PyPI - sdg-hub - Versions diffs - 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

sdg-hub 0.1.0a4py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sdg_hub/_version.py +2 -2
sdg_hub/blocks/__init__.py +41 -5
sdg_hub/blocks/block.py +58 -16
sdg_hub/blocks/llmblock.py +121 -193
sdg_hub/blocks/openaichatblock.py +556 -0
sdg_hub/blocks/utilblocks.py +500 -43
sdg_hub/checkpointer.py +139 -0
sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
sdg_hub/configs/skills/contexts.yaml +18 -11
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
sdg_hub/configs/skills/freeform_questions.yaml +21 -16
sdg_hub/configs/skills/freeform_responses.yaml +19 -25
sdg_hub/configs/skills/router.yaml +53 -6
sdg_hub/flow.py +366 -33
sdg_hub/flow_runner.py +437 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
sdg_hub/pipeline.py +67 -12
sdg_hub/prompts.py +52 -0
sdg_hub/sdg.py +128 -86
sdg_hub/utils/__init__.py +5 -0
sdg_hub/utils/config_validation.py +91 -0
sdg_hub/utils/error_handling.py +94 -0
sdg_hub/utils/path_resolution.py +62 -0
sdg_hub/utils/validation_result.py +10 -0
sdg_hub-0.1.2.dist-info/METADATA +190 -0
sdg_hub-0.1.2.dist-info/RECORD +89 -0
{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
sdg_hub/blocks/filterblock.py +0 -76
sdg_hub/blocks/iterblock.py +0 -31
sdg_hub/blocks/rmblocks.py +0 -194
sdg_hub/configs/annotations/simple.yaml +0 -10
sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
sdg_hub/utils/chunking.py +0 -73
sdg_hub/utils/docprocessor.py +0 -357
sdg_hub/utils/parse_and_convert.py +0 -392
sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
/sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
/sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
/sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
/sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
/sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
/sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
/sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
/sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
/sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
/sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0

sdg_hub/utils/path_resolution.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""
+Path resolution utilities for SDG Hub.
+This module provides utilities for resolving file paths relative to one or more
+base directories, with support for both single directory and multiple directory
+search paths.
+"""
+# Standard
+from typing import List, Union
+import os
+def resolve_path(filename: str, search_dirs: Union[str, List[str]]) -> str:
+    """Resolve a file path relative to one or more search directories.
+    Files are checked in the following order:
+        1. Absolute path is always used as-is
+        2. Checked relative to each directory in search_dirs (in order)
+        3. If not found, returns the original filename (assumes relative to current directory)
+    Parameters
+    ----------
+    filename : str
+        The path to the file to resolve.
+    search_dirs : Union[str, List[str]]
+        Directory or list of directories in which to search for the file.
+    Returns
+    -------
+    str
+        Resolved file path.
+    Examples
+    --------
+    >>> resolve_path("config.yaml", "/path/to/base")
+    '/path/to/base/config.yaml'  # if file exists
+    >>> resolve_path("config.yaml", ["/path1", "/path2"])
+    '/path1/config.yaml'  # if file exists in path1
+    '/path2/config.yaml'  # if file exists in path2 but not path1
+    >>> resolve_path("/absolute/path/file.yaml", ["/path1", "/path2"])
+    '/absolute/path/file.yaml'  # absolute path always used as-is
+    """
+    # Handle absolute paths - always use as-is
+    if os.path.isabs(filename):
+        return filename
+    # Convert single directory to list for uniform handling
+    if isinstance(search_dirs, str):
+        search_dirs = [search_dirs]
+    # Check each directory in order
+    for directory in search_dirs:
+        full_file_path = os.path.join(directory, filename)
+        if os.path.isfile(full_file_path):
+            return full_file_path
+    # If not found in any search directory, return the original filename
+    # This assumes the path is relative to the current directory
+    return filename

sdg_hub/utils/validation_result.py ADDED Viewed

@@ -0,0 +1,10 @@
+from typing import List
+class ValidationResult:
+    def __init__(self, valid: bool, errors: List[str]):
+        self.valid = valid
+        self.errors = errors
+    def __repr__(self):
+        return f"ValidationResult(valid={self.valid}, errors={self.errors})"

sdg_hub-0.1.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,190 @@
+Metadata-Version: 2.4
+Name: sdg_hub
+Version: 0.1.2
+Summary: Synthetic Data Generation
+Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
+License: Apache-2.0
+Project-URL: homepage, https://ai-innovation.team/
+Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
+Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
+Classifier: Development Status :: 3 - Alpha
+Classifier: Environment :: Console
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: click<9.0.0,>=8.1.7
+Requires-Dist: datasets<4.0.0,>=2.18.0
+Requires-Dist: httpx<1.0.0,>=0.25.0
+Requires-Dist: jinja2
+Requires-Dist: openai<2.0.0,>=1.13.3
+Requires-Dist: rich
+Requires-Dist: tenacity!=8.4.0,>=8.3.0
+Requires-Dist: tqdm<5.0.0,>=4.66.2
+Provides-Extra: web-interface
+Requires-Dist: flask>=3.0.2; extra == "web-interface"
+Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
+Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
+Provides-Extra: vllm
+Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
+Requires-Dist: torch>=2.0.0; extra == "vllm"
+Requires-Dist: transformers>=4.37.0; extra == "vllm"
+Requires-Dist: accelerate>=0.21.0; extra == "vllm"
+Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
+Provides-Extra: examples
+Requires-Dist: tabulate>=0.9.0; extra == "examples"
+Requires-Dist: transformers>=4.37.0; extra == "examples"
+Requires-Dist: langchain-text-splitters; extra == "examples"
+Requires-Dist: docling>=2.3.0; extra == "examples"
+Provides-Extra: dev
+Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
+Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
+Requires-Dist: pylint-pydantic; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-asyncio; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: pytest-html; extra == "dev"
+Requires-Dist: tox<5,>=4.4.2; extra == "dev"
+Dynamic: license-file
+# SDG Hub: Synthetic Data Generation Toolkit
+[![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
+[![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
+[![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
+[![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
+[![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
+<html>
+    <h3 align="center">
+      A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
+    </h3>
+    <h3 align="center">
+      <a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
+      <a href="examples/">Examples</a> |
+      <a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
+    </h3>
+</html>
+SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
+**📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
+---
+## ✨ Key Features
+- **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
+  simple YAML configuration files without writing any code.
+- **Modular Block System**: Compose workflows from reusable, self-contained
+  blocks that handle LLM calls, data transformations, and filtering.
+- **LLM-Agnostic**: Works with any language model through configurable
+  prompt templates and generation parameters.
+- **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
+## 🚀 Installation
+### Stable Release (Recommended)
+```bash
+pip install sdg-hub
+```
+### Development Version
+```bash
+pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
+```
+## 🏁 Quick Start
+### Prerequisites
+Before getting started, make sure you have:
+- Python 3.8 or higher
+- LLM Inference Endpoint exposed through OpenAI API
+### Simple Example
+Here's the simplest way to get started:
+```python
+from sdg_hub.flow_runner import run_flow
+# Run a basic knowledge generation flow
+run_flow(
+    ds_path="my_data.jsonl",
+    save_path="output.jsonl",
+    endpoint="http://0.0.0.0:8000/v1",
+    flow_path="flows/generation/knowledge/synth_knowledge.yaml"
+)
+```
+### Advanced Configuration
+You can invoke any built-in flow using run_flow:
+```python
+from sdg_hub.flow_runner import run_flow
+run_flow(
+    ds_path="path/to/dataset.jsonl",
+    save_path="path/to/output.jsonl",
+    endpoint="http://0.0.0.0:8000/v1",
+    flow_path="path/to/flow.yaml",
+    checkpoint_dir="path/to/checkpoints",
+    batch_size=8,
+    num_workers=32,
+    save_freq=2,
+)
+```
+### 📂 Available Built-in Flows
+You can start with any of these YAML flows out of the box:
+#### 🔎 **Knowledge Flows**
+| Flow | Description |
+|------|-------------|
+| `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
+| `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
+#### 🧠 **Skills Flows**
+| Flow | Description |
+|------|-------------|
+| `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
+| `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
+| `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
+All these can be found here: [flows](src/sdg_hub/flows)
+## 📺 Video Tutorial
+For a comprehensive walkthrough of sdg_hub:
+[![SDG Hub Tutorial](https://img.youtube.com/vi/aGKCViWjAmA/0.jpg)](https://www.youtube.com/watch?v=aGKCViWjAmA)
+## 🤝 Contributing
+We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
+## 📄 License
+This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
+---
+Built with ❤️ by the Red Hat AI Innovation Team

sdg_hub-0.1.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,89 @@
+sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
+sdg_hub/_version.py,sha256=bSmADqydH8nBu-J4lG8UVuR7hnU_zcwhnSav2oQ0W0A,511
+sdg_hub/checkpointer.py,sha256=R0pNKL_q7-BerxmIarY0w1nFYaq7fGnoRRkCVL6Z-Gw,5053
+sdg_hub/flow.py,sha256=14WDZfb-VDUBwXsVo9u5oMuWD6aOm-GWtIdT64z4j-0,18050
+sdg_hub/flow_runner.py,sha256=xeAIdx2r86kwtdrMFysjR1N-j4teonvbSHKg-m1VNSs,14584
+sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
+sdg_hub/pipeline.py,sha256=mahktfoCMVnuBnvLNjAVOAoFKNQo-wb0Dz1_xdYhKDM,3852
+sdg_hub/prompts.py,sha256=Gto1KcIhO-50ERvZx1Qzu-eAhSlIkOjYH9F6j2eIPfY,17482
+sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
+sdg_hub/sdg.py,sha256=8SKrSnqyvJAwE2Muf9lXw9ONRcDzqmCtaEzFHCYW4CY,6914
+sdg_hub/blocks/__init__.py,sha256=I-kMjIM7E1NrPLyBuUi0yNoXnuw_kTK3A7ybyt3pOxU,936
+sdg_hub/blocks/block.py,sha256=zdeyDyYiY0EdD3xS7kZR2hRZCRkbygQ4WONp_zv3X7w,3051
+sdg_hub/blocks/llmblock.py,sha256=nWslPFZSCiyL7MXQurOk6Jx29UOsgnVDMI3PTwje7kg,13678
+sdg_hub/blocks/openaichatblock.py,sha256=BWsWFEozWptwe1MMaz-_ZmgQPsNbCRun6ZlaKD3ICxQ,20016
+sdg_hub/blocks/utilblocks.py,sha256=U2PQk26cwHOgofk5IenHjrao08gbqPFOBNRy5QJ-EEY,18290
+sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
+sdg_hub/configs/annotations/detailed_annotations.yaml,sha256=in21xmlhxDJGEaWh1IgINh33tEyW9AuyG3k4pWBuKSM,1520
+sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
+sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
+sdg_hub/configs/annotations/simple_annotations.yaml,sha256=e2F_Ow8EG_me4XJ2cnBTlKb9y1FmdX0DHKkiMqiwdUQ,188
+sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=bIfQr0q0FyReO94v_lpLO56FikARCvFmZza-ISZTOnA,2453
+sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
+sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=_Mc_i9vaLp1OPKexSOURV5gbXEG41p1eELUukOhz8oM,388
+sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
+sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
+sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
+sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=TYgJ7WQc7NFkf3GeRsbx6lwfA_xFnEOYGELewSqorp0,399
+sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
+sdg_hub/configs/knowledge/generate_questions.yaml,sha256=iJtttZrVvlXFraUSrMowqTCLoJOLDbBndcTNMPTO8A4,2788
+sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
+sdg_hub/configs/knowledge/generate_responses.yaml,sha256=wwiB7lSB9yEB1XG2SIEIRtHkSlKh3NGJAmDaq2J6-ZY,2483
+sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
+sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
+sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
+sdg_hub/configs/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
+sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
+sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
+sdg_hub/configs/skills/contexts.yaml,sha256=MZ2QpuGhTce6kuEsMleaGblljhGG-yhXBuH42htA2P4,1161
+sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
+sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=MOI0-GyKrJ_O4v1mm8A1lIKxXfwcS3dA7GjlpDEuXRU,4055
+sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=yDmLd-3A9pN5VLaT4lAcJ_ZvCY43LYlcS1KEdxpBRjU,2559
+sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=vMQtsHpNxPOOHnkzqWPp-N1gSfwPqTbfcKmNfhb9WS8,4648
+sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
+sdg_hub/configs/skills/freeform_questions.yaml,sha256=N6R3c1jNiSSw6T-OUJULpLnPHuaSXjvoNjSqTKL6EOY,1500
+sdg_hub/configs/skills/freeform_responses.yaml,sha256=4URTMsPpgSDOVj71Gw3lL82QWnUFR37iE72BIMwwv7c,1544
+sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
+sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
+sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
+sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
+sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
+sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
+sdg_hub/configs/skills/router.yaml,sha256=7YnFp6H5wYD8W5Qn1Ac4r9dGBSFUDhZSNwmglQ99PgQ,3545
+sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
+sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
+sdg_hub/configs/skills/icl_examples/STEM.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
+sdg_hub/configs/skills/icl_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sdg_hub/configs/skills/icl_examples/coding.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
+sdg_hub/configs/skills/icl_examples/extraction.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
+sdg_hub/configs/skills/icl_examples/humanities.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
+sdg_hub/configs/skills/icl_examples/math.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
+sdg_hub/configs/skills/icl_examples/reasoning.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
+sdg_hub/configs/skills/icl_examples/roleplay.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
+sdg_hub/configs/skills/icl_examples/writing.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
+sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
+sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
+sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
+sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=KYMdStAsfWKZhoFzEwTfl8XhF0qRSc6WsgJbzLWCw-U,3634
+sdg_hub/flows/generation/skills/improve_responses.yaml,sha256=wUV0awTmKHNZ62pHiw_yz-IdG0OYgT_dCwlMUlZS3TA,2683
+sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
+sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
+sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
+sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=9lhQcxXXbN4V9ztPph4fyjUtctll2FYtKY-V4grQdy4,1492
+sdg_hub/utils/__init__.py,sha256=Jfs1DAVSYDNn8dfs0Uq2MguSwu77NyhP-KufSJICiBQ,278
+sdg_hub/utils/config_validation.py,sha256=g92GxN73Mjr0cXvc5amB_Fn4iV9-iKeWmPz9HwLPmNY,3426
+sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
+sdg_hub/utils/error_handling.py,sha256=UvPEmtdpbBL71Zx8DWpIqd8869kEY2dlCH11iDgMfec,1847
+sdg_hub/utils/path_resolution.py,sha256=M7hnwoyRQTKgwGC3Ld1_KmKaO_8Lu0PCk6JtQrLp67Q,2006
+sdg_hub/utils/validation_result.py,sha256=O3zF6r49LQ9StAf_oWmK2bg-JfTQw6rpbHtHr9lI4ks,264
+sdg_hub-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sdg_hub-0.1.2.dist-info/METADATA,sha256=4etDbH6APmsl8vh-b5H8-8r7pVCYBRWbqlRbf6gmYcY,7247
+sdg_hub-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sdg_hub-0.1.2.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
+sdg_hub-0.1.2.dist-info/RECORD,,

{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.3.1)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

sdg_hub/blocks/filterblock.py DELETED Viewed

@@ -1,76 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Standard
-import operator
-# Third Party
-from datasets import Dataset
-# Local
-from .block import Block
-from ..registry import BlockRegistry
-from ..logger_config import setup_logger
-logger = setup_logger(__name__)
-@BlockRegistry.register("FilterByValueBlock")
-class FilterByValueBlock(Block):
-    def __init__(
-        self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
-    ) -> None:
-        """
-        Initializes a new instance of the FilterByValueBlock class.
-        Parameters:
-        - filter_column (str): The name of the column in the dataset to apply the filter on.
-        - filter_value (any or list of any): The value(s) to filter by.
-        - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
-        - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None.
-        - **batch_kwargs: Additional kwargs for batch processing.
-        Returns:
-        None
-        """
-        super().__init__(block_name=self.__class__.__name__)
-        self.value = filter_value if isinstance(filter_value, list) else [filter_value]
-        self.column_name = filter_column
-        self.operation = operation
-        self.convert_dtype = convert_dtype
-        self.num_procs = batch_kwargs.get("num_procs", 1)
-    def _convert_dtype(self, sample):
-        try:
-            sample[self.column_name] = self.convert_dtype(sample[self.column_name])
-        except ValueError as e:
-            logger.error(
-                "Error converting dtype: %s, filling with None to be filtered later", e
-            )
-            sample[self.column_name] = None
-        return sample
-    def generate(self, samples) -> Dataset:
-        if self.convert_dtype:
-            samples = samples.map(
-                self._convert_dtype,
-                num_proc=self.num_procs,
-            )
-        if self.operation == operator.contains:
-            samples = samples.filter(
-                lambda x: self.operation(self.value, x[self.column_name]),
-                num_proc=self.num_procs,
-            )
-        samples = samples.filter(
-            lambda x: x[self.column_name] is not None,
-            num_proc=self.num_procs,
-        )
-        samples = samples.filter(
-            lambda x: any(
-                self.operation(x[self.column_name], value) for value in self.value
-            ),
-            num_proc=self.num_procs,
-        )
-        return samples

sdg_hub/blocks/iterblock.py DELETED Viewed

@@ -1,31 +0,0 @@
-# Third Party
-from datasets import Dataset
-# Local
-from .block import Block
-from ..registry import BlockRegistry
-from ..logger_config import setup_logger
-logger = setup_logger(__name__)
-@BlockRegistry.register("IterBlock")
-class IterBlock(Block):
-    def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
-        super().__init__(block_name)
-        self.num_iters = num_iters
-        self.block = block_type(**block_kwargs)
-        self.gen_kwargs = kwargs.get("gen_kwargs", {})
-        self.gen_kwargs = kwargs.get("gen_kwargs", {})
-    def generate(self, samples, **gen_kwargs) -> Dataset:
-        generated_samples = []
-        num_iters = self.num_iters
-        for _ in range(num_iters):
-            batch_generated = self.block.generate(
-                samples, **{**self.gen_kwargs, **gen_kwargs}
-            )
-            generated_samples.extend(batch_generated)
-        return Dataset.from_list(generated_samples)

sdg_hub/blocks/rmblocks.py DELETED Viewed

@@ -1,194 +0,0 @@
-"""Module containing blocks for scoring responses using Reward Models."""
-# Standard
-from typing import Dict, List
-import json
-from urllib.parse import urljoin
-# Third Party
-from datasets import Dataset
-import requests
-# Local
-from .block import Block
-from ..logger_config import setup_logger
-from ..registry import BlockRegistry
-logger = setup_logger(__name__)
-@BlockRegistry.register("PRMBlock")
-class PRMBlock(Block):
-    """A block for scoring responses using a ProcessReward Model (PRM) via HTTP API.
-    This block sends prompts and responses to a PRM endpoint and returns reward scores
-    for each step in the response.
-    """
-    def __init__(
-        self,
-        block_name: str,
-        host: str,
-        port: int,
-        model_name: str,
-        prompt_col: str,
-        response_col: str,
-        output_col: str = "step_rewards",
-        system_prompt: str = None,
-        endpoint: str = "pooling",
-        step_separator: str = "\n\n",
-        step_fill_token: str = "<extra_0>",
-    ) -> None:
-        r"""Initialize the PRM (Process Reward Model) Block.
-        Parameters
-        ----------
-        block_name : str
-            Name of the block
-        host : str
-            Hostname of the PRM service (e.g., "0.0.0.0" or "localhost")
-        port : int
-            Port number the service is running on
-        model_name : str
-            Name of the PRM model to use
-        prompt_col : str
-            Column name containing the prompt
-        response_col : str
-            Column name containing the response
-        output_col : str, optional
-            Column name to store the reward scores, by default "step_rewards"
-        system_prompt : str, optional
-            Optional system prompt to use for scoring, by default None
-        endpoint : str, optional
-            API endpoint name, by default "pooling"
-        step_separator : str, optional
-            Separator between steps in the response, by default "\n\n"
-        step_fill_token : str, optional
-            Model specific fill token for steps in the response, by default "<extra_0>" used by Qwen2.5-Math-PRM
-        """
-        super().__init__(block_name)
-        # Construct base URL from host and port
-        self.base_url = f"http://{host.strip('/')}:{port}/"
-        self.endpoint = endpoint.strip("/")
-        # Construct the full API URL using urljoin
-        self.api_url = urljoin(self.base_url, self.endpoint)
-        logger.info(f"Initialized PRMBlock with API URL: {self.api_url}")
-        self.model_name = model_name
-        self.prompt_col = prompt_col
-        self.response_col = response_col
-        self.output_col = output_col
-        self.system_prompt = system_prompt
-        self.step_separator = step_separator
-        self.step_fill_token = step_fill_token
-    def _post_request(self, messages: List[Dict]) -> requests.Response:
-        """Make POST request to PRM API endpoint.
-        Parameters
-        ----------
-        messages : List[Dict]
-            List of message dictionaries to send to the API
-        Returns
-        -------
-        requests.Response
-            Response from the API
-        """
-        headers = {"User-Agent": "PRMBlock Client"}
-        prompt = {"model": self.model_name, "messages": messages}
-        response = requests.post(self.api_url, headers=headers, json=prompt)
-        return response
-    def _format_messages(self, sample: Dict) -> List[Dict]:
-        """Format input sample into messages for the PRM API.
-        Parameters
-        ----------
-        sample : Dict
-            Input sample containing prompt and response
-        Returns
-        -------
-        List[Dict]
-            Formatted messages for the API
-        """
-        messages = []
-        if self.system_prompt:
-            messages.append({"role": "system", "content": self.system_prompt})
-        messages.append({"role": "user", "content": sample[self.prompt_col]})
-        messages.append(
-            {
-                "role": "assistant",
-                "content": self.step_fill_token.join(sample[self.response_col].split(self.step_separator))
-                + self.step_fill_token,
-            }
-        )
-        return messages
-    def _extract_rewards(self, response: requests.Response) -> List[float]:
-        """Extract reward scores from API response.
-        Parameters
-        ----------
-        response : requests.Response
-            Response from the API
-        Returns
-        -------
-        List[float]
-            List of reward scores
-        """
-        try:
-            response_data = response.json()
-            rewards = [x[1] for x in response_data["data"][0]["data"]]
-            return rewards
-        except (KeyError, IndexError, json.JSONDecodeError) as e:
-            logger.error(f"Error extracting rewards from response: {e}")
-            return []
-    def _generate(self, sample: dict) -> dict:
-        """Generate reward scores for the input samples.
-        Parameters
-        ----------
-        sample : dict
-            Input sample to score
-        Returns
-        -------
-        dict
-            Dictionary with added reward scores column
-        """
-        messages = self._format_messages(sample)
-        rm_response = self._post_request(messages)
-        if rm_response.status_code != 200:
-            logger.error(f"API request failed with status {rm_response.status_code}")
-            rewards = [0.0] * len(
-                sample[self.response_col].split(self.step_separator)
-            )  # Default to 0 scores on failure
-        else:
-            rewards = self._extract_rewards(rm_response)
-        sample[self.output_col] = rewards
-        return sample
-    def generate(self, samples: Dataset, batch_size: int = 4) -> Dataset:
-        """Generate reward scores for the input samples.
-        Parameters
-        ----------
-        samples : Dataset
-            Input dataset containing samples to score
-        batch_size : int, optional
-            Number of processes to use for parallel processing, by default 4
-        Returns
-        -------
-        Dataset
-            Dataset with added reward scores
-        """
-        return samples.map(self._generate, num_proc=batch_size)

sdg_hub/configs/annotations/simple.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-system: ~
-introduction: |
-  Task Description: {{ simple_task_description }}
-principles: ~
-examples: ~
-generation: |
-  Here is the query for annotation:
-  {{ prompt }}
-start_tags: [""]
-end_tags: [""]

sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

sdg-hub 0.1.0a4py3-none-any.whl → 0.1.2py3-none-any.whl