PyPI - sdg-hub - Versions diffs - 0.5.1__tar.gz → 0.6.0__tar.gz - Mend

sdg-hub 0.5.1tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

{sdg_hub-0.5.1 → sdg_hub-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.5.1
+Version: 0.6.0
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
 Requires-Dist: jinja2
 Requires-Dist: litellm<1.75.0,>=1.73.0
 Requires-Dist: rich
+Requires-Dist: pandas
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: python-dotenv<2.0.0,>=1.0.0
 Requires-Dist: tenacity!=8.4.0,>=8.3.0
 Requires-Dist: tqdm<5.0.0,>=4.66.2
-Provides-Extra: vllm
-Requires-Dist: vllm>=0.9.1; extra == "vllm"
-Requires-Dist: torch>=2.0.0; extra == "vllm"
-Requires-Dist: transformers>=4.37.0; extra == "vllm"
-Requires-Dist: accelerate>=0.21.0; extra == "vllm"
-Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
 Provides-Extra: examples
 Requires-Dist: tabulate>=0.9.0; extra == "examples"
 Requires-Dist: transformers>=4.37.0; extra == "examples"
 Requires-Dist: langchain-text-splitters; extra == "examples"
 Requires-Dist: docling>=2.3.0; extra == "examples"
 Requires-Dist: scikit-learn; extra == "examples"
-Requires-Dist: pandas; extra == "examples"
 Requires-Dist: polars; extra == "examples"
 Requires-Dist: matplotlib; extra == "examples"
 Requires-Dist: spacy; extra == "examples"

{sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb RENAMED Viewed

@@ -34,7 +34,7 @@
    "source": [
     "# Step 1: Document Processing Pipeline\n",
     "# Define the directory containing raw documents to be processed\n",
-    "data_dir = 'document_collection/'\n",
+    "data_dir = \"document_collection/\"\n",
     "\n",
     "# Run the document parser to convert documents to markdown\n",
     "# - input-dir: Directory containing source documents\n",
@@ -68,7 +68,7 @@
     "import glob\n",
     "\n",
     "# In our example above docling step produces markdown of all the pdf files in the document_collection\n",
-    "with open(glob.glob(f'{data_dir}/*.md')[0], 'r') as f:\n",
+    "with open(glob.glob(f\"{data_dir}/*.md\")[0], \"r\") as f:\n",
     "    text = f.read()"
    ]
   },
@@ -81,26 +81,22 @@
    "source": [
     "# Step 4: Text Chunking and Dataset Creation\n",
     "\n",
-    "from markdown_it import MarkdownIt  \n",
+    "from markdown_it import MarkdownIt\n",
     "from typing import List\n",
-    "import datasets  \n",
+    "import datasets\n",
     "\n",
     "\n",
-    "def chunk_markdown(\n",
-    "    text: str,\n",
-    "    max_tokens: int = 200,\n",
-    "    overlap: int = 50\n",
-    ") -> List[str]:\n",
+    "def chunk_markdown(text: str, max_tokens: int = 200, overlap: int = 50) -> List[str]:\n",
     "    \"\"\"\n",
     "    Splits Markdown text into chunks at block-level elements\n",
     "    (headings, paragraphs, lists, tables, code, blockquotes).\n",
     "    Adds overlap (in words) between all consecutive chunks.\n",
-    "    \n",
+    "\n",
     "    Args:\n",
     "        text: The markdown text to be chunked\n",
     "        max_tokens: Maximum number of words per chunk\n",
     "        overlap: Number of overlapping words between consecutive chunks\n",
-    "    \n",
+    "\n",
     "    Returns:\n",
     "        List of text chunks with specified overlap\n",
     "    \"\"\"\n",
@@ -150,7 +146,7 @@
     "\n",
     "\n",
     "# Prepare seed data for the SDG-Hub knowledge pipeline.\n",
-    "# \n",
+    "#\n",
     "# The seed data requires the following fields:\n",
     "#   - document_outline: A concise title or summary that accurately represents the entire document.\n",
     "#     For documents covering multiple themes, consider providing multiple outlines (one per section).\n",
@@ -161,7 +157,7 @@
     "# The code below creates a HuggingFace Dataset from the document chunks,\n",
     "# then maps the required ICL fields to each entry, and finally saves the result as a JSONL file.\n",
     "\n",
-    "seed_data = datasets.Dataset.from_dict({'document': chunks})\n",
+    "seed_data = datasets.Dataset.from_dict({\"document\": chunks})\n",
     "\n",
     "icl = {\n",
     "    \"document_outline\": \"The document contains excerpts from FINTRAC regulations designed to combat money laundering and terrorist financing in Canada\",\n",
@@ -169,14 +165,14 @@
     "    \"icl_query_1\": \"In Canada, what are the methods for verifying someone's identity?\",\n",
     "    \"icl_query_2\": \"In Canada, why is it important to confirm a client's identity?\",\n",
     "    \"icl_query_3\": \"In Canada, can I use Reliance method to verify identity of a person?\",\n",
-    "    \"domain\": \"Finance\"\n",
+    "    \"domain\": \"Finance\",\n",
     "}\n",
     "\n",
     "# Map the ICL fields to each document chunk (if you want to use the same ICL for all, as shown here)\n",
     "seed_data = seed_data.map(lambda x: icl)\n",
     "\n",
     "# Save the seed data to a JSONL file for downstream use\n",
-    "seed_data.to_json('seed_data.jsonl', orient='records', lines=True)"
+    "seed_data.to_json(\"seed_data.jsonl\", orient=\"records\", lines=True)"
    ]
   },
   {

{sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb RENAMED Viewed

@@ -71,7 +71,7 @@
     "# Required to run the flow with async mode\n",
     "import nest_asyncio\n",
     "\n",
-    "nest_asyncio.apply()  "
+    "nest_asyncio.apply()"
    ]
   },
   {
@@ -80,82 +80,90 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def create_seed_data_from_quality_benchmark(run_on_validation=None, seed_data_path=None):\n",
+    "def create_seed_data_from_quality_benchmark(\n",
+    "    run_on_validation=None, seed_data_path=None\n",
+    "):\n",
     "    \"\"\"\n",
     "    Create seed data from QuALITY Benchmark dataset.\n",
-    "    \n",
+    "\n",
     "    Args:\n",
     "        run_on_validation (bool, optional): If True, use validation subset. If None, reads from env.\n",
     "        seed_data_path (str, optional): Path to save seed data. If None, reads from env.\n",
-    "    \n",
+    "\n",
     "    Returns:\n",
     "        datasets.Dataset: The processed corpus\n",
     "    \"\"\"\n",
     "    # Use environment variables as defaults if not provided\n",
     "    if run_on_validation is None:\n",
-    "        run_on_validation = os.getenv('RUN_ON_VALIDATION_SET', 'true').lower() == 'true'\n",
+    "        run_on_validation = os.getenv(\"RUN_ON_VALIDATION_SET\", \"true\").lower() == \"true\"\n",
     "    if seed_data_path is None:\n",
-    "        seed_data_path = os.getenv('SEED_DATA_PATH', 'seed_data_val.jsonl')\n",
-    "    \n",
+    "        seed_data_path = os.getenv(\"SEED_DATA_PATH\", \"seed_data_val.jsonl\")\n",
+    "\n",
     "    # Load QuALITY Benchmark dataset\n",
     "    print(\"Loading QuALITY Benchmark dataset...\")\n",
-    "    quality_corpus = load_dataset(\"zitongyang/entigraph-quality-corpus\", split='train').remove_columns(['entity', 'entigraph']).rename_columns({'raw': 'document', 'uid': 'document_outline'})\n",
-    "    \n",
+    "    quality_corpus = (\n",
+    "        load_dataset(\"zitongyang/entigraph-quality-corpus\", split=\"train\")\n",
+    "        .remove_columns([\"entity\", \"entigraph\"])\n",
+    "        .rename_columns({\"raw\": \"document\", \"uid\": \"document_outline\"})\n",
+    "    )\n",
+    "\n",
     "    # Define seed examples for knowledge tuning\n",
     "    seed_examples = {\n",
     "        \"icl_document\": (\n",
-    "          \"The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.\",\n",
-    "          \"Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy.\"\n",
+    "            \"The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.\",\n",
+    "            \"Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy.\",\n",
     "        ),\n",
     "        \"icl_query_1\": \"How does the technological solution address the economic *and* environmental challenges highlighted in the document?\",\n",
     "        \"icl_query_2\": \"What implicit values or priorities do the community's actions (clean-up initiatives) and the technologists' project reflect, and how do these align or contrast?\",\n",
     "        \"icl_query_3\": \"Imagine the buoy project succeeds. What unintended consequences might arise from its impact, considering document's themes?\",\n",
-    "        \"domain\": \"articles/essays\"\n",
+    "        \"domain\": \"articles/essays\",\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    # Add seed examples to the corpus\n",
     "    quality_corpus = quality_corpus.map(lambda x: seed_examples)\n",
-    "    \n",
+    "\n",
     "    if run_on_validation:\n",
     "        # Validation set - use predefined document IDs for consistent evaluation\n",
     "        DOC_UIDS = [\n",
-    "            ' Defining Decay Down by David Plotz',\n",
-    "            ' Fight Clubbed by David Plotz',\n",
-    "            ' I, Antichrist? by Jeffrey Goldberg',\n",
+    "            \" Defining Decay Down by David Plotz\",\n",
+    "            \" Fight Clubbed by David Plotz\",\n",
+    "            \" I, Antichrist? by Jeffrey Goldberg\",\n",
     "            \" It's Time To Keelhaul U-Haul! by Jeffrey Goldberg\",\n",
     "            \" My Father's Estate by Ben Stein\",\n",
     "            '\"Phone Me in Central Park\" by McConnell, James V.',\n",
-    "            'A Coffin for Jacob by Ludwig, Edward W.',\n",
-    "            'A Fall of Glass by Lee, Stanley R.',\n",
-    "            'A Filbert Is a Nut by Raphael, Rick',\n",
-    "            'A Gift from Earth by Banister, Manly',\n",
-    "            'A Gleeb for Earth by Schafhauser, Charles',\n",
-    "            'A Good Year for the Roses? by David Edelstein',\n",
-    "            'A Pail of Air by Leiber, Fritz',\n",
-    "            'A Planet Named Joe by Hunter, Evan',\n",
+    "            \"A Coffin for Jacob by Ludwig, Edward W.\",\n",
+    "            \"A Fall of Glass by Lee, Stanley R.\",\n",
+    "            \"A Filbert Is a Nut by Raphael, Rick\",\n",
+    "            \"A Gift from Earth by Banister, Manly\",\n",
+    "            \"A Gleeb for Earth by Schafhauser, Charles\",\n",
+    "            \"A Good Year for the Roses? by David Edelstein\",\n",
+    "            \"A Pail of Air by Leiber, Fritz\",\n",
+    "            \"A Planet Named Joe by Hunter, Evan\",\n",
     "            \"AI: what's the worst that could happen? by Harry Armstrong\",\n",
-    "            'Accidental Death by Baily, Peter',\n",
-    "            'All Day September by Kuykendall, Roger',\n",
-    "            'Ambition by Bade, William L.',\n",
-    "            'And Then the Town Took Off by Wilson, Richard',\n",
-    "            'Atom Mystery [Young Atom Detective] by Coombs, Charles Ira',\n",
-    "            'Beach Scene by King, Marshall',\n",
-    "            'Big Ancestor by Wallace, F. L. (Floyd L.)',\n",
-    "            'Birds of a Feather by Silverberg, Robert',\n",
-    "            'Bodyguard by Gold, H. L. (Horace Leonard)'\n",
+    "            \"Accidental Death by Baily, Peter\",\n",
+    "            \"All Day September by Kuykendall, Roger\",\n",
+    "            \"Ambition by Bade, William L.\",\n",
+    "            \"And Then the Town Took Off by Wilson, Richard\",\n",
+    "            \"Atom Mystery [Young Atom Detective] by Coombs, Charles Ira\",\n",
+    "            \"Beach Scene by King, Marshall\",\n",
+    "            \"Big Ancestor by Wallace, F. L. (Floyd L.)\",\n",
+    "            \"Birds of a Feather by Silverberg, Robert\",\n",
+    "            \"Bodyguard by Gold, H. L. (Horace Leonard)\",\n",
     "        ]\n",
-    "        \n",
+    "\n",
     "        # Filter corpus to validation set\n",
-    "        quality_corpus = quality_corpus.filter(lambda x: x['document_outline'] in DOC_UIDS)\n",
+    "        quality_corpus = quality_corpus.filter(\n",
+    "            lambda x: x[\"document_outline\"] in DOC_UIDS\n",
+    "        )\n",
     "        print(f\"Running on validation set with {len(quality_corpus)} documents\")\n",
     "    else:\n",
     "        # Use full dataset for training\n",
     "        print(f\"Running on full dataset with {len(quality_corpus)} documents\")\n",
-    "    \n",
+    "\n",
     "    # Save the seed data\n",
-    "    quality_corpus.to_json(seed_data_path, orient='records', lines=True)\n",
+    "    quality_corpus.to_json(seed_data_path, orient=\"records\", lines=True)\n",
     "    print(f\"Saved seed data to: {seed_data_path}\")\n",
-    "    \n",
+    "\n",
     "    return quality_corpus"
    ]
   },
@@ -166,19 +174,22 @@
    "outputs": [],
    "source": [
     "# Load seed data. If one is not provided, create it from the quality benchmark dataset.\n",
-    "seed_data_path = os.getenv('SEED_DATA_PATH', 'seed_data.jsonl')\n",
+    "seed_data_path = os.getenv(\"SEED_DATA_PATH\", \"seed_data.jsonl\")\n",
     "\n",
     "if not os.path.exists(seed_data_path):\n",
     "    print(f\"{seed_data_path} not found. Creating seed data...\")\n",
-    "    quality_corpus = create_seed_data_from_quality_benchmark(seed_data_path=seed_data_path)\n",
+    "    quality_corpus = create_seed_data_from_quality_benchmark(\n",
+    "        seed_data_path=seed_data_path\n",
+    "    )\n",
     "else:\n",
     "    print(f\"Loading existing seed data from {seed_data_path}\")\n",
-    "    quality_corpus = load_dataset('json', data_files=seed_data_path, split='train')\n",
+    "    quality_corpus = load_dataset(\"json\", data_files=seed_data_path, split=\"train\")\n",
     "\n",
     "# Subsample the seed data. Useful for debugging.\n",
-    "subsample = int(os.getenv('SEED_DATA_SUBSAMPLE', '0'))\n",
+    "subsample = int(os.getenv(\"SEED_DATA_SUBSAMPLE\", \"0\"))\n",
     "if subsample > 0:\n",
-    "    quality_corpus = quality_corpus.select(range(subsample))"
+    "    quality_corpus = quality_corpus.select(range(subsample))\n",
+    "quality_corpus = quality_corpus.to_pandas()"
    ]
   },
   {
@@ -200,14 +211,20 @@
    "source": [
     "# Setup model configuration in flow object\n",
     "def set_model_config(flow_object):\n",
-    "    model_provider = os.getenv('MODEL_PROVIDER', 'hosted_vllm')\n",
+    "    model_provider = os.getenv(\"MODEL_PROVIDER\", \"hosted_vllm\")\n",
     "    print(f\"Using model provider: {model_provider}\")\n",
     "    # Set model provider\n",
-    "    if model_provider == 'hosted_vllm':    \n",
-    "        vllm_model = os.getenv('VLLM_MODEL', 'hosted_vllm/meta-llama/Llama-3.3-70B-Instruct')\n",
-    "        vllm_api_base = os.getenv('VLLM_API_BASE', 'http://localhost:8000/v1')\n",
-    "        vllm_api_key = os.getenv('VLLM_API_KEY', 'EMPTY')\n",
-    "        enable_reasoning = os.getenv('ENABLE_REASONING', 'false').lower() in ('1', 'true', 'yes')\n",
+    "    if model_provider == \"hosted_vllm\":\n",
+    "        vllm_model = os.getenv(\n",
+    "            \"VLLM_MODEL\", \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\"\n",
+    "        )\n",
+    "        vllm_api_base = os.getenv(\"VLLM_API_BASE\", \"http://localhost:8000/v1\")\n",
+    "        vllm_api_key = os.getenv(\"VLLM_API_KEY\", \"EMPTY\")\n",
+    "        enable_reasoning = os.getenv(\"ENABLE_REASONING\", \"false\").lower() in (\n",
+    "            \"1\",\n",
+    "            \"true\",\n",
+    "            \"yes\",\n",
+    "        )\n",
     "        print(f\"Using reasoning: {enable_reasoning}\")\n",
     "        flow_object.set_model_config(\n",
     "            model=vllm_model,\n",
@@ -215,30 +232,30 @@
     "            api_key=vllm_api_key,\n",
     "            enable_reasoning=enable_reasoning,\n",
     "        )\n",
-    "    elif model_provider == 'openai':\n",
-    "        openai_api_key = os.getenv('OPENAI_API_KEY')\n",
-    "        openai_model = os.getenv('OPENAI_MODEL', 'openai/gpt-4')\n",
+    "    elif model_provider == \"openai\":\n",
+    "        openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n",
+    "        openai_model = os.getenv(\"OPENAI_MODEL\", \"openai/gpt-4\")\n",
     "        flow_object.set_model_config(\n",
     "            model=openai_model,\n",
     "            api_key=openai_api_key,\n",
     "        )\n",
-    "    elif model_provider == 'ollama':\n",
-    "        ollama_model = os.getenv('OLLAMA_MODEL', 'ollama/gemma2')\n",
-    "        ollama_api_base = os.getenv('OLLAMA_API_BASE', 'http://localhost:11434')\n",
+    "    elif model_provider == \"ollama\":\n",
+    "        ollama_model = os.getenv(\"OLLAMA_MODEL\", \"ollama/gemma2\")\n",
+    "        ollama_api_base = os.getenv(\"OLLAMA_API_BASE\", \"http://localhost:11434\")\n",
     "        flow_object.set_model_config(\n",
     "            model=ollama_model,\n",
     "            api_base=ollama_api_base,\n",
     "        )\n",
-    "    elif model_provider == 'maas':\n",
-    "        maas_model = os.getenv('MAAS_MODEL')\n",
-    "        maas_api_base = os.getenv('MAAS_API_BASE')\n",
-    "        maas_api_key = os.getenv('MAAS_API_KEY')\n",
+    "    elif model_provider == \"maas\":\n",
+    "        maas_model = os.getenv(\"MAAS_MODEL\")\n",
+    "        maas_api_base = os.getenv(\"MAAS_API_BASE\")\n",
+    "        maas_api_key = os.getenv(\"MAAS_API_KEY\")\n",
     "        flow_object.set_model_config(\n",
     "            model=maas_model,\n",
     "            api_base=maas_api_base,\n",
     "            api_key=maas_api_key,\n",
     "        )\n",
-    "    return flow_object "
+    "    return flow_object"
    ]
   },
   {
@@ -273,10 +290,14 @@
    "outputs": [],
    "source": [
     "# Get runtime parameters\n",
-    "enable_reasoning = os.getenv('ENABLE_REASONING', 'false').lower() in ('1', 'true', 'yes')\n",
-    "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))\n",
-    "max_concurrency = int(os.getenv('MAX_CONCURRENCY', '50'))\n",
-    "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')"
+    "enable_reasoning = os.getenv(\"ENABLE_REASONING\", \"false\").lower() in (\n",
+    "    \"1\",\n",
+    "    \"true\",\n",
+    "    \"yes\",\n",
+    ")\n",
+    "number_of_summaries = int(os.getenv(\"NUMBER_OF_SUMMARIES\", \"50\"))\n",
+    "max_concurrency = int(os.getenv(\"MAX_CONCURRENCY\", \"50\"))\n",
+    "save_data_path = os.getenv(\"OUTPUT_DATA_FOLDER\", \"\")"
    ]
   },
   {
@@ -292,28 +313,32 @@
     "\n",
     "# Set model configuration\n",
     "flow = set_model_config(flow)\n",
-    "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))\n",
+    "number_of_summaries = int(os.getenv(\"NUMBER_OF_SUMMARIES\", \"50\"))\n",
     "# Generate data for extractive summary\n",
     "if enable_reasoning:\n",
     "    # Increase max tokens to accommodate reasoning content\n",
     "    runtime_params = {\n",
-    "        'question_generation': {'max_tokens': 1024}, \n",
-    "        'gen_extractive_summary': {'n': number_of_summaries, 'max_tokens': 6000}\n",
-    "        }\n",
-    "else:\n",
-    "    runtime_params = {\n",
-    "    'gen_extractive_summary': {\n",
-    "        'n': number_of_summaries\n",
+    "        \"question_generation\": {\"max_tokens\": 1024},\n",
+    "        \"gen_extractive_summary\": {\"n\": number_of_summaries, \"max_tokens\": 6000},\n",
     "    }\n",
-    "}\n",
+    "else:\n",
+    "    runtime_params = {\"gen_extractive_summary\": {\"n\": number_of_summaries}}\n",
     "\n",
-    "extractive_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
+    "extractive_summary_generated_data = flow.generate(\n",
+    "    quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency\n",
+    ")\n",
     "\n",
-    "extractive_summary_generated_data.to_json(os.path.join(save_data_path, 'extractive_summary', 'gen.jsonl'), orient='records', lines=True)\n",
+    "os.makedirs(os.path.join(save_data_path, \"extractive_summary\"), exist_ok=True)\n",
+    "\n",
+    "extractive_summary_generated_data.to_json(\n",
+    "    os.path.join(save_data_path, \"extractive_summary\", \"gen.jsonl\"),\n",
+    "    orient=\"records\",\n",
+    "    lines=True,\n",
+    ")\n",
     "\n",
     "print(f\"✓ Extractive summary: {len(extractive_summary_generated_data)} records\")\n",
     "\n",
-    "print(f\"✓ Columns: {list(extractive_summary_generated_data.column_names)}\")"
+    "print(f\"✓ Columns: {list(extractive_summary_generated_data.columns.tolist())}\")"
    ]
   },
   {
@@ -333,21 +358,27 @@
     "if enable_reasoning:\n",
     "    # Increase max tokens to accommodate reasoning content\n",
     "    runtime_params = {\n",
-    "        'question_generation': {'max_tokens': 1024}, \n",
-    "        'gen_detailed_summary': {'n': number_of_summaries, 'max_tokens': 6000}\n",
-    "        }\n",
+    "        \"question_generation\": {\"max_tokens\": 1024},\n",
+    "        \"gen_detailed_summary\": {\"n\": number_of_summaries, \"max_tokens\": 6000},\n",
+    "    }\n",
     "else:\n",
-    "    runtime_params = ({'gen_detailed_summary': {\n",
-    "        'n': number_of_summaries\n",
-    "    }})\n",
+    "    runtime_params = {\"gen_detailed_summary\": {\"n\": number_of_summaries}}\n",
     "# Generate data for detailed summary\n",
-    "detailed_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=50)\n",
+    "detailed_summary_generated_data = flow.generate(\n",
+    "    quality_corpus, runtime_params=runtime_params, max_concurrency=50\n",
+    ")\n",
+    "\n",
+    "os.makedirs(os.path.join(save_data_path, \"detailed_summary\"), exist_ok=True)\n",
     "\n",
-    "detailed_summary_generated_data.to_json(os.path.join(save_data_path, 'detailed_summary', 'gen.jsonl'), orient='records', lines=True)\n",
+    "detailed_summary_generated_data.to_json(\n",
+    "    os.path.join(save_data_path, \"detailed_summary\", \"gen.jsonl\"),\n",
+    "    orient=\"records\",\n",
+    "    lines=True,\n",
+    ")\n",
     "\n",
     "print(f\"✓ Detailed summary: {len(detailed_summary_generated_data)} records\")\n",
     "\n",
-    "print(f\"✓ Columns: {list(detailed_summary_generated_data.column_names)}\")"
+    "print(f\"✓ Columns: {list(detailed_summary_generated_data.columns.tolist())}\")"
    ]
   },
   {
@@ -356,7 +387,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Generate similar data for key facts \n",
+    "# Generate similar data for key facts\n",
     "flow_name = \"Key Facts Knowledge Tuning Dataset Generation Flow\"\n",
     "flow_path = FlowRegistry.get_flow_path(flow_name)\n",
     "flow = Flow.from_yaml(flow_path)\n",
@@ -367,17 +398,25 @@
     "if enable_reasoning:\n",
     "    # Increase max tokens for Question Generation to accommodate reasoning content\n",
     "    runtime_params = {\n",
-    "        'generate_key_fact_qa': {'max_tokens': 6000}, \n",
-    "        }\n",
+    "        \"generate_key_fact_qa\": {\"max_tokens\": 6000},\n",
+    "    }\n",
     "\n",
     "# Generate data for key facts summary\n",
-    "key_facts_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
+    "key_facts_generated_data = flow.generate(\n",
+    "    quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency\n",
+    ")\n",
+    "\n",
+    "os.makedirs(os.path.join(save_data_path, \"key_facts_to_qa\"), exist_ok=True)\n",
     "\n",
-    "key_facts_generated_data.to_json(os.path.join(save_data_path, 'key_facts_to_qa', 'gen.jsonl'), orient='records', lines=True)\n",
+    "key_facts_generated_data.to_json(\n",
+    "    os.path.join(save_data_path, \"key_facts_to_qa\", \"gen.jsonl\"),\n",
+    "    orient=\"records\",\n",
+    "    lines=True,\n",
+    ")\n",
     "\n",
     "print(f\"✓ Key facts: {len(key_facts_generated_data)} records\")\n",
     "\n",
-    "print(f\"✓ Columns: {list(key_facts_generated_data.column_names)}\")"
+    "print(f\"✓ Columns: {list(key_facts_generated_data.columns.tolist())}\")"
    ]
   },
   {
@@ -396,16 +435,24 @@
     "if enable_reasoning:\n",
     "    # Increase max tokens to accommodate reasoning content\n",
     "    runtime_params = {\n",
-    "        'question_generation': {'max_tokens': 2048}, \n",
-    "        }\n",
+    "        \"question_generation\": {\"max_tokens\": 2048},\n",
+    "    }\n",
+    "\n",
+    "document_based_generated_data = flow.generate(\n",
+    "    quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency\n",
+    ")\n",
+    "\n",
+    "os.makedirs(os.path.join(save_data_path, \"document_based_qa\"), exist_ok=True)\n",
     "\n",
-    "document_based_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
-    "    \n",
-    "document_based_generated_data.to_json(os.path.join(save_data_path, 'document_based_qa', 'gen.jsonl'), orient='records', lines=True)\n",
+    "document_based_generated_data.to_json(\n",
+    "    os.path.join(save_data_path, \"document_based_qa\", \"gen.jsonl\"),\n",
+    "    orient=\"records\",\n",
+    "    lines=True,\n",
+    ")\n",
     "\n",
     "print(f\"✓ Document based: {len(document_based_generated_data)} records\")\n",
     "\n",
-    "print(f\"✓ Columns: {list(document_based_generated_data.column_names)}\")"
+    "print(f\"✓ Columns: {list(document_based_generated_data.columns.tolist())}\")"
    ]
   },
   {

sdg-hub 0.5.1__tar.gz → 0.6.0__tar.gz

sdg-hub 0.5.1tar.gz → 0.6.0tar.gz