PyPI - sdg-hub - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sdg-hub 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

sdg_hub/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.5.0'
-__version_tuple__ = version_tuple = (0, 5, 0)
+__version__ = version = '0.5.1'
+__version_tuple__ = version_tuple = (0, 5, 1)
 __commit_id__ = commit_id = None

sdg_hub/core/blocks/llm/llm_parser_block.py CHANGED Viewed

@@ -7,13 +7,16 @@ This module provides the LLMParserBlock for extracting specific fields
 # Standard
 from typing import Any
+from weakref import finalize
+import json
 # Third Party
-from datasets import Dataset
+from datasets import Dataset, load_dataset
 from pydantic import Field, model_validator
 # Local
 from ...utils.logger_config import setup_logger
+from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
 from ..base import BaseBlock
 from ..registry import BlockRegistry
@@ -26,6 +29,8 @@ logger = setup_logger(__name__)
     "Extracts specified fields from LLM response objects",
 )
 class LLMParserBlock(BaseBlock):
+    _flow_requires_jsonl_tmp: bool = True
     """Block for extracting fields from LLM response objects.
     This block extracts specified fields from chat completion response objects.
@@ -314,7 +319,54 @@ class LLMParserBlock(BaseBlock):
             logger.warning("No samples to process, returning empty dataset")
             return Dataset.from_list([])
-        new_data = []
-        for sample in samples:
-            new_data.extend(self._generate(sample))
-        return Dataset.from_list(new_data)
+        tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
+        cleanup_locally = False
+        if tmp_jsonl_path is None:
+            tmp_jsonl_path = str(
+                create_temp_file(
+                    prefix=f"{self.block_name}_llm_parser", suffix=".jsonl"
+                )
+            )
+            cleanup_locally = True
+        rows_written = 0
+        batch = []
+        with open(tmp_jsonl_path, "w") as f:
+            for sample in samples:
+                out = self._generate(sample)
+                for row in out:
+                    batch.append(json.dumps(row) + "\n")
+                    rows_written += 1
+                    if len(batch) >= 5:
+                        f.writelines(batch)
+                        batch.clear()
+            if batch:
+                f.writelines(batch)
+        if rows_written == 0:
+            if cleanup_locally:
+                cleanup_path(tmp_jsonl_path)
+            return Dataset.from_list([])
+        hf_cache_dir = None
+        try:
+            hf_cache_dir = create_temp_dir(
+                prefix=f"{self.block_name}_llm_parser_hf_cache"
+            )
+            ret = load_dataset(
+                "json",
+                data_files=tmp_jsonl_path,
+                split="train",
+                keep_in_memory=False,
+                cache_dir=str(hf_cache_dir),
+            )
+            finalize(ret, cleanup_path, hf_cache_dir)
+            return ret
+        except Exception:
+            if hf_cache_dir is not None:
+                cleanup_path(hf_cache_dir)
+            raise
+        finally:
+            if cleanup_locally:
+                cleanup_path(tmp_jsonl_path)

sdg_hub/core/blocks/llm/text_parser_block.py CHANGED Viewed

@@ -7,14 +7,17 @@ start/end tags, custom regex patterns, and cleanup operations.
 # Standard
 from typing import Any, Optional
+from weakref import finalize
+import json
 import re
 # Third Party
-from datasets import Dataset
+from datasets import Dataset, load_dataset
 from pydantic import Field, field_validator, model_validator
 # Local
 from ...utils.logger_config import setup_logger
+from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
 from ..base import BaseBlock
 from ..registry import BlockRegistry
@@ -27,6 +30,8 @@ logger = setup_logger(__name__)
     "Parses and post-processes text content using tags or regex patterns",
 )
 class TextParserBlock(BaseBlock):
+    _flow_requires_jsonl_tmp: bool = True
     """Block for parsing and post-processing text content.
     This block handles text parsing using start/end tags, custom regex patterns,
@@ -317,7 +322,54 @@ class TextParserBlock(BaseBlock):
             logger.warning("No samples to parse, returning empty dataset")
             return Dataset.from_list([])
-        new_data = []
-        for sample in samples:
-            new_data.extend(self._generate(sample))
-        return Dataset.from_list(new_data)
+        tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
+        cleanup_locally = False
+        if tmp_jsonl_path is None:
+            tmp_jsonl_path = str(
+                create_temp_file(
+                    prefix=f"{self.block_name}_text_parser", suffix=".jsonl"
+                )
+            )
+            cleanup_locally = True
+        rows_written = 0
+        batch = []
+        with open(tmp_jsonl_path, "w") as f:
+            for sample in samples:
+                out = self._generate(sample)
+                for row in out:
+                    batch.append(json.dumps(row) + "\n")
+                    rows_written += 1
+                    if len(batch) >= 5:
+                        f.writelines(batch)
+                        batch.clear()
+            if batch:
+                f.writelines(batch)
+        if rows_written == 0:
+            if cleanup_locally:
+                cleanup_path(tmp_jsonl_path)
+            return Dataset.from_list([])
+        hf_cache_dir = None
+        try:
+            hf_cache_dir = create_temp_dir(
+                prefix=f"{self.block_name}_text_parser_hf_cache"
+            )
+            ret = load_dataset(
+                "json",
+                data_files=tmp_jsonl_path,
+                split="train",
+                keep_in_memory=False,
+                cache_dir=str(hf_cache_dir),
+            )
+            finalize(ret, cleanup_path, hf_cache_dir)
+            return ret
+        except Exception:
+            if hf_cache_dir is not None:
+                cleanup_path(hf_cache_dir)
+            raise
+        finally:
+            if cleanup_locally:
+                cleanup_path(tmp_jsonl_path)

sdg_hub/core/flow/base.py CHANGED Viewed

@@ -5,6 +5,8 @@
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Optional, Union
+from weakref import finalize
+import gc
 import time
 import uuid
@@ -37,6 +39,11 @@ from ..utils.flow_metrics import (
 )
 from ..utils.logger_config import setup_logger
 from ..utils.path_resolution import resolve_path
+from ..utils.temp_manager import (
+    cleanup_path,
+    create_temp_dir,
+    create_temp_file,
+)
 from ..utils.time_estimator import estimate_execution_time
 from ..utils.yaml_utils import save_flow_yaml
 from .checkpointer import FlowCheckpointer
@@ -580,6 +587,7 @@ class Flow(BaseModel):
         # Use provided logger or fall back to global logger
         exec_logger = flow_logger if flow_logger is not None else logger
         current_dataset = dataset
+        current_dataset_temp_path: Optional[Path] = None
         # Execute blocks in sequence
         for i, block in enumerate(self.blocks):
@@ -591,6 +599,14 @@ class Flow(BaseModel):
             # Prepare block execution parameters
             block_kwargs = self._prepare_block_kwargs(block, runtime_params)
+            block_temp_jsonl_path: Optional[Path] = None
+            dataset_temp_dir: Optional[Path] = None
+            if getattr(block, "_flow_requires_jsonl_tmp", False):
+                block_temp_jsonl_path = create_temp_file(
+                    prefix=f"{block.block_name}_parser", suffix=".jsonl"
+                )
+                block_kwargs["_flow_tmp_jsonl_path"] = str(block_temp_jsonl_path)
             # Add max_concurrency to block kwargs if provided
             if max_concurrency is not None:
                 block_kwargs["_flow_max_concurrency"] = max_concurrency
@@ -610,6 +626,28 @@ class Flow(BaseModel):
                         f"Block '{block.block_name}' produced empty dataset"
                     )
+                # Here, we write and reload dataset object from and to disk.
+                # This is done because HF Datasets library creates a ton of intermediate
+                # objects, and holds on to them even after the objects have fulfilled
+                # their purpose. To get flush these objects, HF recommends to implement
+                # this `save_to_disk` and `load_from_disk` hack.
+                # https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_dataset.py#L1029
+                previous_temp_path = current_dataset_temp_path
+                dataset_temp_dir = create_temp_dir(prefix=f"flow_{block.block_name}")
+                current_dataset.save_to_disk(str(dataset_temp_dir))
+                del current_dataset
+                gc.collect()
+                current_dataset = datasets.load_from_disk(
+                    str(dataset_temp_dir), keep_in_memory=False
+                )
+                finalize(current_dataset, cleanup_path, dataset_temp_dir)
+                current_dataset_temp_path = dataset_temp_dir
+                if previous_temp_path and previous_temp_path != dataset_temp_dir:
+                    cleanup_path(previous_temp_path)
+                if block_temp_jsonl_path is not None:
+                    cleanup_path(block_temp_jsonl_path)
                 # Capture metrics after successful execution
                 execution_time = time.perf_counter() - start_time
                 output_rows = len(current_dataset)
@@ -638,6 +676,10 @@ class Flow(BaseModel):
                 )
             except Exception as exc:
+                if block_temp_jsonl_path is not None:
+                    cleanup_path(block_temp_jsonl_path)
+                if dataset_temp_dir is not None:
+                    cleanup_path(dataset_temp_dir)
                 # Capture metrics for failed execution
                 execution_time = time.perf_counter() - start_time
                 self._block_metrics.append(
@@ -661,6 +703,13 @@ class Flow(BaseModel):
                     f"Block '{block.block_name}' execution failed: {exc}"
                 ) from exc
+        if current_dataset_temp_path is not None:
+            final_temp_path = current_dataset_temp_path
+            current_dataset = datasets.load_from_disk(
+                str(final_temp_path), keep_in_memory=True
+            )
+            cleanup_path(final_temp_path)
         return current_dataset
     def _prepare_block_kwargs(

sdg_hub/core/utils/temp_manager.py ADDED Viewed

@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for managing temporary files and directories used by the flow."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional, Union
+# Standard
+import os
+import shutil
+import tempfile
+TEMP_ROOT_DIR_NAME = ".tmp_sdg_buffer"
+def _get_temp_root() -> Path:
+    root = Path.cwd() / TEMP_ROOT_DIR_NAME
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+def _format_prefix(prefix: str) -> str:
+    return f"{prefix}_" if prefix and not prefix.endswith("_") else prefix
+def create_temp_dir(prefix: str = "tmp", suffix: str = "") -> Path:
+    """Create a unique temporary directory."""
+    root = _get_temp_root()
+    name = tempfile.mkdtemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
+    return Path(name)
+def create_temp_file(prefix: str = "tmp", suffix: str = "") -> Path:
+    """Create a unique temporary file."""
+    root = _get_temp_root()
+    fd, name = tempfile.mkstemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
+    os.close(fd)
+    return Path(name)
+def cleanup_path(path: Optional[Union[str, os.PathLike]]) -> None:
+    """Remove a temporary file or directory if it exists."""
+    if not path:
+        return
+    target = Path(path)
+    if not target.exists():
+        return
+    if target.is_dir():
+        shutil.rmtree(target, ignore_errors=True)
+    else:
+        try:
+            target.unlink()
+        except FileNotFoundError:
+            pass

{sdg_hub-0.5.0.dist-info → sdg_hub-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.5.0
+Version: 0.5.1
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0

{sdg_hub-0.5.0.dist-info → sdg_hub-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
-sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
+sdg_hub/_version.py,sha256=cYMOhuaBHd0MIZmumuccsEQ-AxM8LIJy9dsBAWgOpqE,704
 sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
 sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
@@ -11,9 +11,9 @@ sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6
 sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
 sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=MHhI2x9i6LrfDXgvAy2_6YxgyoD7j6BpCgNGsM69xDg,22194
 sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=DW4b09IqXmcshvXawFheDyaLp3rz7vpO5VBrKdUQYW8,31703
-sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=aoHqsDDhaIgCDfPpv7acc0DVN-zUgzFflRVB4win0aM,12012
+sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=pCTaxAML5uFERZx0KTunvgVPHm1H2154VTvF79bGrB8,13699
 sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
-sdg_hub/core/blocks/llm/text_parser_block.py,sha256=975HK6NfXiU9Any4UDMpBNidRpyhHmc76BXUN69SVyc,12566
+sdg_hub/core/blocks/llm/text_parser_block.py,sha256=NGwBdFmfbY3rbm_T7bqTJmaREo2MpSpQwgLrnHHZHqU,14255
 sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
 sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
 sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
@@ -23,7 +23,7 @@ sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGL
 sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
 sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
 sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
-sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
+sdg_hub/core/flow/base.py,sha256=Z2P8QBLl7HWVISdI585hxnIiTu9FhnjlTXn-ngr36Jk,58189
 sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
 sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
 sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
@@ -36,6 +36,7 @@ sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqB
 sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
 sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
 sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
+sdg_hub/core/utils/temp_manager.py,sha256=moSPWMxoDEw5FmeuwKTC8f3tYcarQDN0ozv0796CeGg,1484
 sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
 sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
 sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -77,8 +78,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
 sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
 sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
 sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
-sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
-sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
-sdg_hub-0.5.0.dist-info/RECORD,,
+sdg_hub-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sdg_hub-0.5.1.dist-info/METADATA,sha256=f5pTZHWrt0JQPHysvca3M7U7HU0Yus5jnGK8KrT2U-g,9775
+sdg_hub-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sdg_hub-0.5.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
+sdg_hub-0.5.1.dist-info/RECORD,,

{sdg_hub-0.5.0.dist-info → sdg_hub-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{sdg_hub-0.5.0.dist-info → sdg_hub-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sdg_hub-0.5.0.dist-info → sdg_hub-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

sdg-hub 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

sdg-hub 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl