sdg-hub 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdg_hub/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.5.0'
32
- __version_tuple__ = version_tuple = (0, 5, 0)
31
+ __version__ = version = '0.5.1'
32
+ __version_tuple__ = version_tuple = (0, 5, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -7,13 +7,16 @@ This module provides the LLMParserBlock for extracting specific fields
7
7
 
8
8
  # Standard
9
9
  from typing import Any
10
+ from weakref import finalize
11
+ import json
10
12
 
11
13
  # Third Party
12
- from datasets import Dataset
14
+ from datasets import Dataset, load_dataset
13
15
  from pydantic import Field, model_validator
14
16
 
15
17
  # Local
16
18
  from ...utils.logger_config import setup_logger
19
+ from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
17
20
  from ..base import BaseBlock
18
21
  from ..registry import BlockRegistry
19
22
 
@@ -26,6 +29,8 @@ logger = setup_logger(__name__)
26
29
  "Extracts specified fields from LLM response objects",
27
30
  )
28
31
  class LLMParserBlock(BaseBlock):
32
+ _flow_requires_jsonl_tmp: bool = True
33
+
29
34
  """Block for extracting fields from LLM response objects.
30
35
 
31
36
  This block extracts specified fields from chat completion response objects.
@@ -314,7 +319,54 @@ class LLMParserBlock(BaseBlock):
314
319
  logger.warning("No samples to process, returning empty dataset")
315
320
  return Dataset.from_list([])
316
321
 
317
- new_data = []
318
- for sample in samples:
319
- new_data.extend(self._generate(sample))
320
- return Dataset.from_list(new_data)
322
+ tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
323
+ cleanup_locally = False
324
+
325
+ if tmp_jsonl_path is None:
326
+ tmp_jsonl_path = str(
327
+ create_temp_file(
328
+ prefix=f"{self.block_name}_llm_parser", suffix=".jsonl"
329
+ )
330
+ )
331
+ cleanup_locally = True
332
+
333
+ rows_written = 0
334
+ batch = []
335
+ with open(tmp_jsonl_path, "w") as f:
336
+ for sample in samples:
337
+ out = self._generate(sample)
338
+ for row in out:
339
+ batch.append(json.dumps(row) + "\n")
340
+ rows_written += 1
341
+ if len(batch) >= 5:
342
+ f.writelines(batch)
343
+ batch.clear()
344
+ if batch:
345
+ f.writelines(batch)
346
+
347
+ if rows_written == 0:
348
+ if cleanup_locally:
349
+ cleanup_path(tmp_jsonl_path)
350
+ return Dataset.from_list([])
351
+
352
+ hf_cache_dir = None
353
+ try:
354
+ hf_cache_dir = create_temp_dir(
355
+ prefix=f"{self.block_name}_llm_parser_hf_cache"
356
+ )
357
+ ret = load_dataset(
358
+ "json",
359
+ data_files=tmp_jsonl_path,
360
+ split="train",
361
+ keep_in_memory=False,
362
+ cache_dir=str(hf_cache_dir),
363
+ )
364
+ finalize(ret, cleanup_path, hf_cache_dir)
365
+ return ret
366
+ except Exception:
367
+ if hf_cache_dir is not None:
368
+ cleanup_path(hf_cache_dir)
369
+ raise
370
+ finally:
371
+ if cleanup_locally:
372
+ cleanup_path(tmp_jsonl_path)
@@ -7,14 +7,17 @@ start/end tags, custom regex patterns, and cleanup operations.
7
7
 
8
8
  # Standard
9
9
  from typing import Any, Optional
10
+ from weakref import finalize
11
+ import json
10
12
  import re
11
13
 
12
14
  # Third Party
13
- from datasets import Dataset
15
+ from datasets import Dataset, load_dataset
14
16
  from pydantic import Field, field_validator, model_validator
15
17
 
16
18
  # Local
17
19
  from ...utils.logger_config import setup_logger
20
+ from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
18
21
  from ..base import BaseBlock
19
22
  from ..registry import BlockRegistry
20
23
 
@@ -27,6 +30,8 @@ logger = setup_logger(__name__)
27
30
  "Parses and post-processes text content using tags or regex patterns",
28
31
  )
29
32
  class TextParserBlock(BaseBlock):
33
+ _flow_requires_jsonl_tmp: bool = True
34
+
30
35
  """Block for parsing and post-processing text content.
31
36
 
32
37
  This block handles text parsing using start/end tags, custom regex patterns,
@@ -317,7 +322,54 @@ class TextParserBlock(BaseBlock):
317
322
  logger.warning("No samples to parse, returning empty dataset")
318
323
  return Dataset.from_list([])
319
324
 
320
- new_data = []
321
- for sample in samples:
322
- new_data.extend(self._generate(sample))
323
- return Dataset.from_list(new_data)
325
+ tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
326
+ cleanup_locally = False
327
+
328
+ if tmp_jsonl_path is None:
329
+ tmp_jsonl_path = str(
330
+ create_temp_file(
331
+ prefix=f"{self.block_name}_text_parser", suffix=".jsonl"
332
+ )
333
+ )
334
+ cleanup_locally = True
335
+
336
+ rows_written = 0
337
+ batch = []
338
+ with open(tmp_jsonl_path, "w") as f:
339
+ for sample in samples:
340
+ out = self._generate(sample)
341
+ for row in out:
342
+ batch.append(json.dumps(row) + "\n")
343
+ rows_written += 1
344
+ if len(batch) >= 5:
345
+ f.writelines(batch)
346
+ batch.clear()
347
+ if batch:
348
+ f.writelines(batch)
349
+
350
+ if rows_written == 0:
351
+ if cleanup_locally:
352
+ cleanup_path(tmp_jsonl_path)
353
+ return Dataset.from_list([])
354
+
355
+ hf_cache_dir = None
356
+ try:
357
+ hf_cache_dir = create_temp_dir(
358
+ prefix=f"{self.block_name}_text_parser_hf_cache"
359
+ )
360
+ ret = load_dataset(
361
+ "json",
362
+ data_files=tmp_jsonl_path,
363
+ split="train",
364
+ keep_in_memory=False,
365
+ cache_dir=str(hf_cache_dir),
366
+ )
367
+ finalize(ret, cleanup_path, hf_cache_dir)
368
+ return ret
369
+ except Exception:
370
+ if hf_cache_dir is not None:
371
+ cleanup_path(hf_cache_dir)
372
+ raise
373
+ finally:
374
+ if cleanup_locally:
375
+ cleanup_path(tmp_jsonl_path)
sdg_hub/core/flow/base.py CHANGED
@@ -5,6 +5,8 @@
5
5
  from datetime import datetime
6
6
  from pathlib import Path
7
7
  from typing import Any, Optional, Union
8
+ from weakref import finalize
9
+ import gc
8
10
  import time
9
11
  import uuid
10
12
 
@@ -37,6 +39,11 @@ from ..utils.flow_metrics import (
37
39
  )
38
40
  from ..utils.logger_config import setup_logger
39
41
  from ..utils.path_resolution import resolve_path
42
+ from ..utils.temp_manager import (
43
+ cleanup_path,
44
+ create_temp_dir,
45
+ create_temp_file,
46
+ )
40
47
  from ..utils.time_estimator import estimate_execution_time
41
48
  from ..utils.yaml_utils import save_flow_yaml
42
49
  from .checkpointer import FlowCheckpointer
@@ -580,6 +587,7 @@ class Flow(BaseModel):
580
587
  # Use provided logger or fall back to global logger
581
588
  exec_logger = flow_logger if flow_logger is not None else logger
582
589
  current_dataset = dataset
590
+ current_dataset_temp_path: Optional[Path] = None
583
591
 
584
592
  # Execute blocks in sequence
585
593
  for i, block in enumerate(self.blocks):
@@ -591,6 +599,14 @@ class Flow(BaseModel):
591
599
  # Prepare block execution parameters
592
600
  block_kwargs = self._prepare_block_kwargs(block, runtime_params)
593
601
 
602
+ block_temp_jsonl_path: Optional[Path] = None
603
+ dataset_temp_dir: Optional[Path] = None
604
+ if getattr(block, "_flow_requires_jsonl_tmp", False):
605
+ block_temp_jsonl_path = create_temp_file(
606
+ prefix=f"{block.block_name}_parser", suffix=".jsonl"
607
+ )
608
+ block_kwargs["_flow_tmp_jsonl_path"] = str(block_temp_jsonl_path)
609
+
594
610
  # Add max_concurrency to block kwargs if provided
595
611
  if max_concurrency is not None:
596
612
  block_kwargs["_flow_max_concurrency"] = max_concurrency
@@ -610,6 +626,28 @@ class Flow(BaseModel):
610
626
  f"Block '{block.block_name}' produced empty dataset"
611
627
  )
612
628
 
629
+ # Here, we write and reload dataset object from and to disk.
630
+ # This is done because HF Datasets library creates a ton of intermediate
631
+ # objects, and holds on to them even after the objects have fulfilled
632
+ # their purpose. To get flush these objects, HF recommends to implement
633
+ # this `save_to_disk` and `load_from_disk` hack.
634
+ # https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_dataset.py#L1029
635
+ previous_temp_path = current_dataset_temp_path
636
+ dataset_temp_dir = create_temp_dir(prefix=f"flow_{block.block_name}")
637
+ current_dataset.save_to_disk(str(dataset_temp_dir))
638
+ del current_dataset
639
+ gc.collect()
640
+ current_dataset = datasets.load_from_disk(
641
+ str(dataset_temp_dir), keep_in_memory=False
642
+ )
643
+ finalize(current_dataset, cleanup_path, dataset_temp_dir)
644
+ current_dataset_temp_path = dataset_temp_dir
645
+ if previous_temp_path and previous_temp_path != dataset_temp_dir:
646
+ cleanup_path(previous_temp_path)
647
+
648
+ if block_temp_jsonl_path is not None:
649
+ cleanup_path(block_temp_jsonl_path)
650
+
613
651
  # Capture metrics after successful execution
614
652
  execution_time = time.perf_counter() - start_time
615
653
  output_rows = len(current_dataset)
@@ -638,6 +676,10 @@ class Flow(BaseModel):
638
676
  )
639
677
 
640
678
  except Exception as exc:
679
+ if block_temp_jsonl_path is not None:
680
+ cleanup_path(block_temp_jsonl_path)
681
+ if dataset_temp_dir is not None:
682
+ cleanup_path(dataset_temp_dir)
641
683
  # Capture metrics for failed execution
642
684
  execution_time = time.perf_counter() - start_time
643
685
  self._block_metrics.append(
@@ -661,6 +703,13 @@ class Flow(BaseModel):
661
703
  f"Block '{block.block_name}' execution failed: {exc}"
662
704
  ) from exc
663
705
 
706
+ if current_dataset_temp_path is not None:
707
+ final_temp_path = current_dataset_temp_path
708
+ current_dataset = datasets.load_from_disk(
709
+ str(final_temp_path), keep_in_memory=True
710
+ )
711
+ cleanup_path(final_temp_path)
712
+
664
713
  return current_dataset
665
714
 
666
715
  def _prepare_block_kwargs(
@@ -0,0 +1,57 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Utilities for managing temporary files and directories used by the flow."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from pathlib import Path
7
+ from typing import Optional, Union
8
+
9
+ # Standard
10
+ import os
11
+ import shutil
12
+ import tempfile
13
+
14
+ TEMP_ROOT_DIR_NAME = ".tmp_sdg_buffer"
15
+
16
+
17
+ def _get_temp_root() -> Path:
18
+ root = Path.cwd() / TEMP_ROOT_DIR_NAME
19
+ root.mkdir(parents=True, exist_ok=True)
20
+ return root
21
+
22
+
23
+ def _format_prefix(prefix: str) -> str:
24
+ return f"{prefix}_" if prefix and not prefix.endswith("_") else prefix
25
+
26
+
27
+ def create_temp_dir(prefix: str = "tmp", suffix: str = "") -> Path:
28
+ """Create a unique temporary directory."""
29
+ root = _get_temp_root()
30
+ name = tempfile.mkdtemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
31
+ return Path(name)
32
+
33
+
34
+ def create_temp_file(prefix: str = "tmp", suffix: str = "") -> Path:
35
+ """Create a unique temporary file."""
36
+ root = _get_temp_root()
37
+ fd, name = tempfile.mkstemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
38
+ os.close(fd)
39
+ return Path(name)
40
+
41
+
42
+ def cleanup_path(path: Optional[Union[str, os.PathLike]]) -> None:
43
+ """Remove a temporary file or directory if it exists."""
44
+ if not path:
45
+ return
46
+
47
+ target = Path(path)
48
+ if not target.exists():
49
+ return
50
+
51
+ if target.is_dir():
52
+ shutil.rmtree(target, ignore_errors=True)
53
+ else:
54
+ try:
55
+ target.unlink()
56
+ except FileNotFoundError:
57
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
- sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
2
+ sdg_hub/_version.py,sha256=cYMOhuaBHd0MIZmumuccsEQ-AxM8LIJy9dsBAWgOpqE,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
5
  sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
@@ -11,9 +11,9 @@ sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6
11
11
  sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
12
12
  sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=MHhI2x9i6LrfDXgvAy2_6YxgyoD7j6BpCgNGsM69xDg,22194
13
13
  sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=DW4b09IqXmcshvXawFheDyaLp3rz7vpO5VBrKdUQYW8,31703
14
- sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=aoHqsDDhaIgCDfPpv7acc0DVN-zUgzFflRVB4win0aM,12012
14
+ sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=pCTaxAML5uFERZx0KTunvgVPHm1H2154VTvF79bGrB8,13699
15
15
  sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
16
- sdg_hub/core/blocks/llm/text_parser_block.py,sha256=975HK6NfXiU9Any4UDMpBNidRpyhHmc76BXUN69SVyc,12566
16
+ sdg_hub/core/blocks/llm/text_parser_block.py,sha256=NGwBdFmfbY3rbm_T7bqTJmaREo2MpSpQwgLrnHHZHqU,14255
17
17
  sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
18
18
  sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
19
19
  sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
@@ -23,7 +23,7 @@ sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGL
23
23
  sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
24
24
  sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
25
25
  sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
26
- sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
26
+ sdg_hub/core/flow/base.py,sha256=Z2P8QBLl7HWVISdI585hxnIiTu9FhnjlTXn-ngr36Jk,58189
27
27
  sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
28
28
  sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
29
29
  sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
@@ -36,6 +36,7 @@ sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqB
36
36
  sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
37
37
  sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
38
38
  sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
39
+ sdg_hub/core/utils/temp_manager.py,sha256=moSPWMxoDEw5FmeuwKTC8f3tYcarQDN0ozv0796CeGg,1484
39
40
  sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
40
41
  sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
41
42
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -77,8 +78,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
77
78
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
78
79
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
79
80
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
80
- sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
81
- sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
82
- sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
83
- sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
84
- sdg_hub-0.5.0.dist-info/RECORD,,
81
+ sdg_hub-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
82
+ sdg_hub-0.5.1.dist-info/METADATA,sha256=f5pTZHWrt0JQPHysvca3M7U7HU0Yus5jnGK8KrT2U-g,9775
83
+ sdg_hub-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
84
+ sdg_hub-0.5.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
85
+ sdg_hub-0.5.1.dist-info/RECORD,,