sdg-hub 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/__init__.py +0 -22
  3. sdg_hub/core/blocks/llm/llm_parser_block.py +57 -5
  4. sdg_hub/core/blocks/llm/text_parser_block.py +57 -5
  5. sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  6. sdg_hub/core/flow/base.py +57 -80
  7. sdg_hub/core/utils/temp_manager.py +57 -0
  8. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  9. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  10. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  11. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  12. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
  13. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/METADATA +2 -2
  14. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/RECORD +17 -27
  15. sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  16. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  17. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  18. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  19. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  20. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  21. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  22. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  23. sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  24. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  25. sdg_hub/core/flow/migration.py +0 -198
  26. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/WHEEL +0 -0
  27. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/licenses/LICENSE +0 -0
  28. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/top_level.txt +0 -0
sdg_hub/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.4.2'
32
- __version_tuple__ = version_tuple = (0, 4, 2)
31
+ __version__ = version = '0.5.1'
32
+ __version_tuple__ = version_tuple = (0, 5, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -5,17 +5,6 @@ This package provides various block implementations for data generation, process
5
5
 
6
6
  # Local
7
7
  from .base import BaseBlock
8
- from .deprecated_blocks import (
9
- CombineColumnsBlock,
10
- DuplicateColumns,
11
- FilterByValueBlock,
12
- FlattenColumnsBlock,
13
- LLMBlock,
14
- RenameColumns,
15
- SamplePopulatorBlock,
16
- SelectorBlock,
17
- SetToMajorityValue,
18
- )
19
8
  from .filtering import ColumnValueFilterBlock
20
9
  from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
21
10
  from .registry import BlockRegistry
@@ -28,8 +17,6 @@ from .transform import (
28
17
  UniformColumnValueSetter,
29
18
  )
30
19
 
31
- # All blocks moved to deprecated_blocks or transform modules
32
-
33
20
  __all__ = [
34
21
  "BaseBlock",
35
22
  "BlockRegistry",
@@ -40,15 +27,6 @@ __all__ = [
40
27
  "RenameColumnsBlock",
41
28
  "TextConcatBlock",
42
29
  "UniformColumnValueSetter",
43
- "CombineColumnsBlock", # Deprecated
44
- "DuplicateColumns", # Deprecated
45
- "FilterByValueBlock", # Deprecated
46
- "FlattenColumnsBlock", # Deprecated
47
- "RenameColumns", # Deprecated
48
- "SamplePopulatorBlock", # Deprecated
49
- "SelectorBlock", # Deprecated
50
- "SetToMajorityValue", # Deprecated
51
- "LLMBlock", # Deprecated
52
30
  "LLMChatBlock",
53
31
  "LLMParserBlock",
54
32
  "TextParserBlock",
@@ -7,13 +7,16 @@ This module provides the LLMParserBlock for extracting specific fields
7
7
 
8
8
  # Standard
9
9
  from typing import Any
10
+ from weakref import finalize
11
+ import json
10
12
 
11
13
  # Third Party
12
- from datasets import Dataset
14
+ from datasets import Dataset, load_dataset
13
15
  from pydantic import Field, model_validator
14
16
 
15
17
  # Local
16
18
  from ...utils.logger_config import setup_logger
19
+ from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
17
20
  from ..base import BaseBlock
18
21
  from ..registry import BlockRegistry
19
22
 
@@ -26,6 +29,8 @@ logger = setup_logger(__name__)
26
29
  "Extracts specified fields from LLM response objects",
27
30
  )
28
31
  class LLMParserBlock(BaseBlock):
32
+ _flow_requires_jsonl_tmp: bool = True
33
+
29
34
  """Block for extracting fields from LLM response objects.
30
35
 
31
36
  This block extracts specified fields from chat completion response objects.
@@ -314,7 +319,54 @@ class LLMParserBlock(BaseBlock):
314
319
  logger.warning("No samples to process, returning empty dataset")
315
320
  return Dataset.from_list([])
316
321
 
317
- new_data = []
318
- for sample in samples:
319
- new_data.extend(self._generate(sample))
320
- return Dataset.from_list(new_data)
322
+ tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
323
+ cleanup_locally = False
324
+
325
+ if tmp_jsonl_path is None:
326
+ tmp_jsonl_path = str(
327
+ create_temp_file(
328
+ prefix=f"{self.block_name}_llm_parser", suffix=".jsonl"
329
+ )
330
+ )
331
+ cleanup_locally = True
332
+
333
+ rows_written = 0
334
+ batch = []
335
+ with open(tmp_jsonl_path, "w") as f:
336
+ for sample in samples:
337
+ out = self._generate(sample)
338
+ for row in out:
339
+ batch.append(json.dumps(row) + "\n")
340
+ rows_written += 1
341
+ if len(batch) >= 5:
342
+ f.writelines(batch)
343
+ batch.clear()
344
+ if batch:
345
+ f.writelines(batch)
346
+
347
+ if rows_written == 0:
348
+ if cleanup_locally:
349
+ cleanup_path(tmp_jsonl_path)
350
+ return Dataset.from_list([])
351
+
352
+ hf_cache_dir = None
353
+ try:
354
+ hf_cache_dir = create_temp_dir(
355
+ prefix=f"{self.block_name}_llm_parser_hf_cache"
356
+ )
357
+ ret = load_dataset(
358
+ "json",
359
+ data_files=tmp_jsonl_path,
360
+ split="train",
361
+ keep_in_memory=False,
362
+ cache_dir=str(hf_cache_dir),
363
+ )
364
+ finalize(ret, cleanup_path, hf_cache_dir)
365
+ return ret
366
+ except Exception:
367
+ if hf_cache_dir is not None:
368
+ cleanup_path(hf_cache_dir)
369
+ raise
370
+ finally:
371
+ if cleanup_locally:
372
+ cleanup_path(tmp_jsonl_path)
@@ -7,14 +7,17 @@ start/end tags, custom regex patterns, and cleanup operations.
7
7
 
8
8
  # Standard
9
9
  from typing import Any, Optional
10
+ from weakref import finalize
11
+ import json
10
12
  import re
11
13
 
12
14
  # Third Party
13
- from datasets import Dataset
15
+ from datasets import Dataset, load_dataset
14
16
  from pydantic import Field, field_validator, model_validator
15
17
 
16
18
  # Local
17
19
  from ...utils.logger_config import setup_logger
20
+ from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
18
21
  from ..base import BaseBlock
19
22
  from ..registry import BlockRegistry
20
23
 
@@ -27,6 +30,8 @@ logger = setup_logger(__name__)
27
30
  "Parses and post-processes text content using tags or regex patterns",
28
31
  )
29
32
  class TextParserBlock(BaseBlock):
33
+ _flow_requires_jsonl_tmp: bool = True
34
+
30
35
  """Block for parsing and post-processing text content.
31
36
 
32
37
  This block handles text parsing using start/end tags, custom regex patterns,
@@ -317,7 +322,54 @@ class TextParserBlock(BaseBlock):
317
322
  logger.warning("No samples to parse, returning empty dataset")
318
323
  return Dataset.from_list([])
319
324
 
320
- new_data = []
321
- for sample in samples:
322
- new_data.extend(self._generate(sample))
323
- return Dataset.from_list(new_data)
325
+ tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
326
+ cleanup_locally = False
327
+
328
+ if tmp_jsonl_path is None:
329
+ tmp_jsonl_path = str(
330
+ create_temp_file(
331
+ prefix=f"{self.block_name}_text_parser", suffix=".jsonl"
332
+ )
333
+ )
334
+ cleanup_locally = True
335
+
336
+ rows_written = 0
337
+ batch = []
338
+ with open(tmp_jsonl_path, "w") as f:
339
+ for sample in samples:
340
+ out = self._generate(sample)
341
+ for row in out:
342
+ batch.append(json.dumps(row) + "\n")
343
+ rows_written += 1
344
+ if len(batch) >= 5:
345
+ f.writelines(batch)
346
+ batch.clear()
347
+ if batch:
348
+ f.writelines(batch)
349
+
350
+ if rows_written == 0:
351
+ if cleanup_locally:
352
+ cleanup_path(tmp_jsonl_path)
353
+ return Dataset.from_list([])
354
+
355
+ hf_cache_dir = None
356
+ try:
357
+ hf_cache_dir = create_temp_dir(
358
+ prefix=f"{self.block_name}_text_parser_hf_cache"
359
+ )
360
+ ret = load_dataset(
361
+ "json",
362
+ data_files=tmp_jsonl_path,
363
+ split="train",
364
+ keep_in_memory=False,
365
+ cache_dir=str(hf_cache_dir),
366
+ )
367
+ finalize(ret, cleanup_path, hf_cache_dir)
368
+ return ret
369
+ except Exception:
370
+ if hf_cache_dir is not None:
371
+ cleanup_path(hf_cache_dir)
372
+ raise
373
+ finally:
374
+ if cleanup_locally:
375
+ cleanup_path(tmp_jsonl_path)
@@ -64,6 +64,25 @@ class RenameColumnsBlock(BaseBlock):
64
64
  -------
65
65
  Dataset
66
66
  Dataset with renamed columns.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If attempting to rename to a column name that already exists.
67
72
  """
73
+ # Check for column name collisions
74
+ # Strict validation: no target column name can be an existing column name
75
+ # This prevents chained/circular renames which can be confusing
76
+ existing_cols = set(samples.column_names)
77
+ target_cols = set(self.input_cols.values())
78
+
79
+ collision = target_cols & existing_cols
80
+ if collision:
81
+ raise ValueError(
82
+ f"Cannot rename to existing column names: {sorted(collision)}. "
83
+ "Target column names must not already exist in the dataset. "
84
+ "Chained renames are not supported."
85
+ )
86
+
68
87
  # Rename columns using HuggingFace datasets method
69
88
  return samples.rename_columns(self.input_cols)
sdg_hub/core/flow/base.py CHANGED
@@ -5,6 +5,8 @@
5
5
  from datetime import datetime
6
6
  from pathlib import Path
7
7
  from typing import Any, Optional, Union
8
+ from weakref import finalize
9
+ import gc
8
10
  import time
9
11
  import uuid
10
12
 
@@ -37,11 +39,15 @@ from ..utils.flow_metrics import (
37
39
  )
38
40
  from ..utils.logger_config import setup_logger
39
41
  from ..utils.path_resolution import resolve_path
42
+ from ..utils.temp_manager import (
43
+ cleanup_path,
44
+ create_temp_dir,
45
+ create_temp_file,
46
+ )
40
47
  from ..utils.time_estimator import estimate_execution_time
41
48
  from ..utils.yaml_utils import save_flow_yaml
42
49
  from .checkpointer import FlowCheckpointer
43
50
  from .metadata import DatasetRequirements, FlowMetadata
44
- from .migration import FlowMigration
45
51
  from .validation import FlowValidator
46
52
 
47
53
  logger = setup_logger(__name__)
@@ -73,8 +79,6 @@ class Flow(BaseModel):
73
79
  model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
74
80
 
75
81
  # Private attributes (not serialized)
76
- _migrated_runtime_params: dict[str, dict[str, Any]] = {}
77
- _llm_client: Any = None # Only used for backward compatibility with old YAMLs
78
82
  _model_config_set: bool = False # Track if model configuration has been set
79
83
  _block_metrics: list[dict[str, Any]] = PrivateAttr(
80
84
  default_factory=list
@@ -113,16 +117,13 @@ class Flow(BaseModel):
113
117
  return self
114
118
 
115
119
  @classmethod
116
- def from_yaml(cls, yaml_path: str, client: Any = None) -> "Flow":
120
+ def from_yaml(cls, yaml_path: str) -> "Flow":
117
121
  """Load flow from YAML configuration file.
118
122
 
119
123
  Parameters
120
124
  ----------
121
125
  yaml_path : str
122
126
  Path to the YAML flow configuration file.
123
- client : Any, optional
124
- LLM client instance. Required for backward compatibility with old format YAMLs
125
- that use deprecated LLMBlocks. Ignored for new format YAMLs.
126
127
 
127
128
  Returns
128
129
  -------
@@ -153,21 +154,6 @@ class Flow(BaseModel):
153
154
  except yaml.YAMLError as exc:
154
155
  raise FlowValidationError(f"Invalid YAML in {yaml_path}: {exc}") from exc
155
156
 
156
- # Check if this is an old format flow and migrate if necessary
157
- migrated_runtime_params = None
158
- is_old_format = FlowMigration.is_old_format(flow_config)
159
- if is_old_format:
160
- logger.info(f"Detected old format flow, migrating: {yaml_path}")
161
- if client is None:
162
- logger.warning(
163
- "Old format YAML detected but no client provided. LLMBlocks may fail."
164
- )
165
- flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
166
- flow_config, yaml_path
167
- )
168
- # Save migrated config back to YAML to persist id
169
- save_flow_yaml(yaml_path, flow_config, "migrated to new format")
170
-
171
157
  # Validate YAML structure
172
158
  validator = FlowValidator()
173
159
  validation_errors = validator.validate_yaml_structure(flow_config)
@@ -194,19 +180,6 @@ class Flow(BaseModel):
194
180
 
195
181
  for i, block_config in enumerate(block_configs):
196
182
  try:
197
- # Inject client for deprecated LLMBlocks if this is an old format flow
198
- if (
199
- is_old_format
200
- and block_config.get("block_type") == "LLMBlock"
201
- and client is not None
202
- ):
203
- if "block_config" not in block_config:
204
- block_config["block_config"] = {}
205
- block_config["block_config"]["client"] = client
206
- logger.debug(
207
- f"Injected client for deprecated LLMBlock: {block_config['block_config'].get('block_name')}"
208
- )
209
-
210
183
  block = cls._create_block_from_config(block_config, yaml_dir)
211
184
  blocks.append(block)
212
185
  except Exception as exc:
@@ -228,12 +201,6 @@ class Flow(BaseModel):
228
201
  )
229
202
  else:
230
203
  logger.debug(f"Flow already had id: {flow.metadata.id}")
231
- # Store migrated runtime params and client for backward compatibility
232
- if migrated_runtime_params:
233
- flow._migrated_runtime_params = migrated_runtime_params
234
- if is_old_format and client is not None:
235
- flow._llm_client = client
236
-
237
204
  # Check if this is a flow without LLM blocks
238
205
  llm_blocks = flow._detect_llm_blocks()
239
206
  if not llm_blocks:
@@ -484,12 +451,6 @@ class Flow(BaseModel):
484
451
  self._block_metrics = []
485
452
  run_start = time.perf_counter()
486
453
 
487
- # Merge migrated runtime params with provided ones (provided ones take precedence)
488
- merged_runtime_params = self._migrated_runtime_params.copy()
489
- if runtime_params:
490
- merged_runtime_params.update(runtime_params)
491
- runtime_params = merged_runtime_params
492
-
493
454
  # Execute flow with metrics capture, ensuring metrics are always displayed/saved
494
455
  final_dataset = None
495
456
  execution_successful = False
@@ -626,6 +587,7 @@ class Flow(BaseModel):
626
587
  # Use provided logger or fall back to global logger
627
588
  exec_logger = flow_logger if flow_logger is not None else logger
628
589
  current_dataset = dataset
590
+ current_dataset_temp_path: Optional[Path] = None
629
591
 
630
592
  # Execute blocks in sequence
631
593
  for i, block in enumerate(self.blocks):
@@ -637,6 +599,14 @@ class Flow(BaseModel):
637
599
  # Prepare block execution parameters
638
600
  block_kwargs = self._prepare_block_kwargs(block, runtime_params)
639
601
 
602
+ block_temp_jsonl_path: Optional[Path] = None
603
+ dataset_temp_dir: Optional[Path] = None
604
+ if getattr(block, "_flow_requires_jsonl_tmp", False):
605
+ block_temp_jsonl_path = create_temp_file(
606
+ prefix=f"{block.block_name}_parser", suffix=".jsonl"
607
+ )
608
+ block_kwargs["_flow_tmp_jsonl_path"] = str(block_temp_jsonl_path)
609
+
640
610
  # Add max_concurrency to block kwargs if provided
641
611
  if max_concurrency is not None:
642
612
  block_kwargs["_flow_max_concurrency"] = max_concurrency
@@ -647,22 +617,8 @@ class Flow(BaseModel):
647
617
  input_cols = set(current_dataset.column_names)
648
618
 
649
619
  try:
650
- # Check if this is a deprecated block and skip validations
651
- is_deprecated_block = (
652
- hasattr(block, "__class__")
653
- and hasattr(block.__class__, "__module__")
654
- and "deprecated_blocks" in block.__class__.__module__
655
- )
656
-
657
- if is_deprecated_block:
658
- exec_logger.debug(
659
- f"Skipping validations for deprecated block: {block.block_name}"
660
- )
661
- # Call generate() directly to skip validations, but keep the runtime params
662
- current_dataset = block.generate(current_dataset, **block_kwargs)
663
- else:
664
- # Execute block with validation and logging
665
- current_dataset = block(current_dataset, **block_kwargs)
620
+ # Execute block with validation and logging
621
+ current_dataset = block(current_dataset, **block_kwargs)
666
622
 
667
623
  # Validate output
668
624
  if len(current_dataset) == 0:
@@ -670,6 +626,28 @@ class Flow(BaseModel):
670
626
  f"Block '{block.block_name}' produced empty dataset"
671
627
  )
672
628
 
629
+ # Here, we write and reload dataset object from and to disk.
630
+ # This is done because HF Datasets library creates a ton of intermediate
631
+ # objects, and holds on to them even after the objects have fulfilled
632
+ # their purpose. To get flush these objects, HF recommends to implement
633
+ # this `save_to_disk` and `load_from_disk` hack.
634
+ # https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_dataset.py#L1029
635
+ previous_temp_path = current_dataset_temp_path
636
+ dataset_temp_dir = create_temp_dir(prefix=f"flow_{block.block_name}")
637
+ current_dataset.save_to_disk(str(dataset_temp_dir))
638
+ del current_dataset
639
+ gc.collect()
640
+ current_dataset = datasets.load_from_disk(
641
+ str(dataset_temp_dir), keep_in_memory=False
642
+ )
643
+ finalize(current_dataset, cleanup_path, dataset_temp_dir)
644
+ current_dataset_temp_path = dataset_temp_dir
645
+ if previous_temp_path and previous_temp_path != dataset_temp_dir:
646
+ cleanup_path(previous_temp_path)
647
+
648
+ if block_temp_jsonl_path is not None:
649
+ cleanup_path(block_temp_jsonl_path)
650
+
673
651
  # Capture metrics after successful execution
674
652
  execution_time = time.perf_counter() - start_time
675
653
  output_rows = len(current_dataset)
@@ -698,6 +676,10 @@ class Flow(BaseModel):
698
676
  )
699
677
 
700
678
  except Exception as exc:
679
+ if block_temp_jsonl_path is not None:
680
+ cleanup_path(block_temp_jsonl_path)
681
+ if dataset_temp_dir is not None:
682
+ cleanup_path(dataset_temp_dir)
701
683
  # Capture metrics for failed execution
702
684
  execution_time = time.perf_counter() - start_time
703
685
  self._block_metrics.append(
@@ -721,12 +703,21 @@ class Flow(BaseModel):
721
703
  f"Block '{block.block_name}' execution failed: {exc}"
722
704
  ) from exc
723
705
 
706
+ if current_dataset_temp_path is not None:
707
+ final_temp_path = current_dataset_temp_path
708
+ current_dataset = datasets.load_from_disk(
709
+ str(final_temp_path), keep_in_memory=True
710
+ )
711
+ cleanup_path(final_temp_path)
712
+
724
713
  return current_dataset
725
714
 
726
715
  def _prepare_block_kwargs(
727
- self, block: BaseBlock, runtime_params: dict[str, dict[str, Any]]
716
+ self, block: BaseBlock, runtime_params: Optional[dict[str, dict[str, Any]]]
728
717
  ) -> dict[str, Any]:
729
718
  """Prepare execution parameters for a block."""
719
+ if runtime_params is None:
720
+ return {}
730
721
  return runtime_params.get(block.block_name, {})
731
722
 
732
723
  def set_model_config(
@@ -1114,22 +1105,8 @@ class Flow(BaseModel):
1114
1105
  if max_concurrency is not None:
1115
1106
  block_kwargs["_flow_max_concurrency"] = max_concurrency
1116
1107
 
1117
- # Check if this is a deprecated block and skip validations
1118
- is_deprecated_block = (
1119
- hasattr(block, "__class__")
1120
- and hasattr(block.__class__, "__module__")
1121
- and "deprecated_blocks" in block.__class__.__module__
1122
- )
1123
-
1124
- if is_deprecated_block:
1125
- logger.debug(
1126
- f"Dry run: Skipping validations for deprecated block: {block.block_name}"
1127
- )
1128
- # Call generate() directly to skip validations, but keep the runtime params
1129
- current_dataset = block.generate(current_dataset, **block_kwargs)
1130
- else:
1131
- # Execute block with validation and logging
1132
- current_dataset = block(current_dataset, **block_kwargs)
1108
+ # Execute block with validation and logging
1109
+ current_dataset = block(current_dataset, **block_kwargs)
1133
1110
 
1134
1111
  block_execution_time = (
1135
1112
  time.perf_counter() - block_start_time
@@ -0,0 +1,57 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Utilities for managing temporary files and directories used by the flow."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from pathlib import Path
7
+ from typing import Optional, Union
8
+
9
+ # Standard
10
+ import os
11
+ import shutil
12
+ import tempfile
13
+
14
+ TEMP_ROOT_DIR_NAME = ".tmp_sdg_buffer"
15
+
16
+
17
+ def _get_temp_root() -> Path:
18
+ root = Path.cwd() / TEMP_ROOT_DIR_NAME
19
+ root.mkdir(parents=True, exist_ok=True)
20
+ return root
21
+
22
+
23
+ def _format_prefix(prefix: str) -> str:
24
+ return f"{prefix}_" if prefix and not prefix.endswith("_") else prefix
25
+
26
+
27
+ def create_temp_dir(prefix: str = "tmp", suffix: str = "") -> Path:
28
+ """Create a unique temporary directory."""
29
+ root = _get_temp_root()
30
+ name = tempfile.mkdtemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
31
+ return Path(name)
32
+
33
+
34
+ def create_temp_file(prefix: str = "tmp", suffix: str = "") -> Path:
35
+ """Create a unique temporary file."""
36
+ root = _get_temp_root()
37
+ fd, name = tempfile.mkstemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
38
+ os.close(fd)
39
+ return Path(name)
40
+
41
+
42
+ def cleanup_path(path: Optional[Union[str, os.PathLike]]) -> None:
43
+ """Remove a temporary file or directory if it exists."""
44
+ if not path:
45
+ return
46
+
47
+ target = Path(path)
48
+ if not target.exists():
49
+ return
50
+
51
+ if target.is_dir():
52
+ shutil.rmtree(target, ignore_errors=True)
53
+ else:
54
+ try:
55
+ target.unlink()
56
+ except FileNotFoundError:
57
+ pass
@@ -77,9 +77,13 @@ blocks:
77
77
  - ''
78
78
  - block_type: RenameColumnsBlock
79
79
  block_config:
80
- block_name: rename_to_document_column
80
+ block_name: rename_to_raw_document_column
81
81
  input_cols:
82
82
  document: raw_document
83
+ - block_type: RenameColumnsBlock
84
+ block_config:
85
+ block_name: rename_to_document_column
86
+ input_cols:
83
87
  summary: document
84
88
  - block_type: PromptBuilderBlock
85
89
  block_config:
@@ -79,9 +79,13 @@ blocks:
79
79
  - ''
80
80
  - block_type: RenameColumnsBlock
81
81
  block_config:
82
- block_name: rename_to_document_column
82
+ block_name: rename_to_raw_document_column
83
83
  input_cols:
84
84
  document: raw_document
85
+ - block_type: RenameColumnsBlock
86
+ block_config:
87
+ block_name: rename_to_document_column
88
+ input_cols:
85
89
  summary: document
86
90
  - block_type: PromptBuilderBlock
87
91
  block_config:
@@ -72,9 +72,13 @@ blocks:
72
72
  parsing_pattern: '(?:^|\n)\s*\d+\.\s+(.*?)(?=\n\s*\d+\.\s+|\Z)'
73
73
  - block_type: RenameColumnsBlock
74
74
  block_config:
75
- block_name: rename_to_document_column
75
+ block_name: rename_to_raw_document_column
76
76
  input_cols:
77
77
  document: raw_document
78
+ - block_type: RenameColumnsBlock
79
+ block_config:
80
+ block_name: rename_to_document_column
81
+ input_cols:
78
82
  atomic_facts: document
79
83
  - block_type: PromptBuilderBlock
80
84
  block_config:
@@ -134,10 +134,15 @@ blocks:
134
134
  input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
135
135
  output_cols: [summary, dataset_type]
136
136
 
137
+ - block_type: RenameColumnsBlock
138
+ block_config:
139
+ block_name: rename_to_raw_document_column
140
+ input_cols: {document: raw_document}
141
+
137
142
  - block_type: RenameColumnsBlock
138
143
  block_config:
139
144
  block_name: rename_to_document_column
140
- input_cols: {document: raw_document, summary: document}
145
+ input_cols: {summary: document}
141
146
 
142
147
  - block_type: PromptBuilderBlock
143
148
  block_config:
@@ -135,10 +135,14 @@ blocks:
135
135
  input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
136
136
  output_cols: [summary, dataset_type]
137
137
 
138
+ - block_type: RenameColumnsBlock
139
+ block_config:
140
+ block_name: rename_to_raw_document_column
141
+ input_cols: {document: raw_document}
138
142
  - block_type: RenameColumnsBlock
139
143
  block_config:
140
144
  block_name: rename_to_document_column
141
- input_cols: {document: raw_document, summary: document}
145
+ input_cols: {summary: document}
142
146
 
143
147
  - block_type: PromptBuilderBlock
144
148
  block_config:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.2
3
+ Version: 0.5.1
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: click<9.0.0,>=8.1.7
26
- Requires-Dist: datasets<4.0.0,>=2.18.0
26
+ Requires-Dist: datasets>=4.0.0
27
27
  Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0