sdg-hub 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/__init__.py +0 -22
- sdg_hub/core/blocks/llm/llm_parser_block.py +57 -5
- sdg_hub/core/blocks/llm/text_parser_block.py +57 -5
- sdg_hub/core/blocks/transform/rename_columns.py +19 -0
- sdg_hub/core/flow/base.py +57 -80
- sdg_hub/core/utils/temp_manager.py +57 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/METADATA +2 -2
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/RECORD +17 -27
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
- sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
- sdg_hub/core/flow/migration.py +0 -198
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.1.dist-info}/top_level.txt +0 -0
sdg_hub/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.5.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 5, 1)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
sdg_hub/core/blocks/__init__.py
CHANGED
|
@@ -5,17 +5,6 @@ This package provides various block implementations for data generation, process
|
|
|
5
5
|
|
|
6
6
|
# Local
|
|
7
7
|
from .base import BaseBlock
|
|
8
|
-
from .deprecated_blocks import (
|
|
9
|
-
CombineColumnsBlock,
|
|
10
|
-
DuplicateColumns,
|
|
11
|
-
FilterByValueBlock,
|
|
12
|
-
FlattenColumnsBlock,
|
|
13
|
-
LLMBlock,
|
|
14
|
-
RenameColumns,
|
|
15
|
-
SamplePopulatorBlock,
|
|
16
|
-
SelectorBlock,
|
|
17
|
-
SetToMajorityValue,
|
|
18
|
-
)
|
|
19
8
|
from .filtering import ColumnValueFilterBlock
|
|
20
9
|
from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
|
|
21
10
|
from .registry import BlockRegistry
|
|
@@ -28,8 +17,6 @@ from .transform import (
|
|
|
28
17
|
UniformColumnValueSetter,
|
|
29
18
|
)
|
|
30
19
|
|
|
31
|
-
# All blocks moved to deprecated_blocks or transform modules
|
|
32
|
-
|
|
33
20
|
__all__ = [
|
|
34
21
|
"BaseBlock",
|
|
35
22
|
"BlockRegistry",
|
|
@@ -40,15 +27,6 @@ __all__ = [
|
|
|
40
27
|
"RenameColumnsBlock",
|
|
41
28
|
"TextConcatBlock",
|
|
42
29
|
"UniformColumnValueSetter",
|
|
43
|
-
"CombineColumnsBlock", # Deprecated
|
|
44
|
-
"DuplicateColumns", # Deprecated
|
|
45
|
-
"FilterByValueBlock", # Deprecated
|
|
46
|
-
"FlattenColumnsBlock", # Deprecated
|
|
47
|
-
"RenameColumns", # Deprecated
|
|
48
|
-
"SamplePopulatorBlock", # Deprecated
|
|
49
|
-
"SelectorBlock", # Deprecated
|
|
50
|
-
"SetToMajorityValue", # Deprecated
|
|
51
|
-
"LLMBlock", # Deprecated
|
|
52
30
|
"LLMChatBlock",
|
|
53
31
|
"LLMParserBlock",
|
|
54
32
|
"TextParserBlock",
|
|
@@ -7,13 +7,16 @@ This module provides the LLMParserBlock for extracting specific fields
|
|
|
7
7
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any
|
|
10
|
+
from weakref import finalize
|
|
11
|
+
import json
|
|
10
12
|
|
|
11
13
|
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
14
|
+
from datasets import Dataset, load_dataset
|
|
13
15
|
from pydantic import Field, model_validator
|
|
14
16
|
|
|
15
17
|
# Local
|
|
16
18
|
from ...utils.logger_config import setup_logger
|
|
19
|
+
from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
|
|
17
20
|
from ..base import BaseBlock
|
|
18
21
|
from ..registry import BlockRegistry
|
|
19
22
|
|
|
@@ -26,6 +29,8 @@ logger = setup_logger(__name__)
|
|
|
26
29
|
"Extracts specified fields from LLM response objects",
|
|
27
30
|
)
|
|
28
31
|
class LLMParserBlock(BaseBlock):
|
|
32
|
+
_flow_requires_jsonl_tmp: bool = True
|
|
33
|
+
|
|
29
34
|
"""Block for extracting fields from LLM response objects.
|
|
30
35
|
|
|
31
36
|
This block extracts specified fields from chat completion response objects.
|
|
@@ -314,7 +319,54 @@ class LLMParserBlock(BaseBlock):
|
|
|
314
319
|
logger.warning("No samples to process, returning empty dataset")
|
|
315
320
|
return Dataset.from_list([])
|
|
316
321
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
322
|
+
tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
|
|
323
|
+
cleanup_locally = False
|
|
324
|
+
|
|
325
|
+
if tmp_jsonl_path is None:
|
|
326
|
+
tmp_jsonl_path = str(
|
|
327
|
+
create_temp_file(
|
|
328
|
+
prefix=f"{self.block_name}_llm_parser", suffix=".jsonl"
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
cleanup_locally = True
|
|
332
|
+
|
|
333
|
+
rows_written = 0
|
|
334
|
+
batch = []
|
|
335
|
+
with open(tmp_jsonl_path, "w") as f:
|
|
336
|
+
for sample in samples:
|
|
337
|
+
out = self._generate(sample)
|
|
338
|
+
for row in out:
|
|
339
|
+
batch.append(json.dumps(row) + "\n")
|
|
340
|
+
rows_written += 1
|
|
341
|
+
if len(batch) >= 5:
|
|
342
|
+
f.writelines(batch)
|
|
343
|
+
batch.clear()
|
|
344
|
+
if batch:
|
|
345
|
+
f.writelines(batch)
|
|
346
|
+
|
|
347
|
+
if rows_written == 0:
|
|
348
|
+
if cleanup_locally:
|
|
349
|
+
cleanup_path(tmp_jsonl_path)
|
|
350
|
+
return Dataset.from_list([])
|
|
351
|
+
|
|
352
|
+
hf_cache_dir = None
|
|
353
|
+
try:
|
|
354
|
+
hf_cache_dir = create_temp_dir(
|
|
355
|
+
prefix=f"{self.block_name}_llm_parser_hf_cache"
|
|
356
|
+
)
|
|
357
|
+
ret = load_dataset(
|
|
358
|
+
"json",
|
|
359
|
+
data_files=tmp_jsonl_path,
|
|
360
|
+
split="train",
|
|
361
|
+
keep_in_memory=False,
|
|
362
|
+
cache_dir=str(hf_cache_dir),
|
|
363
|
+
)
|
|
364
|
+
finalize(ret, cleanup_path, hf_cache_dir)
|
|
365
|
+
return ret
|
|
366
|
+
except Exception:
|
|
367
|
+
if hf_cache_dir is not None:
|
|
368
|
+
cleanup_path(hf_cache_dir)
|
|
369
|
+
raise
|
|
370
|
+
finally:
|
|
371
|
+
if cleanup_locally:
|
|
372
|
+
cleanup_path(tmp_jsonl_path)
|
|
@@ -7,14 +7,17 @@ start/end tags, custom regex patterns, and cleanup operations.
|
|
|
7
7
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any, Optional
|
|
10
|
+
from weakref import finalize
|
|
11
|
+
import json
|
|
10
12
|
import re
|
|
11
13
|
|
|
12
14
|
# Third Party
|
|
13
|
-
from datasets import Dataset
|
|
15
|
+
from datasets import Dataset, load_dataset
|
|
14
16
|
from pydantic import Field, field_validator, model_validator
|
|
15
17
|
|
|
16
18
|
# Local
|
|
17
19
|
from ...utils.logger_config import setup_logger
|
|
20
|
+
from ...utils.temp_manager import cleanup_path, create_temp_dir, create_temp_file
|
|
18
21
|
from ..base import BaseBlock
|
|
19
22
|
from ..registry import BlockRegistry
|
|
20
23
|
|
|
@@ -27,6 +30,8 @@ logger = setup_logger(__name__)
|
|
|
27
30
|
"Parses and post-processes text content using tags or regex patterns",
|
|
28
31
|
)
|
|
29
32
|
class TextParserBlock(BaseBlock):
|
|
33
|
+
_flow_requires_jsonl_tmp: bool = True
|
|
34
|
+
|
|
30
35
|
"""Block for parsing and post-processing text content.
|
|
31
36
|
|
|
32
37
|
This block handles text parsing using start/end tags, custom regex patterns,
|
|
@@ -317,7 +322,54 @@ class TextParserBlock(BaseBlock):
|
|
|
317
322
|
logger.warning("No samples to parse, returning empty dataset")
|
|
318
323
|
return Dataset.from_list([])
|
|
319
324
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
325
|
+
tmp_jsonl_path = kwargs.get("_flow_tmp_jsonl_path")
|
|
326
|
+
cleanup_locally = False
|
|
327
|
+
|
|
328
|
+
if tmp_jsonl_path is None:
|
|
329
|
+
tmp_jsonl_path = str(
|
|
330
|
+
create_temp_file(
|
|
331
|
+
prefix=f"{self.block_name}_text_parser", suffix=".jsonl"
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
cleanup_locally = True
|
|
335
|
+
|
|
336
|
+
rows_written = 0
|
|
337
|
+
batch = []
|
|
338
|
+
with open(tmp_jsonl_path, "w") as f:
|
|
339
|
+
for sample in samples:
|
|
340
|
+
out = self._generate(sample)
|
|
341
|
+
for row in out:
|
|
342
|
+
batch.append(json.dumps(row) + "\n")
|
|
343
|
+
rows_written += 1
|
|
344
|
+
if len(batch) >= 5:
|
|
345
|
+
f.writelines(batch)
|
|
346
|
+
batch.clear()
|
|
347
|
+
if batch:
|
|
348
|
+
f.writelines(batch)
|
|
349
|
+
|
|
350
|
+
if rows_written == 0:
|
|
351
|
+
if cleanup_locally:
|
|
352
|
+
cleanup_path(tmp_jsonl_path)
|
|
353
|
+
return Dataset.from_list([])
|
|
354
|
+
|
|
355
|
+
hf_cache_dir = None
|
|
356
|
+
try:
|
|
357
|
+
hf_cache_dir = create_temp_dir(
|
|
358
|
+
prefix=f"{self.block_name}_text_parser_hf_cache"
|
|
359
|
+
)
|
|
360
|
+
ret = load_dataset(
|
|
361
|
+
"json",
|
|
362
|
+
data_files=tmp_jsonl_path,
|
|
363
|
+
split="train",
|
|
364
|
+
keep_in_memory=False,
|
|
365
|
+
cache_dir=str(hf_cache_dir),
|
|
366
|
+
)
|
|
367
|
+
finalize(ret, cleanup_path, hf_cache_dir)
|
|
368
|
+
return ret
|
|
369
|
+
except Exception:
|
|
370
|
+
if hf_cache_dir is not None:
|
|
371
|
+
cleanup_path(hf_cache_dir)
|
|
372
|
+
raise
|
|
373
|
+
finally:
|
|
374
|
+
if cleanup_locally:
|
|
375
|
+
cleanup_path(tmp_jsonl_path)
|
|
@@ -64,6 +64,25 @@ class RenameColumnsBlock(BaseBlock):
|
|
|
64
64
|
-------
|
|
65
65
|
Dataset
|
|
66
66
|
Dataset with renamed columns.
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
ValueError
|
|
71
|
+
If attempting to rename to a column name that already exists.
|
|
67
72
|
"""
|
|
73
|
+
# Check for column name collisions
|
|
74
|
+
# Strict validation: no target column name can be an existing column name
|
|
75
|
+
# This prevents chained/circular renames which can be confusing
|
|
76
|
+
existing_cols = set(samples.column_names)
|
|
77
|
+
target_cols = set(self.input_cols.values())
|
|
78
|
+
|
|
79
|
+
collision = target_cols & existing_cols
|
|
80
|
+
if collision:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Cannot rename to existing column names: {sorted(collision)}. "
|
|
83
|
+
"Target column names must not already exist in the dataset. "
|
|
84
|
+
"Chained renames are not supported."
|
|
85
|
+
)
|
|
86
|
+
|
|
68
87
|
# Rename columns using HuggingFace datasets method
|
|
69
88
|
return samples.rename_columns(self.input_cols)
|
sdg_hub/core/flow/base.py
CHANGED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Any, Optional, Union
|
|
8
|
+
from weakref import finalize
|
|
9
|
+
import gc
|
|
8
10
|
import time
|
|
9
11
|
import uuid
|
|
10
12
|
|
|
@@ -37,11 +39,15 @@ from ..utils.flow_metrics import (
|
|
|
37
39
|
)
|
|
38
40
|
from ..utils.logger_config import setup_logger
|
|
39
41
|
from ..utils.path_resolution import resolve_path
|
|
42
|
+
from ..utils.temp_manager import (
|
|
43
|
+
cleanup_path,
|
|
44
|
+
create_temp_dir,
|
|
45
|
+
create_temp_file,
|
|
46
|
+
)
|
|
40
47
|
from ..utils.time_estimator import estimate_execution_time
|
|
41
48
|
from ..utils.yaml_utils import save_flow_yaml
|
|
42
49
|
from .checkpointer import FlowCheckpointer
|
|
43
50
|
from .metadata import DatasetRequirements, FlowMetadata
|
|
44
|
-
from .migration import FlowMigration
|
|
45
51
|
from .validation import FlowValidator
|
|
46
52
|
|
|
47
53
|
logger = setup_logger(__name__)
|
|
@@ -73,8 +79,6 @@ class Flow(BaseModel):
|
|
|
73
79
|
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
|
74
80
|
|
|
75
81
|
# Private attributes (not serialized)
|
|
76
|
-
_migrated_runtime_params: dict[str, dict[str, Any]] = {}
|
|
77
|
-
_llm_client: Any = None # Only used for backward compatibility with old YAMLs
|
|
78
82
|
_model_config_set: bool = False # Track if model configuration has been set
|
|
79
83
|
_block_metrics: list[dict[str, Any]] = PrivateAttr(
|
|
80
84
|
default_factory=list
|
|
@@ -113,16 +117,13 @@ class Flow(BaseModel):
|
|
|
113
117
|
return self
|
|
114
118
|
|
|
115
119
|
@classmethod
|
|
116
|
-
def from_yaml(cls, yaml_path: str
|
|
120
|
+
def from_yaml(cls, yaml_path: str) -> "Flow":
|
|
117
121
|
"""Load flow from YAML configuration file.
|
|
118
122
|
|
|
119
123
|
Parameters
|
|
120
124
|
----------
|
|
121
125
|
yaml_path : str
|
|
122
126
|
Path to the YAML flow configuration file.
|
|
123
|
-
client : Any, optional
|
|
124
|
-
LLM client instance. Required for backward compatibility with old format YAMLs
|
|
125
|
-
that use deprecated LLMBlocks. Ignored for new format YAMLs.
|
|
126
127
|
|
|
127
128
|
Returns
|
|
128
129
|
-------
|
|
@@ -153,21 +154,6 @@ class Flow(BaseModel):
|
|
|
153
154
|
except yaml.YAMLError as exc:
|
|
154
155
|
raise FlowValidationError(f"Invalid YAML in {yaml_path}: {exc}") from exc
|
|
155
156
|
|
|
156
|
-
# Check if this is an old format flow and migrate if necessary
|
|
157
|
-
migrated_runtime_params = None
|
|
158
|
-
is_old_format = FlowMigration.is_old_format(flow_config)
|
|
159
|
-
if is_old_format:
|
|
160
|
-
logger.info(f"Detected old format flow, migrating: {yaml_path}")
|
|
161
|
-
if client is None:
|
|
162
|
-
logger.warning(
|
|
163
|
-
"Old format YAML detected but no client provided. LLMBlocks may fail."
|
|
164
|
-
)
|
|
165
|
-
flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
|
|
166
|
-
flow_config, yaml_path
|
|
167
|
-
)
|
|
168
|
-
# Save migrated config back to YAML to persist id
|
|
169
|
-
save_flow_yaml(yaml_path, flow_config, "migrated to new format")
|
|
170
|
-
|
|
171
157
|
# Validate YAML structure
|
|
172
158
|
validator = FlowValidator()
|
|
173
159
|
validation_errors = validator.validate_yaml_structure(flow_config)
|
|
@@ -194,19 +180,6 @@ class Flow(BaseModel):
|
|
|
194
180
|
|
|
195
181
|
for i, block_config in enumerate(block_configs):
|
|
196
182
|
try:
|
|
197
|
-
# Inject client for deprecated LLMBlocks if this is an old format flow
|
|
198
|
-
if (
|
|
199
|
-
is_old_format
|
|
200
|
-
and block_config.get("block_type") == "LLMBlock"
|
|
201
|
-
and client is not None
|
|
202
|
-
):
|
|
203
|
-
if "block_config" not in block_config:
|
|
204
|
-
block_config["block_config"] = {}
|
|
205
|
-
block_config["block_config"]["client"] = client
|
|
206
|
-
logger.debug(
|
|
207
|
-
f"Injected client for deprecated LLMBlock: {block_config['block_config'].get('block_name')}"
|
|
208
|
-
)
|
|
209
|
-
|
|
210
183
|
block = cls._create_block_from_config(block_config, yaml_dir)
|
|
211
184
|
blocks.append(block)
|
|
212
185
|
except Exception as exc:
|
|
@@ -228,12 +201,6 @@ class Flow(BaseModel):
|
|
|
228
201
|
)
|
|
229
202
|
else:
|
|
230
203
|
logger.debug(f"Flow already had id: {flow.metadata.id}")
|
|
231
|
-
# Store migrated runtime params and client for backward compatibility
|
|
232
|
-
if migrated_runtime_params:
|
|
233
|
-
flow._migrated_runtime_params = migrated_runtime_params
|
|
234
|
-
if is_old_format and client is not None:
|
|
235
|
-
flow._llm_client = client
|
|
236
|
-
|
|
237
204
|
# Check if this is a flow without LLM blocks
|
|
238
205
|
llm_blocks = flow._detect_llm_blocks()
|
|
239
206
|
if not llm_blocks:
|
|
@@ -484,12 +451,6 @@ class Flow(BaseModel):
|
|
|
484
451
|
self._block_metrics = []
|
|
485
452
|
run_start = time.perf_counter()
|
|
486
453
|
|
|
487
|
-
# Merge migrated runtime params with provided ones (provided ones take precedence)
|
|
488
|
-
merged_runtime_params = self._migrated_runtime_params.copy()
|
|
489
|
-
if runtime_params:
|
|
490
|
-
merged_runtime_params.update(runtime_params)
|
|
491
|
-
runtime_params = merged_runtime_params
|
|
492
|
-
|
|
493
454
|
# Execute flow with metrics capture, ensuring metrics are always displayed/saved
|
|
494
455
|
final_dataset = None
|
|
495
456
|
execution_successful = False
|
|
@@ -626,6 +587,7 @@ class Flow(BaseModel):
|
|
|
626
587
|
# Use provided logger or fall back to global logger
|
|
627
588
|
exec_logger = flow_logger if flow_logger is not None else logger
|
|
628
589
|
current_dataset = dataset
|
|
590
|
+
current_dataset_temp_path: Optional[Path] = None
|
|
629
591
|
|
|
630
592
|
# Execute blocks in sequence
|
|
631
593
|
for i, block in enumerate(self.blocks):
|
|
@@ -637,6 +599,14 @@ class Flow(BaseModel):
|
|
|
637
599
|
# Prepare block execution parameters
|
|
638
600
|
block_kwargs = self._prepare_block_kwargs(block, runtime_params)
|
|
639
601
|
|
|
602
|
+
block_temp_jsonl_path: Optional[Path] = None
|
|
603
|
+
dataset_temp_dir: Optional[Path] = None
|
|
604
|
+
if getattr(block, "_flow_requires_jsonl_tmp", False):
|
|
605
|
+
block_temp_jsonl_path = create_temp_file(
|
|
606
|
+
prefix=f"{block.block_name}_parser", suffix=".jsonl"
|
|
607
|
+
)
|
|
608
|
+
block_kwargs["_flow_tmp_jsonl_path"] = str(block_temp_jsonl_path)
|
|
609
|
+
|
|
640
610
|
# Add max_concurrency to block kwargs if provided
|
|
641
611
|
if max_concurrency is not None:
|
|
642
612
|
block_kwargs["_flow_max_concurrency"] = max_concurrency
|
|
@@ -647,22 +617,8 @@ class Flow(BaseModel):
|
|
|
647
617
|
input_cols = set(current_dataset.column_names)
|
|
648
618
|
|
|
649
619
|
try:
|
|
650
|
-
#
|
|
651
|
-
|
|
652
|
-
hasattr(block, "__class__")
|
|
653
|
-
and hasattr(block.__class__, "__module__")
|
|
654
|
-
and "deprecated_blocks" in block.__class__.__module__
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
if is_deprecated_block:
|
|
658
|
-
exec_logger.debug(
|
|
659
|
-
f"Skipping validations for deprecated block: {block.block_name}"
|
|
660
|
-
)
|
|
661
|
-
# Call generate() directly to skip validations, but keep the runtime params
|
|
662
|
-
current_dataset = block.generate(current_dataset, **block_kwargs)
|
|
663
|
-
else:
|
|
664
|
-
# Execute block with validation and logging
|
|
665
|
-
current_dataset = block(current_dataset, **block_kwargs)
|
|
620
|
+
# Execute block with validation and logging
|
|
621
|
+
current_dataset = block(current_dataset, **block_kwargs)
|
|
666
622
|
|
|
667
623
|
# Validate output
|
|
668
624
|
if len(current_dataset) == 0:
|
|
@@ -670,6 +626,28 @@ class Flow(BaseModel):
|
|
|
670
626
|
f"Block '{block.block_name}' produced empty dataset"
|
|
671
627
|
)
|
|
672
628
|
|
|
629
|
+
# Here, we write and reload dataset object from and to disk.
|
|
630
|
+
# This is done because HF Datasets library creates a ton of intermediate
|
|
631
|
+
# objects, and holds on to them even after the objects have fulfilled
|
|
632
|
+
# their purpose. To get flush these objects, HF recommends to implement
|
|
633
|
+
# this `save_to_disk` and `load_from_disk` hack.
|
|
634
|
+
# https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_dataset.py#L1029
|
|
635
|
+
previous_temp_path = current_dataset_temp_path
|
|
636
|
+
dataset_temp_dir = create_temp_dir(prefix=f"flow_{block.block_name}")
|
|
637
|
+
current_dataset.save_to_disk(str(dataset_temp_dir))
|
|
638
|
+
del current_dataset
|
|
639
|
+
gc.collect()
|
|
640
|
+
current_dataset = datasets.load_from_disk(
|
|
641
|
+
str(dataset_temp_dir), keep_in_memory=False
|
|
642
|
+
)
|
|
643
|
+
finalize(current_dataset, cleanup_path, dataset_temp_dir)
|
|
644
|
+
current_dataset_temp_path = dataset_temp_dir
|
|
645
|
+
if previous_temp_path and previous_temp_path != dataset_temp_dir:
|
|
646
|
+
cleanup_path(previous_temp_path)
|
|
647
|
+
|
|
648
|
+
if block_temp_jsonl_path is not None:
|
|
649
|
+
cleanup_path(block_temp_jsonl_path)
|
|
650
|
+
|
|
673
651
|
# Capture metrics after successful execution
|
|
674
652
|
execution_time = time.perf_counter() - start_time
|
|
675
653
|
output_rows = len(current_dataset)
|
|
@@ -698,6 +676,10 @@ class Flow(BaseModel):
|
|
|
698
676
|
)
|
|
699
677
|
|
|
700
678
|
except Exception as exc:
|
|
679
|
+
if block_temp_jsonl_path is not None:
|
|
680
|
+
cleanup_path(block_temp_jsonl_path)
|
|
681
|
+
if dataset_temp_dir is not None:
|
|
682
|
+
cleanup_path(dataset_temp_dir)
|
|
701
683
|
# Capture metrics for failed execution
|
|
702
684
|
execution_time = time.perf_counter() - start_time
|
|
703
685
|
self._block_metrics.append(
|
|
@@ -721,12 +703,21 @@ class Flow(BaseModel):
|
|
|
721
703
|
f"Block '{block.block_name}' execution failed: {exc}"
|
|
722
704
|
) from exc
|
|
723
705
|
|
|
706
|
+
if current_dataset_temp_path is not None:
|
|
707
|
+
final_temp_path = current_dataset_temp_path
|
|
708
|
+
current_dataset = datasets.load_from_disk(
|
|
709
|
+
str(final_temp_path), keep_in_memory=True
|
|
710
|
+
)
|
|
711
|
+
cleanup_path(final_temp_path)
|
|
712
|
+
|
|
724
713
|
return current_dataset
|
|
725
714
|
|
|
726
715
|
def _prepare_block_kwargs(
|
|
727
|
-
self, block: BaseBlock, runtime_params: dict[str, dict[str, Any]]
|
|
716
|
+
self, block: BaseBlock, runtime_params: Optional[dict[str, dict[str, Any]]]
|
|
728
717
|
) -> dict[str, Any]:
|
|
729
718
|
"""Prepare execution parameters for a block."""
|
|
719
|
+
if runtime_params is None:
|
|
720
|
+
return {}
|
|
730
721
|
return runtime_params.get(block.block_name, {})
|
|
731
722
|
|
|
732
723
|
def set_model_config(
|
|
@@ -1114,22 +1105,8 @@ class Flow(BaseModel):
|
|
|
1114
1105
|
if max_concurrency is not None:
|
|
1115
1106
|
block_kwargs["_flow_max_concurrency"] = max_concurrency
|
|
1116
1107
|
|
|
1117
|
-
#
|
|
1118
|
-
|
|
1119
|
-
hasattr(block, "__class__")
|
|
1120
|
-
and hasattr(block.__class__, "__module__")
|
|
1121
|
-
and "deprecated_blocks" in block.__class__.__module__
|
|
1122
|
-
)
|
|
1123
|
-
|
|
1124
|
-
if is_deprecated_block:
|
|
1125
|
-
logger.debug(
|
|
1126
|
-
f"Dry run: Skipping validations for deprecated block: {block.block_name}"
|
|
1127
|
-
)
|
|
1128
|
-
# Call generate() directly to skip validations, but keep the runtime params
|
|
1129
|
-
current_dataset = block.generate(current_dataset, **block_kwargs)
|
|
1130
|
-
else:
|
|
1131
|
-
# Execute block with validation and logging
|
|
1132
|
-
current_dataset = block(current_dataset, **block_kwargs)
|
|
1108
|
+
# Execute block with validation and logging
|
|
1109
|
+
current_dataset = block(current_dataset, **block_kwargs)
|
|
1133
1110
|
|
|
1134
1111
|
block_execution_time = (
|
|
1135
1112
|
time.perf_counter() - block_start_time
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
"""Utilities for managing temporary files and directories used by the flow."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Union
|
|
8
|
+
|
|
9
|
+
# Standard
|
|
10
|
+
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import tempfile
|
|
13
|
+
|
|
14
|
+
TEMP_ROOT_DIR_NAME = ".tmp_sdg_buffer"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_temp_root() -> Path:
|
|
18
|
+
root = Path.cwd() / TEMP_ROOT_DIR_NAME
|
|
19
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
return root
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _format_prefix(prefix: str) -> str:
|
|
24
|
+
return f"{prefix}_" if prefix and not prefix.endswith("_") else prefix
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_temp_dir(prefix: str = "tmp", suffix: str = "") -> Path:
|
|
28
|
+
"""Create a unique temporary directory."""
|
|
29
|
+
root = _get_temp_root()
|
|
30
|
+
name = tempfile.mkdtemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
|
|
31
|
+
return Path(name)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def create_temp_file(prefix: str = "tmp", suffix: str = "") -> Path:
|
|
35
|
+
"""Create a unique temporary file."""
|
|
36
|
+
root = _get_temp_root()
|
|
37
|
+
fd, name = tempfile.mkstemp(prefix=_format_prefix(prefix), suffix=suffix, dir=root)
|
|
38
|
+
os.close(fd)
|
|
39
|
+
return Path(name)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def cleanup_path(path: Optional[Union[str, os.PathLike]]) -> None:
|
|
43
|
+
"""Remove a temporary file or directory if it exists."""
|
|
44
|
+
if not path:
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
target = Path(path)
|
|
48
|
+
if not target.exists():
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
if target.is_dir():
|
|
52
|
+
shutil.rmtree(target, ignore_errors=True)
|
|
53
|
+
else:
|
|
54
|
+
try:
|
|
55
|
+
target.unlink()
|
|
56
|
+
except FileNotFoundError:
|
|
57
|
+
pass
|
|
@@ -77,9 +77,13 @@ blocks:
|
|
|
77
77
|
- ''
|
|
78
78
|
- block_type: RenameColumnsBlock
|
|
79
79
|
block_config:
|
|
80
|
-
block_name:
|
|
80
|
+
block_name: rename_to_raw_document_column
|
|
81
81
|
input_cols:
|
|
82
82
|
document: raw_document
|
|
83
|
+
- block_type: RenameColumnsBlock
|
|
84
|
+
block_config:
|
|
85
|
+
block_name: rename_to_document_column
|
|
86
|
+
input_cols:
|
|
83
87
|
summary: document
|
|
84
88
|
- block_type: PromptBuilderBlock
|
|
85
89
|
block_config:
|
|
@@ -79,9 +79,13 @@ blocks:
|
|
|
79
79
|
- ''
|
|
80
80
|
- block_type: RenameColumnsBlock
|
|
81
81
|
block_config:
|
|
82
|
-
block_name:
|
|
82
|
+
block_name: rename_to_raw_document_column
|
|
83
83
|
input_cols:
|
|
84
84
|
document: raw_document
|
|
85
|
+
- block_type: RenameColumnsBlock
|
|
86
|
+
block_config:
|
|
87
|
+
block_name: rename_to_document_column
|
|
88
|
+
input_cols:
|
|
85
89
|
summary: document
|
|
86
90
|
- block_type: PromptBuilderBlock
|
|
87
91
|
block_config:
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml
CHANGED
|
@@ -72,9 +72,13 @@ blocks:
|
|
|
72
72
|
parsing_pattern: '(?:^|\n)\s*\d+\.\s+(.*?)(?=\n\s*\d+\.\s+|\Z)'
|
|
73
73
|
- block_type: RenameColumnsBlock
|
|
74
74
|
block_config:
|
|
75
|
-
block_name:
|
|
75
|
+
block_name: rename_to_raw_document_column
|
|
76
76
|
input_cols:
|
|
77
77
|
document: raw_document
|
|
78
|
+
- block_type: RenameColumnsBlock
|
|
79
|
+
block_config:
|
|
80
|
+
block_name: rename_to_document_column
|
|
81
|
+
input_cols:
|
|
78
82
|
atomic_facts: document
|
|
79
83
|
- block_type: PromptBuilderBlock
|
|
80
84
|
block_config:
|
|
@@ -134,10 +134,15 @@ blocks:
|
|
|
134
134
|
input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
|
|
135
135
|
output_cols: [summary, dataset_type]
|
|
136
136
|
|
|
137
|
+
- block_type: RenameColumnsBlock
|
|
138
|
+
block_config:
|
|
139
|
+
block_name: rename_to_raw_document_column
|
|
140
|
+
input_cols: {document: raw_document}
|
|
141
|
+
|
|
137
142
|
- block_type: RenameColumnsBlock
|
|
138
143
|
block_config:
|
|
139
144
|
block_name: rename_to_document_column
|
|
140
|
-
input_cols: {
|
|
145
|
+
input_cols: {summary: document}
|
|
141
146
|
|
|
142
147
|
- block_type: PromptBuilderBlock
|
|
143
148
|
block_config:
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml
CHANGED
|
@@ -135,10 +135,14 @@ blocks:
|
|
|
135
135
|
input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
|
|
136
136
|
output_cols: [summary, dataset_type]
|
|
137
137
|
|
|
138
|
+
- block_type: RenameColumnsBlock
|
|
139
|
+
block_config:
|
|
140
|
+
block_name: rename_to_raw_document_column
|
|
141
|
+
input_cols: {document: raw_document}
|
|
138
142
|
- block_type: RenameColumnsBlock
|
|
139
143
|
block_config:
|
|
140
144
|
block_name: rename_to_document_column
|
|
141
|
-
input_cols: {
|
|
145
|
+
input_cols: {summary: document}
|
|
142
146
|
|
|
143
147
|
- block_type: PromptBuilderBlock
|
|
144
148
|
block_config:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdg_hub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Synthetic Data Generation
|
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
25
|
Requires-Dist: click<9.0.0,>=8.1.7
|
|
26
|
-
Requires-Dist: datasets
|
|
26
|
+
Requires-Dist: datasets>=4.0.0
|
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
|
28
28
|
Requires-Dist: jinja2
|
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|