sdg-hub 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/__init__.py +0 -22
- sdg_hub/core/blocks/transform/rename_columns.py +19 -0
- sdg_hub/core/flow/base.py +8 -80
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/METADATA +2 -2
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/RECORD +14 -25
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
- sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
- sdg_hub/core/flow/migration.py +0 -198
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/top_level.txt +0 -0
sdg_hub/_version.py
CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
28
28
|
commit_id: COMMIT_ID
|
29
29
|
__commit_id__: COMMIT_ID
|
30
30
|
|
31
|
-
__version__ = version = '0.
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
31
|
+
__version__ = version = '0.5.0'
|
32
|
+
__version_tuple__ = version_tuple = (0, 5, 0)
|
33
33
|
|
34
34
|
__commit_id__ = commit_id = None
|
sdg_hub/core/blocks/__init__.py
CHANGED
@@ -5,17 +5,6 @@ This package provides various block implementations for data generation, process
|
|
5
5
|
|
6
6
|
# Local
|
7
7
|
from .base import BaseBlock
|
8
|
-
from .deprecated_blocks import (
|
9
|
-
CombineColumnsBlock,
|
10
|
-
DuplicateColumns,
|
11
|
-
FilterByValueBlock,
|
12
|
-
FlattenColumnsBlock,
|
13
|
-
LLMBlock,
|
14
|
-
RenameColumns,
|
15
|
-
SamplePopulatorBlock,
|
16
|
-
SelectorBlock,
|
17
|
-
SetToMajorityValue,
|
18
|
-
)
|
19
8
|
from .filtering import ColumnValueFilterBlock
|
20
9
|
from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
|
21
10
|
from .registry import BlockRegistry
|
@@ -28,8 +17,6 @@ from .transform import (
|
|
28
17
|
UniformColumnValueSetter,
|
29
18
|
)
|
30
19
|
|
31
|
-
# All blocks moved to deprecated_blocks or transform modules
|
32
|
-
|
33
20
|
__all__ = [
|
34
21
|
"BaseBlock",
|
35
22
|
"BlockRegistry",
|
@@ -40,15 +27,6 @@ __all__ = [
|
|
40
27
|
"RenameColumnsBlock",
|
41
28
|
"TextConcatBlock",
|
42
29
|
"UniformColumnValueSetter",
|
43
|
-
"CombineColumnsBlock", # Deprecated
|
44
|
-
"DuplicateColumns", # Deprecated
|
45
|
-
"FilterByValueBlock", # Deprecated
|
46
|
-
"FlattenColumnsBlock", # Deprecated
|
47
|
-
"RenameColumns", # Deprecated
|
48
|
-
"SamplePopulatorBlock", # Deprecated
|
49
|
-
"SelectorBlock", # Deprecated
|
50
|
-
"SetToMajorityValue", # Deprecated
|
51
|
-
"LLMBlock", # Deprecated
|
52
30
|
"LLMChatBlock",
|
53
31
|
"LLMParserBlock",
|
54
32
|
"TextParserBlock",
|
@@ -64,6 +64,25 @@ class RenameColumnsBlock(BaseBlock):
|
|
64
64
|
-------
|
65
65
|
Dataset
|
66
66
|
Dataset with renamed columns.
|
67
|
+
|
68
|
+
Raises
|
69
|
+
------
|
70
|
+
ValueError
|
71
|
+
If attempting to rename to a column name that already exists.
|
67
72
|
"""
|
73
|
+
# Check for column name collisions
|
74
|
+
# Strict validation: no target column name can be an existing column name
|
75
|
+
# This prevents chained/circular renames which can be confusing
|
76
|
+
existing_cols = set(samples.column_names)
|
77
|
+
target_cols = set(self.input_cols.values())
|
78
|
+
|
79
|
+
collision = target_cols & existing_cols
|
80
|
+
if collision:
|
81
|
+
raise ValueError(
|
82
|
+
f"Cannot rename to existing column names: {sorted(collision)}. "
|
83
|
+
"Target column names must not already exist in the dataset. "
|
84
|
+
"Chained renames are not supported."
|
85
|
+
)
|
86
|
+
|
68
87
|
# Rename columns using HuggingFace datasets method
|
69
88
|
return samples.rename_columns(self.input_cols)
|
sdg_hub/core/flow/base.py
CHANGED
@@ -41,7 +41,6 @@ from ..utils.time_estimator import estimate_execution_time
|
|
41
41
|
from ..utils.yaml_utils import save_flow_yaml
|
42
42
|
from .checkpointer import FlowCheckpointer
|
43
43
|
from .metadata import DatasetRequirements, FlowMetadata
|
44
|
-
from .migration import FlowMigration
|
45
44
|
from .validation import FlowValidator
|
46
45
|
|
47
46
|
logger = setup_logger(__name__)
|
@@ -73,8 +72,6 @@ class Flow(BaseModel):
|
|
73
72
|
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
74
73
|
|
75
74
|
# Private attributes (not serialized)
|
76
|
-
_migrated_runtime_params: dict[str, dict[str, Any]] = {}
|
77
|
-
_llm_client: Any = None # Only used for backward compatibility with old YAMLs
|
78
75
|
_model_config_set: bool = False # Track if model configuration has been set
|
79
76
|
_block_metrics: list[dict[str, Any]] = PrivateAttr(
|
80
77
|
default_factory=list
|
@@ -113,16 +110,13 @@ class Flow(BaseModel):
|
|
113
110
|
return self
|
114
111
|
|
115
112
|
@classmethod
|
116
|
-
def from_yaml(cls, yaml_path: str
|
113
|
+
def from_yaml(cls, yaml_path: str) -> "Flow":
|
117
114
|
"""Load flow from YAML configuration file.
|
118
115
|
|
119
116
|
Parameters
|
120
117
|
----------
|
121
118
|
yaml_path : str
|
122
119
|
Path to the YAML flow configuration file.
|
123
|
-
client : Any, optional
|
124
|
-
LLM client instance. Required for backward compatibility with old format YAMLs
|
125
|
-
that use deprecated LLMBlocks. Ignored for new format YAMLs.
|
126
120
|
|
127
121
|
Returns
|
128
122
|
-------
|
@@ -153,21 +147,6 @@ class Flow(BaseModel):
|
|
153
147
|
except yaml.YAMLError as exc:
|
154
148
|
raise FlowValidationError(f"Invalid YAML in {yaml_path}: {exc}") from exc
|
155
149
|
|
156
|
-
# Check if this is an old format flow and migrate if necessary
|
157
|
-
migrated_runtime_params = None
|
158
|
-
is_old_format = FlowMigration.is_old_format(flow_config)
|
159
|
-
if is_old_format:
|
160
|
-
logger.info(f"Detected old format flow, migrating: {yaml_path}")
|
161
|
-
if client is None:
|
162
|
-
logger.warning(
|
163
|
-
"Old format YAML detected but no client provided. LLMBlocks may fail."
|
164
|
-
)
|
165
|
-
flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
|
166
|
-
flow_config, yaml_path
|
167
|
-
)
|
168
|
-
# Save migrated config back to YAML to persist id
|
169
|
-
save_flow_yaml(yaml_path, flow_config, "migrated to new format")
|
170
|
-
|
171
150
|
# Validate YAML structure
|
172
151
|
validator = FlowValidator()
|
173
152
|
validation_errors = validator.validate_yaml_structure(flow_config)
|
@@ -194,19 +173,6 @@ class Flow(BaseModel):
|
|
194
173
|
|
195
174
|
for i, block_config in enumerate(block_configs):
|
196
175
|
try:
|
197
|
-
# Inject client for deprecated LLMBlocks if this is an old format flow
|
198
|
-
if (
|
199
|
-
is_old_format
|
200
|
-
and block_config.get("block_type") == "LLMBlock"
|
201
|
-
and client is not None
|
202
|
-
):
|
203
|
-
if "block_config" not in block_config:
|
204
|
-
block_config["block_config"] = {}
|
205
|
-
block_config["block_config"]["client"] = client
|
206
|
-
logger.debug(
|
207
|
-
f"Injected client for deprecated LLMBlock: {block_config['block_config'].get('block_name')}"
|
208
|
-
)
|
209
|
-
|
210
176
|
block = cls._create_block_from_config(block_config, yaml_dir)
|
211
177
|
blocks.append(block)
|
212
178
|
except Exception as exc:
|
@@ -228,12 +194,6 @@ class Flow(BaseModel):
|
|
228
194
|
)
|
229
195
|
else:
|
230
196
|
logger.debug(f"Flow already had id: {flow.metadata.id}")
|
231
|
-
# Store migrated runtime params and client for backward compatibility
|
232
|
-
if migrated_runtime_params:
|
233
|
-
flow._migrated_runtime_params = migrated_runtime_params
|
234
|
-
if is_old_format and client is not None:
|
235
|
-
flow._llm_client = client
|
236
|
-
|
237
197
|
# Check if this is a flow without LLM blocks
|
238
198
|
llm_blocks = flow._detect_llm_blocks()
|
239
199
|
if not llm_blocks:
|
@@ -484,12 +444,6 @@ class Flow(BaseModel):
|
|
484
444
|
self._block_metrics = []
|
485
445
|
run_start = time.perf_counter()
|
486
446
|
|
487
|
-
# Merge migrated runtime params with provided ones (provided ones take precedence)
|
488
|
-
merged_runtime_params = self._migrated_runtime_params.copy()
|
489
|
-
if runtime_params:
|
490
|
-
merged_runtime_params.update(runtime_params)
|
491
|
-
runtime_params = merged_runtime_params
|
492
|
-
|
493
447
|
# Execute flow with metrics capture, ensuring metrics are always displayed/saved
|
494
448
|
final_dataset = None
|
495
449
|
execution_successful = False
|
@@ -647,22 +601,8 @@ class Flow(BaseModel):
|
|
647
601
|
input_cols = set(current_dataset.column_names)
|
648
602
|
|
649
603
|
try:
|
650
|
-
#
|
651
|
-
|
652
|
-
hasattr(block, "__class__")
|
653
|
-
and hasattr(block.__class__, "__module__")
|
654
|
-
and "deprecated_blocks" in block.__class__.__module__
|
655
|
-
)
|
656
|
-
|
657
|
-
if is_deprecated_block:
|
658
|
-
exec_logger.debug(
|
659
|
-
f"Skipping validations for deprecated block: {block.block_name}"
|
660
|
-
)
|
661
|
-
# Call generate() directly to skip validations, but keep the runtime params
|
662
|
-
current_dataset = block.generate(current_dataset, **block_kwargs)
|
663
|
-
else:
|
664
|
-
# Execute block with validation and logging
|
665
|
-
current_dataset = block(current_dataset, **block_kwargs)
|
604
|
+
# Execute block with validation and logging
|
605
|
+
current_dataset = block(current_dataset, **block_kwargs)
|
666
606
|
|
667
607
|
# Validate output
|
668
608
|
if len(current_dataset) == 0:
|
@@ -724,9 +664,11 @@ class Flow(BaseModel):
|
|
724
664
|
return current_dataset
|
725
665
|
|
726
666
|
def _prepare_block_kwargs(
|
727
|
-
self, block: BaseBlock, runtime_params: dict[str, dict[str, Any]]
|
667
|
+
self, block: BaseBlock, runtime_params: Optional[dict[str, dict[str, Any]]]
|
728
668
|
) -> dict[str, Any]:
|
729
669
|
"""Prepare execution parameters for a block."""
|
670
|
+
if runtime_params is None:
|
671
|
+
return {}
|
730
672
|
return runtime_params.get(block.block_name, {})
|
731
673
|
|
732
674
|
def set_model_config(
|
@@ -1114,22 +1056,8 @@ class Flow(BaseModel):
|
|
1114
1056
|
if max_concurrency is not None:
|
1115
1057
|
block_kwargs["_flow_max_concurrency"] = max_concurrency
|
1116
1058
|
|
1117
|
-
#
|
1118
|
-
|
1119
|
-
hasattr(block, "__class__")
|
1120
|
-
and hasattr(block.__class__, "__module__")
|
1121
|
-
and "deprecated_blocks" in block.__class__.__module__
|
1122
|
-
)
|
1123
|
-
|
1124
|
-
if is_deprecated_block:
|
1125
|
-
logger.debug(
|
1126
|
-
f"Dry run: Skipping validations for deprecated block: {block.block_name}"
|
1127
|
-
)
|
1128
|
-
# Call generate() directly to skip validations, but keep the runtime params
|
1129
|
-
current_dataset = block.generate(current_dataset, **block_kwargs)
|
1130
|
-
else:
|
1131
|
-
# Execute block with validation and logging
|
1132
|
-
current_dataset = block(current_dataset, **block_kwargs)
|
1059
|
+
# Execute block with validation and logging
|
1060
|
+
current_dataset = block(current_dataset, **block_kwargs)
|
1133
1061
|
|
1134
1062
|
block_execution_time = (
|
1135
1063
|
time.perf_counter() - block_start_time
|
@@ -77,9 +77,13 @@ blocks:
|
|
77
77
|
- ''
|
78
78
|
- block_type: RenameColumnsBlock
|
79
79
|
block_config:
|
80
|
-
block_name:
|
80
|
+
block_name: rename_to_raw_document_column
|
81
81
|
input_cols:
|
82
82
|
document: raw_document
|
83
|
+
- block_type: RenameColumnsBlock
|
84
|
+
block_config:
|
85
|
+
block_name: rename_to_document_column
|
86
|
+
input_cols:
|
83
87
|
summary: document
|
84
88
|
- block_type: PromptBuilderBlock
|
85
89
|
block_config:
|
@@ -79,9 +79,13 @@ blocks:
|
|
79
79
|
- ''
|
80
80
|
- block_type: RenameColumnsBlock
|
81
81
|
block_config:
|
82
|
-
block_name:
|
82
|
+
block_name: rename_to_raw_document_column
|
83
83
|
input_cols:
|
84
84
|
document: raw_document
|
85
|
+
- block_type: RenameColumnsBlock
|
86
|
+
block_config:
|
87
|
+
block_name: rename_to_document_column
|
88
|
+
input_cols:
|
85
89
|
summary: document
|
86
90
|
- block_type: PromptBuilderBlock
|
87
91
|
block_config:
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml
CHANGED
@@ -72,9 +72,13 @@ blocks:
|
|
72
72
|
parsing_pattern: '(?:^|\n)\s*\d+\.\s+(.*?)(?=\n\s*\d+\.\s+|\Z)'
|
73
73
|
- block_type: RenameColumnsBlock
|
74
74
|
block_config:
|
75
|
-
block_name:
|
75
|
+
block_name: rename_to_raw_document_column
|
76
76
|
input_cols:
|
77
77
|
document: raw_document
|
78
|
+
- block_type: RenameColumnsBlock
|
79
|
+
block_config:
|
80
|
+
block_name: rename_to_document_column
|
81
|
+
input_cols:
|
78
82
|
atomic_facts: document
|
79
83
|
- block_type: PromptBuilderBlock
|
80
84
|
block_config:
|
@@ -134,10 +134,15 @@ blocks:
|
|
134
134
|
input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
|
135
135
|
output_cols: [summary, dataset_type]
|
136
136
|
|
137
|
+
- block_type: RenameColumnsBlock
|
138
|
+
block_config:
|
139
|
+
block_name: rename_to_raw_document_column
|
140
|
+
input_cols: {document: raw_document}
|
141
|
+
|
137
142
|
- block_type: RenameColumnsBlock
|
138
143
|
block_config:
|
139
144
|
block_name: rename_to_document_column
|
140
|
-
input_cols: {
|
145
|
+
input_cols: {summary: document}
|
141
146
|
|
142
147
|
- block_type: PromptBuilderBlock
|
143
148
|
block_config:
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml
CHANGED
@@ -135,10 +135,14 @@ blocks:
|
|
135
135
|
input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
|
136
136
|
output_cols: [summary, dataset_type]
|
137
137
|
|
138
|
+
- block_type: RenameColumnsBlock
|
139
|
+
block_config:
|
140
|
+
block_name: rename_to_raw_document_column
|
141
|
+
input_cols: {document: raw_document}
|
138
142
|
- block_type: RenameColumnsBlock
|
139
143
|
block_config:
|
140
144
|
block_name: rename_to_document_column
|
141
|
-
input_cols: {
|
145
|
+
input_cols: {summary: document}
|
142
146
|
|
143
147
|
- block_type: PromptBuilderBlock
|
144
148
|
block_config:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sdg_hub
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Synthetic Data Generation
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
6
|
License: Apache-2.0
|
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
|
|
23
23
|
Description-Content-Type: text/markdown
|
24
24
|
License-File: LICENSE
|
25
25
|
Requires-Dist: click<9.0.0,>=8.1.7
|
26
|
-
Requires-Dist: datasets
|
26
|
+
Requires-Dist: datasets>=4.0.0
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
28
28
|
Requires-Dist: jinja2
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|
@@ -1,20 +1,10 @@
|
|
1
1
|
sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
|
2
|
-
sdg_hub/_version.py,sha256=
|
2
|
+
sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
|
3
3
|
sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
|
5
|
-
sdg_hub/core/blocks/__init__.py,sha256=
|
5
|
+
sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
|
6
6
|
sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
|
7
7
|
sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
|
8
|
-
sdg_hub/core/blocks/deprecated_blocks/__init__.py,sha256=RDu3MWFStDQko-TKkx8tGoB1UTatP_RSldZK43zHDvY,889
|
9
|
-
sdg_hub/core/blocks/deprecated_blocks/combine_columns.py,sha256=HCvpaYsAwgx1Dm0vIshcWsKoVsRT0KrmKp9j4oqtByc,2757
|
10
|
-
sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py,sha256=maCaaEs0EMMzt7L1xm7fAH3ylaFMHEkeC_dtOw3FrjU,2694
|
11
|
-
sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py,sha256=-fuuMKj2g2MrijMBTd0PWtYBbf9anQ2UkYXHigCxxJI,3328
|
12
|
-
sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py,sha256=IenCskrPEv09h2uT6aZKCQzaxgA_3kAzOeJSd-R_-EA,2839
|
13
|
-
sdg_hub/core/blocks/deprecated_blocks/llmblock.py,sha256=34lzC43BODpMk5AwlWA1ctdYPmN7cA6WL5vMXaI0P0Y,20385
|
14
|
-
sdg_hub/core/blocks/deprecated_blocks/rename_columns.py,sha256=thp-mHtkRmUw_nYKpldy_mLWR2AvC5YUhbqDETM6-T0,2620
|
15
|
-
sdg_hub/core/blocks/deprecated_blocks/sample_populator.py,sha256=UdueMApxOmPWaxxMrw7b1v74fKJBfqqRATEBqgmVtNw,1737
|
16
|
-
sdg_hub/core/blocks/deprecated_blocks/selector.py,sha256=nWecsVsW8DvBcqAF_LOqXmW-5MQ28uN3d1y6wkSy38c,2960
|
17
|
-
sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py,sha256=44TQu-rK5isia-otMVB1zHd8D-wWmu3C8CI1NLtfY5s,2729
|
18
8
|
sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
|
19
9
|
sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
|
20
10
|
sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
|
@@ -29,14 +19,13 @@ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGN
|
|
29
19
|
sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
|
30
20
|
sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
|
31
21
|
sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
|
32
|
-
sdg_hub/core/blocks/transform/rename_columns.py,sha256=
|
22
|
+
sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
|
33
23
|
sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
|
34
24
|
sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
|
35
25
|
sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
|
36
|
-
sdg_hub/core/flow/base.py,sha256=
|
26
|
+
sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
|
37
27
|
sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
|
38
28
|
sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
|
39
|
-
sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
|
40
29
|
sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
|
41
30
|
sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
|
42
31
|
sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
|
@@ -55,14 +44,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/gener
|
|
55
44
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml,sha256=qHOgUNrQz2vjUjJiEHNGWxDDXwjJlP1kofTxeGgLyPI,1461
|
56
45
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
57
46
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml,sha256=Ik6gAml0O-jPq8jpXBAkURzYkQuFOnDZb4LDwjmfAiE,381
|
58
|
-
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=
|
47
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=cxNpPh60mcvzxfczMH8hw66Ql3S8O-cWCCDeauO736c,5649
|
59
48
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
49
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml,sha256=smPWVUZRCt58EagWDmJVmTBQj8qMcjpzh-Q3GSuFrz0,4413
|
61
50
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
62
51
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml,sha256=SeapWoOx3fhN5SvWYuHss_9prLE8xSkOic7JkbDHSR0,4081
|
63
|
-
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=
|
52
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=7dVc0_g7Ex5SfdX57pqtk9gmH_lC6Cdm3HC-lg8OiXQ,5817
|
64
53
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
|
-
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=
|
54
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=7X4N19TcyHUo7pNo3C6Zv3w6br7hjzEfgv06XUVDaQo,3330
|
66
55
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml,sha256=YKMX_CuvcThG_bdNCAIXdVBkMvB72I89RGq2ltSSgc8,3298
|
67
56
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
68
57
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -72,14 +61,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/ev
|
|
72
61
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
|
73
62
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
|
74
63
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
|
75
|
-
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=
|
64
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=km0ggcmFsZJGc2TfyYLkzPTrHGmcOB-jBAHInqySisk,9176
|
76
65
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
|
77
66
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
67
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
79
68
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
|
80
69
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
|
81
70
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
|
82
|
-
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=
|
71
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=Q6RusV-_HHMr5jlFNOP6UVuEf8d6btHENMOP3MnB3u0,9291
|
83
72
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
|
84
73
|
sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
|
85
74
|
sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
|
@@ -88,8 +77,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
|
|
88
77
|
sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
|
89
78
|
sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
|
90
79
|
sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
|
91
|
-
sdg_hub-0.
|
92
|
-
sdg_hub-0.
|
93
|
-
sdg_hub-0.
|
94
|
-
sdg_hub-0.
|
95
|
-
sdg_hub-0.
|
80
|
+
sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
81
|
+
sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
|
82
|
+
sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
83
|
+
sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
|
84
|
+
sdg_hub-0.5.0.dist-info/RECORD,,
|
@@ -1,29 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Deprecated blocks for backwards compatibility.
|
3
|
-
|
4
|
-
This module contains deprecated block implementations that are maintained
|
5
|
-
for backwards compatibility. These blocks should not be used in new code.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Local
|
9
|
-
from .combine_columns import CombineColumnsBlock
|
10
|
-
from .duplicate_columns import DuplicateColumns
|
11
|
-
from .filter_by_value import FilterByValueBlock
|
12
|
-
from .flatten_columns import FlattenColumnsBlock
|
13
|
-
from .llmblock import LLMBlock
|
14
|
-
from .rename_columns import RenameColumns
|
15
|
-
from .sample_populator import SamplePopulatorBlock
|
16
|
-
from .selector import SelectorBlock
|
17
|
-
from .set_to_majority_value import SetToMajorityValue
|
18
|
-
|
19
|
-
__all__ = [
|
20
|
-
"CombineColumnsBlock",
|
21
|
-
"DuplicateColumns",
|
22
|
-
"FilterByValueBlock",
|
23
|
-
"FlattenColumnsBlock",
|
24
|
-
"LLMBlock",
|
25
|
-
"RenameColumns",
|
26
|
-
"SamplePopulatorBlock",
|
27
|
-
"SelectorBlock",
|
28
|
-
"SetToMajorityValue",
|
29
|
-
]
|
@@ -1,93 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""DEPRECATED: CombineColumnsBlock for backward compatibility.
|
3
|
-
|
4
|
-
This module provides a deprecated wrapper for the old CombineColumnsBlock interface.
|
5
|
-
Use transform.CombineColumnsBlock instead.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from typing import Any
|
10
|
-
import warnings
|
11
|
-
|
12
|
-
# Third Party
|
13
|
-
from datasets import Dataset
|
14
|
-
|
15
|
-
# Local
|
16
|
-
from ...utils.logger_config import setup_logger
|
17
|
-
from ..base import BaseBlock
|
18
|
-
from ..registry import BlockRegistry
|
19
|
-
from ..transform.text_concat import TextConcatBlock
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register(
|
25
|
-
"CombineColumnsBlock",
|
26
|
-
"deprecated",
|
27
|
-
"DEPRECATED: Use TextConcatBlock instead. Combines multiple columns into a single column using a separator",
|
28
|
-
)
|
29
|
-
class CombineColumnsBlock(BaseBlock):
|
30
|
-
r"""DEPRECATED: Combine multiple columns into a single column using a separator.
|
31
|
-
|
32
|
-
.. deprecated::
|
33
|
-
Use `sdg_hub.blocks.transform.CombineColumnsBlock` instead.
|
34
|
-
This class will be removed in a future version.
|
35
|
-
|
36
|
-
This block concatenates values from multiple columns into a single output column,
|
37
|
-
using a specified separator between values.
|
38
|
-
|
39
|
-
Parameters
|
40
|
-
----------
|
41
|
-
block_name : str
|
42
|
-
Name of the block.
|
43
|
-
columns : List[str]
|
44
|
-
List of column names to combine.
|
45
|
-
output_col : str
|
46
|
-
Name of the column to store combined values.
|
47
|
-
separator : str, optional
|
48
|
-
String to use as separator between combined values, by default "\\n\\n".
|
49
|
-
**batch_kwargs : Dict[str, Any]
|
50
|
-
Additional keyword arguments for batch processing.
|
51
|
-
"""
|
52
|
-
|
53
|
-
def __init__(
|
54
|
-
self,
|
55
|
-
block_name: str,
|
56
|
-
columns: list[str],
|
57
|
-
output_col: str,
|
58
|
-
separator: str = "\n\n",
|
59
|
-
**batch_kwargs: dict[str, Any],
|
60
|
-
) -> None:
|
61
|
-
warnings.warn(
|
62
|
-
"CombineColumnsBlock is deprecated. Use sdg_hub.blocks.transform.TextConcatBlock instead.",
|
63
|
-
DeprecationWarning,
|
64
|
-
stacklevel=2,
|
65
|
-
)
|
66
|
-
|
67
|
-
# Initialize with dummy values for BaseBlock validation
|
68
|
-
super().__init__(
|
69
|
-
block_name=block_name, input_cols=columns, output_cols=[output_col]
|
70
|
-
)
|
71
|
-
|
72
|
-
# Create the new implementation
|
73
|
-
self._impl = TextConcatBlock(
|
74
|
-
block_name=block_name,
|
75
|
-
input_cols=columns,
|
76
|
-
output_cols=[output_col],
|
77
|
-
separator=separator,
|
78
|
-
)
|
79
|
-
|
80
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
81
|
-
"""Generate a dataset with combined columns.
|
82
|
-
|
83
|
-
Parameters
|
84
|
-
----------
|
85
|
-
samples : Dataset
|
86
|
-
Input dataset to process.
|
87
|
-
|
88
|
-
Returns
|
89
|
-
-------
|
90
|
-
Dataset
|
91
|
-
Dataset with combined values stored in output column.
|
92
|
-
"""
|
93
|
-
return self._impl.generate(samples)
|
@@ -1,88 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Deprecated DuplicateColumns for backwards compatibility.
|
3
|
-
|
4
|
-
This module provides a deprecated wrapper around DuplicateColumnsBlock
|
5
|
-
to maintain backwards compatibility with existing code and configurations.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from typing import Any
|
10
|
-
import warnings
|
11
|
-
|
12
|
-
# Third Party
|
13
|
-
from datasets import Dataset
|
14
|
-
|
15
|
-
# Local
|
16
|
-
from ...utils.logger_config import setup_logger
|
17
|
-
from ..base import BaseBlock
|
18
|
-
from ..registry import BlockRegistry
|
19
|
-
from ..transform import DuplicateColumnsBlock
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register(
|
25
|
-
"DuplicateColumns",
|
26
|
-
"deprecated",
|
27
|
-
"DEPRECATED: Use DuplicateColumnsBlock instead. Duplicates existing columns with new names according to a mapping dictionary",
|
28
|
-
)
|
29
|
-
class DuplicateColumns(BaseBlock):
|
30
|
-
"""DEPRECATED: Block for duplicating existing columns with new names.
|
31
|
-
|
32
|
-
This block is deprecated and maintained only for backwards compatibility.
|
33
|
-
Please use DuplicateColumnsBlock instead.
|
34
|
-
|
35
|
-
This block creates copies of existing columns with new names as specified
|
36
|
-
in the columns mapping dictionary.
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(
|
40
|
-
self,
|
41
|
-
block_name: str,
|
42
|
-
columns_map: dict[str, str],
|
43
|
-
) -> None:
|
44
|
-
"""Initialize the deprecated DuplicateColumns.
|
45
|
-
|
46
|
-
Parameters
|
47
|
-
----------
|
48
|
-
block_name : str
|
49
|
-
Name of the block.
|
50
|
-
columns_map : Dict[str, str]
|
51
|
-
Dictionary mapping existing column names to new column names.
|
52
|
-
Keys are existing column names, values are new column names.
|
53
|
-
"""
|
54
|
-
# Issue deprecation warning
|
55
|
-
warnings.warn(
|
56
|
-
"DuplicateColumns is deprecated and will be removed in a future version. "
|
57
|
-
"Please use DuplicateColumnsBlock instead.",
|
58
|
-
DeprecationWarning,
|
59
|
-
stacklevel=2,
|
60
|
-
)
|
61
|
-
|
62
|
-
# Map old signature to new signature
|
63
|
-
super().__init__(
|
64
|
-
block_name=block_name,
|
65
|
-
input_cols=columns_map,
|
66
|
-
output_cols=list(columns_map.values()),
|
67
|
-
)
|
68
|
-
|
69
|
-
# Create the new block instance with mapped parameters
|
70
|
-
self._new_block = DuplicateColumnsBlock(
|
71
|
-
block_name=block_name,
|
72
|
-
input_cols=columns_map,
|
73
|
-
)
|
74
|
-
|
75
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
76
|
-
"""Generate dataset with duplicated columns using the new DuplicateColumnsBlock.
|
77
|
-
|
78
|
-
Parameters
|
79
|
-
----------
|
80
|
-
samples : Dataset
|
81
|
-
The input dataset to duplicate columns from.
|
82
|
-
|
83
|
-
Returns
|
84
|
-
-------
|
85
|
-
Dataset
|
86
|
-
The dataset with additional duplicated columns.
|
87
|
-
"""
|
88
|
-
return self._new_block.generate(samples, **kwargs)
|