sdg-hub 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/__init__.py +0 -22
  3. sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  4. sdg_hub/core/flow/base.py +8 -80
  5. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  6. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  7. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  8. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  9. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
  10. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/METADATA +2 -2
  11. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/RECORD +14 -25
  12. sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  13. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  14. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  15. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  16. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  17. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  18. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  19. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  20. sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  21. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  22. sdg_hub/core/flow/migration.py +0 -198
  23. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/WHEEL +0 -0
  24. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/licenses/LICENSE +0 -0
  25. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/top_level.txt +0 -0
sdg_hub/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.4.2'
32
- __version_tuple__ = version_tuple = (0, 4, 2)
31
+ __version__ = version = '0.5.0'
32
+ __version_tuple__ = version_tuple = (0, 5, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -5,17 +5,6 @@ This package provides various block implementations for data generation, process
5
5
 
6
6
  # Local
7
7
  from .base import BaseBlock
8
- from .deprecated_blocks import (
9
- CombineColumnsBlock,
10
- DuplicateColumns,
11
- FilterByValueBlock,
12
- FlattenColumnsBlock,
13
- LLMBlock,
14
- RenameColumns,
15
- SamplePopulatorBlock,
16
- SelectorBlock,
17
- SetToMajorityValue,
18
- )
19
8
  from .filtering import ColumnValueFilterBlock
20
9
  from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
21
10
  from .registry import BlockRegistry
@@ -28,8 +17,6 @@ from .transform import (
28
17
  UniformColumnValueSetter,
29
18
  )
30
19
 
31
- # All blocks moved to deprecated_blocks or transform modules
32
-
33
20
  __all__ = [
34
21
  "BaseBlock",
35
22
  "BlockRegistry",
@@ -40,15 +27,6 @@ __all__ = [
40
27
  "RenameColumnsBlock",
41
28
  "TextConcatBlock",
42
29
  "UniformColumnValueSetter",
43
- "CombineColumnsBlock", # Deprecated
44
- "DuplicateColumns", # Deprecated
45
- "FilterByValueBlock", # Deprecated
46
- "FlattenColumnsBlock", # Deprecated
47
- "RenameColumns", # Deprecated
48
- "SamplePopulatorBlock", # Deprecated
49
- "SelectorBlock", # Deprecated
50
- "SetToMajorityValue", # Deprecated
51
- "LLMBlock", # Deprecated
52
30
  "LLMChatBlock",
53
31
  "LLMParserBlock",
54
32
  "TextParserBlock",
@@ -64,6 +64,25 @@ class RenameColumnsBlock(BaseBlock):
64
64
  -------
65
65
  Dataset
66
66
  Dataset with renamed columns.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If attempting to rename to a column name that already exists.
67
72
  """
73
+ # Check for column name collisions
74
+ # Strict validation: no target column name can be an existing column name
75
+ # This prevents chained/circular renames which can be confusing
76
+ existing_cols = set(samples.column_names)
77
+ target_cols = set(self.input_cols.values())
78
+
79
+ collision = target_cols & existing_cols
80
+ if collision:
81
+ raise ValueError(
82
+ f"Cannot rename to existing column names: {sorted(collision)}. "
83
+ "Target column names must not already exist in the dataset. "
84
+ "Chained renames are not supported."
85
+ )
86
+
68
87
  # Rename columns using HuggingFace datasets method
69
88
  return samples.rename_columns(self.input_cols)
sdg_hub/core/flow/base.py CHANGED
@@ -41,7 +41,6 @@ from ..utils.time_estimator import estimate_execution_time
41
41
  from ..utils.yaml_utils import save_flow_yaml
42
42
  from .checkpointer import FlowCheckpointer
43
43
  from .metadata import DatasetRequirements, FlowMetadata
44
- from .migration import FlowMigration
45
44
  from .validation import FlowValidator
46
45
 
47
46
  logger = setup_logger(__name__)
@@ -73,8 +72,6 @@ class Flow(BaseModel):
73
72
  model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
74
73
 
75
74
  # Private attributes (not serialized)
76
- _migrated_runtime_params: dict[str, dict[str, Any]] = {}
77
- _llm_client: Any = None # Only used for backward compatibility with old YAMLs
78
75
  _model_config_set: bool = False # Track if model configuration has been set
79
76
  _block_metrics: list[dict[str, Any]] = PrivateAttr(
80
77
  default_factory=list
@@ -113,16 +110,13 @@ class Flow(BaseModel):
113
110
  return self
114
111
 
115
112
  @classmethod
116
- def from_yaml(cls, yaml_path: str, client: Any = None) -> "Flow":
113
+ def from_yaml(cls, yaml_path: str) -> "Flow":
117
114
  """Load flow from YAML configuration file.
118
115
 
119
116
  Parameters
120
117
  ----------
121
118
  yaml_path : str
122
119
  Path to the YAML flow configuration file.
123
- client : Any, optional
124
- LLM client instance. Required for backward compatibility with old format YAMLs
125
- that use deprecated LLMBlocks. Ignored for new format YAMLs.
126
120
 
127
121
  Returns
128
122
  -------
@@ -153,21 +147,6 @@ class Flow(BaseModel):
153
147
  except yaml.YAMLError as exc:
154
148
  raise FlowValidationError(f"Invalid YAML in {yaml_path}: {exc}") from exc
155
149
 
156
- # Check if this is an old format flow and migrate if necessary
157
- migrated_runtime_params = None
158
- is_old_format = FlowMigration.is_old_format(flow_config)
159
- if is_old_format:
160
- logger.info(f"Detected old format flow, migrating: {yaml_path}")
161
- if client is None:
162
- logger.warning(
163
- "Old format YAML detected but no client provided. LLMBlocks may fail."
164
- )
165
- flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
166
- flow_config, yaml_path
167
- )
168
- # Save migrated config back to YAML to persist id
169
- save_flow_yaml(yaml_path, flow_config, "migrated to new format")
170
-
171
150
  # Validate YAML structure
172
151
  validator = FlowValidator()
173
152
  validation_errors = validator.validate_yaml_structure(flow_config)
@@ -194,19 +173,6 @@ class Flow(BaseModel):
194
173
 
195
174
  for i, block_config in enumerate(block_configs):
196
175
  try:
197
- # Inject client for deprecated LLMBlocks if this is an old format flow
198
- if (
199
- is_old_format
200
- and block_config.get("block_type") == "LLMBlock"
201
- and client is not None
202
- ):
203
- if "block_config" not in block_config:
204
- block_config["block_config"] = {}
205
- block_config["block_config"]["client"] = client
206
- logger.debug(
207
- f"Injected client for deprecated LLMBlock: {block_config['block_config'].get('block_name')}"
208
- )
209
-
210
176
  block = cls._create_block_from_config(block_config, yaml_dir)
211
177
  blocks.append(block)
212
178
  except Exception as exc:
@@ -228,12 +194,6 @@ class Flow(BaseModel):
228
194
  )
229
195
  else:
230
196
  logger.debug(f"Flow already had id: {flow.metadata.id}")
231
- # Store migrated runtime params and client for backward compatibility
232
- if migrated_runtime_params:
233
- flow._migrated_runtime_params = migrated_runtime_params
234
- if is_old_format and client is not None:
235
- flow._llm_client = client
236
-
237
197
  # Check if this is a flow without LLM blocks
238
198
  llm_blocks = flow._detect_llm_blocks()
239
199
  if not llm_blocks:
@@ -484,12 +444,6 @@ class Flow(BaseModel):
484
444
  self._block_metrics = []
485
445
  run_start = time.perf_counter()
486
446
 
487
- # Merge migrated runtime params with provided ones (provided ones take precedence)
488
- merged_runtime_params = self._migrated_runtime_params.copy()
489
- if runtime_params:
490
- merged_runtime_params.update(runtime_params)
491
- runtime_params = merged_runtime_params
492
-
493
447
  # Execute flow with metrics capture, ensuring metrics are always displayed/saved
494
448
  final_dataset = None
495
449
  execution_successful = False
@@ -647,22 +601,8 @@ class Flow(BaseModel):
647
601
  input_cols = set(current_dataset.column_names)
648
602
 
649
603
  try:
650
- # Check if this is a deprecated block and skip validations
651
- is_deprecated_block = (
652
- hasattr(block, "__class__")
653
- and hasattr(block.__class__, "__module__")
654
- and "deprecated_blocks" in block.__class__.__module__
655
- )
656
-
657
- if is_deprecated_block:
658
- exec_logger.debug(
659
- f"Skipping validations for deprecated block: {block.block_name}"
660
- )
661
- # Call generate() directly to skip validations, but keep the runtime params
662
- current_dataset = block.generate(current_dataset, **block_kwargs)
663
- else:
664
- # Execute block with validation and logging
665
- current_dataset = block(current_dataset, **block_kwargs)
604
+ # Execute block with validation and logging
605
+ current_dataset = block(current_dataset, **block_kwargs)
666
606
 
667
607
  # Validate output
668
608
  if len(current_dataset) == 0:
@@ -724,9 +664,11 @@ class Flow(BaseModel):
724
664
  return current_dataset
725
665
 
726
666
  def _prepare_block_kwargs(
727
- self, block: BaseBlock, runtime_params: dict[str, dict[str, Any]]
667
+ self, block: BaseBlock, runtime_params: Optional[dict[str, dict[str, Any]]]
728
668
  ) -> dict[str, Any]:
729
669
  """Prepare execution parameters for a block."""
670
+ if runtime_params is None:
671
+ return {}
730
672
  return runtime_params.get(block.block_name, {})
731
673
 
732
674
  def set_model_config(
@@ -1114,22 +1056,8 @@ class Flow(BaseModel):
1114
1056
  if max_concurrency is not None:
1115
1057
  block_kwargs["_flow_max_concurrency"] = max_concurrency
1116
1058
 
1117
- # Check if this is a deprecated block and skip validations
1118
- is_deprecated_block = (
1119
- hasattr(block, "__class__")
1120
- and hasattr(block.__class__, "__module__")
1121
- and "deprecated_blocks" in block.__class__.__module__
1122
- )
1123
-
1124
- if is_deprecated_block:
1125
- logger.debug(
1126
- f"Dry run: Skipping validations for deprecated block: {block.block_name}"
1127
- )
1128
- # Call generate() directly to skip validations, but keep the runtime params
1129
- current_dataset = block.generate(current_dataset, **block_kwargs)
1130
- else:
1131
- # Execute block with validation and logging
1132
- current_dataset = block(current_dataset, **block_kwargs)
1059
+ # Execute block with validation and logging
1060
+ current_dataset = block(current_dataset, **block_kwargs)
1133
1061
 
1134
1062
  block_execution_time = (
1135
1063
  time.perf_counter() - block_start_time
@@ -77,9 +77,13 @@ blocks:
77
77
  - ''
78
78
  - block_type: RenameColumnsBlock
79
79
  block_config:
80
- block_name: rename_to_document_column
80
+ block_name: rename_to_raw_document_column
81
81
  input_cols:
82
82
  document: raw_document
83
+ - block_type: RenameColumnsBlock
84
+ block_config:
85
+ block_name: rename_to_document_column
86
+ input_cols:
83
87
  summary: document
84
88
  - block_type: PromptBuilderBlock
85
89
  block_config:
@@ -79,9 +79,13 @@ blocks:
79
79
  - ''
80
80
  - block_type: RenameColumnsBlock
81
81
  block_config:
82
- block_name: rename_to_document_column
82
+ block_name: rename_to_raw_document_column
83
83
  input_cols:
84
84
  document: raw_document
85
+ - block_type: RenameColumnsBlock
86
+ block_config:
87
+ block_name: rename_to_document_column
88
+ input_cols:
85
89
  summary: document
86
90
  - block_type: PromptBuilderBlock
87
91
  block_config:
@@ -72,9 +72,13 @@ blocks:
72
72
  parsing_pattern: '(?:^|\n)\s*\d+\.\s+(.*?)(?=\n\s*\d+\.\s+|\Z)'
73
73
  - block_type: RenameColumnsBlock
74
74
  block_config:
75
- block_name: rename_to_document_column
75
+ block_name: rename_to_raw_document_column
76
76
  input_cols:
77
77
  document: raw_document
78
+ - block_type: RenameColumnsBlock
79
+ block_config:
80
+ block_name: rename_to_document_column
81
+ input_cols:
78
82
  atomic_facts: document
79
83
  - block_type: PromptBuilderBlock
80
84
  block_config:
@@ -134,10 +134,15 @@ blocks:
134
134
  input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
135
135
  output_cols: [summary, dataset_type]
136
136
 
137
+ - block_type: RenameColumnsBlock
138
+ block_config:
139
+ block_name: rename_to_raw_document_column
140
+ input_cols: {document: raw_document}
141
+
137
142
  - block_type: RenameColumnsBlock
138
143
  block_config:
139
144
  block_name: rename_to_document_column
140
- input_cols: {document: raw_document, summary: document}
145
+ input_cols: {summary: document}
141
146
 
142
147
  - block_type: PromptBuilderBlock
143
148
  block_config:
@@ -135,10 +135,14 @@ blocks:
135
135
  input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
136
136
  output_cols: [summary, dataset_type]
137
137
 
138
+ - block_type: RenameColumnsBlock
139
+ block_config:
140
+ block_name: rename_to_raw_document_column
141
+ input_cols: {document: raw_document}
138
142
  - block_type: RenameColumnsBlock
139
143
  block_config:
140
144
  block_name: rename_to_document_column
141
- input_cols: {document: raw_document, summary: document}
145
+ input_cols: {summary: document}
142
146
 
143
147
  - block_type: PromptBuilderBlock
144
148
  block_config:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.2
3
+ Version: 0.5.0
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: click<9.0.0,>=8.1.7
26
- Requires-Dist: datasets<4.0.0,>=2.18.0
26
+ Requires-Dist: datasets>=4.0.0
27
27
  Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
@@ -1,20 +1,10 @@
1
1
  sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
- sdg_hub/_version.py,sha256=A45grTqzrHuDn1CT9K5GVUbY4_Q3OSTcXAl3zdHzcEI,704
2
+ sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
- sdg_hub/core/blocks/__init__.py,sha256=5FsbkcO-dmBv6MqO96TPn9FKKPTQZQCv20j4wR7UvQw,1502
5
+ sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
6
6
  sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
7
7
  sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
8
- sdg_hub/core/blocks/deprecated_blocks/__init__.py,sha256=RDu3MWFStDQko-TKkx8tGoB1UTatP_RSldZK43zHDvY,889
9
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py,sha256=HCvpaYsAwgx1Dm0vIshcWsKoVsRT0KrmKp9j4oqtByc,2757
10
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py,sha256=maCaaEs0EMMzt7L1xm7fAH3ylaFMHEkeC_dtOw3FrjU,2694
11
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py,sha256=-fuuMKj2g2MrijMBTd0PWtYBbf9anQ2UkYXHigCxxJI,3328
12
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py,sha256=IenCskrPEv09h2uT6aZKCQzaxgA_3kAzOeJSd-R_-EA,2839
13
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py,sha256=34lzC43BODpMk5AwlWA1ctdYPmN7cA6WL5vMXaI0P0Y,20385
14
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py,sha256=thp-mHtkRmUw_nYKpldy_mLWR2AvC5YUhbqDETM6-T0,2620
15
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py,sha256=UdueMApxOmPWaxxMrw7b1v74fKJBfqqRATEBqgmVtNw,1737
16
- sdg_hub/core/blocks/deprecated_blocks/selector.py,sha256=nWecsVsW8DvBcqAF_LOqXmW-5MQ28uN3d1y6wkSy38c,2960
17
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py,sha256=44TQu-rK5isia-otMVB1zHd8D-wWmu3C8CI1NLtfY5s,2729
18
8
  sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
19
9
  sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
20
10
  sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
@@ -29,14 +19,13 @@ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGN
29
19
  sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
30
20
  sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
31
21
  sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
32
- sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqDiJSH_yUp5EFCV-XCVI,1998
22
+ sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
33
23
  sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
34
24
  sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
35
25
  sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
36
- sdg_hub/core/flow/base.py,sha256=4kR-dKXAlLFSwm3YWdT8EoedCIGJT56agcot3tQb6VY,59508
26
+ sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
37
27
  sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
38
28
  sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
39
- sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
40
29
  sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
41
30
  sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
42
31
  sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
@@ -55,14 +44,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/gener
55
44
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml,sha256=qHOgUNrQz2vjUjJiEHNGWxDDXwjJlP1kofTxeGgLyPI,1461
56
45
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
46
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml,sha256=Ik6gAml0O-jPq8jpXBAkURzYkQuFOnDZb4LDwjmfAiE,381
58
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=fUdzY9dtU69o99Uq8FIPycgVWdLD-1kbY97Bh-Vo2A0,5538
47
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=cxNpPh60mcvzxfczMH8hw66Ql3S8O-cWCCDeauO736c,5649
59
48
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
49
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml,sha256=smPWVUZRCt58EagWDmJVmTBQj8qMcjpzh-Q3GSuFrz0,4413
61
50
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
51
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml,sha256=SeapWoOx3fhN5SvWYuHss_9prLE8xSkOic7JkbDHSR0,4081
63
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=iNNIfofFE7awK7iivtIFWxjfjy8QviMugOPPnOTySKA,5706
52
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=7dVc0_g7Ex5SfdX57pqtk9gmH_lC6Cdm3HC-lg8OiXQ,5817
64
53
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=CIUZNYhvszT-jpz1Hvh6nS2y5W34P529ZOMp8thEQ9k,3219
54
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=7X4N19TcyHUo7pNo3C6Zv3w6br7hjzEfgv06XUVDaQo,3330
66
55
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml,sha256=YKMX_CuvcThG_bdNCAIXdVBkMvB72I89RGq2ltSSgc8,3298
67
56
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
57
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,14 +61,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/ev
72
61
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
73
62
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
74
63
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
75
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=HR8sf7RUZKr8UqKztBj-nlvyrve1UMUu8x8qgYM6O14,9055
64
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=km0ggcmFsZJGc2TfyYLkzPTrHGmcOB-jBAHInqySisk,9176
76
65
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
77
66
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
67
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
68
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
80
69
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
81
70
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
82
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=jumjKmKshSd8hoTYpyBJ0nMOADeQmxBmNPY7yfa_xQ8,9171
71
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=Q6RusV-_HHMr5jlFNOP6UVuEf8d6btHENMOP3MnB3u0,9291
83
72
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
84
73
  sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
85
74
  sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
@@ -88,8 +77,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
88
77
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
89
78
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
90
79
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
91
- sdg_hub-0.4.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
92
- sdg_hub-0.4.2.dist-info/METADATA,sha256=5qbw9_DoVmfntmQlvz4VPdQXdUXoLO8Zhrxbc1uY7b0,9783
93
- sdg_hub-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
- sdg_hub-0.4.2.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
95
- sdg_hub-0.4.2.dist-info/RECORD,,
80
+ sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
81
+ sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
82
+ sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
83
+ sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
84
+ sdg_hub-0.5.0.dist-info/RECORD,,
@@ -1,29 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated blocks for backwards compatibility.
3
-
4
- This module contains deprecated block implementations that are maintained
5
- for backwards compatibility. These blocks should not be used in new code.
6
- """
7
-
8
- # Local
9
- from .combine_columns import CombineColumnsBlock
10
- from .duplicate_columns import DuplicateColumns
11
- from .filter_by_value import FilterByValueBlock
12
- from .flatten_columns import FlattenColumnsBlock
13
- from .llmblock import LLMBlock
14
- from .rename_columns import RenameColumns
15
- from .sample_populator import SamplePopulatorBlock
16
- from .selector import SelectorBlock
17
- from .set_to_majority_value import SetToMajorityValue
18
-
19
- __all__ = [
20
- "CombineColumnsBlock",
21
- "DuplicateColumns",
22
- "FilterByValueBlock",
23
- "FlattenColumnsBlock",
24
- "LLMBlock",
25
- "RenameColumns",
26
- "SamplePopulatorBlock",
27
- "SelectorBlock",
28
- "SetToMajorityValue",
29
- ]
@@ -1,93 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """DEPRECATED: CombineColumnsBlock for backward compatibility.
3
-
4
- This module provides a deprecated wrapper for the old CombineColumnsBlock interface.
5
- Use transform.CombineColumnsBlock instead.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform.text_concat import TextConcatBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "CombineColumnsBlock",
26
- "deprecated",
27
- "DEPRECATED: Use TextConcatBlock instead. Combines multiple columns into a single column using a separator",
28
- )
29
- class CombineColumnsBlock(BaseBlock):
30
- r"""DEPRECATED: Combine multiple columns into a single column using a separator.
31
-
32
- .. deprecated::
33
- Use `sdg_hub.blocks.transform.CombineColumnsBlock` instead.
34
- This class will be removed in a future version.
35
-
36
- This block concatenates values from multiple columns into a single output column,
37
- using a specified separator between values.
38
-
39
- Parameters
40
- ----------
41
- block_name : str
42
- Name of the block.
43
- columns : List[str]
44
- List of column names to combine.
45
- output_col : str
46
- Name of the column to store combined values.
47
- separator : str, optional
48
- String to use as separator between combined values, by default "\\n\\n".
49
- **batch_kwargs : Dict[str, Any]
50
- Additional keyword arguments for batch processing.
51
- """
52
-
53
- def __init__(
54
- self,
55
- block_name: str,
56
- columns: list[str],
57
- output_col: str,
58
- separator: str = "\n\n",
59
- **batch_kwargs: dict[str, Any],
60
- ) -> None:
61
- warnings.warn(
62
- "CombineColumnsBlock is deprecated. Use sdg_hub.blocks.transform.TextConcatBlock instead.",
63
- DeprecationWarning,
64
- stacklevel=2,
65
- )
66
-
67
- # Initialize with dummy values for BaseBlock validation
68
- super().__init__(
69
- block_name=block_name, input_cols=columns, output_cols=[output_col]
70
- )
71
-
72
- # Create the new implementation
73
- self._impl = TextConcatBlock(
74
- block_name=block_name,
75
- input_cols=columns,
76
- output_cols=[output_col],
77
- separator=separator,
78
- )
79
-
80
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
81
- """Generate a dataset with combined columns.
82
-
83
- Parameters
84
- ----------
85
- samples : Dataset
86
- Input dataset to process.
87
-
88
- Returns
89
- -------
90
- Dataset
91
- Dataset with combined values stored in output column.
92
- """
93
- return self._impl.generate(samples)
@@ -1,88 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated DuplicateColumns for backwards compatibility.
3
-
4
- This module provides a deprecated wrapper around DuplicateColumnsBlock
5
- to maintain backwards compatibility with existing code and configurations.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform import DuplicateColumnsBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "DuplicateColumns",
26
- "deprecated",
27
- "DEPRECATED: Use DuplicateColumnsBlock instead. Duplicates existing columns with new names according to a mapping dictionary",
28
- )
29
- class DuplicateColumns(BaseBlock):
30
- """DEPRECATED: Block for duplicating existing columns with new names.
31
-
32
- This block is deprecated and maintained only for backwards compatibility.
33
- Please use DuplicateColumnsBlock instead.
34
-
35
- This block creates copies of existing columns with new names as specified
36
- in the columns mapping dictionary.
37
- """
38
-
39
- def __init__(
40
- self,
41
- block_name: str,
42
- columns_map: dict[str, str],
43
- ) -> None:
44
- """Initialize the deprecated DuplicateColumns.
45
-
46
- Parameters
47
- ----------
48
- block_name : str
49
- Name of the block.
50
- columns_map : Dict[str, str]
51
- Dictionary mapping existing column names to new column names.
52
- Keys are existing column names, values are new column names.
53
- """
54
- # Issue deprecation warning
55
- warnings.warn(
56
- "DuplicateColumns is deprecated and will be removed in a future version. "
57
- "Please use DuplicateColumnsBlock instead.",
58
- DeprecationWarning,
59
- stacklevel=2,
60
- )
61
-
62
- # Map old signature to new signature
63
- super().__init__(
64
- block_name=block_name,
65
- input_cols=columns_map,
66
- output_cols=list(columns_map.values()),
67
- )
68
-
69
- # Create the new block instance with mapped parameters
70
- self._new_block = DuplicateColumnsBlock(
71
- block_name=block_name,
72
- input_cols=columns_map,
73
- )
74
-
75
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
76
- """Generate dataset with duplicated columns using the new DuplicateColumnsBlock.
77
-
78
- Parameters
79
- ----------
80
- samples : Dataset
81
- The input dataset to duplicate columns from.
82
-
83
- Returns
84
- -------
85
- Dataset
86
- The dataset with additional duplicated columns.
87
- """
88
- return self._new_block.generate(samples, **kwargs)