sdg-hub 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +16 -3
- sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +175 -416
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +174 -415
- sdg_hub/core/blocks/evaluation/verify_question_block.py +180 -415
- sdg_hub/core/blocks/llm/__init__.py +2 -0
- sdg_hub/core/blocks/llm/client_manager.py +61 -24
- sdg_hub/core/blocks/llm/config.py +1 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +62 -7
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +653 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +75 -30
- sdg_hub/core/blocks/registry.py +49 -35
- sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
- sdg_hub/core/flow/base.py +370 -20
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +45 -0
- sdg_hub/core/flow/migration.py +12 -1
- sdg_hub/core/flow/registry.py +121 -58
- sdg_hub/core/flow/validation.py +12 -0
- sdg_hub/core/utils/__init__.py +2 -1
- sdg_hub/core/utils/datautils.py +81 -1
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -7
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/METADATA +59 -31
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/RECORD +30 -25
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/WHEEL +0 -0
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/top_level.txt +0 -0
@@ -48,6 +48,9 @@ class TextParserBlock(BaseBlock):
|
|
48
48
|
Regex pattern for custom parsing.
|
49
49
|
parser_cleanup_tags : Optional[List[str]]
|
50
50
|
List of tags to clean from parsed output.
|
51
|
+
expand_lists : bool
|
52
|
+
Whether to expand list inputs into individual rows (True) or preserve lists (False).
|
53
|
+
Default is True for backward compatibility.
|
51
54
|
"""
|
52
55
|
|
53
56
|
start_tags: list[str] = Field(
|
@@ -62,6 +65,10 @@ class TextParserBlock(BaseBlock):
|
|
62
65
|
parser_cleanup_tags: Optional[list[str]] = Field(
|
63
66
|
default=None, description="List of tags to clean from parsed output"
|
64
67
|
)
|
68
|
+
expand_lists: bool = Field(
|
69
|
+
default=True,
|
70
|
+
description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
|
71
|
+
)
|
65
72
|
|
66
73
|
@field_validator("start_tags", "end_tags", mode="before")
|
67
74
|
@classmethod
|
@@ -237,36 +244,74 @@ class TextParserBlock(BaseBlock):
|
|
237
244
|
logger.warning(f"Input column '{input_column}' contains empty list")
|
238
245
|
return []
|
239
246
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
247
|
+
if not self.expand_lists:
|
248
|
+
# When expand_lists=False, preserve the list structure
|
249
|
+
# Parse each response in the list and collect results as lists
|
250
|
+
all_parsed_outputs = {col: [] for col in self.output_cols}
|
251
|
+
valid_responses = 0
|
252
|
+
|
253
|
+
for i, response in enumerate(raw_output):
|
254
|
+
if not response or not isinstance(response, str):
|
255
|
+
logger.warning(
|
256
|
+
f"List item {i} in column '{input_column}' contains invalid data "
|
257
|
+
f"(empty or non-string): {type(response)}"
|
258
|
+
)
|
259
|
+
continue
|
260
|
+
|
261
|
+
parsed_outputs = self._parse(response)
|
262
|
+
|
263
|
+
if not parsed_outputs or not any(
|
264
|
+
len(value) > 0 for value in parsed_outputs.values()
|
265
|
+
):
|
266
|
+
logger.warning(
|
267
|
+
f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
|
268
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
269
|
+
)
|
270
|
+
continue
|
271
|
+
|
272
|
+
valid_responses += 1
|
273
|
+
# Collect all parsed values for each column as lists
|
274
|
+
for col in self.output_cols:
|
275
|
+
all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
|
276
|
+
|
277
|
+
if valid_responses == 0:
|
278
|
+
return []
|
279
|
+
|
280
|
+
# Return single row with lists as values
|
281
|
+
return [{**sample, **all_parsed_outputs}]
|
282
|
+
|
283
|
+
else:
|
284
|
+
# When expand_lists=True, use existing expanding behavior
|
285
|
+
all_results = []
|
286
|
+
for i, response in enumerate(raw_output):
|
287
|
+
if not response or not isinstance(response, str):
|
288
|
+
logger.warning(
|
289
|
+
f"List item {i} in column '{input_column}' contains invalid data "
|
290
|
+
f"(empty or non-string): {type(response)}"
|
291
|
+
)
|
292
|
+
continue
|
293
|
+
|
294
|
+
parsed_outputs = self._parse(response)
|
295
|
+
|
296
|
+
if not parsed_outputs or not any(
|
297
|
+
len(value) > 0 for value in parsed_outputs.values()
|
298
|
+
):
|
299
|
+
logger.warning(
|
300
|
+
f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
|
301
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
302
|
+
)
|
303
|
+
continue
|
304
|
+
|
305
|
+
# Create output rows for this response
|
306
|
+
max_length = max(len(value) for value in parsed_outputs.values())
|
307
|
+
for values in zip(
|
308
|
+
*(lst[:max_length] for lst in parsed_outputs.values())
|
309
|
+
):
|
310
|
+
all_results.append(
|
311
|
+
{**sample, **dict(zip(parsed_outputs.keys(), values))}
|
312
|
+
)
|
313
|
+
|
314
|
+
return all_results
|
270
315
|
|
271
316
|
# Handle string inputs (existing logic)
|
272
317
|
elif isinstance(raw_output, str):
|
sdg_hub/core/blocks/registry.py
CHANGED
@@ -164,8 +164,10 @@ class BlockRegistry:
|
|
164
164
|
) from exc
|
165
165
|
|
166
166
|
@classmethod
|
167
|
-
def
|
168
|
-
"""
|
167
|
+
def _get(cls, block_name: str) -> type:
|
168
|
+
"""Internal method to get a block class with enhanced error handling.
|
169
|
+
|
170
|
+
This is a private method used by the framework internals (Flow system).
|
169
171
|
|
170
172
|
Parameters
|
171
173
|
----------
|
@@ -216,29 +218,6 @@ class BlockRegistry:
|
|
216
218
|
|
217
219
|
return metadata.block_class
|
218
220
|
|
219
|
-
@classmethod
|
220
|
-
def info(cls, block_name: str) -> BlockMetadata:
|
221
|
-
"""Get metadata for a specific block.
|
222
|
-
|
223
|
-
Parameters
|
224
|
-
----------
|
225
|
-
block_name : str
|
226
|
-
Name of the block.
|
227
|
-
|
228
|
-
Returns
|
229
|
-
-------
|
230
|
-
BlockMetadata
|
231
|
-
The block's metadata.
|
232
|
-
|
233
|
-
Raises
|
234
|
-
------
|
235
|
-
KeyError
|
236
|
-
If the block is not found.
|
237
|
-
"""
|
238
|
-
if block_name not in cls._metadata:
|
239
|
-
raise KeyError(f"Block '{block_name}' not found in registry.")
|
240
|
-
return cls._metadata[block_name]
|
241
|
-
|
242
221
|
@classmethod
|
243
222
|
def categories(cls) -> list[str]:
|
244
223
|
"""Get all available categories.
|
@@ -251,8 +230,8 @@ class BlockRegistry:
|
|
251
230
|
return sorted(cls._categories.keys())
|
252
231
|
|
253
232
|
@classmethod
|
254
|
-
def
|
255
|
-
"""Get all blocks in a specific category.
|
233
|
+
def _get_category_blocks(cls, category: str) -> list[str]:
|
234
|
+
"""Get all blocks in a specific category (private method).
|
256
235
|
|
257
236
|
Parameters
|
258
237
|
----------
|
@@ -278,20 +257,55 @@ class BlockRegistry:
|
|
278
257
|
return sorted(cls._categories[category])
|
279
258
|
|
280
259
|
@classmethod
|
281
|
-
def
|
282
|
-
|
260
|
+
def list_blocks(
|
261
|
+
cls,
|
262
|
+
category: Optional[str] = None,
|
263
|
+
*,
|
264
|
+
grouped: bool = False,
|
265
|
+
include_deprecated: bool = True,
|
266
|
+
) -> list[str] | dict[str, list[str]]:
|
267
|
+
"""
|
268
|
+
List registered blocks, optionally filtered by category.
|
269
|
+
|
270
|
+
Args:
|
271
|
+
category: If provided, return only blocks in this category.
|
272
|
+
grouped: If True (and category is None), return a dict
|
273
|
+
mapping categories to lists of blocks.
|
274
|
+
include_deprecated: If True, return deprecated blocks.
|
283
275
|
|
284
276
|
Returns
|
285
277
|
-------
|
286
|
-
Dict[str, List[str]]
|
287
|
-
|
278
|
+
List[str] | Dict[str, List[str]]
|
279
|
+
If grouped is False, returns a list of block names.
|
280
|
+
If grouped is True, returns a dict mapping categories to lists of block names.
|
288
281
|
"""
|
289
|
-
|
290
|
-
|
291
|
-
|
282
|
+
|
283
|
+
def filter_deprecated(block_names: list[str]) -> list[str]:
|
284
|
+
if include_deprecated:
|
285
|
+
return block_names
|
286
|
+
return [name for name in block_names if not cls._metadata[name].deprecated]
|
287
|
+
|
288
|
+
if category:
|
289
|
+
block_names = cls._get_category_blocks(category)
|
290
|
+
return filter_deprecated(block_names)
|
291
|
+
|
292
|
+
if grouped:
|
293
|
+
result = {}
|
294
|
+
for cat, blocks in cls._categories.items():
|
295
|
+
filtered = filter_deprecated(sorted(blocks))
|
296
|
+
if filtered:
|
297
|
+
result[cat] = filtered
|
298
|
+
return result
|
299
|
+
|
300
|
+
# Flat list of all block names (across all categories)
|
301
|
+
all_block_names = []
|
302
|
+
for blocks in cls._categories.values():
|
303
|
+
all_block_names.extend(blocks)
|
304
|
+
filtered = filter_deprecated(sorted(all_block_names))
|
305
|
+
return filtered
|
292
306
|
|
293
307
|
@classmethod
|
294
|
-
def
|
308
|
+
def discover_blocks(cls) -> None:
|
295
309
|
"""Print a Rich-formatted table of all available blocks."""
|
296
310
|
if not cls._metadata:
|
297
311
|
console.print("[yellow]No blocks registered yet.[/yellow]")
|
@@ -174,7 +174,7 @@ class IndexBasedMapperBlock(BaseBlock):
|
|
174
174
|
sample[output_col] = sample[source_col]
|
175
175
|
return sample
|
176
176
|
|
177
|
-
def generate(self, samples: Dataset) -> Dataset:
|
177
|
+
def generate(self, samples: Dataset, **kwargs) -> Dataset:
|
178
178
|
"""Generate a new dataset with selected values.
|
179
179
|
|
180
180
|
Parameters
|