sdg-hub 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sdg_hub/_version.py +16 -3
  2. sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
  3. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +175 -416
  4. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +174 -415
  5. sdg_hub/core/blocks/evaluation/verify_question_block.py +180 -415
  6. sdg_hub/core/blocks/llm/__init__.py +2 -0
  7. sdg_hub/core/blocks/llm/client_manager.py +61 -24
  8. sdg_hub/core/blocks/llm/config.py +1 -0
  9. sdg_hub/core/blocks/llm/llm_chat_block.py +62 -7
  10. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +653 -0
  11. sdg_hub/core/blocks/llm/text_parser_block.py +75 -30
  12. sdg_hub/core/blocks/registry.py +49 -35
  13. sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
  14. sdg_hub/core/flow/base.py +370 -20
  15. sdg_hub/core/flow/checkpointer.py +333 -0
  16. sdg_hub/core/flow/metadata.py +45 -0
  17. sdg_hub/core/flow/migration.py +12 -1
  18. sdg_hub/core/flow/registry.py +121 -58
  19. sdg_hub/core/flow/validation.py +12 -0
  20. sdg_hub/core/utils/__init__.py +2 -1
  21. sdg_hub/core/utils/datautils.py +81 -1
  22. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  23. sdg_hub/core/utils/flow_identifier.py +94 -0
  24. sdg_hub/core/utils/yaml_utils.py +59 -0
  25. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -7
  26. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/METADATA +59 -31
  27. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/RECORD +30 -25
  28. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/WHEEL +0 -0
  29. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/licenses/LICENSE +0 -0
  30. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/top_level.txt +0 -0
@@ -48,6 +48,9 @@ class TextParserBlock(BaseBlock):
48
48
  Regex pattern for custom parsing.
49
49
  parser_cleanup_tags : Optional[List[str]]
50
50
  List of tags to clean from parsed output.
51
+ expand_lists : bool
52
+ Whether to expand list inputs into individual rows (True) or preserve lists (False).
53
+ Default is True for backward compatibility.
51
54
  """
52
55
 
53
56
  start_tags: list[str] = Field(
@@ -62,6 +65,10 @@ class TextParserBlock(BaseBlock):
62
65
  parser_cleanup_tags: Optional[list[str]] = Field(
63
66
  default=None, description="List of tags to clean from parsed output"
64
67
  )
68
+ expand_lists: bool = Field(
69
+ default=True,
70
+ description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
71
+ )
65
72
 
66
73
  @field_validator("start_tags", "end_tags", mode="before")
67
74
  @classmethod
@@ -237,36 +244,74 @@ class TextParserBlock(BaseBlock):
237
244
  logger.warning(f"Input column '{input_column}' contains empty list")
238
245
  return []
239
246
 
240
- all_results = []
241
- for i, response in enumerate(raw_output):
242
- if not response or not isinstance(response, str):
243
- logger.warning(
244
- f"List item {i} in column '{input_column}' contains invalid data "
245
- f"(empty or non-string): {type(response)}"
246
- )
247
- continue
248
-
249
- parsed_outputs = self._parse(response)
250
-
251
- if not parsed_outputs or not any(
252
- len(value) > 0 for value in parsed_outputs.values()
253
- ):
254
- logger.warning(
255
- f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
256
- f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
257
- )
258
- continue
259
-
260
- # Create output rows for this response
261
- max_length = max(len(value) for value in parsed_outputs.values())
262
- for values in zip(
263
- *(lst[:max_length] for lst in parsed_outputs.values())
264
- ):
265
- all_results.append(
266
- {**sample, **dict(zip(parsed_outputs.keys(), values))}
267
- )
268
-
269
- return all_results
247
+ if not self.expand_lists:
248
+ # When expand_lists=False, preserve the list structure
249
+ # Parse each response in the list and collect results as lists
250
+ all_parsed_outputs = {col: [] for col in self.output_cols}
251
+ valid_responses = 0
252
+
253
+ for i, response in enumerate(raw_output):
254
+ if not response or not isinstance(response, str):
255
+ logger.warning(
256
+ f"List item {i} in column '{input_column}' contains invalid data "
257
+ f"(empty or non-string): {type(response)}"
258
+ )
259
+ continue
260
+
261
+ parsed_outputs = self._parse(response)
262
+
263
+ if not parsed_outputs or not any(
264
+ len(value) > 0 for value in parsed_outputs.values()
265
+ ):
266
+ logger.warning(
267
+ f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
268
+ f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
269
+ )
270
+ continue
271
+
272
+ valid_responses += 1
273
+ # Collect all parsed values for each column as lists
274
+ for col in self.output_cols:
275
+ all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
276
+
277
+ if valid_responses == 0:
278
+ return []
279
+
280
+ # Return single row with lists as values
281
+ return [{**sample, **all_parsed_outputs}]
282
+
283
+ else:
284
+ # When expand_lists=True, use existing expanding behavior
285
+ all_results = []
286
+ for i, response in enumerate(raw_output):
287
+ if not response or not isinstance(response, str):
288
+ logger.warning(
289
+ f"List item {i} in column '{input_column}' contains invalid data "
290
+ f"(empty or non-string): {type(response)}"
291
+ )
292
+ continue
293
+
294
+ parsed_outputs = self._parse(response)
295
+
296
+ if not parsed_outputs or not any(
297
+ len(value) > 0 for value in parsed_outputs.values()
298
+ ):
299
+ logger.warning(
300
+ f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
301
+ f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
302
+ )
303
+ continue
304
+
305
+ # Create output rows for this response
306
+ max_length = max(len(value) for value in parsed_outputs.values())
307
+ for values in zip(
308
+ *(lst[:max_length] for lst in parsed_outputs.values())
309
+ ):
310
+ all_results.append(
311
+ {**sample, **dict(zip(parsed_outputs.keys(), values))}
312
+ )
313
+
314
+ return all_results
270
315
 
271
316
  # Handle string inputs (existing logic)
272
317
  elif isinstance(raw_output, str):
@@ -164,8 +164,10 @@ class BlockRegistry:
164
164
  ) from exc
165
165
 
166
166
  @classmethod
167
- def get(cls, block_name: str) -> type:
168
- """Get a block class with enhanced error handling.
167
+ def _get(cls, block_name: str) -> type:
168
+ """Internal method to get a block class with enhanced error handling.
169
+
170
+ This is a private method used by the framework internals (Flow system).
169
171
 
170
172
  Parameters
171
173
  ----------
@@ -216,29 +218,6 @@ class BlockRegistry:
216
218
 
217
219
  return metadata.block_class
218
220
 
219
- @classmethod
220
- def info(cls, block_name: str) -> BlockMetadata:
221
- """Get metadata for a specific block.
222
-
223
- Parameters
224
- ----------
225
- block_name : str
226
- Name of the block.
227
-
228
- Returns
229
- -------
230
- BlockMetadata
231
- The block's metadata.
232
-
233
- Raises
234
- ------
235
- KeyError
236
- If the block is not found.
237
- """
238
- if block_name not in cls._metadata:
239
- raise KeyError(f"Block '{block_name}' not found in registry.")
240
- return cls._metadata[block_name]
241
-
242
221
  @classmethod
243
222
  def categories(cls) -> list[str]:
244
223
  """Get all available categories.
@@ -251,8 +230,8 @@ class BlockRegistry:
251
230
  return sorted(cls._categories.keys())
252
231
 
253
232
  @classmethod
254
- def category(cls, category: str) -> list[str]:
255
- """Get all blocks in a specific category.
233
+ def _get_category_blocks(cls, category: str) -> list[str]:
234
+ """Get all blocks in a specific category (private method).
256
235
 
257
236
  Parameters
258
237
  ----------
@@ -278,20 +257,55 @@ class BlockRegistry:
278
257
  return sorted(cls._categories[category])
279
258
 
280
259
  @classmethod
281
- def all(cls) -> dict[str, list[str]]:
282
- """List all blocks organized by category.
260
+ def list_blocks(
261
+ cls,
262
+ category: Optional[str] = None,
263
+ *,
264
+ grouped: bool = False,
265
+ include_deprecated: bool = True,
266
+ ) -> list[str] | dict[str, list[str]]:
267
+ """
268
+ List registered blocks, optionally filtered by category.
269
+
270
+ Args:
271
+ category: If provided, return only blocks in this category.
272
+ grouped: If True (and category is None), return a dict
273
+ mapping categories to lists of blocks.
274
+ include_deprecated: If True, return deprecated blocks.
283
275
 
284
276
  Returns
285
277
  -------
286
- Dict[str, List[str]]
287
- Dictionary mapping categories to lists of block names.
278
+ List[str] | Dict[str, List[str]]
279
+ If grouped is False, returns a list of block names.
280
+ If grouped is True, returns a dict mapping categories to lists of block names.
288
281
  """
289
- return {
290
- category: sorted(blocks) for category, blocks in cls._categories.items()
291
- }
282
+
283
+ def filter_deprecated(block_names: list[str]) -> list[str]:
284
+ if include_deprecated:
285
+ return block_names
286
+ return [name for name in block_names if not cls._metadata[name].deprecated]
287
+
288
+ if category:
289
+ block_names = cls._get_category_blocks(category)
290
+ return filter_deprecated(block_names)
291
+
292
+ if grouped:
293
+ result = {}
294
+ for cat, blocks in cls._categories.items():
295
+ filtered = filter_deprecated(sorted(blocks))
296
+ if filtered:
297
+ result[cat] = filtered
298
+ return result
299
+
300
+ # Flat list of all block names (across all categories)
301
+ all_block_names = []
302
+ for blocks in cls._categories.values():
303
+ all_block_names.extend(blocks)
304
+ filtered = filter_deprecated(sorted(all_block_names))
305
+ return filtered
292
306
 
293
307
  @classmethod
294
- def show(cls) -> None:
308
+ def discover_blocks(cls) -> None:
295
309
  """Print a Rich-formatted table of all available blocks."""
296
310
  if not cls._metadata:
297
311
  console.print("[yellow]No blocks registered yet.[/yellow]")
@@ -174,7 +174,7 @@ class IndexBasedMapperBlock(BaseBlock):
174
174
  sample[output_col] = sample[source_col]
175
175
  return sample
176
176
 
177
- def generate(self, samples: Dataset) -> Dataset:
177
+ def generate(self, samples: Dataset, **kwargs) -> Dataset:
178
178
  """Generate a new dataset with selected values.
179
179
 
180
180
  Parameters