sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,357 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Text parser block for parsing and post-processing LLM outputs.
3
+
4
+ This module provides the TextParserBlock for handling output parsing using
5
+ start/end tags, custom regex patterns, and cleanup operations.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any, Optional
10
+ import re
11
+
12
+ # Third Party
13
+ from datasets import Dataset
14
+ from pydantic import Field, field_validator, model_validator
15
+
16
+ # Local
17
+ from ...utils.logger_config import setup_logger
18
+ from ..base import BaseBlock
19
+ from ..registry import BlockRegistry
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ @BlockRegistry.register(
25
+ "TextParserBlock",
26
+ "llm",
27
+ "Parses and post-processes LLM outputs using tags or regex patterns",
28
+ )
29
+ class TextParserBlock(BaseBlock):
30
+ """Block for parsing and post-processing LLM outputs.
31
+
32
+ This block handles output parsing using start/end tags, custom regex patterns,
33
+ and cleanup operations. It expects exactly one input column containing raw LLM output.
34
+
35
+ Attributes
36
+ ----------
37
+ block_name : str
38
+ Unique identifier for this block instance.
39
+ input_cols : Union[str, List[str], Dict[str, Any], None]
40
+ Input column name(s) containing raw LLM output. Must specify exactly one column.
41
+ output_cols : Union[str, List[str], Dict[str, Any], None]
42
+ Output column name(s) for parsed results.
43
+ start_tags : List[str]
44
+ List of start tags for tag-based parsing.
45
+ end_tags : List[str]
46
+ List of end tags for tag-based parsing.
47
+ parsing_pattern : Optional[str]
48
+ Regex pattern for custom parsing.
49
+ parser_cleanup_tags : Optional[List[str]]
50
+ List of tags to clean from parsed output.
51
+ expand_lists : bool
52
+ Whether to expand list inputs into individual rows (True) or preserve lists (False).
53
+ Default is True for backward compatibility.
54
+ """
55
+
56
+ start_tags: list[str] = Field(
57
+ default_factory=list, description="List of start tags for tag-based parsing"
58
+ )
59
+ end_tags: list[str] = Field(
60
+ default_factory=list, description="List of end tags for tag-based parsing"
61
+ )
62
+ parsing_pattern: Optional[str] = Field(
63
+ default=None, description="Regex pattern for custom parsing"
64
+ )
65
+ parser_cleanup_tags: Optional[list[str]] = Field(
66
+ default=None, description="List of tags to clean from parsed output"
67
+ )
68
+ expand_lists: bool = Field(
69
+ default=True,
70
+ description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
71
+ )
72
+
73
+ @field_validator("start_tags", "end_tags", mode="before")
74
+ @classmethod
75
+ def normalize_tags(cls, v):
76
+ """Normalize tag lists to ensure they are always lists."""
77
+ if v is None:
78
+ return []
79
+ if isinstance(v, str):
80
+ return [v]
81
+ if isinstance(v, list):
82
+ return v
83
+ raise ValueError(f"Tags must be a string, list, or None, got {type(v)}")
84
+
85
+ @field_validator("parser_cleanup_tags", mode="before")
86
+ @classmethod
87
+ def normalize_cleanup_tags(cls, v):
88
+ """Normalize cleanup tags to ensure they are always lists when not None."""
89
+ if v is None:
90
+ return None
91
+ if isinstance(v, str):
92
+ return [v]
93
+ if isinstance(v, list):
94
+ return v
95
+ raise ValueError(f"Cleanup tags must be a string, list, or None, got {type(v)}")
96
+
97
+ @model_validator(mode="after")
98
+ def validate_parsing_configuration(self):
99
+ """Validate that parsing configuration is consistent."""
100
+ # Validate that at least one parsing method is configured
101
+ has_regex = self.parsing_pattern is not None
102
+ has_tags = bool(self.start_tags) or bool(self.end_tags)
103
+
104
+ if not has_regex and not has_tags:
105
+ raise ValueError(
106
+ "TextParserBlock requires at least one parsing method: "
107
+ "either 'parsing_pattern' (regex) or 'start_tags'/'end_tags' (tag-based parsing)"
108
+ )
109
+
110
+ # Validate tag parsing configuration
111
+ if has_tags:
112
+ if len(self.start_tags) != len(self.end_tags):
113
+ raise ValueError(
114
+ f"start_tags and end_tags must have the same length. "
115
+ f"Got {len(self.start_tags)} start_tags and {len(self.end_tags)} end_tags"
116
+ )
117
+
118
+ # We can't validate against output_cols here since they might not be normalized yet
119
+ # This validation will be moved to _validate_custom
120
+
121
+ return self
122
+
123
+ def _validate_custom(self, dataset: Dataset) -> None:
124
+ """Validate TextParserBlock specific requirements.
125
+
126
+ Parameters
127
+ ----------
128
+ dataset : Dataset
129
+ The dataset to validate.
130
+
131
+ Raises
132
+ ------
133
+ ValueError
134
+ If TextParserBlock requirements are not met.
135
+ """
136
+ # Validate that we have exactly one input column
137
+ if len(self.input_cols) == 0:
138
+ raise ValueError("TextParserBlock expects at least one input column")
139
+ if len(self.input_cols) > 1:
140
+ logger.warning(
141
+ f"TextParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
142
+ f"Using the first column: {self.input_cols[0]}"
143
+ )
144
+
145
+ # Validate tag parsing against output columns (can only be done after model creation)
146
+ has_tags = bool(self.start_tags) or bool(self.end_tags)
147
+ if has_tags and len(self.start_tags) != len(self.output_cols):
148
+ raise ValueError(
149
+ f"When using tag-based parsing, the number of tag pairs must match output_cols. "
150
+ f"Got {len(self.start_tags)} tag pairs and {len(self.output_cols)} output columns"
151
+ )
152
+
153
+ def _extract_matches(
154
+ self, text: str, start_tag: Optional[str], end_tag: Optional[str]
155
+ ) -> list[str]:
156
+ if not text:
157
+ return []
158
+ if not start_tag and not end_tag:
159
+ return [text.strip()]
160
+
161
+ pattern = ""
162
+ if start_tag:
163
+ pattern += re.escape(start_tag)
164
+ pattern += r"(.*?)"
165
+ if end_tag:
166
+ pattern += re.escape(end_tag)
167
+ elif start_tag:
168
+ pattern += "$"
169
+
170
+ return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
171
+
172
+ def _parse(self, generated_string: str) -> dict[str, list[str]]:
173
+ if self.parsing_pattern is not None:
174
+ return self._parse_with_regex(generated_string)
175
+ return self._parse_with_tags(generated_string)
176
+
177
+ def _parse_with_regex(self, generated_string: str) -> dict[str, list[str]]:
178
+ """Parse using regex pattern."""
179
+ if self.parsing_pattern is None:
180
+ raise ValueError("parsing_pattern is required for regex parsing")
181
+ pattern = re.compile(self.parsing_pattern, re.DOTALL)
182
+ all_matches = pattern.findall(generated_string)
183
+ matches: dict[str, list[str]] = {
184
+ column_name: [] for column_name in self.output_cols
185
+ }
186
+
187
+ logger.debug(
188
+ f"Regex parsing found {len(all_matches)} matches with pattern: {self.parsing_pattern}"
189
+ )
190
+
191
+ if all_matches and isinstance(all_matches[0], tuple):
192
+ return self._process_tuple_matches(all_matches, matches)
193
+ return self._process_single_matches(all_matches, matches)
194
+
195
+ def _parse_with_tags(self, generated_string: str) -> dict[str, list[str]]:
196
+ """Parse using start/end tags."""
197
+ matches: dict[str, list[str]] = {
198
+ column_name: [] for column_name in self.output_cols
199
+ }
200
+
201
+ for start_tag, end_tag, output_col in zip(
202
+ self.start_tags, self.end_tags, self.output_cols
203
+ ):
204
+ extracted = self._extract_matches(generated_string, start_tag, end_tag)
205
+ matches[output_col] = extracted
206
+ logger.debug(
207
+ f"Tag parsing for '{output_col}' with tags '{start_tag}'/'{end_tag}' found {len(extracted)} matches"
208
+ )
209
+
210
+ return matches
211
+
212
+ def _process_tuple_matches(
213
+ self, all_matches: list, matches: dict[str, list[str]]
214
+ ) -> dict[str, list[str]]:
215
+ """Process regex matches that are tuples."""
216
+ for match in all_matches:
217
+ for column_name, value in zip(self.output_cols, match):
218
+ value = self._clean_value(value.strip())
219
+ matches[column_name].append(value)
220
+ return matches
221
+
222
+ def _process_single_matches(
223
+ self, all_matches: list, matches: dict[str, list[str]]
224
+ ) -> dict[str, list[str]]:
225
+ """Process regex matches that are single values."""
226
+ cleaned_matches = [self._clean_value(match.strip()) for match in all_matches]
227
+ matches[self.output_cols[0]] = cleaned_matches
228
+ return matches
229
+
230
+ def _clean_value(self, value: str) -> str:
231
+ """Clean value by removing cleanup tags."""
232
+ if self.parser_cleanup_tags:
233
+ for clean_tag in self.parser_cleanup_tags:
234
+ value = value.replace(clean_tag, "")
235
+ return value
236
+
237
+ def _generate(self, sample: dict) -> list[dict]:
238
+ input_column = self.input_cols[0]
239
+ raw_output = sample[input_column]
240
+
241
+ # Handle list inputs (e.g., from LLMChatBlock with n > 1)
242
+ if isinstance(raw_output, list):
243
+ if not raw_output:
244
+ logger.warning(f"Input column '{input_column}' contains empty list")
245
+ return []
246
+
247
+ if not self.expand_lists:
248
+ # When expand_lists=False, preserve the list structure
249
+ # Parse each response in the list and collect results as lists
250
+ all_parsed_outputs = {col: [] for col in self.output_cols}
251
+ valid_responses = 0
252
+
253
+ for i, response in enumerate(raw_output):
254
+ if not response or not isinstance(response, str):
255
+ logger.warning(
256
+ f"List item {i} in column '{input_column}' contains invalid data "
257
+ f"(empty or non-string): {type(response)}"
258
+ )
259
+ continue
260
+
261
+ parsed_outputs = self._parse(response)
262
+
263
+ if not parsed_outputs or not any(
264
+ len(value) > 0 for value in parsed_outputs.values()
265
+ ):
266
+ logger.warning(
267
+ f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
268
+ f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
269
+ )
270
+ continue
271
+
272
+ valid_responses += 1
273
+ # Collect all parsed values for each column as lists
274
+ for col in self.output_cols:
275
+ all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
276
+
277
+ if valid_responses == 0:
278
+ return []
279
+
280
+ # Return single row with lists as values
281
+ # TODO: This breaks retry counting in LLMChatWithParsingRetryBlock until LLMChatWithParsingRetryBlock is re-based
282
+ # which expects one row per successful parse for counting
283
+ return [{**sample, **all_parsed_outputs}]
284
+
285
+ else:
286
+ # When expand_lists=True, use existing expanding behavior
287
+ all_results = []
288
+ for i, response in enumerate(raw_output):
289
+ if not response or not isinstance(response, str):
290
+ logger.warning(
291
+ f"List item {i} in column '{input_column}' contains invalid data "
292
+ f"(empty or non-string): {type(response)}"
293
+ )
294
+ continue
295
+
296
+ parsed_outputs = self._parse(response)
297
+
298
+ if not parsed_outputs or not any(
299
+ len(value) > 0 for value in parsed_outputs.values()
300
+ ):
301
+ logger.warning(
302
+ f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
303
+ f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
304
+ )
305
+ continue
306
+
307
+ # Create output rows for this response
308
+ max_length = max(len(value) for value in parsed_outputs.values())
309
+ for values in zip(
310
+ *(lst[:max_length] for lst in parsed_outputs.values())
311
+ ):
312
+ all_results.append(
313
+ {**sample, **dict(zip(parsed_outputs.keys(), values))}
314
+ )
315
+
316
+ return all_results
317
+
318
+ # Handle string inputs (existing logic)
319
+ elif isinstance(raw_output, str):
320
+ if not raw_output:
321
+ logger.warning(f"Input column '{input_column}' contains empty string")
322
+ return []
323
+
324
+ parsed_outputs = self._parse(raw_output)
325
+
326
+ if not parsed_outputs or not any(
327
+ len(value) > 0 for value in parsed_outputs.values()
328
+ ):
329
+ logger.warning(
330
+ f"Failed to parse any content from input. Raw output length: {len(raw_output)}, "
331
+ f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
332
+ )
333
+ return []
334
+
335
+ result = []
336
+ max_length = max(len(value) for value in parsed_outputs.values())
337
+ for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
338
+ result.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
339
+ return result
340
+
341
+ else:
342
+ logger.warning(
343
+ f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
344
+ f"Expected str or List[str]"
345
+ )
346
+ return []
347
+
348
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
349
+ logger.debug(f"Parsing outputs for {len(samples)} samples")
350
+ if len(samples) == 0:
351
+ logger.warning("No samples to parse, returning empty dataset")
352
+ return Dataset.from_list([])
353
+
354
+ new_data = []
355
+ for sample in samples:
356
+ new_data.extend(self._generate(sample))
357
+ return Dataset.from_list(new_data)
@@ -0,0 +1,331 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Enhanced BlockRegistry with metadata and better error handling.
3
+
4
+ This module provides a clean registry system for blocks with metadata,
5
+ categorization, and improved error handling.
6
+ """
7
+
8
+ # Standard
9
+ from dataclasses import dataclass
10
+ from difflib import get_close_matches
11
+ from typing import Optional
12
+ import inspect
13
+
14
+ # Third Party
15
+ from rich.console import Console
16
+ from rich.table import Table
17
+
18
+ # Local
19
+ from ..utils.logger_config import setup_logger
20
+
21
+ logger = setup_logger(__name__)
22
+ console = Console()
23
+
24
+
25
+ @dataclass
26
+ class BlockMetadata:
27
+ """Metadata for registered blocks.
28
+
29
+ Parameters
30
+ ----------
31
+ name : str
32
+ The registered name of the block.
33
+ block_class : Type
34
+ The actual block class.
35
+ category : str
36
+ Category for organization (e.g., 'llm', 'utility', 'filtering').
37
+ description : str, optional
38
+ Human-readable description of what the block does.
39
+ deprecated : bool, optional
40
+ Whether this block is deprecated.
41
+ replacement : str, optional
42
+ Suggested replacement if deprecated.
43
+ """
44
+
45
+ name: str
46
+ block_class: type
47
+ category: str
48
+ description: str = ""
49
+ deprecated: bool = False
50
+ replacement: Optional[str] = None
51
+
52
+ def __post_init__(self) -> None:
53
+ """Validate metadata after initialization."""
54
+ if not self.name:
55
+ raise ValueError("Block name cannot be empty")
56
+ if not inspect.isclass(self.block_class):
57
+ raise ValueError("block_class must be a class")
58
+
59
+
60
+ class BlockRegistry:
61
+ """Registry for block classes with metadata and enhanced error handling."""
62
+
63
+ _metadata: dict[str, BlockMetadata] = {}
64
+ _categories: dict[str, set[str]] = {}
65
+
66
+ @classmethod
67
+ def register(
68
+ cls,
69
+ block_name: str,
70
+ category: str,
71
+ description: str = "",
72
+ deprecated: bool = False,
73
+ replacement: Optional[str] = None,
74
+ ):
75
+ """Register a block class with metadata.
76
+
77
+ Parameters
78
+ ----------
79
+ block_name : str
80
+ Name under which to register the block.
81
+ category : str
82
+ Category for organization.
83
+ description : str, optional
84
+ Human-readable description of the block.
85
+ deprecated : bool, optional
86
+ Whether this block is deprecated.
87
+ replacement : str, optional
88
+ Suggested replacement if deprecated.
89
+
90
+ Returns
91
+ -------
92
+ callable
93
+ Decorator function.
94
+ """
95
+
96
+ def decorator(block_class: type) -> type:
97
+ # Validate the class
98
+ cls._validate_block_class(block_class)
99
+
100
+ # Create metadata
101
+ metadata = BlockMetadata(
102
+ name=block_name,
103
+ block_class=block_class,
104
+ category=category,
105
+ description=description,
106
+ deprecated=deprecated,
107
+ replacement=replacement,
108
+ )
109
+
110
+ # Register the metadata
111
+ cls._metadata[block_name] = metadata
112
+
113
+ # Update category index
114
+ if category not in cls._categories:
115
+ cls._categories[category] = set()
116
+ cls._categories[category].add(block_name)
117
+
118
+ logger.debug(
119
+ f"Registered block '{block_name}' "
120
+ f"({block_class.__name__}) in category '{category}'"
121
+ )
122
+
123
+ if deprecated:
124
+ warning_msg = f"Block '{block_name}' is deprecated."
125
+ if replacement:
126
+ warning_msg += f" Use '{replacement}' instead."
127
+ logger.warning(warning_msg)
128
+
129
+ return block_class
130
+
131
+ return decorator
132
+
133
+ @classmethod
134
+ def _validate_block_class(cls, block_class: type) -> None:
135
+ """Validate that a class is a proper block class.
136
+
137
+ Parameters
138
+ ----------
139
+ block_class : Type
140
+ The class to validate.
141
+
142
+ Raises
143
+ ------
144
+ ValueError
145
+ If the class is not a valid block class.
146
+ """
147
+ if not inspect.isclass(block_class):
148
+ raise ValueError(f"Expected a class, got {type(block_class)}")
149
+
150
+ # Validate BaseBlock inheritance
151
+ try:
152
+ # Local
153
+ from .base import BaseBlock
154
+
155
+ if not issubclass(block_class, BaseBlock):
156
+ raise ValueError(
157
+ f"Block class '{block_class.__name__}' must inherit from BaseBlock"
158
+ )
159
+ except ImportError as exc:
160
+ # BaseBlock not available, check for generate method
161
+ if not hasattr(block_class, "generate"):
162
+ raise ValueError(
163
+ f"Block class '{block_class.__name__}' must implement 'generate' method"
164
+ ) from exc
165
+
166
+ @classmethod
167
+ def get(cls, block_name: str) -> type:
168
+ """Get a block class with enhanced error handling.
169
+
170
+ Parameters
171
+ ----------
172
+ block_name : str
173
+ Name of the block to retrieve.
174
+
175
+ Returns
176
+ -------
177
+ Type
178
+ The block class.
179
+
180
+ Raises
181
+ ------
182
+ KeyError
183
+ If the block is not found, with helpful suggestions.
184
+ """
185
+ if block_name not in cls._metadata:
186
+ available_blocks = list(cls._metadata.keys())
187
+ suggestions = get_close_matches(
188
+ block_name, available_blocks, n=3, cutoff=0.6
189
+ )
190
+
191
+ error_msg = f"Block '{block_name}' not found in registry."
192
+
193
+ if suggestions:
194
+ error_msg += f" Did you mean: {', '.join(suggestions)}?"
195
+
196
+ if available_blocks:
197
+ error_msg += (
198
+ f"\nAvailable blocks: {', '.join(sorted(available_blocks))}"
199
+ )
200
+
201
+ if cls._categories:
202
+ error_msg += (
203
+ f"\nCategories: {', '.join(sorted(cls._categories.keys()))}"
204
+ )
205
+
206
+ logger.error(error_msg)
207
+ raise KeyError(error_msg)
208
+
209
+ metadata = cls._metadata[block_name]
210
+
211
+ if metadata.deprecated:
212
+ warning_msg = f"Block '{block_name}' is deprecated."
213
+ if metadata.replacement:
214
+ warning_msg += f" Use '{metadata.replacement}' instead."
215
+ logger.warning(warning_msg)
216
+
217
+ return metadata.block_class
218
+
219
+ @classmethod
220
+ def info(cls, block_name: str) -> BlockMetadata:
221
+ """Get metadata for a specific block.
222
+
223
+ Parameters
224
+ ----------
225
+ block_name : str
226
+ Name of the block.
227
+
228
+ Returns
229
+ -------
230
+ BlockMetadata
231
+ The block's metadata.
232
+
233
+ Raises
234
+ ------
235
+ KeyError
236
+ If the block is not found.
237
+ """
238
+ if block_name not in cls._metadata:
239
+ raise KeyError(f"Block '{block_name}' not found in registry.")
240
+ return cls._metadata[block_name]
241
+
242
+ @classmethod
243
+ def categories(cls) -> list[str]:
244
+ """Get all available categories.
245
+
246
+ Returns
247
+ -------
248
+ List[str]
249
+ Sorted list of categories.
250
+ """
251
+ return sorted(cls._categories.keys())
252
+
253
+ @classmethod
254
+ def category(cls, category: str) -> list[str]:
255
+ """Get all blocks in a specific category.
256
+
257
+ Parameters
258
+ ----------
259
+ category : str
260
+ The category to filter by.
261
+
262
+ Returns
263
+ -------
264
+ List[str]
265
+ List of block names in the category.
266
+
267
+ Raises
268
+ ------
269
+ KeyError
270
+ If the category doesn't exist.
271
+ """
272
+ if category not in cls._categories:
273
+ available_categories = sorted(cls._categories.keys())
274
+ raise KeyError(
275
+ f"Category '{category}' not found. "
276
+ f"Available categories: {', '.join(available_categories)}"
277
+ )
278
+ return sorted(cls._categories[category])
279
+
280
+ @classmethod
281
+ def all(cls) -> dict[str, list[str]]:
282
+ """List all blocks organized by category.
283
+
284
+ Returns
285
+ -------
286
+ Dict[str, List[str]]
287
+ Dictionary mapping categories to lists of block names.
288
+ """
289
+ return {
290
+ category: sorted(blocks) for category, blocks in cls._categories.items()
291
+ }
292
+
293
+ @classmethod
294
+ def discover_blocks(cls) -> None:
295
+ """Print a Rich-formatted table of all available blocks."""
296
+ if not cls._metadata:
297
+ console.print("[yellow]No blocks registered yet.[/yellow]")
298
+ return
299
+
300
+ table = Table(
301
+ title="Available Blocks", show_header=True, header_style="bold magenta"
302
+ )
303
+ table.add_column("Block Name", style="cyan", no_wrap=True)
304
+ table.add_column("Category", style="green")
305
+ table.add_column("Description", style="white")
306
+
307
+ # Sort blocks by category, then by name
308
+ sorted_blocks = sorted(
309
+ cls._metadata.items(), key=lambda x: (x[1].category, x[0])
310
+ )
311
+
312
+ for name, metadata in sorted_blocks:
313
+ description = metadata.description or "No description"
314
+
315
+ # Show deprecated blocks with a warning indicator in the name
316
+ block_name = f"⚠️ {name}" if metadata.deprecated else name
317
+
318
+ table.add_row(block_name, metadata.category, description)
319
+
320
+ console.print(table)
321
+
322
+ # Show summary
323
+ total_blocks = len(cls._metadata)
324
+ total_categories = len(cls._categories)
325
+ deprecated_count = sum(1 for m in cls._metadata.values() if m.deprecated)
326
+
327
+ console.print(
328
+ f"\n[bold]Summary:[/bold] {total_blocks} blocks across {total_categories} categories"
329
+ )
330
+ if deprecated_count > 0:
331
+ console.print(f"[yellow]⚠️ {deprecated_count} deprecated blocks[/yellow]")