sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +27 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +1209 -0
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +389 -0
- sdg_hub/core/flow/migration.py +198 -0
- sdg_hub/core/flow/registry.py +393 -0
- sdg_hub/core/flow/validation.py +277 -0
- sdg_hub/{utils → core/utils}/__init__.py +7 -4
- sdg_hub/core/utils/datautils.py +63 -0
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.1.dist-info/METADATA +221 -0
- sdg_hub-0.2.1.dist-info/RECORD +68 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/datautils.py +0 -14
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,357 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Text parser block for parsing and post-processing LLM outputs.
|
3
|
+
|
4
|
+
This module provides the TextParserBlock for handling output parsing using
|
5
|
+
start/end tags, custom regex patterns, and cleanup operations.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any, Optional
|
10
|
+
import re
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
from pydantic import Field, field_validator, model_validator
|
15
|
+
|
16
|
+
# Local
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..registry import BlockRegistry
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"TextParserBlock",
|
26
|
+
"llm",
|
27
|
+
"Parses and post-processes LLM outputs using tags or regex patterns",
|
28
|
+
)
|
29
|
+
class TextParserBlock(BaseBlock):
|
30
|
+
"""Block for parsing and post-processing LLM outputs.
|
31
|
+
|
32
|
+
This block handles output parsing using start/end tags, custom regex patterns,
|
33
|
+
and cleanup operations. It expects exactly one input column containing raw LLM output.
|
34
|
+
|
35
|
+
Attributes
|
36
|
+
----------
|
37
|
+
block_name : str
|
38
|
+
Unique identifier for this block instance.
|
39
|
+
input_cols : Union[str, List[str], Dict[str, Any], None]
|
40
|
+
Input column name(s) containing raw LLM output. Must specify exactly one column.
|
41
|
+
output_cols : Union[str, List[str], Dict[str, Any], None]
|
42
|
+
Output column name(s) for parsed results.
|
43
|
+
start_tags : List[str]
|
44
|
+
List of start tags for tag-based parsing.
|
45
|
+
end_tags : List[str]
|
46
|
+
List of end tags for tag-based parsing.
|
47
|
+
parsing_pattern : Optional[str]
|
48
|
+
Regex pattern for custom parsing.
|
49
|
+
parser_cleanup_tags : Optional[List[str]]
|
50
|
+
List of tags to clean from parsed output.
|
51
|
+
expand_lists : bool
|
52
|
+
Whether to expand list inputs into individual rows (True) or preserve lists (False).
|
53
|
+
Default is True for backward compatibility.
|
54
|
+
"""
|
55
|
+
|
56
|
+
start_tags: list[str] = Field(
|
57
|
+
default_factory=list, description="List of start tags for tag-based parsing"
|
58
|
+
)
|
59
|
+
end_tags: list[str] = Field(
|
60
|
+
default_factory=list, description="List of end tags for tag-based parsing"
|
61
|
+
)
|
62
|
+
parsing_pattern: Optional[str] = Field(
|
63
|
+
default=None, description="Regex pattern for custom parsing"
|
64
|
+
)
|
65
|
+
parser_cleanup_tags: Optional[list[str]] = Field(
|
66
|
+
default=None, description="List of tags to clean from parsed output"
|
67
|
+
)
|
68
|
+
expand_lists: bool = Field(
|
69
|
+
default=True,
|
70
|
+
description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
|
71
|
+
)
|
72
|
+
|
73
|
+
@field_validator("start_tags", "end_tags", mode="before")
|
74
|
+
@classmethod
|
75
|
+
def normalize_tags(cls, v):
|
76
|
+
"""Normalize tag lists to ensure they are always lists."""
|
77
|
+
if v is None:
|
78
|
+
return []
|
79
|
+
if isinstance(v, str):
|
80
|
+
return [v]
|
81
|
+
if isinstance(v, list):
|
82
|
+
return v
|
83
|
+
raise ValueError(f"Tags must be a string, list, or None, got {type(v)}")
|
84
|
+
|
85
|
+
@field_validator("parser_cleanup_tags", mode="before")
|
86
|
+
@classmethod
|
87
|
+
def normalize_cleanup_tags(cls, v):
|
88
|
+
"""Normalize cleanup tags to ensure they are always lists when not None."""
|
89
|
+
if v is None:
|
90
|
+
return None
|
91
|
+
if isinstance(v, str):
|
92
|
+
return [v]
|
93
|
+
if isinstance(v, list):
|
94
|
+
return v
|
95
|
+
raise ValueError(f"Cleanup tags must be a string, list, or None, got {type(v)}")
|
96
|
+
|
97
|
+
@model_validator(mode="after")
|
98
|
+
def validate_parsing_configuration(self):
|
99
|
+
"""Validate that parsing configuration is consistent."""
|
100
|
+
# Validate that at least one parsing method is configured
|
101
|
+
has_regex = self.parsing_pattern is not None
|
102
|
+
has_tags = bool(self.start_tags) or bool(self.end_tags)
|
103
|
+
|
104
|
+
if not has_regex and not has_tags:
|
105
|
+
raise ValueError(
|
106
|
+
"TextParserBlock requires at least one parsing method: "
|
107
|
+
"either 'parsing_pattern' (regex) or 'start_tags'/'end_tags' (tag-based parsing)"
|
108
|
+
)
|
109
|
+
|
110
|
+
# Validate tag parsing configuration
|
111
|
+
if has_tags:
|
112
|
+
if len(self.start_tags) != len(self.end_tags):
|
113
|
+
raise ValueError(
|
114
|
+
f"start_tags and end_tags must have the same length. "
|
115
|
+
f"Got {len(self.start_tags)} start_tags and {len(self.end_tags)} end_tags"
|
116
|
+
)
|
117
|
+
|
118
|
+
# We can't validate against output_cols here since they might not be normalized yet
|
119
|
+
# This validation will be moved to _validate_custom
|
120
|
+
|
121
|
+
return self
|
122
|
+
|
123
|
+
def _validate_custom(self, dataset: Dataset) -> None:
|
124
|
+
"""Validate TextParserBlock specific requirements.
|
125
|
+
|
126
|
+
Parameters
|
127
|
+
----------
|
128
|
+
dataset : Dataset
|
129
|
+
The dataset to validate.
|
130
|
+
|
131
|
+
Raises
|
132
|
+
------
|
133
|
+
ValueError
|
134
|
+
If TextParserBlock requirements are not met.
|
135
|
+
"""
|
136
|
+
# Validate that we have exactly one input column
|
137
|
+
if len(self.input_cols) == 0:
|
138
|
+
raise ValueError("TextParserBlock expects at least one input column")
|
139
|
+
if len(self.input_cols) > 1:
|
140
|
+
logger.warning(
|
141
|
+
f"TextParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
|
142
|
+
f"Using the first column: {self.input_cols[0]}"
|
143
|
+
)
|
144
|
+
|
145
|
+
# Validate tag parsing against output columns (can only be done after model creation)
|
146
|
+
has_tags = bool(self.start_tags) or bool(self.end_tags)
|
147
|
+
if has_tags and len(self.start_tags) != len(self.output_cols):
|
148
|
+
raise ValueError(
|
149
|
+
f"When using tag-based parsing, the number of tag pairs must match output_cols. "
|
150
|
+
f"Got {len(self.start_tags)} tag pairs and {len(self.output_cols)} output columns"
|
151
|
+
)
|
152
|
+
|
153
|
+
def _extract_matches(
|
154
|
+
self, text: str, start_tag: Optional[str], end_tag: Optional[str]
|
155
|
+
) -> list[str]:
|
156
|
+
if not text:
|
157
|
+
return []
|
158
|
+
if not start_tag and not end_tag:
|
159
|
+
return [text.strip()]
|
160
|
+
|
161
|
+
pattern = ""
|
162
|
+
if start_tag:
|
163
|
+
pattern += re.escape(start_tag)
|
164
|
+
pattern += r"(.*?)"
|
165
|
+
if end_tag:
|
166
|
+
pattern += re.escape(end_tag)
|
167
|
+
elif start_tag:
|
168
|
+
pattern += "$"
|
169
|
+
|
170
|
+
return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
|
171
|
+
|
172
|
+
def _parse(self, generated_string: str) -> dict[str, list[str]]:
|
173
|
+
if self.parsing_pattern is not None:
|
174
|
+
return self._parse_with_regex(generated_string)
|
175
|
+
return self._parse_with_tags(generated_string)
|
176
|
+
|
177
|
+
def _parse_with_regex(self, generated_string: str) -> dict[str, list[str]]:
|
178
|
+
"""Parse using regex pattern."""
|
179
|
+
if self.parsing_pattern is None:
|
180
|
+
raise ValueError("parsing_pattern is required for regex parsing")
|
181
|
+
pattern = re.compile(self.parsing_pattern, re.DOTALL)
|
182
|
+
all_matches = pattern.findall(generated_string)
|
183
|
+
matches: dict[str, list[str]] = {
|
184
|
+
column_name: [] for column_name in self.output_cols
|
185
|
+
}
|
186
|
+
|
187
|
+
logger.debug(
|
188
|
+
f"Regex parsing found {len(all_matches)} matches with pattern: {self.parsing_pattern}"
|
189
|
+
)
|
190
|
+
|
191
|
+
if all_matches and isinstance(all_matches[0], tuple):
|
192
|
+
return self._process_tuple_matches(all_matches, matches)
|
193
|
+
return self._process_single_matches(all_matches, matches)
|
194
|
+
|
195
|
+
def _parse_with_tags(self, generated_string: str) -> dict[str, list[str]]:
|
196
|
+
"""Parse using start/end tags."""
|
197
|
+
matches: dict[str, list[str]] = {
|
198
|
+
column_name: [] for column_name in self.output_cols
|
199
|
+
}
|
200
|
+
|
201
|
+
for start_tag, end_tag, output_col in zip(
|
202
|
+
self.start_tags, self.end_tags, self.output_cols
|
203
|
+
):
|
204
|
+
extracted = self._extract_matches(generated_string, start_tag, end_tag)
|
205
|
+
matches[output_col] = extracted
|
206
|
+
logger.debug(
|
207
|
+
f"Tag parsing for '{output_col}' with tags '{start_tag}'/'{end_tag}' found {len(extracted)} matches"
|
208
|
+
)
|
209
|
+
|
210
|
+
return matches
|
211
|
+
|
212
|
+
def _process_tuple_matches(
|
213
|
+
self, all_matches: list, matches: dict[str, list[str]]
|
214
|
+
) -> dict[str, list[str]]:
|
215
|
+
"""Process regex matches that are tuples."""
|
216
|
+
for match in all_matches:
|
217
|
+
for column_name, value in zip(self.output_cols, match):
|
218
|
+
value = self._clean_value(value.strip())
|
219
|
+
matches[column_name].append(value)
|
220
|
+
return matches
|
221
|
+
|
222
|
+
def _process_single_matches(
|
223
|
+
self, all_matches: list, matches: dict[str, list[str]]
|
224
|
+
) -> dict[str, list[str]]:
|
225
|
+
"""Process regex matches that are single values."""
|
226
|
+
cleaned_matches = [self._clean_value(match.strip()) for match in all_matches]
|
227
|
+
matches[self.output_cols[0]] = cleaned_matches
|
228
|
+
return matches
|
229
|
+
|
230
|
+
def _clean_value(self, value: str) -> str:
|
231
|
+
"""Clean value by removing cleanup tags."""
|
232
|
+
if self.parser_cleanup_tags:
|
233
|
+
for clean_tag in self.parser_cleanup_tags:
|
234
|
+
value = value.replace(clean_tag, "")
|
235
|
+
return value
|
236
|
+
|
237
|
+
def _generate(self, sample: dict) -> list[dict]:
|
238
|
+
input_column = self.input_cols[0]
|
239
|
+
raw_output = sample[input_column]
|
240
|
+
|
241
|
+
# Handle list inputs (e.g., from LLMChatBlock with n > 1)
|
242
|
+
if isinstance(raw_output, list):
|
243
|
+
if not raw_output:
|
244
|
+
logger.warning(f"Input column '{input_column}' contains empty list")
|
245
|
+
return []
|
246
|
+
|
247
|
+
if not self.expand_lists:
|
248
|
+
# When expand_lists=False, preserve the list structure
|
249
|
+
# Parse each response in the list and collect results as lists
|
250
|
+
all_parsed_outputs = {col: [] for col in self.output_cols}
|
251
|
+
valid_responses = 0
|
252
|
+
|
253
|
+
for i, response in enumerate(raw_output):
|
254
|
+
if not response or not isinstance(response, str):
|
255
|
+
logger.warning(
|
256
|
+
f"List item {i} in column '{input_column}' contains invalid data "
|
257
|
+
f"(empty or non-string): {type(response)}"
|
258
|
+
)
|
259
|
+
continue
|
260
|
+
|
261
|
+
parsed_outputs = self._parse(response)
|
262
|
+
|
263
|
+
if not parsed_outputs or not any(
|
264
|
+
len(value) > 0 for value in parsed_outputs.values()
|
265
|
+
):
|
266
|
+
logger.warning(
|
267
|
+
f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
|
268
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
269
|
+
)
|
270
|
+
continue
|
271
|
+
|
272
|
+
valid_responses += 1
|
273
|
+
# Collect all parsed values for each column as lists
|
274
|
+
for col in self.output_cols:
|
275
|
+
all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
|
276
|
+
|
277
|
+
if valid_responses == 0:
|
278
|
+
return []
|
279
|
+
|
280
|
+
# Return single row with lists as values
|
281
|
+
# TODO: This breaks retry counting in LLMChatWithParsingRetryBlock until LLMChatWithParsingRetryBlock is re-based
|
282
|
+
# which expects one row per successful parse for counting
|
283
|
+
return [{**sample, **all_parsed_outputs}]
|
284
|
+
|
285
|
+
else:
|
286
|
+
# When expand_lists=True, use existing expanding behavior
|
287
|
+
all_results = []
|
288
|
+
for i, response in enumerate(raw_output):
|
289
|
+
if not response or not isinstance(response, str):
|
290
|
+
logger.warning(
|
291
|
+
f"List item {i} in column '{input_column}' contains invalid data "
|
292
|
+
f"(empty or non-string): {type(response)}"
|
293
|
+
)
|
294
|
+
continue
|
295
|
+
|
296
|
+
parsed_outputs = self._parse(response)
|
297
|
+
|
298
|
+
if not parsed_outputs or not any(
|
299
|
+
len(value) > 0 for value in parsed_outputs.values()
|
300
|
+
):
|
301
|
+
logger.warning(
|
302
|
+
f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
|
303
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
304
|
+
)
|
305
|
+
continue
|
306
|
+
|
307
|
+
# Create output rows for this response
|
308
|
+
max_length = max(len(value) for value in parsed_outputs.values())
|
309
|
+
for values in zip(
|
310
|
+
*(lst[:max_length] for lst in parsed_outputs.values())
|
311
|
+
):
|
312
|
+
all_results.append(
|
313
|
+
{**sample, **dict(zip(parsed_outputs.keys(), values))}
|
314
|
+
)
|
315
|
+
|
316
|
+
return all_results
|
317
|
+
|
318
|
+
# Handle string inputs (existing logic)
|
319
|
+
elif isinstance(raw_output, str):
|
320
|
+
if not raw_output:
|
321
|
+
logger.warning(f"Input column '{input_column}' contains empty string")
|
322
|
+
return []
|
323
|
+
|
324
|
+
parsed_outputs = self._parse(raw_output)
|
325
|
+
|
326
|
+
if not parsed_outputs or not any(
|
327
|
+
len(value) > 0 for value in parsed_outputs.values()
|
328
|
+
):
|
329
|
+
logger.warning(
|
330
|
+
f"Failed to parse any content from input. Raw output length: {len(raw_output)}, "
|
331
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
332
|
+
)
|
333
|
+
return []
|
334
|
+
|
335
|
+
result = []
|
336
|
+
max_length = max(len(value) for value in parsed_outputs.values())
|
337
|
+
for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
|
338
|
+
result.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
|
339
|
+
return result
|
340
|
+
|
341
|
+
else:
|
342
|
+
logger.warning(
|
343
|
+
f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
|
344
|
+
f"Expected str or List[str]"
|
345
|
+
)
|
346
|
+
return []
|
347
|
+
|
348
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
349
|
+
logger.debug(f"Parsing outputs for {len(samples)} samples")
|
350
|
+
if len(samples) == 0:
|
351
|
+
logger.warning("No samples to parse, returning empty dataset")
|
352
|
+
return Dataset.from_list([])
|
353
|
+
|
354
|
+
new_data = []
|
355
|
+
for sample in samples:
|
356
|
+
new_data.extend(self._generate(sample))
|
357
|
+
return Dataset.from_list(new_data)
|
@@ -0,0 +1,331 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Enhanced BlockRegistry with metadata and better error handling.
|
3
|
+
|
4
|
+
This module provides a clean registry system for blocks with metadata,
|
5
|
+
categorization, and improved error handling.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from difflib import get_close_matches
|
11
|
+
from typing import Optional
|
12
|
+
import inspect
|
13
|
+
|
14
|
+
# Third Party
|
15
|
+
from rich.console import Console
|
16
|
+
from rich.table import Table
|
17
|
+
|
18
|
+
# Local
|
19
|
+
from ..utils.logger_config import setup_logger
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
console = Console()
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class BlockMetadata:
|
27
|
+
"""Metadata for registered blocks.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
name : str
|
32
|
+
The registered name of the block.
|
33
|
+
block_class : Type
|
34
|
+
The actual block class.
|
35
|
+
category : str
|
36
|
+
Category for organization (e.g., 'llm', 'utility', 'filtering').
|
37
|
+
description : str, optional
|
38
|
+
Human-readable description of what the block does.
|
39
|
+
deprecated : bool, optional
|
40
|
+
Whether this block is deprecated.
|
41
|
+
replacement : str, optional
|
42
|
+
Suggested replacement if deprecated.
|
43
|
+
"""
|
44
|
+
|
45
|
+
name: str
|
46
|
+
block_class: type
|
47
|
+
category: str
|
48
|
+
description: str = ""
|
49
|
+
deprecated: bool = False
|
50
|
+
replacement: Optional[str] = None
|
51
|
+
|
52
|
+
def __post_init__(self) -> None:
|
53
|
+
"""Validate metadata after initialization."""
|
54
|
+
if not self.name:
|
55
|
+
raise ValueError("Block name cannot be empty")
|
56
|
+
if not inspect.isclass(self.block_class):
|
57
|
+
raise ValueError("block_class must be a class")
|
58
|
+
|
59
|
+
|
60
|
+
class BlockRegistry:
|
61
|
+
"""Registry for block classes with metadata and enhanced error handling."""
|
62
|
+
|
63
|
+
_metadata: dict[str, BlockMetadata] = {}
|
64
|
+
_categories: dict[str, set[str]] = {}
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def register(
|
68
|
+
cls,
|
69
|
+
block_name: str,
|
70
|
+
category: str,
|
71
|
+
description: str = "",
|
72
|
+
deprecated: bool = False,
|
73
|
+
replacement: Optional[str] = None,
|
74
|
+
):
|
75
|
+
"""Register a block class with metadata.
|
76
|
+
|
77
|
+
Parameters
|
78
|
+
----------
|
79
|
+
block_name : str
|
80
|
+
Name under which to register the block.
|
81
|
+
category : str
|
82
|
+
Category for organization.
|
83
|
+
description : str, optional
|
84
|
+
Human-readable description of the block.
|
85
|
+
deprecated : bool, optional
|
86
|
+
Whether this block is deprecated.
|
87
|
+
replacement : str, optional
|
88
|
+
Suggested replacement if deprecated.
|
89
|
+
|
90
|
+
Returns
|
91
|
+
-------
|
92
|
+
callable
|
93
|
+
Decorator function.
|
94
|
+
"""
|
95
|
+
|
96
|
+
def decorator(block_class: type) -> type:
|
97
|
+
# Validate the class
|
98
|
+
cls._validate_block_class(block_class)
|
99
|
+
|
100
|
+
# Create metadata
|
101
|
+
metadata = BlockMetadata(
|
102
|
+
name=block_name,
|
103
|
+
block_class=block_class,
|
104
|
+
category=category,
|
105
|
+
description=description,
|
106
|
+
deprecated=deprecated,
|
107
|
+
replacement=replacement,
|
108
|
+
)
|
109
|
+
|
110
|
+
# Register the metadata
|
111
|
+
cls._metadata[block_name] = metadata
|
112
|
+
|
113
|
+
# Update category index
|
114
|
+
if category not in cls._categories:
|
115
|
+
cls._categories[category] = set()
|
116
|
+
cls._categories[category].add(block_name)
|
117
|
+
|
118
|
+
logger.debug(
|
119
|
+
f"Registered block '{block_name}' "
|
120
|
+
f"({block_class.__name__}) in category '{category}'"
|
121
|
+
)
|
122
|
+
|
123
|
+
if deprecated:
|
124
|
+
warning_msg = f"Block '{block_name}' is deprecated."
|
125
|
+
if replacement:
|
126
|
+
warning_msg += f" Use '{replacement}' instead."
|
127
|
+
logger.warning(warning_msg)
|
128
|
+
|
129
|
+
return block_class
|
130
|
+
|
131
|
+
return decorator
|
132
|
+
|
133
|
+
@classmethod
|
134
|
+
def _validate_block_class(cls, block_class: type) -> None:
|
135
|
+
"""Validate that a class is a proper block class.
|
136
|
+
|
137
|
+
Parameters
|
138
|
+
----------
|
139
|
+
block_class : Type
|
140
|
+
The class to validate.
|
141
|
+
|
142
|
+
Raises
|
143
|
+
------
|
144
|
+
ValueError
|
145
|
+
If the class is not a valid block class.
|
146
|
+
"""
|
147
|
+
if not inspect.isclass(block_class):
|
148
|
+
raise ValueError(f"Expected a class, got {type(block_class)}")
|
149
|
+
|
150
|
+
# Validate BaseBlock inheritance
|
151
|
+
try:
|
152
|
+
# Local
|
153
|
+
from .base import BaseBlock
|
154
|
+
|
155
|
+
if not issubclass(block_class, BaseBlock):
|
156
|
+
raise ValueError(
|
157
|
+
f"Block class '{block_class.__name__}' must inherit from BaseBlock"
|
158
|
+
)
|
159
|
+
except ImportError as exc:
|
160
|
+
# BaseBlock not available, check for generate method
|
161
|
+
if not hasattr(block_class, "generate"):
|
162
|
+
raise ValueError(
|
163
|
+
f"Block class '{block_class.__name__}' must implement 'generate' method"
|
164
|
+
) from exc
|
165
|
+
|
166
|
+
@classmethod
|
167
|
+
def get(cls, block_name: str) -> type:
|
168
|
+
"""Get a block class with enhanced error handling.
|
169
|
+
|
170
|
+
Parameters
|
171
|
+
----------
|
172
|
+
block_name : str
|
173
|
+
Name of the block to retrieve.
|
174
|
+
|
175
|
+
Returns
|
176
|
+
-------
|
177
|
+
Type
|
178
|
+
The block class.
|
179
|
+
|
180
|
+
Raises
|
181
|
+
------
|
182
|
+
KeyError
|
183
|
+
If the block is not found, with helpful suggestions.
|
184
|
+
"""
|
185
|
+
if block_name not in cls._metadata:
|
186
|
+
available_blocks = list(cls._metadata.keys())
|
187
|
+
suggestions = get_close_matches(
|
188
|
+
block_name, available_blocks, n=3, cutoff=0.6
|
189
|
+
)
|
190
|
+
|
191
|
+
error_msg = f"Block '{block_name}' not found in registry."
|
192
|
+
|
193
|
+
if suggestions:
|
194
|
+
error_msg += f" Did you mean: {', '.join(suggestions)}?"
|
195
|
+
|
196
|
+
if available_blocks:
|
197
|
+
error_msg += (
|
198
|
+
f"\nAvailable blocks: {', '.join(sorted(available_blocks))}"
|
199
|
+
)
|
200
|
+
|
201
|
+
if cls._categories:
|
202
|
+
error_msg += (
|
203
|
+
f"\nCategories: {', '.join(sorted(cls._categories.keys()))}"
|
204
|
+
)
|
205
|
+
|
206
|
+
logger.error(error_msg)
|
207
|
+
raise KeyError(error_msg)
|
208
|
+
|
209
|
+
metadata = cls._metadata[block_name]
|
210
|
+
|
211
|
+
if metadata.deprecated:
|
212
|
+
warning_msg = f"Block '{block_name}' is deprecated."
|
213
|
+
if metadata.replacement:
|
214
|
+
warning_msg += f" Use '{metadata.replacement}' instead."
|
215
|
+
logger.warning(warning_msg)
|
216
|
+
|
217
|
+
return metadata.block_class
|
218
|
+
|
219
|
+
@classmethod
|
220
|
+
def info(cls, block_name: str) -> BlockMetadata:
|
221
|
+
"""Get metadata for a specific block.
|
222
|
+
|
223
|
+
Parameters
|
224
|
+
----------
|
225
|
+
block_name : str
|
226
|
+
Name of the block.
|
227
|
+
|
228
|
+
Returns
|
229
|
+
-------
|
230
|
+
BlockMetadata
|
231
|
+
The block's metadata.
|
232
|
+
|
233
|
+
Raises
|
234
|
+
------
|
235
|
+
KeyError
|
236
|
+
If the block is not found.
|
237
|
+
"""
|
238
|
+
if block_name not in cls._metadata:
|
239
|
+
raise KeyError(f"Block '{block_name}' not found in registry.")
|
240
|
+
return cls._metadata[block_name]
|
241
|
+
|
242
|
+
@classmethod
|
243
|
+
def categories(cls) -> list[str]:
|
244
|
+
"""Get all available categories.
|
245
|
+
|
246
|
+
Returns
|
247
|
+
-------
|
248
|
+
List[str]
|
249
|
+
Sorted list of categories.
|
250
|
+
"""
|
251
|
+
return sorted(cls._categories.keys())
|
252
|
+
|
253
|
+
@classmethod
|
254
|
+
def category(cls, category: str) -> list[str]:
|
255
|
+
"""Get all blocks in a specific category.
|
256
|
+
|
257
|
+
Parameters
|
258
|
+
----------
|
259
|
+
category : str
|
260
|
+
The category to filter by.
|
261
|
+
|
262
|
+
Returns
|
263
|
+
-------
|
264
|
+
List[str]
|
265
|
+
List of block names in the category.
|
266
|
+
|
267
|
+
Raises
|
268
|
+
------
|
269
|
+
KeyError
|
270
|
+
If the category doesn't exist.
|
271
|
+
"""
|
272
|
+
if category not in cls._categories:
|
273
|
+
available_categories = sorted(cls._categories.keys())
|
274
|
+
raise KeyError(
|
275
|
+
f"Category '{category}' not found. "
|
276
|
+
f"Available categories: {', '.join(available_categories)}"
|
277
|
+
)
|
278
|
+
return sorted(cls._categories[category])
|
279
|
+
|
280
|
+
@classmethod
|
281
|
+
def all(cls) -> dict[str, list[str]]:
|
282
|
+
"""List all blocks organized by category.
|
283
|
+
|
284
|
+
Returns
|
285
|
+
-------
|
286
|
+
Dict[str, List[str]]
|
287
|
+
Dictionary mapping categories to lists of block names.
|
288
|
+
"""
|
289
|
+
return {
|
290
|
+
category: sorted(blocks) for category, blocks in cls._categories.items()
|
291
|
+
}
|
292
|
+
|
293
|
+
@classmethod
|
294
|
+
def discover_blocks(cls) -> None:
|
295
|
+
"""Print a Rich-formatted table of all available blocks."""
|
296
|
+
if not cls._metadata:
|
297
|
+
console.print("[yellow]No blocks registered yet.[/yellow]")
|
298
|
+
return
|
299
|
+
|
300
|
+
table = Table(
|
301
|
+
title="Available Blocks", show_header=True, header_style="bold magenta"
|
302
|
+
)
|
303
|
+
table.add_column("Block Name", style="cyan", no_wrap=True)
|
304
|
+
table.add_column("Category", style="green")
|
305
|
+
table.add_column("Description", style="white")
|
306
|
+
|
307
|
+
# Sort blocks by category, then by name
|
308
|
+
sorted_blocks = sorted(
|
309
|
+
cls._metadata.items(), key=lambda x: (x[1].category, x[0])
|
310
|
+
)
|
311
|
+
|
312
|
+
for name, metadata in sorted_blocks:
|
313
|
+
description = metadata.description or "No description"
|
314
|
+
|
315
|
+
# Show deprecated blocks with a warning indicator in the name
|
316
|
+
block_name = f"⚠️ {name}" if metadata.deprecated else name
|
317
|
+
|
318
|
+
table.add_row(block_name, metadata.category, description)
|
319
|
+
|
320
|
+
console.print(table)
|
321
|
+
|
322
|
+
# Show summary
|
323
|
+
total_blocks = len(cls._metadata)
|
324
|
+
total_categories = len(cls._categories)
|
325
|
+
deprecated_count = sum(1 for m in cls._metadata.values() if m.deprecated)
|
326
|
+
|
327
|
+
console.print(
|
328
|
+
f"\n[bold]Summary:[/bold] {total_blocks} blocks across {total_categories} categories"
|
329
|
+
)
|
330
|
+
if deprecated_count > 0:
|
331
|
+
console.print(f"[yellow]⚠️ {deprecated_count} deprecated blocks[/yellow]")
|