sdg-hub 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +0 -2
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +1 -2
- sdg_hub/core/blocks/__init__.py +2 -4
- sdg_hub/core/blocks/base.py +61 -6
- sdg_hub/core/blocks/filtering/column_value_filter.py +3 -2
- sdg_hub/core/blocks/llm/__init__.py +2 -4
- sdg_hub/core/blocks/llm/llm_chat_block.py +251 -265
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +216 -98
- sdg_hub/core/blocks/llm/llm_parser_block.py +320 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +53 -152
- sdg_hub/core/flow/__init__.py +3 -4
- sdg_hub/core/flow/base.py +11 -73
- sdg_hub/core/flow/metadata.py +1 -68
- sdg_hub/core/flow/registry.py +0 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +51 -12
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +158 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +51 -12
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +14 -3
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +147 -28
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +41 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +14 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +14 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +303 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +55 -0
- sdg_hub/flows/text_analysis/structured_insights/flow.yaml +28 -5
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/METADATA +2 -1
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/RECORD +34 -30
- sdg_hub/core/blocks/evaluation/__init__.py +0 -9
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +0 -323
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +0 -323
- sdg_hub/core/blocks/evaluation/verify_question_block.py +0 -329
- sdg_hub/core/blocks/llm/client_manager.py +0 -472
- sdg_hub/core/blocks/llm/config.py +0 -337
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""LLM parser block for extracting fields from LLM response objects.
|
3
|
+
|
4
|
+
This module provides the LLMParserBlock for extracting specific fields
|
5
|
+
(content, reasoning_content, tool_calls) from chat completion response objects.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import Field, model_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
@BlockRegistry.register(
|
24
|
+
"LLMParserBlock",
|
25
|
+
"llm",
|
26
|
+
"Extracts specified fields from LLM response objects",
|
27
|
+
)
|
28
|
+
class LLMParserBlock(BaseBlock):
|
29
|
+
"""Block for extracting fields from LLM response objects.
|
30
|
+
|
31
|
+
This block extracts specified fields from chat completion response objects.
|
32
|
+
It expects exactly one input column containing response objects (dict or list of dicts).
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Unique identifier for this block instance.
|
38
|
+
input_cols : Union[str, List[str], Dict[str, Any], None]
|
39
|
+
Input column name(s) containing LLM response objects. Must specify exactly one column.
|
40
|
+
output_cols : Union[str, List[str], Dict[str, Any], None]
|
41
|
+
Output column name(s) for extracted fields.
|
42
|
+
extract_content : bool
|
43
|
+
Whether to extract 'content' field from responses.
|
44
|
+
extract_reasoning_content : bool
|
45
|
+
Whether to extract 'reasoning_content' field from responses.
|
46
|
+
extract_tool_calls : bool
|
47
|
+
Whether to extract 'tool_calls' field from responses.
|
48
|
+
expand_lists : bool
|
49
|
+
Whether to expand list inputs into individual rows (True) or preserve lists (False).
|
50
|
+
Default is True for backward compatibility.
|
51
|
+
field_prefix : str
|
52
|
+
Prefix to add to output field names. Default is empty string (no prefix).
|
53
|
+
Example: 'llm_' results in 'llm_content', 'llm_reasoning_content', 'llm_tool_calls'.
|
54
|
+
"""
|
55
|
+
|
56
|
+
extract_content: bool = Field(
|
57
|
+
default=True,
|
58
|
+
description="Whether to extract 'content' field from responses.",
|
59
|
+
)
|
60
|
+
extract_reasoning_content: bool = Field(
|
61
|
+
default=False,
|
62
|
+
description="Whether to extract 'reasoning_content' field from responses.",
|
63
|
+
)
|
64
|
+
extract_tool_calls: bool = Field(
|
65
|
+
default=False,
|
66
|
+
description="Whether to extract 'tool_calls' field from responses.",
|
67
|
+
)
|
68
|
+
expand_lists: bool = Field(
|
69
|
+
default=True,
|
70
|
+
description="Whether to expand list inputs into individual rows (True) or preserve lists (False).",
|
71
|
+
)
|
72
|
+
field_prefix: str = Field(
|
73
|
+
default="",
|
74
|
+
description="Prefix to add to output field names (e.g., 'llm_' results in 'llm_content', 'llm_reasoning_content').",
|
75
|
+
)
|
76
|
+
|
77
|
+
@model_validator(mode="after")
|
78
|
+
def validate_extraction_configuration(self):
|
79
|
+
"""Validate that at least one extraction field is enabled and pre-compute field names."""
|
80
|
+
if not any(
|
81
|
+
[
|
82
|
+
self.extract_content,
|
83
|
+
self.extract_reasoning_content,
|
84
|
+
self.extract_tool_calls,
|
85
|
+
]
|
86
|
+
):
|
87
|
+
raise ValueError(
|
88
|
+
"LLMParserBlock requires at least one extraction field to be enabled: "
|
89
|
+
"extract_content, extract_reasoning_content, or extract_tool_calls"
|
90
|
+
)
|
91
|
+
|
92
|
+
# Pre-compute prefixed field names for efficiency
|
93
|
+
prefix = self.field_prefix
|
94
|
+
if prefix == "":
|
95
|
+
prefix = self.block_name + "_"
|
96
|
+
self._content_field = f"{prefix}content"
|
97
|
+
self._reasoning_content_field = f"{prefix}reasoning_content"
|
98
|
+
self._tool_calls_field = f"{prefix}tool_calls"
|
99
|
+
|
100
|
+
# Advertise output columns for standard collision checks
|
101
|
+
self.output_cols = self._get_output_columns()
|
102
|
+
|
103
|
+
return self
|
104
|
+
|
105
|
+
def _validate_custom(self, dataset: Dataset) -> None:
|
106
|
+
"""Validate LLMParserBlock specific requirements.
|
107
|
+
|
108
|
+
Parameters
|
109
|
+
----------
|
110
|
+
dataset : Dataset
|
111
|
+
The dataset to validate.
|
112
|
+
|
113
|
+
Raises
|
114
|
+
------
|
115
|
+
ValueError
|
116
|
+
If LLMParserBlock requirements are not met.
|
117
|
+
"""
|
118
|
+
# Validate that we have exactly one input column
|
119
|
+
if len(self.input_cols) == 0:
|
120
|
+
raise ValueError("LLMParserBlock expects at least one input column")
|
121
|
+
if len(self.input_cols) > 1:
|
122
|
+
logger.warning(
|
123
|
+
f"LLMParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
|
124
|
+
f"Using the first column: {self.input_cols[0]}"
|
125
|
+
)
|
126
|
+
|
127
|
+
def _extract_fields_from_response(self, response: dict) -> dict[str, Any]:
|
128
|
+
"""Extract specified fields from a single response object.
|
129
|
+
|
130
|
+
Parameters
|
131
|
+
----------
|
132
|
+
response : dict
|
133
|
+
Response object from chat completion API
|
134
|
+
|
135
|
+
Returns
|
136
|
+
-------
|
137
|
+
dict[str, Any]
|
138
|
+
Dictionary with extracted fields using prefixed field names
|
139
|
+
|
140
|
+
Raises
|
141
|
+
------
|
142
|
+
ValueError
|
143
|
+
If none of the requested fields are found in the response
|
144
|
+
"""
|
145
|
+
extracted = {}
|
146
|
+
missing_fields = []
|
147
|
+
|
148
|
+
if self.extract_content:
|
149
|
+
if "content" not in response:
|
150
|
+
missing_fields.append("content")
|
151
|
+
else:
|
152
|
+
if response["content"] is None:
|
153
|
+
## skip this field
|
154
|
+
logger.warning("Content field is None, using empty string instead")
|
155
|
+
extracted[self._content_field] = ""
|
156
|
+
else:
|
157
|
+
extracted[self._content_field] = response["content"]
|
158
|
+
|
159
|
+
if self.extract_reasoning_content:
|
160
|
+
if "reasoning_content" not in response:
|
161
|
+
missing_fields.append("reasoning_content")
|
162
|
+
else:
|
163
|
+
if response["reasoning_content"] is None:
|
164
|
+
## skip this field
|
165
|
+
logger.warning(
|
166
|
+
"Reasoning content field is None, using empty string instead"
|
167
|
+
)
|
168
|
+
extracted[self._reasoning_content_field] = ""
|
169
|
+
else:
|
170
|
+
extracted[self._reasoning_content_field] = response[
|
171
|
+
"reasoning_content"
|
172
|
+
]
|
173
|
+
|
174
|
+
if self.extract_tool_calls:
|
175
|
+
if "tool_calls" not in response:
|
176
|
+
missing_fields.append("tool_calls")
|
177
|
+
else:
|
178
|
+
if response["tool_calls"] is None:
|
179
|
+
## skip this field
|
180
|
+
logger.warning("Tool calls field is None, using empty list instead")
|
181
|
+
extracted[self._tool_calls_field] = []
|
182
|
+
else:
|
183
|
+
extracted[self._tool_calls_field] = response["tool_calls"]
|
184
|
+
|
185
|
+
if missing_fields:
|
186
|
+
logger.warning(
|
187
|
+
f"Requested fields {missing_fields} not found in response. Available keys: {list(response.keys())}"
|
188
|
+
)
|
189
|
+
|
190
|
+
if not extracted:
|
191
|
+
raise ValueError(
|
192
|
+
f"No requested fields found in response. Available keys: {list(response.keys())}"
|
193
|
+
)
|
194
|
+
return extracted
|
195
|
+
|
196
|
+
def _get_output_columns(self) -> list[str]:
|
197
|
+
"""Get the list of output columns based on extraction settings."""
|
198
|
+
columns = []
|
199
|
+
if self.extract_content:
|
200
|
+
columns.append(self._content_field)
|
201
|
+
if self.extract_reasoning_content:
|
202
|
+
columns.append(self._reasoning_content_field)
|
203
|
+
if self.extract_tool_calls:
|
204
|
+
columns.append(self._tool_calls_field)
|
205
|
+
return columns
|
206
|
+
|
207
|
+
def _generate(self, sample: dict) -> list[dict]:
|
208
|
+
input_column = self.input_cols[0]
|
209
|
+
raw_output = sample[input_column]
|
210
|
+
|
211
|
+
# Handle list inputs (e.g., from LLMChatBlock with n > 1)
|
212
|
+
if isinstance(raw_output, list):
|
213
|
+
return self._process_list_input(sample, raw_output, input_column)
|
214
|
+
|
215
|
+
# Handle single dict input
|
216
|
+
elif isinstance(raw_output, dict):
|
217
|
+
return self._process_single_input(sample, raw_output)
|
218
|
+
|
219
|
+
else:
|
220
|
+
logger.warning(
|
221
|
+
f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
|
222
|
+
f"Expected dict or list[dict]"
|
223
|
+
)
|
224
|
+
return []
|
225
|
+
|
226
|
+
def _process_list_input(
|
227
|
+
self, sample: dict, raw_output: list, input_column: str
|
228
|
+
) -> list[dict]:
|
229
|
+
"""Process list of response objects."""
|
230
|
+
if not raw_output:
|
231
|
+
logger.warning(f"Input column '{input_column}' contains empty list")
|
232
|
+
return []
|
233
|
+
|
234
|
+
if not self.expand_lists:
|
235
|
+
# Preserve list structure - collect all extracted fields as lists
|
236
|
+
return self._process_list_preserve_structure(
|
237
|
+
sample, raw_output, input_column
|
238
|
+
)
|
239
|
+
else:
|
240
|
+
# Expand lists - create individual rows for each response
|
241
|
+
return self._process_list_expand_rows(sample, raw_output, input_column)
|
242
|
+
|
243
|
+
def _process_list_preserve_structure(
|
244
|
+
self, sample: dict, raw_output: list, input_column: str
|
245
|
+
) -> list[dict]:
|
246
|
+
"""Process list input while preserving list structure."""
|
247
|
+
output_columns = self._get_output_columns()
|
248
|
+
all_extracted = {col: [] for col in output_columns}
|
249
|
+
valid_responses = 0
|
250
|
+
|
251
|
+
for i, response in enumerate(raw_output):
|
252
|
+
if not isinstance(response, dict):
|
253
|
+
logger.warning(
|
254
|
+
f"List item {i} in column '{input_column}' is not a dict"
|
255
|
+
)
|
256
|
+
continue
|
257
|
+
|
258
|
+
try:
|
259
|
+
extracted = self._extract_fields_from_response(response)
|
260
|
+
valid_responses += 1
|
261
|
+
for col in output_columns:
|
262
|
+
if col in extracted:
|
263
|
+
all_extracted[col].append(extracted[col])
|
264
|
+
except ValueError as e:
|
265
|
+
logger.warning(f"Failed to extract fields from list item {i}: {e}")
|
266
|
+
continue
|
267
|
+
|
268
|
+
if valid_responses == 0:
|
269
|
+
raise ValueError(
|
270
|
+
f"No valid responses found in list input for column '{input_column}'"
|
271
|
+
)
|
272
|
+
|
273
|
+
# Return single row with lists as values
|
274
|
+
return [{**sample, **all_extracted}]
|
275
|
+
|
276
|
+
def _process_list_expand_rows(
|
277
|
+
self, sample: dict, raw_output: list, input_column: str
|
278
|
+
) -> list[dict]:
|
279
|
+
"""Process list input by expanding into individual rows."""
|
280
|
+
all_results = []
|
281
|
+
|
282
|
+
for i, response in enumerate(raw_output):
|
283
|
+
if not isinstance(response, dict):
|
284
|
+
logger.warning(
|
285
|
+
f"List item {i} in column '{input_column}' is not a dict"
|
286
|
+
)
|
287
|
+
continue
|
288
|
+
|
289
|
+
try:
|
290
|
+
extracted = self._extract_fields_from_response(response)
|
291
|
+
# Create a row for this response
|
292
|
+
result_row = {**sample, **extracted}
|
293
|
+
all_results.append(result_row)
|
294
|
+
except ValueError as e:
|
295
|
+
logger.warning(f"Failed to extract fields from list item {i}: {e}")
|
296
|
+
continue
|
297
|
+
|
298
|
+
if not all_results:
|
299
|
+
raise ValueError(
|
300
|
+
f"No valid responses found in list input for column '{input_column}'"
|
301
|
+
)
|
302
|
+
|
303
|
+
return all_results
|
304
|
+
|
305
|
+
def _process_single_input(self, sample: dict, raw_output: dict) -> list[dict]:
|
306
|
+
"""Process single response object."""
|
307
|
+
# _extract_fields_from_response now raises ValueError if no fields found
|
308
|
+
extracted = self._extract_fields_from_response(raw_output)
|
309
|
+
return [{**sample, **extracted}]
|
310
|
+
|
311
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
312
|
+
logger.debug(f"Extracting fields from {len(samples)} samples")
|
313
|
+
if len(samples) == 0:
|
314
|
+
logger.warning("No samples to process, returning empty dataset")
|
315
|
+
return Dataset.from_list([])
|
316
|
+
|
317
|
+
new_data = []
|
318
|
+
for sample in samples:
|
319
|
+
new_data.extend(self._generate(sample))
|
320
|
+
return Dataset.from_list(new_data)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Text parser block for parsing and post-processing
|
2
|
+
"""Text parser block for parsing and post-processing text content.
|
3
3
|
|
4
|
-
This module provides the TextParserBlock for handling
|
4
|
+
This module provides the TextParserBlock for handling text parsing using
|
5
5
|
start/end tags, custom regex patterns, and cleanup operations.
|
6
6
|
"""
|
7
7
|
|
@@ -24,20 +24,21 @@ logger = setup_logger(__name__)
|
|
24
24
|
@BlockRegistry.register(
|
25
25
|
"TextParserBlock",
|
26
26
|
"llm",
|
27
|
-
"Parses and post-processes
|
27
|
+
"Parses and post-processes text content using tags or regex patterns",
|
28
28
|
)
|
29
29
|
class TextParserBlock(BaseBlock):
|
30
|
-
"""Block for parsing and post-processing
|
30
|
+
"""Block for parsing and post-processing text content.
|
31
31
|
|
32
|
-
This block handles
|
33
|
-
and cleanup operations. It expects exactly one input column containing
|
32
|
+
This block handles text parsing using start/end tags, custom regex patterns,
|
33
|
+
and cleanup operations. It expects exactly one input column containing text content
|
34
|
+
as either a string or a list of strings.
|
34
35
|
|
35
36
|
Attributes
|
36
37
|
----------
|
37
38
|
block_name : str
|
38
39
|
Unique identifier for this block instance.
|
39
40
|
input_cols : Union[str, List[str], Dict[str, Any], None]
|
40
|
-
Input column name(s) containing
|
41
|
+
Input column name(s) containing text content (str or List[str]). Must specify exactly one column.
|
41
42
|
output_cols : Union[str, List[str], Dict[str, Any], None]
|
42
43
|
Output column name(s) for parsed results.
|
43
44
|
start_tags : List[str]
|
@@ -51,10 +52,6 @@ class TextParserBlock(BaseBlock):
|
|
51
52
|
expand_lists : bool
|
52
53
|
Whether to expand list inputs into individual rows (True) or preserve lists (False).
|
53
54
|
Default is True for backward compatibility.
|
54
|
-
save_reasoning_content : bool
|
55
|
-
Whether to save the reasoning content to the output.
|
56
|
-
reasoning_content_field : Optional[str]
|
57
|
-
The field name of the reasoning content to save to the output.
|
58
55
|
"""
|
59
56
|
|
60
57
|
start_tags: list[str] = Field(
|
@@ -69,18 +66,6 @@ class TextParserBlock(BaseBlock):
|
|
69
66
|
parser_cleanup_tags: Optional[list[str]] = Field(
|
70
67
|
default=None, description="List of tags to clean from parsed output"
|
71
68
|
)
|
72
|
-
expand_lists: bool = Field(
|
73
|
-
default=True,
|
74
|
-
description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
|
75
|
-
)
|
76
|
-
save_reasoning_content: bool = Field(
|
77
|
-
default=False,
|
78
|
-
description="Whether to save the reasoning content to the output.",
|
79
|
-
)
|
80
|
-
reasoning_content_field: Optional[str] = Field(
|
81
|
-
default="reasoning_content",
|
82
|
-
description="The field name of the reasoning content to save to the output.",
|
83
|
-
)
|
84
69
|
|
85
70
|
@field_validator("start_tags", "end_tags", mode="before")
|
86
71
|
@classmethod
|
@@ -246,147 +231,67 @@ class TextParserBlock(BaseBlock):
|
|
246
231
|
value = value.replace(clean_tag, "")
|
247
232
|
return value
|
248
233
|
|
249
|
-
def _handle_message(self, sample: dict) -> dict[str, list[str]]:
|
250
|
-
if "content" not in sample:
|
251
|
-
logger.warning(f"Content not found in sample: {sample}")
|
252
|
-
return {}
|
253
|
-
parsed_output = self._parse(sample["content"])
|
254
|
-
if self.save_reasoning_content:
|
255
|
-
parsed_output[self.reasoning_content_field] = [
|
256
|
-
self._get_reasoning_content(sample)
|
257
|
-
]
|
258
|
-
return parsed_output
|
259
|
-
|
260
|
-
def _get_reasoning_content(self, sample: dict) -> str:
|
261
|
-
if self.save_reasoning_content:
|
262
|
-
if self.reasoning_content_field in sample:
|
263
|
-
return sample[self.reasoning_content_field]
|
264
|
-
else:
|
265
|
-
logger.warning(
|
266
|
-
f"Reasoning content field '{self.reasoning_content_field}' not found in response"
|
267
|
-
)
|
268
|
-
return ""
|
269
|
-
|
270
234
|
def _generate(self, sample: dict) -> list[dict]:
|
271
235
|
input_column = self.input_cols[0]
|
272
236
|
raw_output = sample[input_column]
|
273
237
|
|
274
|
-
# Handle list inputs (e.g.,
|
238
|
+
# Handle list inputs (e.g., multiple text strings to process)
|
275
239
|
if isinstance(raw_output, list):
|
276
240
|
if not raw_output:
|
277
241
|
logger.warning(f"Input column '{input_column}' contains empty list")
|
278
242
|
return []
|
279
243
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
self.block_name + "_" + self.reasoning_content_field
|
315
|
-
not in all_parsed_outputs
|
316
|
-
):
|
317
|
-
all_parsed_outputs[
|
318
|
-
self.block_name + "_" + self.reasoning_content_field
|
319
|
-
] = []
|
320
|
-
all_parsed_outputs[
|
321
|
-
self.block_name + "_" + self.reasoning_content_field
|
322
|
-
].extend(reasoning_content)
|
323
|
-
|
324
|
-
if valid_responses == 0:
|
325
|
-
return []
|
326
|
-
|
327
|
-
# Return single row with lists as values
|
328
|
-
return [{**sample, **all_parsed_outputs}]
|
329
|
-
|
330
|
-
else:
|
331
|
-
# When expand_lists=True, use existing expanding behavior
|
332
|
-
all_results = []
|
333
|
-
for i, message in enumerate(raw_output):
|
334
|
-
if not message:
|
335
|
-
logger.warning(
|
336
|
-
f"List item {i} in column '{input_column}' is empty"
|
337
|
-
)
|
338
|
-
continue
|
339
|
-
|
340
|
-
parsed_outputs = self._handle_message(message)
|
341
|
-
if self.save_reasoning_content:
|
342
|
-
reasoning_content = parsed_outputs.pop(
|
343
|
-
self.reasoning_content_field
|
344
|
-
)
|
345
|
-
|
346
|
-
if not parsed_outputs or not any(
|
347
|
-
len(value) > 0 for value in parsed_outputs.values()
|
348
|
-
):
|
349
|
-
logger.warning(
|
350
|
-
f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
|
351
|
-
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
352
|
-
)
|
353
|
-
continue
|
354
|
-
|
355
|
-
# Create output rows for this response
|
356
|
-
max_length = max(len(value) for value in parsed_outputs.values())
|
357
|
-
for values in zip(
|
358
|
-
*(lst[:max_length] for lst in parsed_outputs.values())
|
359
|
-
):
|
360
|
-
result_row = {
|
361
|
-
**sample,
|
362
|
-
**dict(zip(parsed_outputs.keys(), values)),
|
363
|
-
}
|
364
|
-
if self.save_reasoning_content:
|
365
|
-
result_row[
|
366
|
-
self.block_name + "_" + self.reasoning_content_field
|
367
|
-
] = reasoning_content[0]
|
368
|
-
all_results.append(result_row)
|
369
|
-
|
370
|
-
return all_results
|
371
|
-
|
372
|
-
# Handle dict inputs (existing logic)
|
373
|
-
elif isinstance(raw_output, dict) or isinstance(raw_output, str):
|
374
|
-
if not raw_output:
|
375
|
-
logger.warning(f"Input column '{input_column}' contains empty dict")
|
244
|
+
# Parse each text string in the list and collect results as lists
|
245
|
+
all_parsed_outputs = {col: [] for col in self.output_cols}
|
246
|
+
valid_responses = 0
|
247
|
+
|
248
|
+
for i, message in enumerate(raw_output):
|
249
|
+
# Ensure each item in the list is a string
|
250
|
+
if not isinstance(message, str):
|
251
|
+
logger.warning(
|
252
|
+
f"List item {i} in column '{input_column}' is not a string: {type(message)}. "
|
253
|
+
f"Expected List[str], skipping this item."
|
254
|
+
)
|
255
|
+
continue
|
256
|
+
|
257
|
+
if not message:
|
258
|
+
logger.warning(f"List item {i} in column '{input_column}' is empty")
|
259
|
+
continue
|
260
|
+
|
261
|
+
parsed_outputs = self._parse(message)
|
262
|
+
|
263
|
+
if not parsed_outputs or not any(
|
264
|
+
len(value) > 0 for value in parsed_outputs.values()
|
265
|
+
):
|
266
|
+
logger.warning(
|
267
|
+
f"Failed to parse content from list item {i}. Text length: {len(message)}, "
|
268
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
269
|
+
)
|
270
|
+
continue
|
271
|
+
|
272
|
+
valid_responses += 1
|
273
|
+
# Collect all parsed values for each column as lists
|
274
|
+
for col in self.output_cols:
|
275
|
+
all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
|
276
|
+
|
277
|
+
if valid_responses == 0:
|
376
278
|
return []
|
377
279
|
|
378
|
-
|
379
|
-
|
280
|
+
# Return single row with lists as values
|
281
|
+
return [{**sample, **all_parsed_outputs}]
|
282
|
+
# Handle string inputs
|
283
|
+
elif isinstance(raw_output, str):
|
284
|
+
if not raw_output:
|
285
|
+
logger.warning(f"Input column '{input_column}' contains empty string")
|
286
|
+
return []
|
380
287
|
|
381
|
-
parsed_outputs = self.
|
382
|
-
if self.save_reasoning_content:
|
383
|
-
reasoning_content = parsed_outputs.pop(self.reasoning_content_field)
|
288
|
+
parsed_outputs = self._parse(raw_output)
|
384
289
|
|
385
290
|
if not parsed_outputs or not any(
|
386
291
|
len(value) > 0 for value in parsed_outputs.values()
|
387
292
|
):
|
388
293
|
logger.warning(
|
389
|
-
f"Failed to parse any content from input.
|
294
|
+
f"Failed to parse any content from input. Text length: {len(raw_output)}, "
|
390
295
|
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
391
296
|
)
|
392
297
|
return []
|
@@ -395,10 +300,6 @@ class TextParserBlock(BaseBlock):
|
|
395
300
|
max_length = max(len(value) for value in parsed_outputs.values())
|
396
301
|
for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
|
397
302
|
result_row = {**sample, **dict(zip(parsed_outputs.keys(), values))}
|
398
|
-
if self.save_reasoning_content:
|
399
|
-
result_row[self.block_name + "_" + self.reasoning_content_field] = (
|
400
|
-
reasoning_content[0]
|
401
|
-
)
|
402
303
|
result.append(result_row)
|
403
304
|
|
404
305
|
return result
|
@@ -406,7 +307,7 @@ class TextParserBlock(BaseBlock):
|
|
406
307
|
else:
|
407
308
|
logger.warning(
|
408
309
|
f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
|
409
|
-
f"Expected
|
310
|
+
f"Expected str or List[str]"
|
410
311
|
)
|
411
312
|
return []
|
412
313
|
|
sdg_hub/core/flow/__init__.py
CHANGED
@@ -1,20 +1,19 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
"""New flow implementation for SDG Hub.
|
3
3
|
|
4
|
-
This module provides a redesigned Flow class with metadata support
|
5
|
-
dual initialization modes
|
4
|
+
This module provides a redesigned Flow class with metadata support
|
5
|
+
and dual initialization modes.
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Local
|
9
9
|
from .base import Flow
|
10
|
-
from .metadata import FlowMetadata
|
10
|
+
from .metadata import FlowMetadata
|
11
11
|
from .registry import FlowRegistry
|
12
12
|
from .validation import FlowValidator
|
13
13
|
|
14
14
|
__all__ = [
|
15
15
|
"Flow",
|
16
16
|
"FlowMetadata",
|
17
|
-
"FlowParameter",
|
18
17
|
"FlowRegistry",
|
19
18
|
"FlowValidator",
|
20
19
|
]
|