sdg-hub 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/__init__.py +2 -4
  3. sdg_hub/core/blocks/base.py +61 -6
  4. sdg_hub/core/blocks/filtering/column_value_filter.py +3 -2
  5. sdg_hub/core/blocks/llm/__init__.py +2 -4
  6. sdg_hub/core/blocks/llm/llm_chat_block.py +251 -265
  7. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +216 -98
  8. sdg_hub/core/blocks/llm/llm_parser_block.py +320 -0
  9. sdg_hub/core/blocks/llm/text_parser_block.py +53 -152
  10. sdg_hub/core/flow/base.py +7 -4
  11. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +51 -11
  12. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  13. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +159 -0
  14. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +51 -11
  15. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +14 -2
  16. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +146 -26
  17. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  18. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  19. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +41 -0
  20. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +14 -0
  21. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +14 -0
  22. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +304 -0
  23. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +55 -0
  24. sdg_hub/flows/text_analysis/structured_insights/flow.yaml +28 -4
  25. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.0.dist-info}/METADATA +1 -1
  26. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.0.dist-info}/RECORD +29 -25
  27. sdg_hub/core/blocks/evaluation/__init__.py +0 -9
  28. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +0 -323
  29. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +0 -323
  30. sdg_hub/core/blocks/evaluation/verify_question_block.py +0 -329
  31. sdg_hub/core/blocks/llm/client_manager.py +0 -472
  32. sdg_hub/core/blocks/llm/config.py +0 -337
  33. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.0.dist-info}/WHEEL +0 -0
  34. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.0.dist-info}/licenses/LICENSE +0 -0
  35. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """LLM parser block for extracting fields from LLM response objects.
3
+
4
+ This module provides the LLMParserBlock for extracting specific fields
5
+ (content, reasoning_content, tool_calls) from chat completion response objects.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+
11
+ # Third Party
12
+ from datasets import Dataset
13
+ from pydantic import Field, model_validator
14
+
15
+ # Local
16
+ from ...utils.logger_config import setup_logger
17
+ from ..base import BaseBlock
18
+ from ..registry import BlockRegistry
19
+
20
+ logger = setup_logger(__name__)
21
+
22
+
23
+ @BlockRegistry.register(
24
+ "LLMParserBlock",
25
+ "llm",
26
+ "Extracts specified fields from LLM response objects",
27
+ )
28
+ class LLMParserBlock(BaseBlock):
29
+ """Block for extracting fields from LLM response objects.
30
+
31
+ This block extracts specified fields from chat completion response objects.
32
+ It expects exactly one input column containing response objects (dict or list of dicts).
33
+
34
+ Attributes
35
+ ----------
36
+ block_name : str
37
+ Unique identifier for this block instance.
38
+ input_cols : Union[str, List[str], Dict[str, Any], None]
39
+ Input column name(s) containing LLM response objects. Must specify exactly one column.
40
+ output_cols : Union[str, List[str], Dict[str, Any], None]
41
+ Output column name(s) for extracted fields.
42
+ extract_content : bool
43
+ Whether to extract 'content' field from responses.
44
+ extract_reasoning_content : bool
45
+ Whether to extract 'reasoning_content' field from responses.
46
+ extract_tool_calls : bool
47
+ Whether to extract 'tool_calls' field from responses.
48
+ expand_lists : bool
49
+ Whether to expand list inputs into individual rows (True) or preserve lists (False).
50
+ Default is True for backward compatibility.
51
+ field_prefix : str
52
+ Prefix to add to output field names. Default is empty string (no prefix).
53
+ Example: 'llm_' results in 'llm_content', 'llm_reasoning_content', 'llm_tool_calls'.
54
+ """
55
+
56
+ extract_content: bool = Field(
57
+ default=True,
58
+ description="Whether to extract 'content' field from responses.",
59
+ )
60
+ extract_reasoning_content: bool = Field(
61
+ default=False,
62
+ description="Whether to extract 'reasoning_content' field from responses.",
63
+ )
64
+ extract_tool_calls: bool = Field(
65
+ default=False,
66
+ description="Whether to extract 'tool_calls' field from responses.",
67
+ )
68
+ expand_lists: bool = Field(
69
+ default=True,
70
+ description="Whether to expand list inputs into individual rows (True) or preserve lists (False).",
71
+ )
72
+ field_prefix: str = Field(
73
+ default="",
74
+ description="Prefix to add to output field names (e.g., 'llm_' results in 'llm_content', 'llm_reasoning_content').",
75
+ )
76
+
77
+ @model_validator(mode="after")
78
+ def validate_extraction_configuration(self):
79
+ """Validate that at least one extraction field is enabled and pre-compute field names."""
80
+ if not any(
81
+ [
82
+ self.extract_content,
83
+ self.extract_reasoning_content,
84
+ self.extract_tool_calls,
85
+ ]
86
+ ):
87
+ raise ValueError(
88
+ "LLMParserBlock requires at least one extraction field to be enabled: "
89
+ "extract_content, extract_reasoning_content, or extract_tool_calls"
90
+ )
91
+
92
+ # Pre-compute prefixed field names for efficiency
93
+ prefix = self.field_prefix
94
+ if prefix == "":
95
+ prefix = self.block_name + "_"
96
+ self._content_field = f"{prefix}content"
97
+ self._reasoning_content_field = f"{prefix}reasoning_content"
98
+ self._tool_calls_field = f"{prefix}tool_calls"
99
+
100
+ # Advertise output columns for standard collision checks
101
+ self.output_cols = self._get_output_columns()
102
+
103
+ return self
104
+
105
+ def _validate_custom(self, dataset: Dataset) -> None:
106
+ """Validate LLMParserBlock specific requirements.
107
+
108
+ Parameters
109
+ ----------
110
+ dataset : Dataset
111
+ The dataset to validate.
112
+
113
+ Raises
114
+ ------
115
+ ValueError
116
+ If LLMParserBlock requirements are not met.
117
+ """
118
+ # Validate that we have exactly one input column
119
+ if len(self.input_cols) == 0:
120
+ raise ValueError("LLMParserBlock expects at least one input column")
121
+ if len(self.input_cols) > 1:
122
+ logger.warning(
123
+ f"LLMParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
124
+ f"Using the first column: {self.input_cols[0]}"
125
+ )
126
+
127
+ def _extract_fields_from_response(self, response: dict) -> dict[str, Any]:
128
+ """Extract specified fields from a single response object.
129
+
130
+ Parameters
131
+ ----------
132
+ response : dict
133
+ Response object from chat completion API
134
+
135
+ Returns
136
+ -------
137
+ dict[str, Any]
138
+ Dictionary with extracted fields using prefixed field names
139
+
140
+ Raises
141
+ ------
142
+ ValueError
143
+ If none of the requested fields are found in the response
144
+ """
145
+ extracted = {}
146
+ missing_fields = []
147
+
148
+ if self.extract_content:
149
+ if "content" not in response:
150
+ missing_fields.append("content")
151
+ else:
152
+ if response["content"] is None:
153
+ ## skip this field
154
+ logger.warning("Content field is None, using empty string instead")
155
+ extracted[self._content_field] = ""
156
+ else:
157
+ extracted[self._content_field] = response["content"]
158
+
159
+ if self.extract_reasoning_content:
160
+ if "reasoning_content" not in response:
161
+ missing_fields.append("reasoning_content")
162
+ else:
163
+ if response["reasoning_content"] is None:
164
+ ## skip this field
165
+ logger.warning(
166
+ "Reasoning content field is None, using empty string instead"
167
+ )
168
+ extracted[self._reasoning_content_field] = ""
169
+ else:
170
+ extracted[self._reasoning_content_field] = response[
171
+ "reasoning_content"
172
+ ]
173
+
174
+ if self.extract_tool_calls:
175
+ if "tool_calls" not in response:
176
+ missing_fields.append("tool_calls")
177
+ else:
178
+ if response["tool_calls"] is None:
179
+ ## skip this field
180
+ logger.warning("Tool calls field is None, using empty list instead")
181
+ extracted[self._tool_calls_field] = []
182
+ else:
183
+ extracted[self._tool_calls_field] = response["tool_calls"]
184
+
185
+ if missing_fields:
186
+ logger.warning(
187
+ f"Requested fields {missing_fields} not found in response. Available keys: {list(response.keys())}"
188
+ )
189
+
190
+ if not extracted:
191
+ raise ValueError(
192
+ f"No requested fields found in response. Available keys: {list(response.keys())}"
193
+ )
194
+ return extracted
195
+
196
+ def _get_output_columns(self) -> list[str]:
197
+ """Get the list of output columns based on extraction settings."""
198
+ columns = []
199
+ if self.extract_content:
200
+ columns.append(self._content_field)
201
+ if self.extract_reasoning_content:
202
+ columns.append(self._reasoning_content_field)
203
+ if self.extract_tool_calls:
204
+ columns.append(self._tool_calls_field)
205
+ return columns
206
+
207
+ def _generate(self, sample: dict) -> list[dict]:
208
+ input_column = self.input_cols[0]
209
+ raw_output = sample[input_column]
210
+
211
+ # Handle list inputs (e.g., from LLMChatBlock with n > 1)
212
+ if isinstance(raw_output, list):
213
+ return self._process_list_input(sample, raw_output, input_column)
214
+
215
+ # Handle single dict input
216
+ elif isinstance(raw_output, dict):
217
+ return self._process_single_input(sample, raw_output)
218
+
219
+ else:
220
+ logger.warning(
221
+ f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
222
+ f"Expected dict or list[dict]"
223
+ )
224
+ return []
225
+
226
+ def _process_list_input(
227
+ self, sample: dict, raw_output: list, input_column: str
228
+ ) -> list[dict]:
229
+ """Process list of response objects."""
230
+ if not raw_output:
231
+ logger.warning(f"Input column '{input_column}' contains empty list")
232
+ return []
233
+
234
+ if not self.expand_lists:
235
+ # Preserve list structure - collect all extracted fields as lists
236
+ return self._process_list_preserve_structure(
237
+ sample, raw_output, input_column
238
+ )
239
+ else:
240
+ # Expand lists - create individual rows for each response
241
+ return self._process_list_expand_rows(sample, raw_output, input_column)
242
+
243
+ def _process_list_preserve_structure(
244
+ self, sample: dict, raw_output: list, input_column: str
245
+ ) -> list[dict]:
246
+ """Process list input while preserving list structure."""
247
+ output_columns = self._get_output_columns()
248
+ all_extracted = {col: [] for col in output_columns}
249
+ valid_responses = 0
250
+
251
+ for i, response in enumerate(raw_output):
252
+ if not isinstance(response, dict):
253
+ logger.warning(
254
+ f"List item {i} in column '{input_column}' is not a dict"
255
+ )
256
+ continue
257
+
258
+ try:
259
+ extracted = self._extract_fields_from_response(response)
260
+ valid_responses += 1
261
+ for col in output_columns:
262
+ if col in extracted:
263
+ all_extracted[col].append(extracted[col])
264
+ except ValueError as e:
265
+ logger.warning(f"Failed to extract fields from list item {i}: {e}")
266
+ continue
267
+
268
+ if valid_responses == 0:
269
+ raise ValueError(
270
+ f"No valid responses found in list input for column '{input_column}'"
271
+ )
272
+
273
+ # Return single row with lists as values
274
+ return [{**sample, **all_extracted}]
275
+
276
+ def _process_list_expand_rows(
277
+ self, sample: dict, raw_output: list, input_column: str
278
+ ) -> list[dict]:
279
+ """Process list input by expanding into individual rows."""
280
+ all_results = []
281
+
282
+ for i, response in enumerate(raw_output):
283
+ if not isinstance(response, dict):
284
+ logger.warning(
285
+ f"List item {i} in column '{input_column}' is not a dict"
286
+ )
287
+ continue
288
+
289
+ try:
290
+ extracted = self._extract_fields_from_response(response)
291
+ # Create a row for this response
292
+ result_row = {**sample, **extracted}
293
+ all_results.append(result_row)
294
+ except ValueError as e:
295
+ logger.warning(f"Failed to extract fields from list item {i}: {e}")
296
+ continue
297
+
298
+ if not all_results:
299
+ raise ValueError(
300
+ f"No valid responses found in list input for column '{input_column}'"
301
+ )
302
+
303
+ return all_results
304
+
305
+ def _process_single_input(self, sample: dict, raw_output: dict) -> list[dict]:
306
+ """Process single response object."""
307
+ # _extract_fields_from_response now raises ValueError if no fields found
308
+ extracted = self._extract_fields_from_response(raw_output)
309
+ return [{**sample, **extracted}]
310
+
311
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
312
+ logger.debug(f"Extracting fields from {len(samples)} samples")
313
+ if len(samples) == 0:
314
+ logger.warning("No samples to process, returning empty dataset")
315
+ return Dataset.from_list([])
316
+
317
+ new_data = []
318
+ for sample in samples:
319
+ new_data.extend(self._generate(sample))
320
+ return Dataset.from_list(new_data)
@@ -1,7 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
- """Text parser block for parsing and post-processing LLM outputs.
2
+ """Text parser block for parsing and post-processing text content.
3
3
 
4
- This module provides the TextParserBlock for handling output parsing using
4
+ This module provides the TextParserBlock for handling text parsing using
5
5
  start/end tags, custom regex patterns, and cleanup operations.
6
6
  """
7
7
 
@@ -24,20 +24,21 @@ logger = setup_logger(__name__)
24
24
  @BlockRegistry.register(
25
25
  "TextParserBlock",
26
26
  "llm",
27
- "Parses and post-processes LLM outputs using tags or regex patterns",
27
+ "Parses and post-processes text content using tags or regex patterns",
28
28
  )
29
29
  class TextParserBlock(BaseBlock):
30
- """Block for parsing and post-processing LLM outputs.
30
+ """Block for parsing and post-processing text content.
31
31
 
32
- This block handles output parsing using start/end tags, custom regex patterns,
33
- and cleanup operations. It expects exactly one input column containing raw LLM output.
32
+ This block handles text parsing using start/end tags, custom regex patterns,
33
+ and cleanup operations. It expects exactly one input column containing text content
34
+ as either a string or a list of strings.
34
35
 
35
36
  Attributes
36
37
  ----------
37
38
  block_name : str
38
39
  Unique identifier for this block instance.
39
40
  input_cols : Union[str, List[str], Dict[str, Any], None]
40
- Input column name(s) containing raw LLM output. Must specify exactly one column.
41
+ Input column name(s) containing text content (str or List[str]). Must specify exactly one column.
41
42
  output_cols : Union[str, List[str], Dict[str, Any], None]
42
43
  Output column name(s) for parsed results.
43
44
  start_tags : List[str]
@@ -51,10 +52,6 @@ class TextParserBlock(BaseBlock):
51
52
  expand_lists : bool
52
53
  Whether to expand list inputs into individual rows (True) or preserve lists (False).
53
54
  Default is True for backward compatibility.
54
- save_reasoning_content : bool
55
- Whether to save the reasoning content to the output.
56
- reasoning_content_field : Optional[str]
57
- The field name of the reasoning content to save to the output.
58
55
  """
59
56
 
60
57
  start_tags: list[str] = Field(
@@ -69,18 +66,6 @@ class TextParserBlock(BaseBlock):
69
66
  parser_cleanup_tags: Optional[list[str]] = Field(
70
67
  default=None, description="List of tags to clean from parsed output"
71
68
  )
72
- expand_lists: bool = Field(
73
- default=True,
74
- description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
75
- )
76
- save_reasoning_content: bool = Field(
77
- default=False,
78
- description="Whether to save the reasoning content to the output.",
79
- )
80
- reasoning_content_field: Optional[str] = Field(
81
- default="reasoning_content",
82
- description="The field name of the reasoning content to save to the output.",
83
- )
84
69
 
85
70
  @field_validator("start_tags", "end_tags", mode="before")
86
71
  @classmethod
@@ -246,147 +231,67 @@ class TextParserBlock(BaseBlock):
246
231
  value = value.replace(clean_tag, "")
247
232
  return value
248
233
 
249
- def _handle_message(self, sample: dict) -> dict[str, list[str]]:
250
- if "content" not in sample:
251
- logger.warning(f"Content not found in sample: {sample}")
252
- return {}
253
- parsed_output = self._parse(sample["content"])
254
- if self.save_reasoning_content:
255
- parsed_output[self.reasoning_content_field] = [
256
- self._get_reasoning_content(sample)
257
- ]
258
- return parsed_output
259
-
260
- def _get_reasoning_content(self, sample: dict) -> str:
261
- if self.save_reasoning_content:
262
- if self.reasoning_content_field in sample:
263
- return sample[self.reasoning_content_field]
264
- else:
265
- logger.warning(
266
- f"Reasoning content field '{self.reasoning_content_field}' not found in response"
267
- )
268
- return ""
269
-
270
234
  def _generate(self, sample: dict) -> list[dict]:
271
235
  input_column = self.input_cols[0]
272
236
  raw_output = sample[input_column]
273
237
 
274
- # Handle list inputs (e.g., from LLMChatBlock with n > 1)
238
+ # Handle list inputs (e.g., multiple text strings to process)
275
239
  if isinstance(raw_output, list):
276
240
  if not raw_output:
277
241
  logger.warning(f"Input column '{input_column}' contains empty list")
278
242
  return []
279
243
 
280
- if not self.expand_lists:
281
- # When expand_lists=False, preserve the list structure
282
- # Parse each response in the list and collect results as lists
283
- all_parsed_outputs = {col: [] for col in self.output_cols}
284
- valid_responses = 0
285
-
286
- for i, message in enumerate(raw_output):
287
- if not message:
288
- logger.warning(
289
- f"List item {i} in column '{input_column}' is empty"
290
- )
291
- continue
292
-
293
- parsed_outputs = self._handle_message(message)
294
- if self.save_reasoning_content:
295
- reasoning_content = parsed_outputs.pop(
296
- self.reasoning_content_field
297
- )
298
-
299
- if not parsed_outputs or not any(
300
- len(value) > 0 for value in parsed_outputs.values()
301
- ):
302
- logger.warning(
303
- f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
304
- f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
305
- )
306
- continue
307
-
308
- valid_responses += 1
309
- # Collect all parsed values for each column as lists
310
- for col in self.output_cols:
311
- all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
312
- if self.save_reasoning_content:
313
- if (
314
- self.block_name + "_" + self.reasoning_content_field
315
- not in all_parsed_outputs
316
- ):
317
- all_parsed_outputs[
318
- self.block_name + "_" + self.reasoning_content_field
319
- ] = []
320
- all_parsed_outputs[
321
- self.block_name + "_" + self.reasoning_content_field
322
- ].extend(reasoning_content)
323
-
324
- if valid_responses == 0:
325
- return []
326
-
327
- # Return single row with lists as values
328
- return [{**sample, **all_parsed_outputs}]
329
-
330
- else:
331
- # When expand_lists=True, use existing expanding behavior
332
- all_results = []
333
- for i, message in enumerate(raw_output):
334
- if not message:
335
- logger.warning(
336
- f"List item {i} in column '{input_column}' is empty"
337
- )
338
- continue
339
-
340
- parsed_outputs = self._handle_message(message)
341
- if self.save_reasoning_content:
342
- reasoning_content = parsed_outputs.pop(
343
- self.reasoning_content_field
344
- )
345
-
346
- if not parsed_outputs or not any(
347
- len(value) > 0 for value in parsed_outputs.values()
348
- ):
349
- logger.warning(
350
- f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
351
- f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
352
- )
353
- continue
354
-
355
- # Create output rows for this response
356
- max_length = max(len(value) for value in parsed_outputs.values())
357
- for values in zip(
358
- *(lst[:max_length] for lst in parsed_outputs.values())
359
- ):
360
- result_row = {
361
- **sample,
362
- **dict(zip(parsed_outputs.keys(), values)),
363
- }
364
- if self.save_reasoning_content:
365
- result_row[
366
- self.block_name + "_" + self.reasoning_content_field
367
- ] = reasoning_content[0]
368
- all_results.append(result_row)
369
-
370
- return all_results
371
-
372
- # Handle dict inputs (existing logic)
373
- elif isinstance(raw_output, dict) or isinstance(raw_output, str):
374
- if not raw_output:
375
- logger.warning(f"Input column '{input_column}' contains empty dict")
244
+ # Parse each text string in the list and collect results as lists
245
+ all_parsed_outputs = {col: [] for col in self.output_cols}
246
+ valid_responses = 0
247
+
248
+ for i, message in enumerate(raw_output):
249
+ # Ensure each item in the list is a string
250
+ if not isinstance(message, str):
251
+ logger.warning(
252
+ f"List item {i} in column '{input_column}' is not a string: {type(message)}. "
253
+ f"Expected List[str], skipping this item."
254
+ )
255
+ continue
256
+
257
+ if not message:
258
+ logger.warning(f"List item {i} in column '{input_column}' is empty")
259
+ continue
260
+
261
+ parsed_outputs = self._parse(message)
262
+
263
+ if not parsed_outputs or not any(
264
+ len(value) > 0 for value in parsed_outputs.values()
265
+ ):
266
+ logger.warning(
267
+ f"Failed to parse content from list item {i}. Text length: {len(message)}, "
268
+ f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
269
+ )
270
+ continue
271
+
272
+ valid_responses += 1
273
+ # Collect all parsed values for each column as lists
274
+ for col in self.output_cols:
275
+ all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
276
+
277
+ if valid_responses == 0:
376
278
  return []
377
279
 
378
- if isinstance(raw_output, str):
379
- raw_output = {"content": raw_output}
280
+ # Return single row with lists as values
281
+ return [{**sample, **all_parsed_outputs}]
282
+ # Handle string inputs
283
+ elif isinstance(raw_output, str):
284
+ if not raw_output:
285
+ logger.warning(f"Input column '{input_column}' contains empty string")
286
+ return []
380
287
 
381
- parsed_outputs = self._handle_message(raw_output)
382
- if self.save_reasoning_content:
383
- reasoning_content = parsed_outputs.pop(self.reasoning_content_field)
288
+ parsed_outputs = self._parse(raw_output)
384
289
 
385
290
  if not parsed_outputs or not any(
386
291
  len(value) > 0 for value in parsed_outputs.values()
387
292
  ):
388
293
  logger.warning(
389
- f"Failed to parse any content from input. Raw output length: {len(raw_output)}, "
294
+ f"Failed to parse any content from input. Text length: {len(raw_output)}, "
390
295
  f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
391
296
  )
392
297
  return []
@@ -395,10 +300,6 @@ class TextParserBlock(BaseBlock):
395
300
  max_length = max(len(value) for value in parsed_outputs.values())
396
301
  for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
397
302
  result_row = {**sample, **dict(zip(parsed_outputs.keys(), values))}
398
- if self.save_reasoning_content:
399
- result_row[self.block_name + "_" + self.reasoning_content_field] = (
400
- reasoning_content[0]
401
- )
402
303
  result.append(result_row)
403
304
 
404
305
  return result
@@ -406,7 +307,7 @@ class TextParserBlock(BaseBlock):
406
307
  else:
407
308
  logger.warning(
408
309
  f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
409
- f"Expected dict or List[dict]"
310
+ f"Expected str or List[str]"
410
311
  )
411
312
  return []
412
313
 
sdg_hub/core/flow/base.py CHANGED
@@ -877,16 +877,19 @@ class Flow(BaseModel):
877
877
  f"Block '{block.block_name}': {param_name} "
878
878
  f"'{old_value}' -> '{param_value}'"
879
879
  )
880
+ ## check if allow extra
881
+ elif block.model_config["extra"] == "allow":
882
+ setattr(block, param_name, param_value)
883
+ logger.debug(
884
+ f"Block '{block.block_name}': {param_name} "
885
+ f"'{old_value}' -> '{param_value}'"
886
+ )
880
887
  else:
881
888
  logger.warning(
882
889
  f"Block '{block.block_name}' ({block.__class__.__name__}) "
883
890
  f"does not have attribute '{param_name}' - skipping"
884
891
  )
885
892
 
886
- # Reinitialize client manager for LLM blocks after updating config
887
- if hasattr(block, "_reinitialize_client_manager"):
888
- block._reinitialize_client_manager()
889
-
890
893
  modified_count += 1
891
894
 
892
895
  if modified_count > 0: