sdg-hub 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/llm/__init__.py +2 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +77 -30
- sdg_hub/core/blocks/registry.py +1 -1
- sdg_hub/core/flow/base.py +243 -14
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +45 -0
- sdg_hub/core/flow/migration.py +12 -1
- sdg_hub/core/flow/registry.py +121 -58
- sdg_hub/core/flow/validation.py +12 -0
- sdg_hub/core/utils/__init__.py +2 -1
- sdg_hub/core/utils/datautils.py +52 -1
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -0
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.1.dist-info}/METADATA +21 -18
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.1.dist-info}/RECORD +22 -17
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
sdg_hub/_version.py
CHANGED
@@ -11,6 +11,7 @@ from .client_manager import LLMClientManager
|
|
11
11
|
from .config import LLMConfig
|
12
12
|
from .error_handler import ErrorCategory, LLMErrorHandler
|
13
13
|
from .llm_chat_block import LLMChatBlock
|
14
|
+
from .llm_chat_with_parsing_retry_block import LLMChatWithParsingRetryBlock
|
14
15
|
from .prompt_builder_block import PromptBuilderBlock
|
15
16
|
from .text_parser_block import TextParserBlock
|
16
17
|
|
@@ -20,6 +21,7 @@ __all__ = [
|
|
20
21
|
"LLMErrorHandler",
|
21
22
|
"ErrorCategory",
|
22
23
|
"LLMChatBlock",
|
24
|
+
"LLMChatWithParsingRetryBlock",
|
23
25
|
"PromptBuilderBlock",
|
24
26
|
"TextParserBlock",
|
25
27
|
]
|
@@ -0,0 +1,491 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Composite block combining LLM chat and text parsing with retry logic.
|
3
|
+
|
4
|
+
This module provides the LLMChatWithParsingRetryBlock that encapsulates the complete
|
5
|
+
LLM generation and parsing workflow with automatic retry on parsing failures.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any, Optional
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import ConfigDict, Field, field_validator
|
14
|
+
|
15
|
+
from ...utils.error_handling import BlockValidationError
|
16
|
+
|
17
|
+
# Local
|
18
|
+
from ...utils.logger_config import setup_logger
|
19
|
+
from ..base import BaseBlock
|
20
|
+
from ..registry import BlockRegistry
|
21
|
+
from .llm_chat_block import LLMChatBlock
|
22
|
+
from .text_parser_block import TextParserBlock
|
23
|
+
|
24
|
+
logger = setup_logger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class MaxRetriesExceededError(Exception):
|
28
|
+
"""Raised when maximum retry attempts are exceeded without achieving target count."""
|
29
|
+
|
30
|
+
def __init__(self, target_count: int, actual_count: int, max_retries: int):
|
31
|
+
self.target_count = target_count
|
32
|
+
self.actual_count = actual_count
|
33
|
+
self.max_retries = max_retries
|
34
|
+
super().__init__(
|
35
|
+
f"Failed to achieve target count {target_count} after {max_retries} retries. "
|
36
|
+
f"Only got {actual_count} successful parses."
|
37
|
+
)
|
38
|
+
|
39
|
+
|
40
|
+
@BlockRegistry.register(
|
41
|
+
"LLMChatWithParsingRetryBlock",
|
42
|
+
"llm",
|
43
|
+
"Composite block combining LLM chat and text parsing with automatic retry on parsing failures",
|
44
|
+
)
|
45
|
+
class LLMChatWithParsingRetryBlock(BaseBlock):
|
46
|
+
"""Composite block for LLM generation with parsing retry logic.
|
47
|
+
|
48
|
+
This block combines LLMChatBlock and TextParserBlock into a single cohesive block
|
49
|
+
that automatically retries LLM generation when parsing fails, accumulating successful
|
50
|
+
results until the target count is reached or max retries exceeded.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
block_name : str
|
55
|
+
Name of the block.
|
56
|
+
input_cols : Union[str, List[str]]
|
57
|
+
Input column name(s). Should contain the messages list.
|
58
|
+
output_cols : Union[str, List[str]]
|
59
|
+
Output column name(s) for parsed results.
|
60
|
+
model : str
|
61
|
+
Model identifier in LiteLLM format.
|
62
|
+
api_base : Optional[str]
|
63
|
+
Base URL for the API. Required for local models.
|
64
|
+
api_key : Optional[str]
|
65
|
+
API key for the provider. Falls back to environment variables.
|
66
|
+
parsing_max_retries : int, optional
|
67
|
+
Maximum number of retry attempts for parsing failures (default: 3).
|
68
|
+
This is different from max_retries, which handles LLM network/API failures.
|
69
|
+
|
70
|
+
### LLM Generation Parameters ###
|
71
|
+
async_mode : bool, optional
|
72
|
+
Whether to use async processing (default: False).
|
73
|
+
timeout : float, optional
|
74
|
+
Request timeout in seconds (default: 120.0).
|
75
|
+
max_retries : int, optional
|
76
|
+
Maximum number of LLM retry attempts for network failures (default: 6).
|
77
|
+
temperature : Optional[float], optional
|
78
|
+
Sampling temperature (0.0 to 2.0).
|
79
|
+
max_tokens : Optional[int], optional
|
80
|
+
Maximum tokens to generate.
|
81
|
+
top_p : Optional[float], optional
|
82
|
+
Nucleus sampling parameter (0.0 to 1.0).
|
83
|
+
frequency_penalty : Optional[float], optional
|
84
|
+
Frequency penalty (-2.0 to 2.0).
|
85
|
+
presence_penalty : Optional[float], optional
|
86
|
+
Presence penalty (-2.0 to 2.0).
|
87
|
+
stop : Optional[Union[str, List[str]]], optional
|
88
|
+
Stop sequences.
|
89
|
+
seed : Optional[int], optional
|
90
|
+
Random seed for reproducible outputs.
|
91
|
+
response_format : Optional[Dict[str, Any]], optional
|
92
|
+
Response format specification (e.g., JSON mode).
|
93
|
+
stream : Optional[bool], optional
|
94
|
+
Whether to stream responses.
|
95
|
+
n : Optional[int], optional
|
96
|
+
Number of completions to generate per retry attempt.
|
97
|
+
logprobs : Optional[bool], optional
|
98
|
+
Whether to return log probabilities.
|
99
|
+
top_logprobs : Optional[int], optional
|
100
|
+
Number of top log probabilities to return.
|
101
|
+
user : Optional[str], optional
|
102
|
+
End-user identifier.
|
103
|
+
extra_headers : Optional[Dict[str, str]], optional
|
104
|
+
Additional headers to send with requests.
|
105
|
+
extra_body : Optional[Dict[str, Any]], optional
|
106
|
+
Additional parameters for the request body.
|
107
|
+
provider_specific : Optional[Dict[str, Any]], optional
|
108
|
+
Provider-specific parameters.
|
109
|
+
|
110
|
+
### Text Parser Parameters ###
|
111
|
+
start_tags : List[str], optional
|
112
|
+
List of start tags for tag-based parsing.
|
113
|
+
end_tags : List[str], optional
|
114
|
+
List of end tags for tag-based parsing.
|
115
|
+
parsing_pattern : Optional[str], optional
|
116
|
+
Regex pattern for custom parsing.
|
117
|
+
parser_cleanup_tags : Optional[List[str]], optional
|
118
|
+
List of tags to clean from parsed output.
|
119
|
+
|
120
|
+
Examples
|
121
|
+
--------
|
122
|
+
>>> # Basic JSON parsing with retry
|
123
|
+
>>> block = LLMChatWithParsingRetryBlock(
|
124
|
+
... block_name="json_retry_block",
|
125
|
+
... input_cols="messages",
|
126
|
+
... output_cols="parsed_json",
|
127
|
+
... model="openai/gpt-4",
|
128
|
+
... parsing_max_retries=3,
|
129
|
+
... parsing_pattern=r'"result":\s*"([^"]*)"',
|
130
|
+
... n=3
|
131
|
+
... )
|
132
|
+
|
133
|
+
>>> # Tag-based parsing with retry
|
134
|
+
>>> block = LLMChatWithParsingRetryBlock(
|
135
|
+
... block_name="tag_retry_block",
|
136
|
+
... input_cols="messages",
|
137
|
+
... output_cols=["explanation", "answer"],
|
138
|
+
... model="anthropic/claude-3-sonnet-20240229",
|
139
|
+
... parsing_max_retries=5,
|
140
|
+
... start_tags=["<explanation>", "<answer>"],
|
141
|
+
... end_tags=["</explanation>", "</answer>"],
|
142
|
+
... n=2
|
143
|
+
... )
|
144
|
+
"""
|
145
|
+
|
146
|
+
model_config = ConfigDict(
|
147
|
+
extra="allow"
|
148
|
+
) # Allow extra fields for dynamic forwarding
|
149
|
+
|
150
|
+
# Composite-specific parameters only
|
151
|
+
parsing_max_retries: int = Field(
|
152
|
+
3, description="Maximum number of retry attempts for parsing failures"
|
153
|
+
)
|
154
|
+
|
155
|
+
# Store parameters for internal blocks
|
156
|
+
llm_params: dict[str, Any] = Field(default_factory=dict, exclude=True)
|
157
|
+
parser_params: dict[str, Any] = Field(default_factory=dict, exclude=True)
|
158
|
+
|
159
|
+
# Internal blocks - excluded from serialization
|
160
|
+
llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
|
161
|
+
text_parser: Optional[TextParserBlock] = Field(None, exclude=True)
|
162
|
+
|
163
|
+
@field_validator("input_cols")
|
164
|
+
@classmethod
|
165
|
+
def validate_single_input_col(cls, v):
|
166
|
+
"""Ensure exactly one input column."""
|
167
|
+
if isinstance(v, str):
|
168
|
+
return [v]
|
169
|
+
if isinstance(v, list) and len(v) == 1:
|
170
|
+
return v
|
171
|
+
if isinstance(v, list) and len(v) != 1:
|
172
|
+
raise ValueError(
|
173
|
+
f"LLMChatWithParsingRetryBlock expects exactly one input column, got {len(v)}: {v}"
|
174
|
+
)
|
175
|
+
raise ValueError(f"Invalid input_cols format: {v}")
|
176
|
+
|
177
|
+
@field_validator("parsing_max_retries")
|
178
|
+
@classmethod
|
179
|
+
def validate_parsing_max_retries(cls, v):
|
180
|
+
"""Ensure parsing_max_retries is positive."""
|
181
|
+
if v < 1:
|
182
|
+
raise ValueError("parsing_max_retries must be at least 1")
|
183
|
+
return v
|
184
|
+
|
185
|
+
def __init__(self, **kwargs):
|
186
|
+
"""Initialize with dynamic parameter forwarding."""
|
187
|
+
# Extract and store composite-specific params before super().__init__
|
188
|
+
parsing_max_retries = kwargs.pop("parsing_max_retries", 3)
|
189
|
+
|
190
|
+
# Forward parameters to appropriate internal blocks
|
191
|
+
llm_params = {k: v for k, v in kwargs.items() if k in LLMChatBlock.model_fields}
|
192
|
+
parser_params = {
|
193
|
+
k: v for k, v in kwargs.items() if k in TextParserBlock.model_fields
|
194
|
+
}
|
195
|
+
|
196
|
+
# Keep only BaseBlock fields for super().__init__
|
197
|
+
base_params = {k: v for k, v in kwargs.items() if k in BaseBlock.model_fields}
|
198
|
+
base_params["parsing_max_retries"] = parsing_max_retries
|
199
|
+
base_params["llm_params"] = llm_params
|
200
|
+
base_params["parser_params"] = parser_params
|
201
|
+
|
202
|
+
# Initialize parent with all valid parameters
|
203
|
+
super().__init__(**base_params)
|
204
|
+
|
205
|
+
# Create internal blocks with forwarded parameters
|
206
|
+
self._create_internal_blocks()
|
207
|
+
|
208
|
+
# Log initialization only when model is configured
|
209
|
+
model = self.llm_params.get("model")
|
210
|
+
if model:
|
211
|
+
logger.info(
|
212
|
+
f"Initialized LLMChatWithParsingRetryBlock '{self.block_name}' with model '{model}'",
|
213
|
+
extra={
|
214
|
+
"block_name": self.block_name,
|
215
|
+
"model": model,
|
216
|
+
"async_mode": self.llm_params.get("async_mode", False),
|
217
|
+
"parsing_max_retries": self.parsing_max_retries,
|
218
|
+
},
|
219
|
+
)
|
220
|
+
|
221
|
+
def _create_internal_blocks(self) -> None:
|
222
|
+
"""Create and configure the internal blocks using dynamic parameter forwarding."""
|
223
|
+
# 1. LLMChatBlock
|
224
|
+
llm_kwargs = {
|
225
|
+
**self.llm_params, # Forward all LLM parameters dynamically first
|
226
|
+
"block_name": f"{self.block_name}_llm_chat", # Override block_name
|
227
|
+
"input_cols": self.input_cols,
|
228
|
+
"output_cols": [f"{self.block_name}_raw_response"],
|
229
|
+
}
|
230
|
+
self.llm_chat = LLMChatBlock(**llm_kwargs)
|
231
|
+
|
232
|
+
# 2. TextParserBlock
|
233
|
+
text_parser_kwargs = {
|
234
|
+
**self.parser_params, # Forward all parser parameters dynamically first
|
235
|
+
"block_name": f"{self.block_name}_text_parser", # Override block_name
|
236
|
+
"input_cols": [f"{self.block_name}_raw_response"],
|
237
|
+
"output_cols": self.output_cols,
|
238
|
+
}
|
239
|
+
self.text_parser = TextParserBlock(**text_parser_kwargs)
|
240
|
+
|
241
|
+
def _reinitialize_client_manager(self) -> None:
|
242
|
+
"""Reinitialize the internal LLM chat block's client manager.
|
243
|
+
|
244
|
+
This should be called after model configuration changes to ensure
|
245
|
+
the internal LLM chat block uses the updated model configuration.
|
246
|
+
"""
|
247
|
+
if self.llm_chat and hasattr(self.llm_chat, "_reinitialize_client_manager"):
|
248
|
+
# Update the internal LLM chat block's model config from stored params
|
249
|
+
for key in ["model", "api_base", "api_key"]:
|
250
|
+
if key in self.llm_params:
|
251
|
+
setattr(self.llm_chat, key, self.llm_params[key])
|
252
|
+
# Reinitialize its client manager
|
253
|
+
self.llm_chat._reinitialize_client_manager()
|
254
|
+
|
255
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
256
|
+
"""Generate responses with parsing retry logic.
|
257
|
+
|
258
|
+
For each input sample, this method:
|
259
|
+
1. Generates LLM responses using the configured n parameter
|
260
|
+
2. Attempts to parse the responses using TextParserBlock
|
261
|
+
3. Counts successful parses and retries if below target
|
262
|
+
4. Accumulates results across retry attempts
|
263
|
+
5. Returns final dataset with all successful parses
|
264
|
+
|
265
|
+
Parameters
|
266
|
+
----------
|
267
|
+
samples : Dataset
|
268
|
+
Input dataset containing the messages column.
|
269
|
+
**kwargs : Any
|
270
|
+
Additional keyword arguments passed to internal blocks.
|
271
|
+
|
272
|
+
Returns
|
273
|
+
-------
|
274
|
+
Dataset
|
275
|
+
Dataset with parsed results from successful generations.
|
276
|
+
|
277
|
+
Raises
|
278
|
+
------
|
279
|
+
BlockValidationError
|
280
|
+
If model is not configured before calling generate().
|
281
|
+
MaxRetriesExceededError
|
282
|
+
If target count not reached after max retries for any sample.
|
283
|
+
"""
|
284
|
+
# Validate that model is configured
|
285
|
+
model = self.llm_params.get("model")
|
286
|
+
if not model:
|
287
|
+
raise BlockValidationError(
|
288
|
+
f"Model not configured for block '{self.block_name}'. "
|
289
|
+
f"Call flow.set_model_config() before generating."
|
290
|
+
)
|
291
|
+
|
292
|
+
logger.info(
|
293
|
+
f"Starting LLM generation with parsing retry for {len(samples)} samples",
|
294
|
+
extra={
|
295
|
+
"block_name": self.block_name,
|
296
|
+
"model": model,
|
297
|
+
"batch_size": len(samples),
|
298
|
+
"parsing_max_retries": self.parsing_max_retries,
|
299
|
+
},
|
300
|
+
)
|
301
|
+
|
302
|
+
all_results = []
|
303
|
+
|
304
|
+
# Process each sample independently with retry logic
|
305
|
+
for sample_idx, sample in enumerate(samples):
|
306
|
+
sample_results = []
|
307
|
+
total_parsed_count = 0
|
308
|
+
|
309
|
+
# Determine target count for this sample (number of completions requested)
|
310
|
+
target = kwargs.get("n", self.llm_params.get("n")) or 1
|
311
|
+
|
312
|
+
logger.debug(
|
313
|
+
f"Processing sample {sample_idx} with target count {target}",
|
314
|
+
extra={
|
315
|
+
"block_name": self.block_name,
|
316
|
+
"sample_idx": sample_idx,
|
317
|
+
"target_count": target,
|
318
|
+
},
|
319
|
+
)
|
320
|
+
|
321
|
+
# Retry loop for this sample
|
322
|
+
for attempt in range(self.parsing_max_retries):
|
323
|
+
if total_parsed_count >= target:
|
324
|
+
break # Already reached target
|
325
|
+
|
326
|
+
try:
|
327
|
+
# Generate LLM responses for this sample
|
328
|
+
temp_dataset = Dataset.from_list([sample])
|
329
|
+
llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
|
330
|
+
|
331
|
+
# Parse the responses
|
332
|
+
parsed_result = self.text_parser.generate(llm_result, **kwargs)
|
333
|
+
|
334
|
+
# Count successful parses and accumulate results
|
335
|
+
new_parsed_count = len(parsed_result)
|
336
|
+
total_parsed_count += new_parsed_count
|
337
|
+
sample_results.extend(parsed_result)
|
338
|
+
|
339
|
+
logger.debug(
|
340
|
+
f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
|
341
|
+
f"(total: {total_parsed_count}/{target})",
|
342
|
+
extra={
|
343
|
+
"block_name": self.block_name,
|
344
|
+
"sample_idx": sample_idx,
|
345
|
+
"attempt": attempt + 1,
|
346
|
+
"new_parses": new_parsed_count,
|
347
|
+
"total_parses": total_parsed_count,
|
348
|
+
"target_count": target,
|
349
|
+
},
|
350
|
+
)
|
351
|
+
|
352
|
+
if total_parsed_count >= target:
|
353
|
+
logger.debug(
|
354
|
+
f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
|
355
|
+
extra={
|
356
|
+
"block_name": self.block_name,
|
357
|
+
"sample_idx": sample_idx,
|
358
|
+
"attempts": attempt + 1,
|
359
|
+
"final_count": total_parsed_count,
|
360
|
+
},
|
361
|
+
)
|
362
|
+
break
|
363
|
+
|
364
|
+
except Exception as e:
|
365
|
+
logger.warning(
|
366
|
+
f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
|
367
|
+
extra={
|
368
|
+
"block_name": self.block_name,
|
369
|
+
"sample_idx": sample_idx,
|
370
|
+
"attempt": attempt + 1,
|
371
|
+
"error": str(e),
|
372
|
+
},
|
373
|
+
)
|
374
|
+
# Continue to next attempt
|
375
|
+
continue
|
376
|
+
|
377
|
+
# Check if we reached the target count
|
378
|
+
if total_parsed_count < target:
|
379
|
+
raise MaxRetriesExceededError(
|
380
|
+
target_count=target,
|
381
|
+
actual_count=total_parsed_count,
|
382
|
+
max_retries=self.parsing_max_retries,
|
383
|
+
)
|
384
|
+
|
385
|
+
# Trim results to exact target count if we exceeded it
|
386
|
+
if total_parsed_count > target:
|
387
|
+
sample_results = sample_results[:target]
|
388
|
+
logger.debug(
|
389
|
+
f"Trimmed sample {sample_idx} results from {total_parsed_count} to {target}",
|
390
|
+
extra={
|
391
|
+
"block_name": self.block_name,
|
392
|
+
"sample_idx": sample_idx,
|
393
|
+
"trimmed_from": total_parsed_count,
|
394
|
+
"trimmed_to": target,
|
395
|
+
},
|
396
|
+
)
|
397
|
+
|
398
|
+
# Add this sample's results to final dataset
|
399
|
+
all_results.extend(sample_results)
|
400
|
+
|
401
|
+
logger.info(
|
402
|
+
f"LLM generation with parsing retry completed: {len(samples)} input samples → {len(all_results)} output rows",
|
403
|
+
extra={
|
404
|
+
"block_name": self.block_name,
|
405
|
+
"input_samples": len(samples),
|
406
|
+
"output_rows": len(all_results),
|
407
|
+
"model": model,
|
408
|
+
},
|
409
|
+
)
|
410
|
+
|
411
|
+
return Dataset.from_list(all_results)
|
412
|
+
|
413
|
+
def _validate_custom(self, dataset: Dataset) -> None:
|
414
|
+
"""Custom validation for LLMChatWithParsingRetryBlock.
|
415
|
+
|
416
|
+
This method validates the entire chain of internal blocks by simulating
|
417
|
+
the data flow through each block to ensure they can all process the data correctly.
|
418
|
+
"""
|
419
|
+
# Validate that required input column exists
|
420
|
+
if len(self.input_cols) != 1:
|
421
|
+
raise ValueError(
|
422
|
+
f"LLMChatWithParsingRetryBlock expects exactly one input column, got {len(self.input_cols)}"
|
423
|
+
)
|
424
|
+
|
425
|
+
input_col = self.input_cols[0]
|
426
|
+
if input_col not in dataset.column_names:
|
427
|
+
raise ValueError(
|
428
|
+
f"Required input column '{input_col}' not found in dataset. "
|
429
|
+
f"Available columns: {dataset.column_names}"
|
430
|
+
)
|
431
|
+
|
432
|
+
# Validate parsing configuration
|
433
|
+
has_regex = self.parser_params.get("parsing_pattern") is not None
|
434
|
+
has_tags = bool(self.parser_params.get("start_tags", [])) or bool(
|
435
|
+
self.parser_params.get("end_tags", [])
|
436
|
+
)
|
437
|
+
|
438
|
+
if not has_regex and not has_tags:
|
439
|
+
raise ValueError(
|
440
|
+
"LLMChatWithParsingRetryBlock requires at least one parsing method: "
|
441
|
+
"either 'parsing_pattern' (regex) or 'start_tags'/'end_tags' (tag-based parsing)"
|
442
|
+
)
|
443
|
+
|
444
|
+
# Validate that internal blocks are initialized
|
445
|
+
if not all([self.llm_chat, self.text_parser]):
|
446
|
+
raise ValueError(
|
447
|
+
"All internal blocks must be initialized before validation"
|
448
|
+
)
|
449
|
+
|
450
|
+
# Validate internal blocks
|
451
|
+
try:
|
452
|
+
logger.debug("Validating internal LLM chat block")
|
453
|
+
self.llm_chat._validate_custom(dataset)
|
454
|
+
|
455
|
+
# Create temporary dataset with expected LLM output for parser validation
|
456
|
+
temp_data = []
|
457
|
+
for sample in dataset:
|
458
|
+
temp_sample = dict(sample)
|
459
|
+
temp_sample[f"{self.block_name}_raw_response"] = "test output"
|
460
|
+
temp_data.append(temp_sample)
|
461
|
+
temp_dataset = Dataset.from_list(temp_data)
|
462
|
+
|
463
|
+
logger.debug("Validating internal text parser block")
|
464
|
+
self.text_parser._validate_custom(temp_dataset)
|
465
|
+
|
466
|
+
logger.debug("All internal blocks validated successfully")
|
467
|
+
|
468
|
+
except Exception as e:
|
469
|
+
logger.error(f"Validation failed in internal blocks: {e}")
|
470
|
+
raise ValueError(f"Internal block validation failed: {e}") from e
|
471
|
+
|
472
|
+
def get_internal_blocks_info(self) -> dict[str, Any]:
|
473
|
+
"""Get information about the internal blocks.
|
474
|
+
|
475
|
+
Returns
|
476
|
+
-------
|
477
|
+
Dict[str, Any]
|
478
|
+
Information about each internal block.
|
479
|
+
"""
|
480
|
+
return {
|
481
|
+
"llm_chat": self.llm_chat.get_info() if self.llm_chat else None,
|
482
|
+
"text_parser": self.text_parser.get_info() if self.text_parser else None,
|
483
|
+
}
|
484
|
+
|
485
|
+
def __repr__(self) -> str:
|
486
|
+
"""String representation of the block."""
|
487
|
+
model = self.llm_params.get("model", "not_configured")
|
488
|
+
return (
|
489
|
+
f"LLMChatWithParsingRetryBlock(name='{self.block_name}', "
|
490
|
+
f"model='{model}', parsing_max_retries={self.parsing_max_retries})"
|
491
|
+
)
|
@@ -48,6 +48,9 @@ class TextParserBlock(BaseBlock):
|
|
48
48
|
Regex pattern for custom parsing.
|
49
49
|
parser_cleanup_tags : Optional[List[str]]
|
50
50
|
List of tags to clean from parsed output.
|
51
|
+
expand_lists : bool
|
52
|
+
Whether to expand list inputs into individual rows (True) or preserve lists (False).
|
53
|
+
Default is True for backward compatibility.
|
51
54
|
"""
|
52
55
|
|
53
56
|
start_tags: list[str] = Field(
|
@@ -62,6 +65,10 @@ class TextParserBlock(BaseBlock):
|
|
62
65
|
parser_cleanup_tags: Optional[list[str]] = Field(
|
63
66
|
default=None, description="List of tags to clean from parsed output"
|
64
67
|
)
|
68
|
+
expand_lists: bool = Field(
|
69
|
+
default=True,
|
70
|
+
description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
|
71
|
+
)
|
65
72
|
|
66
73
|
@field_validator("start_tags", "end_tags", mode="before")
|
67
74
|
@classmethod
|
@@ -237,36 +244,76 @@ class TextParserBlock(BaseBlock):
|
|
237
244
|
logger.warning(f"Input column '{input_column}' contains empty list")
|
238
245
|
return []
|
239
246
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
247
|
+
if not self.expand_lists:
|
248
|
+
# When expand_lists=False, preserve the list structure
|
249
|
+
# Parse each response in the list and collect results as lists
|
250
|
+
all_parsed_outputs = {col: [] for col in self.output_cols}
|
251
|
+
valid_responses = 0
|
252
|
+
|
253
|
+
for i, response in enumerate(raw_output):
|
254
|
+
if not response or not isinstance(response, str):
|
255
|
+
logger.warning(
|
256
|
+
f"List item {i} in column '{input_column}' contains invalid data "
|
257
|
+
f"(empty or non-string): {type(response)}"
|
258
|
+
)
|
259
|
+
continue
|
260
|
+
|
261
|
+
parsed_outputs = self._parse(response)
|
262
|
+
|
263
|
+
if not parsed_outputs or not any(
|
264
|
+
len(value) > 0 for value in parsed_outputs.values()
|
265
|
+
):
|
266
|
+
logger.warning(
|
267
|
+
f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
|
268
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
269
|
+
)
|
270
|
+
continue
|
271
|
+
|
272
|
+
valid_responses += 1
|
273
|
+
# Collect all parsed values for each column as lists
|
274
|
+
for col in self.output_cols:
|
275
|
+
all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
|
276
|
+
|
277
|
+
if valid_responses == 0:
|
278
|
+
return []
|
279
|
+
|
280
|
+
# Return single row with lists as values
|
281
|
+
# TODO: This breaks retry counting in LLMChatWithParsingRetryBlock until LLMChatWithParsingRetryBlock is re-based
|
282
|
+
# which expects one row per successful parse for counting
|
283
|
+
return [{**sample, **all_parsed_outputs}]
|
284
|
+
|
285
|
+
else:
|
286
|
+
# When expand_lists=True, use existing expanding behavior
|
287
|
+
all_results = []
|
288
|
+
for i, response in enumerate(raw_output):
|
289
|
+
if not response or not isinstance(response, str):
|
290
|
+
logger.warning(
|
291
|
+
f"List item {i} in column '{input_column}' contains invalid data "
|
292
|
+
f"(empty or non-string): {type(response)}"
|
293
|
+
)
|
294
|
+
continue
|
295
|
+
|
296
|
+
parsed_outputs = self._parse(response)
|
297
|
+
|
298
|
+
if not parsed_outputs or not any(
|
299
|
+
len(value) > 0 for value in parsed_outputs.values()
|
300
|
+
):
|
301
|
+
logger.warning(
|
302
|
+
f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
|
303
|
+
f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
|
304
|
+
)
|
305
|
+
continue
|
306
|
+
|
307
|
+
# Create output rows for this response
|
308
|
+
max_length = max(len(value) for value in parsed_outputs.values())
|
309
|
+
for values in zip(
|
310
|
+
*(lst[:max_length] for lst in parsed_outputs.values())
|
311
|
+
):
|
312
|
+
all_results.append(
|
313
|
+
{**sample, **dict(zip(parsed_outputs.keys(), values))}
|
314
|
+
)
|
315
|
+
|
316
|
+
return all_results
|
270
317
|
|
271
318
|
# Handle string inputs (existing logic)
|
272
319
|
elif isinstance(raw_output, str):
|
sdg_hub/core/blocks/registry.py
CHANGED
@@ -291,7 +291,7 @@ class BlockRegistry:
|
|
291
291
|
}
|
292
292
|
|
293
293
|
@classmethod
|
294
|
-
def
|
294
|
+
def discover_blocks(cls) -> None:
|
295
295
|
"""Print a Rich-formatted table of all available blocks."""
|
296
296
|
if not cls._metadata:
|
297
297
|
console.print("[yellow]No blocks registered yet.[/yellow]")
|