sdg-hub 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/base.py +60 -58
- sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
- sdg_hub/core/blocks/llm/__init__.py +0 -2
- sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
- sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
- sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
- sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
- sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
- sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
- sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
- sdg_hub/core/blocks/transform/melt_columns.py +13 -12
- sdg_hub/core/blocks/transform/rename_columns.py +20 -9
- sdg_hub/core/blocks/transform/text_concat.py +20 -21
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
- sdg_hub/core/flow/base.py +139 -106
- sdg_hub/core/flow/checkpointer.py +34 -36
- sdg_hub/core/flow/validation.py +4 -4
- sdg_hub/core/utils/datautils.py +52 -54
- sdg_hub/core/utils/flow_metrics.py +9 -6
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.0.dist-info}/METADATA +2 -8
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.0.dist-info}/RECORD +25 -27
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
- sdg_hub/core/utils/temp_manager.py +0 -57
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -1,771 +0,0 @@
|
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
-
"""Composite block combining LLM chat and text parsing with retry logic.
|
|
3
|
-
|
|
4
|
-
This module provides the LLMChatWithParsingRetryBlock that encapsulates the complete
|
|
5
|
-
LLM generation and parsing workflow with automatic retry on parsing failures.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Standard
|
|
9
|
-
from typing import Any, Optional
|
|
10
|
-
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
|
-
from pydantic import ConfigDict, Field, field_validator
|
|
14
|
-
|
|
15
|
-
# Local
|
|
16
|
-
from ...utils.error_handling import BlockValidationError
|
|
17
|
-
from ...utils.logger_config import setup_logger
|
|
18
|
-
from ..base import BaseBlock
|
|
19
|
-
from ..registry import BlockRegistry
|
|
20
|
-
from .llm_chat_block import LLMChatBlock
|
|
21
|
-
from .llm_parser_block import LLMParserBlock
|
|
22
|
-
from .text_parser_block import TextParserBlock
|
|
23
|
-
|
|
24
|
-
logger = setup_logger(__name__)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class MaxRetriesExceededError(Exception):
|
|
28
|
-
"""Raised when maximum retry attempts are exceeded without achieving target count."""
|
|
29
|
-
|
|
30
|
-
def __init__(self, target_count: int, actual_count: int, max_retries: int):
|
|
31
|
-
self.target_count = target_count
|
|
32
|
-
self.actual_count = actual_count
|
|
33
|
-
self.max_retries = max_retries
|
|
34
|
-
super().__init__(
|
|
35
|
-
f"Failed to achieve target count {target_count} after {max_retries} retries. "
|
|
36
|
-
f"Only got {actual_count} successful parses."
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@BlockRegistry.register(
|
|
41
|
-
"LLMChatWithParsingRetryBlock",
|
|
42
|
-
"llm",
|
|
43
|
-
"Composite block combining LLM chat and text parsing with automatic retry on parsing failures",
|
|
44
|
-
)
|
|
45
|
-
class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
46
|
-
"""Composite block for LLM generation with parsing retry logic.
|
|
47
|
-
|
|
48
|
-
This block combines LLMChatBlock and TextParserBlock into a single cohesive block
|
|
49
|
-
that automatically retries LLM generation when parsing fails, accumulating successful
|
|
50
|
-
results until the target count is reached or max retries exceeded.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
block_name : str
|
|
55
|
-
Name of the block.
|
|
56
|
-
input_cols : Union[str, List[str]]
|
|
57
|
-
Input column name(s). Should contain the messages list.
|
|
58
|
-
output_cols : Union[str, List[str]]
|
|
59
|
-
Output column name(s) for parsed results.
|
|
60
|
-
model : str
|
|
61
|
-
Model identifier in LiteLLM format.
|
|
62
|
-
api_base : Optional[str]
|
|
63
|
-
Base URL for the API. Required for local models.
|
|
64
|
-
api_key : Optional[str]
|
|
65
|
-
API key for the provider. Falls back to environment variables.
|
|
66
|
-
parsing_max_retries : int, optional
|
|
67
|
-
Maximum number of retry attempts for parsing failures (default: 3).
|
|
68
|
-
This is different from max_retries, which handles LLM network/API failures.
|
|
69
|
-
|
|
70
|
-
**llm_kwargs : Any
|
|
71
|
-
Any LiteLLM completion parameters (model, api_base, api_key, temperature,
|
|
72
|
-
max_tokens, top_p, frequency_penalty, presence_penalty, stop, seed,
|
|
73
|
-
response_format, stream, n, logprobs, top_logprobs, user, extra_headers,
|
|
74
|
-
extra_body, async_mode, timeout, num_retries, etc.).
|
|
75
|
-
See https://docs.litellm.ai/docs/completion/input for full list.
|
|
76
|
-
|
|
77
|
-
### Text Parser Parameters ###
|
|
78
|
-
start_tags : List[str], optional
|
|
79
|
-
List of start tags for tag-based parsing.
|
|
80
|
-
end_tags : List[str], optional
|
|
81
|
-
List of end tags for tag-based parsing.
|
|
82
|
-
parsing_pattern : Optional[str], optional
|
|
83
|
-
Regex pattern for custom parsing.
|
|
84
|
-
parser_cleanup_tags : Optional[List[str]], optional
|
|
85
|
-
List of tags to clean from parsed output.
|
|
86
|
-
|
|
87
|
-
### LLMParserBlock Parameters ###
|
|
88
|
-
extract_content : bool, optional
|
|
89
|
-
Whether to extract 'content' field from responses.
|
|
90
|
-
extract_reasoning_content : bool, optional
|
|
91
|
-
Whether to extract 'reasoning_content' field from responses.
|
|
92
|
-
extract_tool_calls : bool, optional
|
|
93
|
-
Whether to extract 'tool_calls' field from responses.
|
|
94
|
-
expand_lists : bool, optional
|
|
95
|
-
Whether to expand list inputs into individual rows (True) or preserve lists (False).
|
|
96
|
-
field_prefix : Optional[str], optional
|
|
97
|
-
Prefix for the field names in the parsed output.
|
|
98
|
-
|
|
99
|
-
Examples
|
|
100
|
-
--------
|
|
101
|
-
>>> # Basic JSON parsing with retry
|
|
102
|
-
>>> block = LLMChatWithParsingRetryBlock(
|
|
103
|
-
... block_name="json_retry_block",
|
|
104
|
-
... input_cols="messages",
|
|
105
|
-
... output_cols="parsed_json",
|
|
106
|
-
... model="openai/gpt-4",
|
|
107
|
-
... parsing_max_retries=3,
|
|
108
|
-
... parsing_pattern=r'"result":\s*"([^"]*)"',
|
|
109
|
-
... n=3
|
|
110
|
-
... )
|
|
111
|
-
|
|
112
|
-
>>> # Tag-based parsing with retry
|
|
113
|
-
>>> block = LLMChatWithParsingRetryBlock(
|
|
114
|
-
... block_name="tag_retry_block",
|
|
115
|
-
... input_cols="messages",
|
|
116
|
-
... output_cols=["explanation", "answer"],
|
|
117
|
-
... model="anthropic/claude-3-sonnet-20240229",
|
|
118
|
-
... parsing_max_retries=5,
|
|
119
|
-
... start_tags=["<explanation>", "<answer>"],
|
|
120
|
-
... end_tags=["</explanation>", "</answer>"],
|
|
121
|
-
... n=2
|
|
122
|
-
... )
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
model_config = ConfigDict(
|
|
126
|
-
extra="allow"
|
|
127
|
-
) # Allow extra fields for dynamic forwarding
|
|
128
|
-
|
|
129
|
-
# --- Composite-specific configuration ---
|
|
130
|
-
parsing_max_retries: int = Field(
|
|
131
|
-
3, description="Maximum number of retry attempts for parsing failures"
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
# --- Parser configuration (required for internal TextParserBlock) ---
|
|
135
|
-
start_tags: Optional[list[str]] = Field(
|
|
136
|
-
None, description="Start tags for tag-based parsing"
|
|
137
|
-
)
|
|
138
|
-
end_tags: Optional[list[str]] = Field(
|
|
139
|
-
None, description="End tags for tag-based parsing"
|
|
140
|
-
)
|
|
141
|
-
parsing_pattern: Optional[str] = Field(
|
|
142
|
-
None, description="Regex pattern for custom parsing"
|
|
143
|
-
)
|
|
144
|
-
parser_cleanup_tags: Optional[list[str]] = Field(
|
|
145
|
-
None, description="List of tags to clean from parsed output"
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
### LLMParserBlock Parameters ###
|
|
149
|
-
extract_content: bool = Field(
|
|
150
|
-
default=True, description="Whether to extract 'content' field from responses."
|
|
151
|
-
)
|
|
152
|
-
extract_reasoning_content: bool = Field(
|
|
153
|
-
default=False,
|
|
154
|
-
description="Whether to extract 'reasoning_content' field from responses.",
|
|
155
|
-
)
|
|
156
|
-
extract_tool_calls: bool = Field(
|
|
157
|
-
default=False,
|
|
158
|
-
description="Whether to extract 'tool_calls' field from responses.",
|
|
159
|
-
)
|
|
160
|
-
expand_lists: bool = Field(
|
|
161
|
-
default=True,
|
|
162
|
-
description="Whether to expand list inputs into individual rows (True) or preserve lists (False).",
|
|
163
|
-
)
|
|
164
|
-
field_prefix: Optional[str] = Field(
|
|
165
|
-
default="", description="Prefix for the field names in the parsed output."
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
# Internal blocks - excluded from serialization
|
|
169
|
-
llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
|
|
170
|
-
text_parser: Optional[TextParserBlock] = Field(None, exclude=True)
|
|
171
|
-
llm_parser: Optional[LLMParserBlock] = Field(None, exclude=True)
|
|
172
|
-
|
|
173
|
-
@field_validator("input_cols")
|
|
174
|
-
@classmethod
|
|
175
|
-
def validate_single_input_col(cls, v):
|
|
176
|
-
"""Ensure exactly one input column."""
|
|
177
|
-
if isinstance(v, str):
|
|
178
|
-
return [v]
|
|
179
|
-
if isinstance(v, list) and len(v) == 1:
|
|
180
|
-
return v
|
|
181
|
-
if isinstance(v, list) and len(v) != 1:
|
|
182
|
-
raise ValueError(
|
|
183
|
-
f"LLMChatWithParsingRetryBlock expects exactly one input column, got {len(v)}: {v}"
|
|
184
|
-
)
|
|
185
|
-
raise ValueError(f"Invalid input_cols format: {v}")
|
|
186
|
-
|
|
187
|
-
@field_validator("parsing_max_retries")
|
|
188
|
-
@classmethod
|
|
189
|
-
def validate_parsing_max_retries(cls, v):
|
|
190
|
-
"""Ensure parsing_max_retries is positive."""
|
|
191
|
-
if v < 1:
|
|
192
|
-
raise ValueError("parsing_max_retries must be at least 1")
|
|
193
|
-
return v
|
|
194
|
-
|
|
195
|
-
def __init__(self, **kwargs):
|
|
196
|
-
"""Initialize with dynamic parameter routing."""
|
|
197
|
-
super().__init__(**kwargs)
|
|
198
|
-
self._create_internal_blocks(**kwargs)
|
|
199
|
-
|
|
200
|
-
# Log initialization if model is configured
|
|
201
|
-
if self.llm_chat and self.llm_chat.model:
|
|
202
|
-
logger.info(
|
|
203
|
-
f"Initialized LLMChatWithParsingRetryBlock '{self.block_name}' with model '{self.llm_chat.model}'",
|
|
204
|
-
extra={
|
|
205
|
-
"block_name": self.block_name,
|
|
206
|
-
"model": self.llm_chat.model,
|
|
207
|
-
"parsing_max_retries": self.parsing_max_retries,
|
|
208
|
-
},
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
def _extract_params(self, kwargs: dict, block_class) -> dict:
|
|
212
|
-
"""Extract parameters for specific block class."""
|
|
213
|
-
# Parameters that belong to this wrapper and shouldn't be forwarded
|
|
214
|
-
wrapper_params = {
|
|
215
|
-
"block_name",
|
|
216
|
-
"input_cols",
|
|
217
|
-
"output_cols",
|
|
218
|
-
"parsing_max_retries",
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
if block_class == LLMChatBlock:
|
|
222
|
-
# LLMChatBlock accepts any parameters via extra="allow"
|
|
223
|
-
# Forward everything except wrapper-specific and parser-specific params
|
|
224
|
-
parser_specific_params = {
|
|
225
|
-
"start_tags",
|
|
226
|
-
"end_tags",
|
|
227
|
-
"parsing_pattern",
|
|
228
|
-
"parser_cleanup_tags",
|
|
229
|
-
}
|
|
230
|
-
llm_parser_specific_params = {
|
|
231
|
-
"extract_content",
|
|
232
|
-
"extract_reasoning_content",
|
|
233
|
-
"extract_tool_calls",
|
|
234
|
-
"expand_lists",
|
|
235
|
-
"field_prefix",
|
|
236
|
-
}
|
|
237
|
-
excluded_params = (
|
|
238
|
-
wrapper_params | parser_specific_params | llm_parser_specific_params
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
# Forward all other kwargs
|
|
242
|
-
params = {k: v for k, v in kwargs.items() if k not in excluded_params}
|
|
243
|
-
|
|
244
|
-
# Also forward instance attributes that aren't parser-specific
|
|
245
|
-
for field_name, field_value in self.__dict__.items():
|
|
246
|
-
if (
|
|
247
|
-
field_name not in excluded_params
|
|
248
|
-
and not field_name.startswith("_")
|
|
249
|
-
and field_name not in ["llm_chat", "text_parser", "llm_parser"]
|
|
250
|
-
and field_value is not None
|
|
251
|
-
):
|
|
252
|
-
params[field_name] = field_value
|
|
253
|
-
|
|
254
|
-
else:
|
|
255
|
-
# For TextParserBlock, only forward known fields and parser-specific params
|
|
256
|
-
non_llm_chat_params = {
|
|
257
|
-
"start_tags",
|
|
258
|
-
"end_tags",
|
|
259
|
-
"parsing_pattern",
|
|
260
|
-
"parser_cleanup_tags",
|
|
261
|
-
"expand_lists",
|
|
262
|
-
"field_prefix",
|
|
263
|
-
"extract_content",
|
|
264
|
-
"extract_reasoning_content",
|
|
265
|
-
"extract_tool_calls",
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
# Forward parser-specific parameters from kwargs
|
|
269
|
-
params = {
|
|
270
|
-
k: v
|
|
271
|
-
for k, v in kwargs.items()
|
|
272
|
-
if k in block_class.model_fields and k not in wrapper_params
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
# Forward parser-specific instance attributes
|
|
276
|
-
for field_name in non_llm_chat_params:
|
|
277
|
-
if hasattr(self, field_name):
|
|
278
|
-
field_value = getattr(self, field_name)
|
|
279
|
-
if field_value is not None:
|
|
280
|
-
params[field_name] = field_value
|
|
281
|
-
|
|
282
|
-
return params
|
|
283
|
-
|
|
284
|
-
def _create_internal_blocks(self, **kwargs):
|
|
285
|
-
"""Create internal blocks with parameter routing."""
|
|
286
|
-
# Route parameters to appropriate blocks
|
|
287
|
-
llm_params = self._extract_params(kwargs, LLMChatBlock)
|
|
288
|
-
parser_params = self._extract_params(kwargs, TextParserBlock)
|
|
289
|
-
llm_parser_params = self._extract_params(kwargs, LLMParserBlock)
|
|
290
|
-
|
|
291
|
-
# 1. LLMChatBlock
|
|
292
|
-
self.llm_chat = LLMChatBlock(
|
|
293
|
-
block_name=f"{self.block_name}_llm_chat",
|
|
294
|
-
input_cols=self.input_cols,
|
|
295
|
-
output_cols=[f"{self.block_name}_raw_response"],
|
|
296
|
-
**llm_params,
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
# 2. LLMParserBlock
|
|
300
|
-
self.llm_parser = LLMParserBlock(
|
|
301
|
-
block_name=f"{self.block_name}_llm_parser",
|
|
302
|
-
input_cols=[f"{self.block_name}_raw_response"],
|
|
303
|
-
**llm_parser_params,
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
# 2. TextParserBlock
|
|
307
|
-
self.text_parser = TextParserBlock(
|
|
308
|
-
block_name=f"{self.block_name}_text_parser",
|
|
309
|
-
input_cols=[
|
|
310
|
-
f"{self.llm_parser.field_prefix if self.llm_parser.field_prefix!='' else self.llm_parser.block_name}_content"
|
|
311
|
-
],
|
|
312
|
-
output_cols=self.output_cols,
|
|
313
|
-
**parser_params,
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
def __getattr__(self, name: str) -> Any:
|
|
317
|
-
"""Forward attribute access to appropriate internal block."""
|
|
318
|
-
# Parser-specific parameters go to text_parser
|
|
319
|
-
parser_params = {
|
|
320
|
-
"start_tags",
|
|
321
|
-
"end_tags",
|
|
322
|
-
"parsing_pattern",
|
|
323
|
-
"parser_cleanup_tags",
|
|
324
|
-
}
|
|
325
|
-
llm_parser_params = {
|
|
326
|
-
"extract_content",
|
|
327
|
-
"extract_reasoning_content",
|
|
328
|
-
"extract_tool_calls",
|
|
329
|
-
"expand_lists",
|
|
330
|
-
"field_prefix",
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
if name in parser_params and hasattr(self, "text_parser") and self.text_parser:
|
|
334
|
-
return getattr(self.text_parser, name)
|
|
335
|
-
|
|
336
|
-
if (
|
|
337
|
-
name in llm_parser_params
|
|
338
|
-
and hasattr(self, "llm_parser")
|
|
339
|
-
and self.llm_parser
|
|
340
|
-
):
|
|
341
|
-
return getattr(self.llm_parser, name)
|
|
342
|
-
|
|
343
|
-
# Everything else goes to llm_chat (which accepts any parameters via extra="allow")
|
|
344
|
-
if hasattr(self, "llm_chat") and self.llm_chat:
|
|
345
|
-
# Always try LLMChatBlock - it will return None for unset attributes
|
|
346
|
-
# due to extra="allow", which makes hasattr() work correctly
|
|
347
|
-
return getattr(self.llm_chat, name, None)
|
|
348
|
-
|
|
349
|
-
raise AttributeError(
|
|
350
|
-
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
def __setattr__(self, name: str, value: Any) -> None:
|
|
354
|
-
"""Handle dynamic parameter updates from flow.set_model_config()."""
|
|
355
|
-
super().__setattr__(name, value)
|
|
356
|
-
|
|
357
|
-
# Don't forward during initialization or for internal attributes
|
|
358
|
-
if not hasattr(self, "llm_chat") or name.startswith("_"):
|
|
359
|
-
return
|
|
360
|
-
|
|
361
|
-
# Parser-specific parameters go to text_parser
|
|
362
|
-
parser_params = {
|
|
363
|
-
"start_tags",
|
|
364
|
-
"end_tags",
|
|
365
|
-
"parsing_pattern",
|
|
366
|
-
"parser_cleanup_tags",
|
|
367
|
-
}
|
|
368
|
-
llm_parser_params = {
|
|
369
|
-
"extract_content",
|
|
370
|
-
"extract_reasoning_content",
|
|
371
|
-
"extract_tool_calls",
|
|
372
|
-
"expand_lists",
|
|
373
|
-
"field_prefix",
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
if name in parser_params and hasattr(self, "text_parser") and self.text_parser:
|
|
377
|
-
setattr(self.text_parser, name, value)
|
|
378
|
-
|
|
379
|
-
if (
|
|
380
|
-
name in llm_parser_params
|
|
381
|
-
and hasattr(self, "llm_parser")
|
|
382
|
-
and self.llm_parser
|
|
383
|
-
):
|
|
384
|
-
setattr(self.llm_parser, name, value)
|
|
385
|
-
|
|
386
|
-
# LLM-related parameters go to llm_chat (which accepts any via extra="allow")
|
|
387
|
-
elif (
|
|
388
|
-
hasattr(self, "llm_chat")
|
|
389
|
-
and self.llm_chat
|
|
390
|
-
and name
|
|
391
|
-
not in {
|
|
392
|
-
"block_name",
|
|
393
|
-
"input_cols",
|
|
394
|
-
"output_cols",
|
|
395
|
-
"parsing_max_retries",
|
|
396
|
-
"llm_chat",
|
|
397
|
-
"llm_parser",
|
|
398
|
-
"text_parser",
|
|
399
|
-
}
|
|
400
|
-
):
|
|
401
|
-
setattr(self.llm_chat, name, value)
|
|
402
|
-
|
|
403
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
|
404
|
-
"""Generate responses with parsing retry logic.
|
|
405
|
-
|
|
406
|
-
For each input sample, this method:
|
|
407
|
-
1. Generates LLM responses using the configured n parameter
|
|
408
|
-
2. Attempts to parse the responses using TextParserBlock
|
|
409
|
-
3. Counts successful parses and retries if below target
|
|
410
|
-
4. Accumulates results across retry attempts
|
|
411
|
-
5. Returns final dataset with all successful parses
|
|
412
|
-
|
|
413
|
-
Parameters
|
|
414
|
-
----------
|
|
415
|
-
samples : Dataset
|
|
416
|
-
Input dataset containing the messages column.
|
|
417
|
-
**kwargs : Any
|
|
418
|
-
Additional keyword arguments passed to internal blocks.
|
|
419
|
-
|
|
420
|
-
Returns
|
|
421
|
-
-------
|
|
422
|
-
Dataset
|
|
423
|
-
Dataset with parsed results from successful generations.
|
|
424
|
-
|
|
425
|
-
Raises
|
|
426
|
-
------
|
|
427
|
-
BlockValidationError
|
|
428
|
-
If model is not configured before calling generate().
|
|
429
|
-
MaxRetriesExceededError
|
|
430
|
-
If target count not reached after max retries for any sample.
|
|
431
|
-
"""
|
|
432
|
-
# Validate that model is configured (check internal LLM block)
|
|
433
|
-
if not self.llm_chat or not self.llm_chat.model:
|
|
434
|
-
raise BlockValidationError(
|
|
435
|
-
f"Model not configured for block '{self.block_name}'. "
|
|
436
|
-
f"Call flow.set_model_config() before generating."
|
|
437
|
-
)
|
|
438
|
-
|
|
439
|
-
logger.info(
|
|
440
|
-
f"Starting LLM generation with parsing retry for {len(samples)} samples",
|
|
441
|
-
extra={
|
|
442
|
-
"block_name": self.block_name,
|
|
443
|
-
"model": self.llm_chat.model,
|
|
444
|
-
"batch_size": len(samples),
|
|
445
|
-
"parsing_max_retries": self.parsing_max_retries,
|
|
446
|
-
},
|
|
447
|
-
)
|
|
448
|
-
|
|
449
|
-
all_results = []
|
|
450
|
-
|
|
451
|
-
# Process each sample independently with retry logic
|
|
452
|
-
for sample_idx, sample in enumerate(samples):
|
|
453
|
-
# Determine target count for this sample (number of completions requested)
|
|
454
|
-
target = kwargs.get("n", getattr(self, "n", None)) or 1
|
|
455
|
-
|
|
456
|
-
logger.debug(
|
|
457
|
-
f"Processing sample {sample_idx} with target count {target}",
|
|
458
|
-
extra={
|
|
459
|
-
"block_name": self.block_name,
|
|
460
|
-
"sample_idx": sample_idx,
|
|
461
|
-
"target_count": target,
|
|
462
|
-
},
|
|
463
|
-
)
|
|
464
|
-
|
|
465
|
-
if self.llm_parser.expand_lists:
|
|
466
|
-
# Current behavior for expand_lists=True: count rows directly
|
|
467
|
-
sample_results = []
|
|
468
|
-
total_parsed_count = 0
|
|
469
|
-
|
|
470
|
-
# Retry loop for this sample
|
|
471
|
-
for attempt in range(self.parsing_max_retries):
|
|
472
|
-
if total_parsed_count >= target:
|
|
473
|
-
break # Already reached target
|
|
474
|
-
|
|
475
|
-
try:
|
|
476
|
-
# Generate LLM responses for this sample
|
|
477
|
-
temp_dataset = Dataset.from_list([sample])
|
|
478
|
-
llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
|
|
479
|
-
llm_parser_result = self.llm_parser.generate(
|
|
480
|
-
llm_result, **kwargs
|
|
481
|
-
)
|
|
482
|
-
|
|
483
|
-
# Parse the responses
|
|
484
|
-
parsed_result = self.text_parser.generate(
|
|
485
|
-
llm_parser_result, **kwargs
|
|
486
|
-
)
|
|
487
|
-
|
|
488
|
-
# Count successful parses and accumulate results
|
|
489
|
-
new_parsed_count = len(parsed_result)
|
|
490
|
-
total_parsed_count += new_parsed_count
|
|
491
|
-
sample_results.extend(parsed_result)
|
|
492
|
-
|
|
493
|
-
logger.debug(
|
|
494
|
-
f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
|
|
495
|
-
f"(total: {total_parsed_count}/{target})",
|
|
496
|
-
extra={
|
|
497
|
-
"block_name": self.block_name,
|
|
498
|
-
"sample_idx": sample_idx,
|
|
499
|
-
"attempt": attempt + 1,
|
|
500
|
-
"new_parses": new_parsed_count,
|
|
501
|
-
"total_parses": total_parsed_count,
|
|
502
|
-
"target_count": target,
|
|
503
|
-
},
|
|
504
|
-
)
|
|
505
|
-
|
|
506
|
-
if total_parsed_count >= target:
|
|
507
|
-
logger.debug(
|
|
508
|
-
f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
|
|
509
|
-
extra={
|
|
510
|
-
"block_name": self.block_name,
|
|
511
|
-
"sample_idx": sample_idx,
|
|
512
|
-
"attempts": attempt + 1,
|
|
513
|
-
"final_count": total_parsed_count,
|
|
514
|
-
},
|
|
515
|
-
)
|
|
516
|
-
break
|
|
517
|
-
|
|
518
|
-
except Exception as e:
|
|
519
|
-
logger.warning(
|
|
520
|
-
f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
|
|
521
|
-
extra={
|
|
522
|
-
"block_name": self.block_name,
|
|
523
|
-
"sample_idx": sample_idx,
|
|
524
|
-
"attempt": attempt + 1,
|
|
525
|
-
"error": str(e),
|
|
526
|
-
},
|
|
527
|
-
)
|
|
528
|
-
# Continue to next attempt
|
|
529
|
-
continue
|
|
530
|
-
|
|
531
|
-
else:
|
|
532
|
-
# New behavior for expand_lists=False: parse individual responses and accumulate
|
|
533
|
-
accumulated_parsed_items = {col: [] for col in self.output_cols}
|
|
534
|
-
total_parsed_count = 0
|
|
535
|
-
|
|
536
|
-
# Retry loop for this sample
|
|
537
|
-
for attempt in range(self.parsing_max_retries):
|
|
538
|
-
if total_parsed_count >= target:
|
|
539
|
-
break # Already reached target
|
|
540
|
-
|
|
541
|
-
try:
|
|
542
|
-
# Generate LLM responses for this sample
|
|
543
|
-
temp_dataset = Dataset.from_list([sample])
|
|
544
|
-
llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
|
|
545
|
-
llm_parser_result = self.llm_parser.generate(
|
|
546
|
-
llm_result, **kwargs
|
|
547
|
-
)
|
|
548
|
-
# Get the raw responses (should be a list when n > 1)
|
|
549
|
-
raw_response_col = f"{self.llm_parser.field_prefix if self.llm_parser.field_prefix!='' else self.llm_parser.block_name}_content"
|
|
550
|
-
raw_responses = llm_parser_result[0][raw_response_col]
|
|
551
|
-
if not isinstance(raw_responses, list):
|
|
552
|
-
raw_responses = [raw_responses]
|
|
553
|
-
|
|
554
|
-
# Parse each response individually and accumulate successful ones
|
|
555
|
-
new_parsed_count = 0
|
|
556
|
-
for response in raw_responses:
|
|
557
|
-
if total_parsed_count >= target:
|
|
558
|
-
break # Stop if we've reached target
|
|
559
|
-
|
|
560
|
-
# Create temporary dataset with single response for parsing
|
|
561
|
-
temp_parse_data = [{**sample, raw_response_col: response}]
|
|
562
|
-
temp_parse_dataset = Dataset.from_list(temp_parse_data)
|
|
563
|
-
|
|
564
|
-
# Force expand_lists=True temporarily to get individual parsed items
|
|
565
|
-
original_expand_lists = self.llm_parser.expand_lists
|
|
566
|
-
try:
|
|
567
|
-
self.llm_parser.expand_lists = (
|
|
568
|
-
self.llm_parser.expand_lists
|
|
569
|
-
)
|
|
570
|
-
parsed_result = self.text_parser.generate(
|
|
571
|
-
temp_parse_dataset, **kwargs
|
|
572
|
-
)
|
|
573
|
-
except Exception as parse_e:
|
|
574
|
-
logger.debug(
|
|
575
|
-
f"Failed to parse individual response: {parse_e}"
|
|
576
|
-
)
|
|
577
|
-
continue
|
|
578
|
-
finally:
|
|
579
|
-
self.llm_parser.expand_lists = original_expand_lists
|
|
580
|
-
|
|
581
|
-
# If parsing was successful, accumulate the results
|
|
582
|
-
if len(parsed_result) > 0:
|
|
583
|
-
for parsed_row in parsed_result:
|
|
584
|
-
if total_parsed_count >= target:
|
|
585
|
-
break
|
|
586
|
-
|
|
587
|
-
# Only count as successful if ALL output columns are present
|
|
588
|
-
if all(
|
|
589
|
-
col in parsed_row for col in self.output_cols
|
|
590
|
-
):
|
|
591
|
-
for col in self.output_cols:
|
|
592
|
-
accumulated_parsed_items[col].append(
|
|
593
|
-
parsed_row[col]
|
|
594
|
-
)
|
|
595
|
-
total_parsed_count += 1
|
|
596
|
-
new_parsed_count += 1
|
|
597
|
-
# If any column is missing, skip this parsed response entirely
|
|
598
|
-
|
|
599
|
-
logger.debug(
|
|
600
|
-
f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
|
|
601
|
-
f"(total: {total_parsed_count}/{target})",
|
|
602
|
-
extra={
|
|
603
|
-
"block_name": self.block_name,
|
|
604
|
-
"sample_idx": sample_idx,
|
|
605
|
-
"attempt": attempt + 1,
|
|
606
|
-
"new_parses": new_parsed_count,
|
|
607
|
-
"total_parses": total_parsed_count,
|
|
608
|
-
"target_count": target,
|
|
609
|
-
},
|
|
610
|
-
)
|
|
611
|
-
|
|
612
|
-
if total_parsed_count >= target:
|
|
613
|
-
logger.debug(
|
|
614
|
-
f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
|
|
615
|
-
extra={
|
|
616
|
-
"block_name": self.block_name,
|
|
617
|
-
"sample_idx": sample_idx,
|
|
618
|
-
"attempts": attempt + 1,
|
|
619
|
-
"final_count": total_parsed_count,
|
|
620
|
-
},
|
|
621
|
-
)
|
|
622
|
-
break
|
|
623
|
-
|
|
624
|
-
except Exception as e:
|
|
625
|
-
logger.warning(
|
|
626
|
-
f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
|
|
627
|
-
extra={
|
|
628
|
-
"block_name": self.block_name,
|
|
629
|
-
"sample_idx": sample_idx,
|
|
630
|
-
"attempt": attempt + 1,
|
|
631
|
-
"error": str(e),
|
|
632
|
-
},
|
|
633
|
-
)
|
|
634
|
-
# Continue to next attempt
|
|
635
|
-
continue
|
|
636
|
-
|
|
637
|
-
# Create final result row with accumulated lists
|
|
638
|
-
if total_parsed_count > 0:
|
|
639
|
-
# Trim to exact target count if needed
|
|
640
|
-
for col in self.output_cols:
|
|
641
|
-
if len(accumulated_parsed_items[col]) > target:
|
|
642
|
-
accumulated_parsed_items[col] = accumulated_parsed_items[
|
|
643
|
-
col
|
|
644
|
-
][:target]
|
|
645
|
-
|
|
646
|
-
# Only add the parsed output columns as lists, preserve other columns as-is
|
|
647
|
-
final_row = {**sample, **accumulated_parsed_items}
|
|
648
|
-
sample_results = [final_row]
|
|
649
|
-
else:
|
|
650
|
-
sample_results = []
|
|
651
|
-
|
|
652
|
-
# Check if we reached the target count
|
|
653
|
-
if total_parsed_count < target:
|
|
654
|
-
raise MaxRetriesExceededError(
|
|
655
|
-
target_count=target,
|
|
656
|
-
actual_count=total_parsed_count,
|
|
657
|
-
max_retries=self.parsing_max_retries,
|
|
658
|
-
)
|
|
659
|
-
|
|
660
|
-
# For expand_lists=True, trim results to exact target count if we exceeded it
|
|
661
|
-
if self.llm_parser.expand_lists and total_parsed_count > target:
|
|
662
|
-
sample_results = sample_results[:target]
|
|
663
|
-
logger.debug(
|
|
664
|
-
f"Trimmed sample {sample_idx} results from {total_parsed_count} to {target}",
|
|
665
|
-
extra={
|
|
666
|
-
"block_name": self.block_name,
|
|
667
|
-
"sample_idx": sample_idx,
|
|
668
|
-
"trimmed_from": total_parsed_count,
|
|
669
|
-
"trimmed_to": target,
|
|
670
|
-
},
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
# Add this sample's results to final dataset
|
|
674
|
-
all_results.extend(sample_results)
|
|
675
|
-
|
|
676
|
-
logger.info(
|
|
677
|
-
f"LLM generation with parsing retry completed: {len(samples)} input samples → {len(all_results)} output rows",
|
|
678
|
-
extra={
|
|
679
|
-
"block_name": self.block_name,
|
|
680
|
-
"input_samples": len(samples),
|
|
681
|
-
"output_rows": len(all_results),
|
|
682
|
-
"model": self.llm_chat.model,
|
|
683
|
-
},
|
|
684
|
-
)
|
|
685
|
-
|
|
686
|
-
return Dataset.from_list(all_results)
|
|
687
|
-
|
|
688
|
-
def _validate_custom(self, dataset: Dataset) -> None:
|
|
689
|
-
"""Custom validation for LLMChatWithParsingRetryBlock.
|
|
690
|
-
|
|
691
|
-
This method validates the entire chain of internal blocks by simulating
|
|
692
|
-
the data flow through each block to ensure they can all process the data correctly.
|
|
693
|
-
"""
|
|
694
|
-
# Validate that required input column exists
|
|
695
|
-
if len(self.input_cols) != 1:
|
|
696
|
-
raise ValueError(
|
|
697
|
-
f"LLMChatWithParsingRetryBlock expects exactly one input column, got {len(self.input_cols)}"
|
|
698
|
-
)
|
|
699
|
-
|
|
700
|
-
input_col = self.input_cols[0]
|
|
701
|
-
if input_col not in dataset.column_names:
|
|
702
|
-
raise ValueError(
|
|
703
|
-
f"Required input column '{input_col}' not found in dataset. "
|
|
704
|
-
f"Available columns: {dataset.column_names}"
|
|
705
|
-
)
|
|
706
|
-
|
|
707
|
-
# Validate parsing configuration
|
|
708
|
-
has_regex = getattr(self, "parsing_pattern", None) is not None
|
|
709
|
-
has_tags = bool(getattr(self, "start_tags", [])) or bool(
|
|
710
|
-
getattr(self, "end_tags", [])
|
|
711
|
-
)
|
|
712
|
-
|
|
713
|
-
if not has_regex and not has_tags:
|
|
714
|
-
raise ValueError(
|
|
715
|
-
"LLMChatWithParsingRetryBlock requires at least one parsing method: "
|
|
716
|
-
"either 'parsing_pattern' (regex) or 'start_tags'/'end_tags' (tag-based parsing)"
|
|
717
|
-
)
|
|
718
|
-
|
|
719
|
-
# Validate that internal blocks are initialized
|
|
720
|
-
if not all([self.llm_chat, self.text_parser]):
|
|
721
|
-
raise ValueError(
|
|
722
|
-
"All internal blocks must be initialized before validation"
|
|
723
|
-
)
|
|
724
|
-
|
|
725
|
-
# Validate internal blocks
|
|
726
|
-
try:
|
|
727
|
-
logger.debug("Validating internal LLM chat block")
|
|
728
|
-
self.llm_chat._validate_custom(dataset)
|
|
729
|
-
|
|
730
|
-
# Create temporary dataset with expected LLM output for parser validation
|
|
731
|
-
temp_data = []
|
|
732
|
-
for sample in dataset:
|
|
733
|
-
temp_sample = dict(sample)
|
|
734
|
-
temp_sample[f"{self.block_name}_raw_response"] = "test output"
|
|
735
|
-
temp_data.append(temp_sample)
|
|
736
|
-
temp_dataset = Dataset.from_list(temp_data)
|
|
737
|
-
|
|
738
|
-
logger.debug("Validating internal text parser block")
|
|
739
|
-
self.text_parser._validate_custom(temp_dataset)
|
|
740
|
-
|
|
741
|
-
logger.debug("All internal blocks validated successfully")
|
|
742
|
-
|
|
743
|
-
except Exception as e:
|
|
744
|
-
logger.error(f"Validation failed in internal blocks: {e}")
|
|
745
|
-
raise ValueError(f"Internal block validation failed: {e}") from e
|
|
746
|
-
|
|
747
|
-
def get_internal_blocks_info(self) -> dict[str, Any]:
|
|
748
|
-
"""Get information about the internal blocks.
|
|
749
|
-
|
|
750
|
-
Returns
|
|
751
|
-
-------
|
|
752
|
-
Dict[str, Any]
|
|
753
|
-
Information about each internal block.
|
|
754
|
-
"""
|
|
755
|
-
return {
|
|
756
|
-
"llm_chat": self.llm_chat.get_info() if self.llm_chat else None,
|
|
757
|
-
"llm_parser": self.llm_parser.get_info() if self.llm_parser else None,
|
|
758
|
-
"text_parser": self.text_parser.get_info() if self.text_parser else None,
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
def __repr__(self) -> str:
|
|
762
|
-
"""String representation of the block."""
|
|
763
|
-
model = (
|
|
764
|
-
self.llm_chat.model
|
|
765
|
-
if (self.llm_chat and self.llm_chat.model)
|
|
766
|
-
else "not_configured"
|
|
767
|
-
)
|
|
768
|
-
return (
|
|
769
|
-
f"LLMChatWithParsingRetryBlock(name='{self.block_name}', "
|
|
770
|
-
f"model='{model}', parsing_max_retries={self.parsing_max_retries})"
|
|
771
|
-
)
|