sdg-hub 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +16 -3
- sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +175 -416
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +174 -415
- sdg_hub/core/blocks/evaluation/verify_question_block.py +180 -415
- sdg_hub/core/blocks/llm/client_manager.py +92 -43
- sdg_hub/core/blocks/llm/config.py +1 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +74 -16
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +277 -115
- sdg_hub/core/blocks/llm/text_parser_block.py +88 -23
- sdg_hub/core/blocks/registry.py +48 -34
- sdg_hub/core/blocks/transform/__init__.py +2 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
- sdg_hub/core/blocks/transform/json_structure_block.py +142 -0
- sdg_hub/core/flow/base.py +326 -62
- sdg_hub/core/utils/datautils.py +54 -0
- sdg_hub/core/utils/flow_metrics.py +261 -0
- sdg_hub/core/utils/logger_config.py +50 -9
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +11 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +159 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +65 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +161 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +15 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +21 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +44 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +104 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +61 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -7
- sdg_hub/flows/text_analysis/__init__.py +2 -0
- sdg_hub/flows/text_analysis/structured_insights/__init__.py +6 -0
- sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +27 -0
- sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +38 -0
- sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +21 -0
- sdg_hub/flows/text_analysis/structured_insights/flow.yaml +153 -0
- sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +21 -0
- {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/METADATA +42 -15
- {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/RECORD +44 -22
- {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/top_level.txt +0 -0
@@ -12,9 +12,8 @@ from typing import Any, Optional
|
|
12
12
|
from datasets import Dataset
|
13
13
|
from pydantic import ConfigDict, Field, field_validator
|
14
14
|
|
15
|
-
from ...utils.error_handling import BlockValidationError
|
16
|
-
|
17
15
|
# Local
|
16
|
+
from ...utils.error_handling import BlockValidationError
|
18
17
|
from ...utils.logger_config import setup_logger
|
19
18
|
from ..base import BaseBlock
|
20
19
|
from ..registry import BlockRegistry
|
@@ -147,14 +146,24 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
147
146
|
extra="allow"
|
148
147
|
) # Allow extra fields for dynamic forwarding
|
149
148
|
|
150
|
-
# Composite-specific
|
149
|
+
# --- Composite-specific configuration ---
|
151
150
|
parsing_max_retries: int = Field(
|
152
151
|
3, description="Maximum number of retry attempts for parsing failures"
|
153
152
|
)
|
154
153
|
|
155
|
-
#
|
156
|
-
|
157
|
-
|
154
|
+
# --- Parser configuration (required for internal TextParserBlock) ---
|
155
|
+
start_tags: Optional[list[str]] = Field(
|
156
|
+
None, description="Start tags for tag-based parsing"
|
157
|
+
)
|
158
|
+
end_tags: Optional[list[str]] = Field(
|
159
|
+
None, description="End tags for tag-based parsing"
|
160
|
+
)
|
161
|
+
parsing_pattern: Optional[str] = Field(
|
162
|
+
None, description="Regex pattern for custom parsing"
|
163
|
+
)
|
164
|
+
parser_cleanup_tags: Optional[list[str]] = Field(
|
165
|
+
None, description="List of tags to clean from parsed output"
|
166
|
+
)
|
158
167
|
|
159
168
|
# Internal blocks - excluded from serialization
|
160
169
|
llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
|
@@ -183,60 +192,98 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
183
192
|
return v
|
184
193
|
|
185
194
|
def __init__(self, **kwargs):
|
186
|
-
"""Initialize with dynamic parameter
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
# Forward parameters to appropriate internal blocks
|
191
|
-
llm_params = {k: v for k, v in kwargs.items() if k in LLMChatBlock.model_fields}
|
192
|
-
parser_params = {
|
193
|
-
k: v for k, v in kwargs.items() if k in TextParserBlock.model_fields
|
194
|
-
}
|
195
|
-
|
196
|
-
# Keep only BaseBlock fields for super().__init__
|
197
|
-
base_params = {k: v for k, v in kwargs.items() if k in BaseBlock.model_fields}
|
198
|
-
base_params["parsing_max_retries"] = parsing_max_retries
|
199
|
-
base_params["llm_params"] = llm_params
|
200
|
-
base_params["parser_params"] = parser_params
|
201
|
-
|
202
|
-
# Initialize parent with all valid parameters
|
203
|
-
super().__init__(**base_params)
|
204
|
-
|
205
|
-
# Create internal blocks with forwarded parameters
|
206
|
-
self._create_internal_blocks()
|
195
|
+
"""Initialize with dynamic parameter routing."""
|
196
|
+
super().__init__(**kwargs)
|
197
|
+
self._create_internal_blocks(**kwargs)
|
207
198
|
|
208
|
-
# Log initialization
|
209
|
-
model
|
210
|
-
if model:
|
199
|
+
# Log initialization if model is configured
|
200
|
+
if hasattr(self, "model") and self.model:
|
211
201
|
logger.info(
|
212
|
-
f"Initialized LLMChatWithParsingRetryBlock '{self.block_name}' with model '{model}'",
|
202
|
+
f"Initialized LLMChatWithParsingRetryBlock '{self.block_name}' with model '{self.model}'",
|
213
203
|
extra={
|
214
204
|
"block_name": self.block_name,
|
215
|
-
"model": model,
|
216
|
-
"async_mode": self.llm_params.get("async_mode", False),
|
205
|
+
"model": self.model,
|
217
206
|
"parsing_max_retries": self.parsing_max_retries,
|
218
207
|
},
|
219
208
|
)
|
220
209
|
|
221
|
-
def
|
222
|
-
"""
|
223
|
-
#
|
224
|
-
|
225
|
-
|
226
|
-
"
|
227
|
-
"
|
228
|
-
"
|
210
|
+
def _extract_params(self, kwargs: dict, block_class) -> dict:
|
211
|
+
"""Extract parameters for specific block class based on its model_fields."""
|
212
|
+
# Exclude parameters that are handled by this wrapper
|
213
|
+
wrapper_params = {
|
214
|
+
"block_name",
|
215
|
+
"input_cols",
|
216
|
+
"output_cols",
|
217
|
+
"parsing_max_retries",
|
229
218
|
}
|
230
|
-
self.llm_chat = LLMChatBlock(**llm_kwargs)
|
231
219
|
|
232
|
-
#
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
"output_cols": self.output_cols,
|
220
|
+
# Extract parameters that the target block accepts
|
221
|
+
params = {
|
222
|
+
k: v
|
223
|
+
for k, v in kwargs.items()
|
224
|
+
if k in block_class.model_fields and k not in wrapper_params
|
238
225
|
}
|
239
|
-
|
226
|
+
|
227
|
+
# Also include declared fields from this composite block that the target block accepts
|
228
|
+
for field_name in self.__class__.model_fields:
|
229
|
+
if (
|
230
|
+
field_name in block_class.model_fields
|
231
|
+
and field_name not in wrapper_params
|
232
|
+
):
|
233
|
+
field_value = getattr(self, field_name)
|
234
|
+
if field_value is not None: # Only forward non-None values
|
235
|
+
params[field_name] = field_value
|
236
|
+
|
237
|
+
return params
|
238
|
+
|
239
|
+
def _create_internal_blocks(self, **kwargs):
|
240
|
+
"""Create internal blocks with parameter routing."""
|
241
|
+
# Route parameters to appropriate blocks
|
242
|
+
llm_params = self._extract_params(kwargs, LLMChatBlock)
|
243
|
+
parser_params = self._extract_params(kwargs, TextParserBlock)
|
244
|
+
|
245
|
+
# 1. LLMChatBlock
|
246
|
+
self.llm_chat = LLMChatBlock(
|
247
|
+
block_name=f"{self.block_name}_llm_chat",
|
248
|
+
input_cols=self.input_cols,
|
249
|
+
output_cols=[f"{self.block_name}_raw_response"],
|
250
|
+
**llm_params,
|
251
|
+
)
|
252
|
+
|
253
|
+
# 2. TextParserBlock
|
254
|
+
self.text_parser = TextParserBlock(
|
255
|
+
block_name=f"{self.block_name}_text_parser",
|
256
|
+
input_cols=[f"{self.block_name}_raw_response"],
|
257
|
+
output_cols=self.output_cols,
|
258
|
+
**parser_params,
|
259
|
+
)
|
260
|
+
|
261
|
+
def __getattr__(self, name: str) -> Any:
|
262
|
+
"""Forward attribute access to appropriate internal block."""
|
263
|
+
# Check each internal block to see which one has this parameter
|
264
|
+
for block_attr, block_class in [
|
265
|
+
("llm_chat", LLMChatBlock),
|
266
|
+
("text_parser", TextParserBlock),
|
267
|
+
]:
|
268
|
+
if hasattr(self, block_attr) and name in block_class.model_fields:
|
269
|
+
internal_block = getattr(self, block_attr)
|
270
|
+
if internal_block is not None:
|
271
|
+
return getattr(internal_block, name)
|
272
|
+
raise AttributeError(
|
273
|
+
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
274
|
+
)
|
275
|
+
|
276
|
+
def __setattr__(self, name: str, value: Any) -> None:
|
277
|
+
"""Handle dynamic parameter updates from flow.set_model_config()."""
|
278
|
+
super().__setattr__(name, value)
|
279
|
+
|
280
|
+
# Forward to appropriate internal blocks
|
281
|
+
for block_attr, block_class in [
|
282
|
+
("llm_chat", LLMChatBlock),
|
283
|
+
("text_parser", TextParserBlock),
|
284
|
+
]:
|
285
|
+
if hasattr(self, block_attr) and name in block_class.model_fields:
|
286
|
+
setattr(getattr(self, block_attr), name, value)
|
240
287
|
|
241
288
|
def _reinitialize_client_manager(self) -> None:
|
242
289
|
"""Reinitialize the internal LLM chat block's client manager.
|
@@ -245,11 +292,8 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
245
292
|
the internal LLM chat block uses the updated model configuration.
|
246
293
|
"""
|
247
294
|
if self.llm_chat and hasattr(self.llm_chat, "_reinitialize_client_manager"):
|
248
|
-
#
|
249
|
-
|
250
|
-
if key in self.llm_params:
|
251
|
-
setattr(self.llm_chat, key, self.llm_params[key])
|
252
|
-
# Reinitialize its client manager
|
295
|
+
# The parameters should already be forwarded via __setattr__ magic method
|
296
|
+
# Just reinitialize the client manager with the current configuration
|
253
297
|
self.llm_chat._reinitialize_client_manager()
|
254
298
|
|
255
299
|
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
@@ -282,8 +326,7 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
282
326
|
If target count not reached after max retries for any sample.
|
283
327
|
"""
|
284
328
|
# Validate that model is configured
|
285
|
-
|
286
|
-
if not model:
|
329
|
+
if not hasattr(self, "model") or not self.model:
|
287
330
|
raise BlockValidationError(
|
288
331
|
f"Model not configured for block '{self.block_name}'. "
|
289
332
|
f"Call flow.set_model_config() before generating."
|
@@ -293,7 +336,7 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
293
336
|
f"Starting LLM generation with parsing retry for {len(samples)} samples",
|
294
337
|
extra={
|
295
338
|
"block_name": self.block_name,
|
296
|
-
"model": model,
|
339
|
+
"model": self.model,
|
297
340
|
"batch_size": len(samples),
|
298
341
|
"parsing_max_retries": self.parsing_max_retries,
|
299
342
|
},
|
@@ -303,11 +346,8 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
303
346
|
|
304
347
|
# Process each sample independently with retry logic
|
305
348
|
for sample_idx, sample in enumerate(samples):
|
306
|
-
sample_results = []
|
307
|
-
total_parsed_count = 0
|
308
|
-
|
309
349
|
# Determine target count for this sample (number of completions requested)
|
310
|
-
target = kwargs.get("n", self
|
350
|
+
target = kwargs.get("n", getattr(self, "n", None)) or 1
|
311
351
|
|
312
352
|
logger.debug(
|
313
353
|
f"Processing sample {sample_idx} with target count {target}",
|
@@ -318,61 +358,183 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
318
358
|
},
|
319
359
|
)
|
320
360
|
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
try:
|
327
|
-
# Generate LLM responses for this sample
|
328
|
-
temp_dataset = Dataset.from_list([sample])
|
329
|
-
llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
|
330
|
-
|
331
|
-
# Parse the responses
|
332
|
-
parsed_result = self.text_parser.generate(llm_result, **kwargs)
|
333
|
-
|
334
|
-
# Count successful parses and accumulate results
|
335
|
-
new_parsed_count = len(parsed_result)
|
336
|
-
total_parsed_count += new_parsed_count
|
337
|
-
sample_results.extend(parsed_result)
|
338
|
-
|
339
|
-
logger.debug(
|
340
|
-
f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
|
341
|
-
f"(total: {total_parsed_count}/{target})",
|
342
|
-
extra={
|
343
|
-
"block_name": self.block_name,
|
344
|
-
"sample_idx": sample_idx,
|
345
|
-
"attempt": attempt + 1,
|
346
|
-
"new_parses": new_parsed_count,
|
347
|
-
"total_parses": total_parsed_count,
|
348
|
-
"target_count": target,
|
349
|
-
},
|
350
|
-
)
|
361
|
+
if self.text_parser.expand_lists:
|
362
|
+
# Current behavior for expand_lists=True: count rows directly
|
363
|
+
sample_results = []
|
364
|
+
total_parsed_count = 0
|
351
365
|
|
366
|
+
# Retry loop for this sample
|
367
|
+
for attempt in range(self.parsing_max_retries):
|
352
368
|
if total_parsed_count >= target:
|
369
|
+
break # Already reached target
|
370
|
+
|
371
|
+
try:
|
372
|
+
# Generate LLM responses for this sample
|
373
|
+
temp_dataset = Dataset.from_list([sample])
|
374
|
+
llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
|
375
|
+
|
376
|
+
# Parse the responses
|
377
|
+
parsed_result = self.text_parser.generate(llm_result, **kwargs)
|
378
|
+
|
379
|
+
# Count successful parses and accumulate results
|
380
|
+
new_parsed_count = len(parsed_result)
|
381
|
+
total_parsed_count += new_parsed_count
|
382
|
+
sample_results.extend(parsed_result)
|
383
|
+
|
384
|
+
logger.debug(
|
385
|
+
f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
|
386
|
+
f"(total: {total_parsed_count}/{target})",
|
387
|
+
extra={
|
388
|
+
"block_name": self.block_name,
|
389
|
+
"sample_idx": sample_idx,
|
390
|
+
"attempt": attempt + 1,
|
391
|
+
"new_parses": new_parsed_count,
|
392
|
+
"total_parses": total_parsed_count,
|
393
|
+
"target_count": target,
|
394
|
+
},
|
395
|
+
)
|
396
|
+
|
397
|
+
if total_parsed_count >= target:
|
398
|
+
logger.debug(
|
399
|
+
f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
|
400
|
+
extra={
|
401
|
+
"block_name": self.block_name,
|
402
|
+
"sample_idx": sample_idx,
|
403
|
+
"attempts": attempt + 1,
|
404
|
+
"final_count": total_parsed_count,
|
405
|
+
},
|
406
|
+
)
|
407
|
+
break
|
408
|
+
|
409
|
+
except Exception as e:
|
410
|
+
logger.warning(
|
411
|
+
f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
|
412
|
+
extra={
|
413
|
+
"block_name": self.block_name,
|
414
|
+
"sample_idx": sample_idx,
|
415
|
+
"attempt": attempt + 1,
|
416
|
+
"error": str(e),
|
417
|
+
},
|
418
|
+
)
|
419
|
+
# Continue to next attempt
|
420
|
+
continue
|
421
|
+
|
422
|
+
else:
|
423
|
+
# New behavior for expand_lists=False: parse individual responses and accumulate
|
424
|
+
accumulated_parsed_items = {col: [] for col in self.output_cols}
|
425
|
+
total_parsed_count = 0
|
426
|
+
|
427
|
+
# Retry loop for this sample
|
428
|
+
for attempt in range(self.parsing_max_retries):
|
429
|
+
if total_parsed_count >= target:
|
430
|
+
break # Already reached target
|
431
|
+
|
432
|
+
try:
|
433
|
+
# Generate LLM responses for this sample
|
434
|
+
temp_dataset = Dataset.from_list([sample])
|
435
|
+
llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
|
436
|
+
|
437
|
+
# Get the raw responses (should be a list when n > 1)
|
438
|
+
raw_response_col = f"{self.block_name}_raw_response"
|
439
|
+
raw_responses = llm_result[0][raw_response_col]
|
440
|
+
if not isinstance(raw_responses, list):
|
441
|
+
raw_responses = [raw_responses]
|
442
|
+
|
443
|
+
# Parse each response individually and accumulate successful ones
|
444
|
+
new_parsed_count = 0
|
445
|
+
for response in raw_responses:
|
446
|
+
if total_parsed_count >= target:
|
447
|
+
break # Stop if we've reached target
|
448
|
+
|
449
|
+
# Create temporary dataset with single response for parsing
|
450
|
+
temp_parse_data = [{**sample, raw_response_col: response}]
|
451
|
+
temp_parse_dataset = Dataset.from_list(temp_parse_data)
|
452
|
+
|
453
|
+
# Force expand_lists=True temporarily to get individual parsed items
|
454
|
+
original_expand_lists = self.text_parser.expand_lists
|
455
|
+
try:
|
456
|
+
self.text_parser.expand_lists = True
|
457
|
+
parsed_result = self.text_parser.generate(
|
458
|
+
temp_parse_dataset, **kwargs
|
459
|
+
)
|
460
|
+
except Exception as parse_e:
|
461
|
+
logger.debug(
|
462
|
+
f"Failed to parse individual response: {parse_e}"
|
463
|
+
)
|
464
|
+
continue
|
465
|
+
finally:
|
466
|
+
self.text_parser.expand_lists = original_expand_lists
|
467
|
+
|
468
|
+
# If parsing was successful, accumulate the results
|
469
|
+
if len(parsed_result) > 0:
|
470
|
+
for parsed_row in parsed_result:
|
471
|
+
if total_parsed_count >= target:
|
472
|
+
break
|
473
|
+
|
474
|
+
# Only count as successful if ALL output columns are present
|
475
|
+
if all(
|
476
|
+
col in parsed_row for col in self.output_cols
|
477
|
+
):
|
478
|
+
for col in self.output_cols:
|
479
|
+
accumulated_parsed_items[col].append(
|
480
|
+
parsed_row[col]
|
481
|
+
)
|
482
|
+
total_parsed_count += 1
|
483
|
+
new_parsed_count += 1
|
484
|
+
# If any column is missing, skip this parsed response entirely
|
485
|
+
|
353
486
|
logger.debug(
|
354
|
-
f"
|
487
|
+
f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
|
488
|
+
f"(total: {total_parsed_count}/{target})",
|
489
|
+
extra={
|
490
|
+
"block_name": self.block_name,
|
491
|
+
"sample_idx": sample_idx,
|
492
|
+
"attempt": attempt + 1,
|
493
|
+
"new_parses": new_parsed_count,
|
494
|
+
"total_parses": total_parsed_count,
|
495
|
+
"target_count": target,
|
496
|
+
},
|
497
|
+
)
|
498
|
+
|
499
|
+
if total_parsed_count >= target:
|
500
|
+
logger.debug(
|
501
|
+
f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
|
502
|
+
extra={
|
503
|
+
"block_name": self.block_name,
|
504
|
+
"sample_idx": sample_idx,
|
505
|
+
"attempts": attempt + 1,
|
506
|
+
"final_count": total_parsed_count,
|
507
|
+
},
|
508
|
+
)
|
509
|
+
break
|
510
|
+
|
511
|
+
except Exception as e:
|
512
|
+
logger.warning(
|
513
|
+
f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
|
355
514
|
extra={
|
356
515
|
"block_name": self.block_name,
|
357
516
|
"sample_idx": sample_idx,
|
358
|
-
"
|
359
|
-
"
|
517
|
+
"attempt": attempt + 1,
|
518
|
+
"error": str(e),
|
360
519
|
},
|
361
520
|
)
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
#
|
375
|
-
|
521
|
+
# Continue to next attempt
|
522
|
+
continue
|
523
|
+
|
524
|
+
# Create final result row with accumulated lists
|
525
|
+
if total_parsed_count > 0:
|
526
|
+
# Trim to exact target count if needed
|
527
|
+
for col in self.output_cols:
|
528
|
+
if len(accumulated_parsed_items[col]) > target:
|
529
|
+
accumulated_parsed_items[col] = accumulated_parsed_items[
|
530
|
+
col
|
531
|
+
][:target]
|
532
|
+
|
533
|
+
# Only add the parsed output columns as lists, preserve other columns as-is
|
534
|
+
final_row = {**sample, **accumulated_parsed_items}
|
535
|
+
sample_results = [final_row]
|
536
|
+
else:
|
537
|
+
sample_results = []
|
376
538
|
|
377
539
|
# Check if we reached the target count
|
378
540
|
if total_parsed_count < target:
|
@@ -382,8 +544,8 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
382
544
|
max_retries=self.parsing_max_retries,
|
383
545
|
)
|
384
546
|
|
385
|
-
#
|
386
|
-
if total_parsed_count > target:
|
547
|
+
# For expand_lists=True, trim results to exact target count if we exceeded it
|
548
|
+
if self.text_parser.expand_lists and total_parsed_count > target:
|
387
549
|
sample_results = sample_results[:target]
|
388
550
|
logger.debug(
|
389
551
|
f"Trimmed sample {sample_idx} results from {total_parsed_count} to {target}",
|
@@ -404,7 +566,7 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
404
566
|
"block_name": self.block_name,
|
405
567
|
"input_samples": len(samples),
|
406
568
|
"output_rows": len(all_results),
|
407
|
-
"model": model,
|
569
|
+
"model": self.model,
|
408
570
|
},
|
409
571
|
)
|
410
572
|
|
@@ -430,9 +592,9 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
430
592
|
)
|
431
593
|
|
432
594
|
# Validate parsing configuration
|
433
|
-
has_regex = self
|
434
|
-
has_tags = bool(self
|
435
|
-
self
|
595
|
+
has_regex = getattr(self, "parsing_pattern", None) is not None
|
596
|
+
has_tags = bool(getattr(self, "start_tags", [])) or bool(
|
597
|
+
getattr(self, "end_tags", [])
|
436
598
|
)
|
437
599
|
|
438
600
|
if not has_regex and not has_tags:
|
@@ -484,7 +646,7 @@ class LLMChatWithParsingRetryBlock(BaseBlock):
|
|
484
646
|
|
485
647
|
def __repr__(self) -> str:
|
486
648
|
"""String representation of the block."""
|
487
|
-
model = self
|
649
|
+
model = getattr(self, "model", "not_configured")
|
488
650
|
return (
|
489
651
|
f"LLMChatWithParsingRetryBlock(name='{self.block_name}', "
|
490
652
|
f"model='{model}', parsing_max_retries={self.parsing_max_retries})"
|