sdg-hub 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/base.py +60 -58
  3. sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
  4. sdg_hub/core/blocks/llm/__init__.py +0 -2
  5. sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
  6. sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
  7. sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
  8. sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
  9. sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
  10. sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
  11. sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
  12. sdg_hub/core/blocks/transform/melt_columns.py +13 -12
  13. sdg_hub/core/blocks/transform/rename_columns.py +20 -9
  14. sdg_hub/core/blocks/transform/text_concat.py +20 -21
  15. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
  16. sdg_hub/core/flow/base.py +139 -106
  17. sdg_hub/core/flow/checkpointer.py +34 -36
  18. sdg_hub/core/flow/validation.py +4 -4
  19. sdg_hub/core/utils/datautils.py +52 -54
  20. sdg_hub/core/utils/flow_metrics.py +9 -6
  21. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +1 -0
  22. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA +5 -9
  23. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD +26 -28
  24. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
  25. sdg_hub/core/utils/temp_manager.py +0 -57
  26. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/WHEEL +0 -0
  27. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/licenses/LICENSE +0 -0
  28. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/top_level.txt +0 -0
@@ -1,771 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Composite block combining LLM chat and text parsing with retry logic.
3
-
4
- This module provides the LLMChatWithParsingRetryBlock that encapsulates the complete
5
- LLM generation and parsing workflow with automatic retry on parsing failures.
6
- """
7
-
8
- # Standard
9
- from typing import Any, Optional
10
-
11
- # Third Party
12
- from datasets import Dataset
13
- from pydantic import ConfigDict, Field, field_validator
14
-
15
- # Local
16
- from ...utils.error_handling import BlockValidationError
17
- from ...utils.logger_config import setup_logger
18
- from ..base import BaseBlock
19
- from ..registry import BlockRegistry
20
- from .llm_chat_block import LLMChatBlock
21
- from .llm_parser_block import LLMParserBlock
22
- from .text_parser_block import TextParserBlock
23
-
24
- logger = setup_logger(__name__)
25
-
26
-
27
- class MaxRetriesExceededError(Exception):
28
- """Raised when maximum retry attempts are exceeded without achieving target count."""
29
-
30
- def __init__(self, target_count: int, actual_count: int, max_retries: int):
31
- self.target_count = target_count
32
- self.actual_count = actual_count
33
- self.max_retries = max_retries
34
- super().__init__(
35
- f"Failed to achieve target count {target_count} after {max_retries} retries. "
36
- f"Only got {actual_count} successful parses."
37
- )
38
-
39
-
40
- @BlockRegistry.register(
41
- "LLMChatWithParsingRetryBlock",
42
- "llm",
43
- "Composite block combining LLM chat and text parsing with automatic retry on parsing failures",
44
- )
45
- class LLMChatWithParsingRetryBlock(BaseBlock):
46
- """Composite block for LLM generation with parsing retry logic.
47
-
48
- This block combines LLMChatBlock and TextParserBlock into a single cohesive block
49
- that automatically retries LLM generation when parsing fails, accumulating successful
50
- results until the target count is reached or max retries exceeded.
51
-
52
- Parameters
53
- ----------
54
- block_name : str
55
- Name of the block.
56
- input_cols : Union[str, List[str]]
57
- Input column name(s). Should contain the messages list.
58
- output_cols : Union[str, List[str]]
59
- Output column name(s) for parsed results.
60
- model : str
61
- Model identifier in LiteLLM format.
62
- api_base : Optional[str]
63
- Base URL for the API. Required for local models.
64
- api_key : Optional[str]
65
- API key for the provider. Falls back to environment variables.
66
- parsing_max_retries : int, optional
67
- Maximum number of retry attempts for parsing failures (default: 3).
68
- This is different from max_retries, which handles LLM network/API failures.
69
-
70
- **llm_kwargs : Any
71
- Any LiteLLM completion parameters (model, api_base, api_key, temperature,
72
- max_tokens, top_p, frequency_penalty, presence_penalty, stop, seed,
73
- response_format, stream, n, logprobs, top_logprobs, user, extra_headers,
74
- extra_body, async_mode, timeout, num_retries, etc.).
75
- See https://docs.litellm.ai/docs/completion/input for full list.
76
-
77
- ### Text Parser Parameters ###
78
- start_tags : List[str], optional
79
- List of start tags for tag-based parsing.
80
- end_tags : List[str], optional
81
- List of end tags for tag-based parsing.
82
- parsing_pattern : Optional[str], optional
83
- Regex pattern for custom parsing.
84
- parser_cleanup_tags : Optional[List[str]], optional
85
- List of tags to clean from parsed output.
86
-
87
- ### LLMParserBlock Parameters ###
88
- extract_content : bool, optional
89
- Whether to extract 'content' field from responses.
90
- extract_reasoning_content : bool, optional
91
- Whether to extract 'reasoning_content' field from responses.
92
- extract_tool_calls : bool, optional
93
- Whether to extract 'tool_calls' field from responses.
94
- expand_lists : bool, optional
95
- Whether to expand list inputs into individual rows (True) or preserve lists (False).
96
- field_prefix : Optional[str], optional
97
- Prefix for the field names in the parsed output.
98
-
99
- Examples
100
- --------
101
- >>> # Basic JSON parsing with retry
102
- >>> block = LLMChatWithParsingRetryBlock(
103
- ... block_name="json_retry_block",
104
- ... input_cols="messages",
105
- ... output_cols="parsed_json",
106
- ... model="openai/gpt-4",
107
- ... parsing_max_retries=3,
108
- ... parsing_pattern=r'"result":\s*"([^"]*)"',
109
- ... n=3
110
- ... )
111
-
112
- >>> # Tag-based parsing with retry
113
- >>> block = LLMChatWithParsingRetryBlock(
114
- ... block_name="tag_retry_block",
115
- ... input_cols="messages",
116
- ... output_cols=["explanation", "answer"],
117
- ... model="anthropic/claude-3-sonnet-20240229",
118
- ... parsing_max_retries=5,
119
- ... start_tags=["<explanation>", "<answer>"],
120
- ... end_tags=["</explanation>", "</answer>"],
121
- ... n=2
122
- ... )
123
- """
124
-
125
- model_config = ConfigDict(
126
- extra="allow"
127
- ) # Allow extra fields for dynamic forwarding
128
-
129
- # --- Composite-specific configuration ---
130
- parsing_max_retries: int = Field(
131
- 3, description="Maximum number of retry attempts for parsing failures"
132
- )
133
-
134
- # --- Parser configuration (required for internal TextParserBlock) ---
135
- start_tags: Optional[list[str]] = Field(
136
- None, description="Start tags for tag-based parsing"
137
- )
138
- end_tags: Optional[list[str]] = Field(
139
- None, description="End tags for tag-based parsing"
140
- )
141
- parsing_pattern: Optional[str] = Field(
142
- None, description="Regex pattern for custom parsing"
143
- )
144
- parser_cleanup_tags: Optional[list[str]] = Field(
145
- None, description="List of tags to clean from parsed output"
146
- )
147
-
148
- ### LLMParserBlock Parameters ###
149
- extract_content: bool = Field(
150
- default=True, description="Whether to extract 'content' field from responses."
151
- )
152
- extract_reasoning_content: bool = Field(
153
- default=False,
154
- description="Whether to extract 'reasoning_content' field from responses.",
155
- )
156
- extract_tool_calls: bool = Field(
157
- default=False,
158
- description="Whether to extract 'tool_calls' field from responses.",
159
- )
160
- expand_lists: bool = Field(
161
- default=True,
162
- description="Whether to expand list inputs into individual rows (True) or preserve lists (False).",
163
- )
164
- field_prefix: Optional[str] = Field(
165
- default="", description="Prefix for the field names in the parsed output."
166
- )
167
-
168
- # Internal blocks - excluded from serialization
169
- llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
170
- text_parser: Optional[TextParserBlock] = Field(None, exclude=True)
171
- llm_parser: Optional[LLMParserBlock] = Field(None, exclude=True)
172
-
173
- @field_validator("input_cols")
174
- @classmethod
175
- def validate_single_input_col(cls, v):
176
- """Ensure exactly one input column."""
177
- if isinstance(v, str):
178
- return [v]
179
- if isinstance(v, list) and len(v) == 1:
180
- return v
181
- if isinstance(v, list) and len(v) != 1:
182
- raise ValueError(
183
- f"LLMChatWithParsingRetryBlock expects exactly one input column, got {len(v)}: {v}"
184
- )
185
- raise ValueError(f"Invalid input_cols format: {v}")
186
-
187
- @field_validator("parsing_max_retries")
188
- @classmethod
189
- def validate_parsing_max_retries(cls, v):
190
- """Ensure parsing_max_retries is positive."""
191
- if v < 1:
192
- raise ValueError("parsing_max_retries must be at least 1")
193
- return v
194
-
195
- def __init__(self, **kwargs):
196
- """Initialize with dynamic parameter routing."""
197
- super().__init__(**kwargs)
198
- self._create_internal_blocks(**kwargs)
199
-
200
- # Log initialization if model is configured
201
- if self.llm_chat and self.llm_chat.model:
202
- logger.info(
203
- f"Initialized LLMChatWithParsingRetryBlock '{self.block_name}' with model '{self.llm_chat.model}'",
204
- extra={
205
- "block_name": self.block_name,
206
- "model": self.llm_chat.model,
207
- "parsing_max_retries": self.parsing_max_retries,
208
- },
209
- )
210
-
211
- def _extract_params(self, kwargs: dict, block_class) -> dict:
212
- """Extract parameters for specific block class."""
213
- # Parameters that belong to this wrapper and shouldn't be forwarded
214
- wrapper_params = {
215
- "block_name",
216
- "input_cols",
217
- "output_cols",
218
- "parsing_max_retries",
219
- }
220
-
221
- if block_class == LLMChatBlock:
222
- # LLMChatBlock accepts any parameters via extra="allow"
223
- # Forward everything except wrapper-specific and parser-specific params
224
- parser_specific_params = {
225
- "start_tags",
226
- "end_tags",
227
- "parsing_pattern",
228
- "parser_cleanup_tags",
229
- }
230
- llm_parser_specific_params = {
231
- "extract_content",
232
- "extract_reasoning_content",
233
- "extract_tool_calls",
234
- "expand_lists",
235
- "field_prefix",
236
- }
237
- excluded_params = (
238
- wrapper_params | parser_specific_params | llm_parser_specific_params
239
- )
240
-
241
- # Forward all other kwargs
242
- params = {k: v for k, v in kwargs.items() if k not in excluded_params}
243
-
244
- # Also forward instance attributes that aren't parser-specific
245
- for field_name, field_value in self.__dict__.items():
246
- if (
247
- field_name not in excluded_params
248
- and not field_name.startswith("_")
249
- and field_name not in ["llm_chat", "text_parser", "llm_parser"]
250
- and field_value is not None
251
- ):
252
- params[field_name] = field_value
253
-
254
- else:
255
- # For TextParserBlock, only forward known fields and parser-specific params
256
- non_llm_chat_params = {
257
- "start_tags",
258
- "end_tags",
259
- "parsing_pattern",
260
- "parser_cleanup_tags",
261
- "expand_lists",
262
- "field_prefix",
263
- "extract_content",
264
- "extract_reasoning_content",
265
- "extract_tool_calls",
266
- }
267
-
268
- # Forward parser-specific parameters from kwargs
269
- params = {
270
- k: v
271
- for k, v in kwargs.items()
272
- if k in block_class.model_fields and k not in wrapper_params
273
- }
274
-
275
- # Forward parser-specific instance attributes
276
- for field_name in non_llm_chat_params:
277
- if hasattr(self, field_name):
278
- field_value = getattr(self, field_name)
279
- if field_value is not None:
280
- params[field_name] = field_value
281
-
282
- return params
283
-
284
- def _create_internal_blocks(self, **kwargs):
285
- """Create internal blocks with parameter routing."""
286
- # Route parameters to appropriate blocks
287
- llm_params = self._extract_params(kwargs, LLMChatBlock)
288
- parser_params = self._extract_params(kwargs, TextParserBlock)
289
- llm_parser_params = self._extract_params(kwargs, LLMParserBlock)
290
-
291
- # 1. LLMChatBlock
292
- self.llm_chat = LLMChatBlock(
293
- block_name=f"{self.block_name}_llm_chat",
294
- input_cols=self.input_cols,
295
- output_cols=[f"{self.block_name}_raw_response"],
296
- **llm_params,
297
- )
298
-
299
- # 2. LLMParserBlock
300
- self.llm_parser = LLMParserBlock(
301
- block_name=f"{self.block_name}_llm_parser",
302
- input_cols=[f"{self.block_name}_raw_response"],
303
- **llm_parser_params,
304
- )
305
-
306
- # 2. TextParserBlock
307
- self.text_parser = TextParserBlock(
308
- block_name=f"{self.block_name}_text_parser",
309
- input_cols=[
310
- f"{self.llm_parser.field_prefix if self.llm_parser.field_prefix!='' else self.llm_parser.block_name}_content"
311
- ],
312
- output_cols=self.output_cols,
313
- **parser_params,
314
- )
315
-
316
- def __getattr__(self, name: str) -> Any:
317
- """Forward attribute access to appropriate internal block."""
318
- # Parser-specific parameters go to text_parser
319
- parser_params = {
320
- "start_tags",
321
- "end_tags",
322
- "parsing_pattern",
323
- "parser_cleanup_tags",
324
- }
325
- llm_parser_params = {
326
- "extract_content",
327
- "extract_reasoning_content",
328
- "extract_tool_calls",
329
- "expand_lists",
330
- "field_prefix",
331
- }
332
-
333
- if name in parser_params and hasattr(self, "text_parser") and self.text_parser:
334
- return getattr(self.text_parser, name)
335
-
336
- if (
337
- name in llm_parser_params
338
- and hasattr(self, "llm_parser")
339
- and self.llm_parser
340
- ):
341
- return getattr(self.llm_parser, name)
342
-
343
- # Everything else goes to llm_chat (which accepts any parameters via extra="allow")
344
- if hasattr(self, "llm_chat") and self.llm_chat:
345
- # Always try LLMChatBlock - it will return None for unset attributes
346
- # due to extra="allow", which makes hasattr() work correctly
347
- return getattr(self.llm_chat, name, None)
348
-
349
- raise AttributeError(
350
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
351
- )
352
-
353
- def __setattr__(self, name: str, value: Any) -> None:
354
- """Handle dynamic parameter updates from flow.set_model_config()."""
355
- super().__setattr__(name, value)
356
-
357
- # Don't forward during initialization or for internal attributes
358
- if not hasattr(self, "llm_chat") or name.startswith("_"):
359
- return
360
-
361
- # Parser-specific parameters go to text_parser
362
- parser_params = {
363
- "start_tags",
364
- "end_tags",
365
- "parsing_pattern",
366
- "parser_cleanup_tags",
367
- }
368
- llm_parser_params = {
369
- "extract_content",
370
- "extract_reasoning_content",
371
- "extract_tool_calls",
372
- "expand_lists",
373
- "field_prefix",
374
- }
375
-
376
- if name in parser_params and hasattr(self, "text_parser") and self.text_parser:
377
- setattr(self.text_parser, name, value)
378
-
379
- if (
380
- name in llm_parser_params
381
- and hasattr(self, "llm_parser")
382
- and self.llm_parser
383
- ):
384
- setattr(self.llm_parser, name, value)
385
-
386
- # LLM-related parameters go to llm_chat (which accepts any via extra="allow")
387
- elif (
388
- hasattr(self, "llm_chat")
389
- and self.llm_chat
390
- and name
391
- not in {
392
- "block_name",
393
- "input_cols",
394
- "output_cols",
395
- "parsing_max_retries",
396
- "llm_chat",
397
- "llm_parser",
398
- "text_parser",
399
- }
400
- ):
401
- setattr(self.llm_chat, name, value)
402
-
403
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
404
- """Generate responses with parsing retry logic.
405
-
406
- For each input sample, this method:
407
- 1. Generates LLM responses using the configured n parameter
408
- 2. Attempts to parse the responses using TextParserBlock
409
- 3. Counts successful parses and retries if below target
410
- 4. Accumulates results across retry attempts
411
- 5. Returns final dataset with all successful parses
412
-
413
- Parameters
414
- ----------
415
- samples : Dataset
416
- Input dataset containing the messages column.
417
- **kwargs : Any
418
- Additional keyword arguments passed to internal blocks.
419
-
420
- Returns
421
- -------
422
- Dataset
423
- Dataset with parsed results from successful generations.
424
-
425
- Raises
426
- ------
427
- BlockValidationError
428
- If model is not configured before calling generate().
429
- MaxRetriesExceededError
430
- If target count not reached after max retries for any sample.
431
- """
432
- # Validate that model is configured (check internal LLM block)
433
- if not self.llm_chat or not self.llm_chat.model:
434
- raise BlockValidationError(
435
- f"Model not configured for block '{self.block_name}'. "
436
- f"Call flow.set_model_config() before generating."
437
- )
438
-
439
- logger.info(
440
- f"Starting LLM generation with parsing retry for {len(samples)} samples",
441
- extra={
442
- "block_name": self.block_name,
443
- "model": self.llm_chat.model,
444
- "batch_size": len(samples),
445
- "parsing_max_retries": self.parsing_max_retries,
446
- },
447
- )
448
-
449
- all_results = []
450
-
451
- # Process each sample independently with retry logic
452
- for sample_idx, sample in enumerate(samples):
453
- # Determine target count for this sample (number of completions requested)
454
- target = kwargs.get("n", getattr(self, "n", None)) or 1
455
-
456
- logger.debug(
457
- f"Processing sample {sample_idx} with target count {target}",
458
- extra={
459
- "block_name": self.block_name,
460
- "sample_idx": sample_idx,
461
- "target_count": target,
462
- },
463
- )
464
-
465
- if self.llm_parser.expand_lists:
466
- # Current behavior for expand_lists=True: count rows directly
467
- sample_results = []
468
- total_parsed_count = 0
469
-
470
- # Retry loop for this sample
471
- for attempt in range(self.parsing_max_retries):
472
- if total_parsed_count >= target:
473
- break # Already reached target
474
-
475
- try:
476
- # Generate LLM responses for this sample
477
- temp_dataset = Dataset.from_list([sample])
478
- llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
479
- llm_parser_result = self.llm_parser.generate(
480
- llm_result, **kwargs
481
- )
482
-
483
- # Parse the responses
484
- parsed_result = self.text_parser.generate(
485
- llm_parser_result, **kwargs
486
- )
487
-
488
- # Count successful parses and accumulate results
489
- new_parsed_count = len(parsed_result)
490
- total_parsed_count += new_parsed_count
491
- sample_results.extend(parsed_result)
492
-
493
- logger.debug(
494
- f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
495
- f"(total: {total_parsed_count}/{target})",
496
- extra={
497
- "block_name": self.block_name,
498
- "sample_idx": sample_idx,
499
- "attempt": attempt + 1,
500
- "new_parses": new_parsed_count,
501
- "total_parses": total_parsed_count,
502
- "target_count": target,
503
- },
504
- )
505
-
506
- if total_parsed_count >= target:
507
- logger.debug(
508
- f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
509
- extra={
510
- "block_name": self.block_name,
511
- "sample_idx": sample_idx,
512
- "attempts": attempt + 1,
513
- "final_count": total_parsed_count,
514
- },
515
- )
516
- break
517
-
518
- except Exception as e:
519
- logger.warning(
520
- f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
521
- extra={
522
- "block_name": self.block_name,
523
- "sample_idx": sample_idx,
524
- "attempt": attempt + 1,
525
- "error": str(e),
526
- },
527
- )
528
- # Continue to next attempt
529
- continue
530
-
531
- else:
532
- # New behavior for expand_lists=False: parse individual responses and accumulate
533
- accumulated_parsed_items = {col: [] for col in self.output_cols}
534
- total_parsed_count = 0
535
-
536
- # Retry loop for this sample
537
- for attempt in range(self.parsing_max_retries):
538
- if total_parsed_count >= target:
539
- break # Already reached target
540
-
541
- try:
542
- # Generate LLM responses for this sample
543
- temp_dataset = Dataset.from_list([sample])
544
- llm_result = self.llm_chat.generate(temp_dataset, **kwargs)
545
- llm_parser_result = self.llm_parser.generate(
546
- llm_result, **kwargs
547
- )
548
- # Get the raw responses (should be a list when n > 1)
549
- raw_response_col = f"{self.llm_parser.field_prefix if self.llm_parser.field_prefix!='' else self.llm_parser.block_name}_content"
550
- raw_responses = llm_parser_result[0][raw_response_col]
551
- if not isinstance(raw_responses, list):
552
- raw_responses = [raw_responses]
553
-
554
- # Parse each response individually and accumulate successful ones
555
- new_parsed_count = 0
556
- for response in raw_responses:
557
- if total_parsed_count >= target:
558
- break # Stop if we've reached target
559
-
560
- # Create temporary dataset with single response for parsing
561
- temp_parse_data = [{**sample, raw_response_col: response}]
562
- temp_parse_dataset = Dataset.from_list(temp_parse_data)
563
-
564
- # Force expand_lists=True temporarily to get individual parsed items
565
- original_expand_lists = self.llm_parser.expand_lists
566
- try:
567
- self.llm_parser.expand_lists = (
568
- self.llm_parser.expand_lists
569
- )
570
- parsed_result = self.text_parser.generate(
571
- temp_parse_dataset, **kwargs
572
- )
573
- except Exception as parse_e:
574
- logger.debug(
575
- f"Failed to parse individual response: {parse_e}"
576
- )
577
- continue
578
- finally:
579
- self.llm_parser.expand_lists = original_expand_lists
580
-
581
- # If parsing was successful, accumulate the results
582
- if len(parsed_result) > 0:
583
- for parsed_row in parsed_result:
584
- if total_parsed_count >= target:
585
- break
586
-
587
- # Only count as successful if ALL output columns are present
588
- if all(
589
- col in parsed_row for col in self.output_cols
590
- ):
591
- for col in self.output_cols:
592
- accumulated_parsed_items[col].append(
593
- parsed_row[col]
594
- )
595
- total_parsed_count += 1
596
- new_parsed_count += 1
597
- # If any column is missing, skip this parsed response entirely
598
-
599
- logger.debug(
600
- f"Attempt {attempt + 1} for sample {sample_idx}: {new_parsed_count} successful parses "
601
- f"(total: {total_parsed_count}/{target})",
602
- extra={
603
- "block_name": self.block_name,
604
- "sample_idx": sample_idx,
605
- "attempt": attempt + 1,
606
- "new_parses": new_parsed_count,
607
- "total_parses": total_parsed_count,
608
- "target_count": target,
609
- },
610
- )
611
-
612
- if total_parsed_count >= target:
613
- logger.debug(
614
- f"Target reached for sample {sample_idx} after {attempt + 1} attempts",
615
- extra={
616
- "block_name": self.block_name,
617
- "sample_idx": sample_idx,
618
- "attempts": attempt + 1,
619
- "final_count": total_parsed_count,
620
- },
621
- )
622
- break
623
-
624
- except Exception as e:
625
- logger.warning(
626
- f"Error during attempt {attempt + 1} for sample {sample_idx}: {e}",
627
- extra={
628
- "block_name": self.block_name,
629
- "sample_idx": sample_idx,
630
- "attempt": attempt + 1,
631
- "error": str(e),
632
- },
633
- )
634
- # Continue to next attempt
635
- continue
636
-
637
- # Create final result row with accumulated lists
638
- if total_parsed_count > 0:
639
- # Trim to exact target count if needed
640
- for col in self.output_cols:
641
- if len(accumulated_parsed_items[col]) > target:
642
- accumulated_parsed_items[col] = accumulated_parsed_items[
643
- col
644
- ][:target]
645
-
646
- # Only add the parsed output columns as lists, preserve other columns as-is
647
- final_row = {**sample, **accumulated_parsed_items}
648
- sample_results = [final_row]
649
- else:
650
- sample_results = []
651
-
652
- # Check if we reached the target count
653
- if total_parsed_count < target:
654
- raise MaxRetriesExceededError(
655
- target_count=target,
656
- actual_count=total_parsed_count,
657
- max_retries=self.parsing_max_retries,
658
- )
659
-
660
- # For expand_lists=True, trim results to exact target count if we exceeded it
661
- if self.llm_parser.expand_lists and total_parsed_count > target:
662
- sample_results = sample_results[:target]
663
- logger.debug(
664
- f"Trimmed sample {sample_idx} results from {total_parsed_count} to {target}",
665
- extra={
666
- "block_name": self.block_name,
667
- "sample_idx": sample_idx,
668
- "trimmed_from": total_parsed_count,
669
- "trimmed_to": target,
670
- },
671
- )
672
-
673
- # Add this sample's results to final dataset
674
- all_results.extend(sample_results)
675
-
676
- logger.info(
677
- f"LLM generation with parsing retry completed: {len(samples)} input samples → {len(all_results)} output rows",
678
- extra={
679
- "block_name": self.block_name,
680
- "input_samples": len(samples),
681
- "output_rows": len(all_results),
682
- "model": self.llm_chat.model,
683
- },
684
- )
685
-
686
- return Dataset.from_list(all_results)
687
-
688
- def _validate_custom(self, dataset: Dataset) -> None:
689
- """Custom validation for LLMChatWithParsingRetryBlock.
690
-
691
- This method validates the entire chain of internal blocks by simulating
692
- the data flow through each block to ensure they can all process the data correctly.
693
- """
694
- # Validate that required input column exists
695
- if len(self.input_cols) != 1:
696
- raise ValueError(
697
- f"LLMChatWithParsingRetryBlock expects exactly one input column, got {len(self.input_cols)}"
698
- )
699
-
700
- input_col = self.input_cols[0]
701
- if input_col not in dataset.column_names:
702
- raise ValueError(
703
- f"Required input column '{input_col}' not found in dataset. "
704
- f"Available columns: {dataset.column_names}"
705
- )
706
-
707
- # Validate parsing configuration
708
- has_regex = getattr(self, "parsing_pattern", None) is not None
709
- has_tags = bool(getattr(self, "start_tags", [])) or bool(
710
- getattr(self, "end_tags", [])
711
- )
712
-
713
- if not has_regex and not has_tags:
714
- raise ValueError(
715
- "LLMChatWithParsingRetryBlock requires at least one parsing method: "
716
- "either 'parsing_pattern' (regex) or 'start_tags'/'end_tags' (tag-based parsing)"
717
- )
718
-
719
- # Validate that internal blocks are initialized
720
- if not all([self.llm_chat, self.text_parser]):
721
- raise ValueError(
722
- "All internal blocks must be initialized before validation"
723
- )
724
-
725
- # Validate internal blocks
726
- try:
727
- logger.debug("Validating internal LLM chat block")
728
- self.llm_chat._validate_custom(dataset)
729
-
730
- # Create temporary dataset with expected LLM output for parser validation
731
- temp_data = []
732
- for sample in dataset:
733
- temp_sample = dict(sample)
734
- temp_sample[f"{self.block_name}_raw_response"] = "test output"
735
- temp_data.append(temp_sample)
736
- temp_dataset = Dataset.from_list(temp_data)
737
-
738
- logger.debug("Validating internal text parser block")
739
- self.text_parser._validate_custom(temp_dataset)
740
-
741
- logger.debug("All internal blocks validated successfully")
742
-
743
- except Exception as e:
744
- logger.error(f"Validation failed in internal blocks: {e}")
745
- raise ValueError(f"Internal block validation failed: {e}") from e
746
-
747
- def get_internal_blocks_info(self) -> dict[str, Any]:
748
- """Get information about the internal blocks.
749
-
750
- Returns
751
- -------
752
- Dict[str, Any]
753
- Information about each internal block.
754
- """
755
- return {
756
- "llm_chat": self.llm_chat.get_info() if self.llm_chat else None,
757
- "llm_parser": self.llm_parser.get_info() if self.llm_parser else None,
758
- "text_parser": self.text_parser.get_info() if self.text_parser else None,
759
- }
760
-
761
- def __repr__(self) -> str:
762
- """String representation of the block."""
763
- model = (
764
- self.llm_chat.model
765
- if (self.llm_chat and self.llm_chat.model)
766
- else "not_configured"
767
- )
768
- return (
769
- f"LLMChatWithParsingRetryBlock(name='{self.block_name}', "
770
- f"model='{model}', parsing_max_retries={self.parsing_max_retries})"
771
- )