sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,564 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Composite block for faithfulness evaluation of question-answer pairs.
3
+
4
+ This module provides the EvaluateFaithfulnessBlock that encapsulates the complete
5
+ faithfulness evaluation workflow, combining prompt building, LLM chat, text parsing,
6
+ and filtering into a single block for simplified configuration.
7
+ """
8
+
9
+ # Standard
10
+ from typing import Any, Optional, Union
11
+
12
+ # Third Party
13
+ from datasets import Dataset
14
+ from pydantic import ConfigDict, Field, field_validator
15
+
16
+ # Local
17
+ from ...utils.logger_config import setup_logger
18
+ from ..base import BaseBlock
19
+ from ..filtering.column_value_filter import ColumnValueFilterBlock
20
+ from ..llm.llm_chat_block import LLMChatBlock
21
+ from ..llm.prompt_builder_block import PromptBuilderBlock
22
+ from ..llm.text_parser_block import TextParserBlock
23
+ from ..registry import BlockRegistry
24
+
25
+ logger = setup_logger(__name__)
26
+
27
+
28
+ @BlockRegistry.register(
29
+ "EvaluateFaithfulnessBlock",
30
+ "evaluation",
31
+ "Composite block for faithfulness evaluation of question-answer pairs",
32
+ )
33
+ class EvaluateFaithfulnessBlock(BaseBlock):
34
+ """Composite block for faithfulness evaluation workflow.
35
+
36
+ This block combines four separate blocks into a single cohesive evaluation block:
37
+ 1. PromptBuilderBlock - builds evaluation prompt from document and response
38
+ 2. LLMChatBlock - generates faithfulness evaluation using LLM
39
+ 3. TextParserBlock - parses explanation and judgment from raw output
40
+ 4. ColumnValueFilterBlock - filters based on faithfulness judgment
41
+
42
+ Parameters
43
+ ----------
44
+ block_name : str
45
+ Name of the block.
46
+ input_cols : List[str]
47
+ Input columns: ["document", "response"]
48
+ output_cols : List[str]
49
+ Output columns: ["faithfulness_explanation", "faithfulness_judgment"]
50
+ prompt_config_path : str
51
+ Path to YAML file containing the faithfulness evaluation prompt template.
52
+ model : str
53
+ Model identifier in LiteLLM format (e.g., "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct")
54
+ api_base : Optional[str]
55
+ Base URL for the API. Required for local models.
56
+ api_key : Optional[str]
57
+ API key for the provider. Falls back to environment variables.
58
+ filter_value : str, optional
59
+ Value to filter on for faithfulness judgment (default: "YES")
60
+ operation : str, optional
61
+ Filter operation (default: "eq")
62
+ convert_dtype : Optional[str], optional
63
+ Data type conversion for filter column (default: None)
64
+ async_mode : bool, optional
65
+ Whether to use async processing (default: True)
66
+ format_as_messages : bool, optional
67
+ Whether to format prompt as messages (default: True)
68
+ start_tags : List[str], optional
69
+ Start tags for parsing (default: ["[Start of Explanation]", "[Start of Answer]"])
70
+ end_tags : List[str], optional
71
+ End tags for parsing (default: ["[End of Explanation]", "[End of Answer]"])
72
+ parsing_pattern : Optional[str], optional
73
+ Regex pattern for custom parsing. If provided, takes precedence over tag-based parsing.
74
+ parser_cleanup_tags : Optional[List[str]], optional
75
+ List of tags to clean from parsed output.
76
+
77
+ ### LLM Generation Parameters ###
78
+ temperature : Optional[float], optional
79
+ Sampling temperature (0.0 to 2.0).
80
+ max_tokens : Optional[int], optional
81
+ Maximum tokens to generate.
82
+ top_p : Optional[float], optional
83
+ Nucleus sampling parameter (0.0 to 1.0).
84
+ frequency_penalty : Optional[float], optional
85
+ Frequency penalty (-2.0 to 2.0).
86
+ presence_penalty : Optional[float], optional
87
+ Presence penalty (-2.0 to 2.0).
88
+ stop : Optional[Union[str, List[str]]], optional
89
+ Stop sequences.
90
+ seed : Optional[int], optional
91
+ Random seed for reproducible outputs.
92
+ response_format : Optional[Dict[str, Any]], optional
93
+ Response format specification (e.g., JSON mode).
94
+ stream : Optional[bool], optional
95
+ Whether to stream responses.
96
+ n : Optional[int], optional
97
+ Number of completions to generate. When n > 1, the output column will contain
98
+ a list of responses for each input sample.
99
+ logprobs : Optional[bool], optional
100
+ Whether to return log probabilities.
101
+ top_logprobs : Optional[int], optional
102
+ Number of top log probabilities to return.
103
+ user : Optional[str], optional
104
+ End-user identifier.
105
+ extra_headers : Optional[Dict[str, str]], optional
106
+ Additional headers to send with requests.
107
+ extra_body : Optional[Dict[str, Any]], optional
108
+ Additional parameters for the request body.
109
+ timeout : float, optional
110
+ Request timeout in seconds (default: 120.0).
111
+ max_retries : int, optional
112
+ Maximum number of retry attempts (default: 6).
113
+ **kwargs : Any
114
+ Additional provider-specific parameters.
115
+ """
116
+
117
+ model_config = ConfigDict(extra="forbid")
118
+
119
+ # Core configuration
120
+ prompt_config_path: str = Field(
121
+ ...,
122
+ description="Path to YAML file containing the faithfulness evaluation prompt template",
123
+ )
124
+ model: Optional[str] = Field(None, description="Model identifier in LiteLLM format")
125
+ api_base: Optional[str] = Field(None, description="Base URL for the API")
126
+ api_key: Optional[str] = Field(
127
+ None,
128
+ description="API key for the provider. Falls back to environment variables.",
129
+ )
130
+
131
+ # Filter configuration
132
+ filter_value: str = Field(
133
+ "YES", description="Value to filter on for faithfulness judgment"
134
+ )
135
+ operation: str = Field("eq", description="Filter operation")
136
+ convert_dtype: Optional[str] = Field(
137
+ None, description="Data type conversion for filter column"
138
+ )
139
+
140
+ # Processing configuration
141
+ async_mode: bool = Field(True, description="Whether to use async processing")
142
+ format_as_messages: bool = Field(
143
+ True, description="Whether to format prompt as messages"
144
+ )
145
+
146
+ # Parser configuration
147
+ start_tags: list[str] = Field(
148
+ ["[Start of Explanation]", "[Start of Answer]"],
149
+ description="Start tags for parsing explanation and judgment",
150
+ )
151
+ end_tags: list[str] = Field(
152
+ ["[End of Explanation]", "[End of Answer]"],
153
+ description="End tags for parsing explanation and judgment",
154
+ )
155
+ parsing_pattern: Optional[str] = Field(
156
+ None,
157
+ description="Regex pattern for custom parsing. If provided, takes precedence over tag-based parsing",
158
+ )
159
+ parser_cleanup_tags: Optional[list[str]] = Field(
160
+ None, description="List of tags to clean from parsed output"
161
+ )
162
+
163
+ # LLM generation parameters
164
+ temperature: Optional[float] = Field(
165
+ None, description="Sampling temperature (0.0 to 2.0)"
166
+ )
167
+ max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
168
+ top_p: Optional[float] = Field(
169
+ None, description="Nucleus sampling parameter (0.0 to 1.0)"
170
+ )
171
+ frequency_penalty: Optional[float] = Field(
172
+ None, description="Frequency penalty (-2.0 to 2.0)"
173
+ )
174
+ presence_penalty: Optional[float] = Field(
175
+ None, description="Presence penalty (-2.0 to 2.0)"
176
+ )
177
+ stop: Optional[Union[str, list[str]]] = Field(None, description="Stop sequences")
178
+ seed: Optional[int] = Field(
179
+ None, description="Random seed for reproducible outputs"
180
+ )
181
+ response_format: Optional[dict[str, Any]] = Field(
182
+ None, description="Response format specification (e.g., JSON mode)"
183
+ )
184
+ stream: Optional[bool] = Field(None, description="Whether to stream responses")
185
+ n: Optional[int] = Field(
186
+ None,
187
+ description="Number of completions to generate. When n > 1, the output column will contain a list of responses for each input sample",
188
+ )
189
+ logprobs: Optional[bool] = Field(
190
+ None, description="Whether to return log probabilities"
191
+ )
192
+ top_logprobs: Optional[int] = Field(
193
+ None, description="Number of top log probabilities to return"
194
+ )
195
+ user: Optional[str] = Field(None, description="End-user identifier")
196
+ extra_headers: Optional[dict[str, str]] = Field(
197
+ None, description="Additional headers to send with requests"
198
+ )
199
+ extra_body: Optional[dict[str, Any]] = Field(
200
+ None, description="Additional parameters for the request body"
201
+ )
202
+ timeout: float = Field(120.0, description="Request timeout in seconds")
203
+ max_retries: int = Field(6, description="Maximum number of retry attempts")
204
+
205
+ # Additional provider-specific parameters
206
+ llm_kwargs: dict[str, Any] = Field(
207
+ default_factory=dict, description="Additional provider-specific parameters"
208
+ )
209
+
210
+ # Internal blocks - excluded from serialization
211
+ prompt_builder: Optional[PromptBuilderBlock] = Field(None, exclude=True)
212
+ llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
213
+ text_parser: Optional[TextParserBlock] = Field(None, exclude=True)
214
+ filter_block: Optional[ColumnValueFilterBlock] = Field(None, exclude=True)
215
+
216
+ @field_validator("input_cols")
217
+ @classmethod
218
+ def validate_input_cols(cls, v):
219
+ """Validate that input columns are exactly ["document", "response"]."""
220
+ expected = ["document", "response"]
221
+ if v != expected:
222
+ raise ValueError(
223
+ f"EvaluateFaithfulnessBlock expects input_cols={expected}, got {v}"
224
+ )
225
+ return v
226
+
227
+ @field_validator("output_cols")
228
+ @classmethod
229
+ def validate_output_cols(cls, v):
230
+ """Validate that output columns are exactly ["faithfulness_explanation", "faithfulness_judgment"]."""
231
+ expected = [
232
+ "faithfulness_explanation",
233
+ "faithfulness_judgment",
234
+ ]
235
+ if v != expected:
236
+ raise ValueError(
237
+ f"EvaluateFaithfulnessBlock expects output_cols={expected}, got {v}"
238
+ )
239
+ return v
240
+
241
+ def model_post_init(self, __context: Any) -> None:
242
+ """Initialize the internal blocks after Pydantic validation."""
243
+ super().model_post_init(__context)
244
+
245
+ # Create internal blocks
246
+ self._create_internal_blocks()
247
+
248
+ # Log initialization only when model is configured
249
+ if self.model:
250
+ logger.info(
251
+ f"Initialized EvaluateFaithfulnessBlock '{self.block_name}' with model '{self.model}'",
252
+ extra={
253
+ "block_name": self.block_name,
254
+ "model": self.model,
255
+ "async_mode": self.async_mode,
256
+ "filter_value": self.filter_value,
257
+ },
258
+ )
259
+
260
+ def _create_internal_blocks(self) -> None:
261
+ """Create and configure the internal blocks."""
262
+ # 1. PromptBuilderBlock
263
+ self.prompt_builder = PromptBuilderBlock(
264
+ block_name=f"{self.block_name}_prompt_builder",
265
+ input_cols=["document", "response"],
266
+ output_cols=["eval_faithfulness_prompt"],
267
+ prompt_config_path=self.prompt_config_path,
268
+ format_as_messages=self.format_as_messages,
269
+ )
270
+
271
+ # 2. LLMChatBlock
272
+ llm_kwargs = {
273
+ "block_name": f"{self.block_name}_llm_chat",
274
+ "input_cols": ["eval_faithfulness_prompt"],
275
+ "output_cols": ["raw_eval_faithfulness"],
276
+ "model": self.model,
277
+ "api_base": self.api_base,
278
+ "api_key": self.api_key,
279
+ "async_mode": self.async_mode,
280
+ "timeout": self.timeout,
281
+ "max_retries": self.max_retries,
282
+ }
283
+
284
+ # Add generation parameters if specified
285
+ if self.temperature is not None:
286
+ llm_kwargs["temperature"] = self.temperature
287
+ if self.max_tokens is not None:
288
+ llm_kwargs["max_tokens"] = self.max_tokens
289
+ if self.top_p is not None:
290
+ llm_kwargs["top_p"] = self.top_p
291
+ if self.frequency_penalty is not None:
292
+ llm_kwargs["frequency_penalty"] = self.frequency_penalty
293
+ if self.presence_penalty is not None:
294
+ llm_kwargs["presence_penalty"] = self.presence_penalty
295
+ if self.stop is not None:
296
+ llm_kwargs["stop"] = self.stop
297
+ if self.seed is not None:
298
+ llm_kwargs["seed"] = self.seed
299
+ if self.response_format is not None:
300
+ llm_kwargs["response_format"] = self.response_format
301
+ if self.stream is not None:
302
+ llm_kwargs["stream"] = self.stream
303
+ if self.n is not None:
304
+ llm_kwargs["n"] = self.n
305
+ if self.logprobs is not None:
306
+ llm_kwargs["logprobs"] = self.logprobs
307
+ if self.top_logprobs is not None:
308
+ llm_kwargs["top_logprobs"] = self.top_logprobs
309
+ if self.user is not None:
310
+ llm_kwargs["user"] = self.user
311
+ if self.extra_headers is not None:
312
+ llm_kwargs["extra_headers"] = self.extra_headers
313
+ if self.extra_body is not None:
314
+ llm_kwargs["extra_body"] = self.extra_body
315
+
316
+ # Add any additional kwargs
317
+ llm_kwargs.update(self.llm_kwargs)
318
+
319
+ self.llm_chat = LLMChatBlock(**llm_kwargs)
320
+
321
+ # 3. TextParserBlock
322
+ text_parser_kwargs = {
323
+ "block_name": f"{self.block_name}_text_parser",
324
+ "input_cols": ["raw_eval_faithfulness"],
325
+ "output_cols": ["faithfulness_explanation", "faithfulness_judgment"],
326
+ "start_tags": self.start_tags,
327
+ "end_tags": self.end_tags,
328
+ }
329
+
330
+ # Add optional TextParserBlock parameters if specified
331
+ if self.parsing_pattern is not None:
332
+ text_parser_kwargs["parsing_pattern"] = self.parsing_pattern
333
+ if self.parser_cleanup_tags is not None:
334
+ text_parser_kwargs["parser_cleanup_tags"] = self.parser_cleanup_tags
335
+
336
+ self.text_parser = TextParserBlock(**text_parser_kwargs)
337
+
338
+ # 4. ColumnValueFilterBlock
339
+ filter_kwargs = {
340
+ "block_name": f"{self.block_name}_filter",
341
+ "input_cols": ["faithfulness_judgment"],
342
+ "output_cols": [], # Filter blocks don't create new columns
343
+ "filter_value": self.filter_value,
344
+ "operation": self.operation,
345
+ }
346
+
347
+ if self.convert_dtype is not None:
348
+ filter_kwargs["convert_dtype"] = self.convert_dtype
349
+
350
+ self.filter_block = ColumnValueFilterBlock(**filter_kwargs)
351
+
352
+ def _reinitialize_client_manager(self) -> None:
353
+ """Reinitialize the internal LLM chat block's client manager.
354
+
355
+ This should be called after model configuration changes to ensure
356
+ the internal LLM chat block uses the updated model configuration.
357
+ """
358
+ if self.llm_chat and hasattr(self.llm_chat, "_reinitialize_client_manager"):
359
+ # Update the internal LLM chat block's model config
360
+ self.llm_chat.model = self.model
361
+ self.llm_chat.api_base = self.api_base
362
+ self.llm_chat.api_key = self.api_key
363
+ # Reinitialize its client manager
364
+ self.llm_chat._reinitialize_client_manager()
365
+
366
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
367
+ """Generate faithfulness evaluation for all samples.
368
+
369
+ This method chains the four internal blocks in sequence:
370
+ 1. Build faithfulness evaluation prompts
371
+ 2. Generate LLM responses
372
+ 3. Parse explanation and judgment
373
+ 4. Filter based on judgment
374
+
375
+ Parameters
376
+ ----------
377
+ samples : Dataset
378
+ Input dataset containing 'document' and 'response' columns.
379
+ **kwargs : Any
380
+ Additional keyword arguments passed to internal blocks.
381
+
382
+ Returns
383
+ -------
384
+ Dataset
385
+ Dataset with faithfulness evaluation results and filtering applied.
386
+
387
+ Raises
388
+ ------
389
+ BlockValidationError
390
+ If model is not configured before calling generate().
391
+ """
392
+ # Validate that model is configured
393
+ if not self.model:
394
+ # Local
395
+ from ...utils.error_handling import BlockValidationError
396
+
397
+ raise BlockValidationError(
398
+ f"Model not configured for block '{self.block_name}'. "
399
+ f"Call flow.set_model_config() before generating."
400
+ )
401
+ logger.info(
402
+ f"Starting faithfulness evaluation for {len(samples)} samples",
403
+ extra={
404
+ "block_name": self.block_name,
405
+ "model": self.model,
406
+ "batch_size": len(samples),
407
+ },
408
+ )
409
+
410
+ current_dataset = samples
411
+
412
+ try:
413
+ # Step 1: Build prompts
414
+ logger.debug("Step 1: Building faithfulness evaluation prompts")
415
+ current_dataset = self.prompt_builder.generate(current_dataset, **kwargs)
416
+
417
+ # Step 2: Generate LLM responses
418
+ logger.debug("Step 2: Generating LLM responses")
419
+ current_dataset = self.llm_chat.generate(current_dataset, **kwargs)
420
+
421
+ # Step 3: Parse responses
422
+ logger.debug("Step 3: Parsing faithfulness evaluation responses")
423
+ current_dataset = self.text_parser.generate(current_dataset, **kwargs)
424
+
425
+ # Step 4: Filter based on judgment
426
+ logger.debug("Step 4: Filtering based on faithfulness judgment")
427
+ original_count = len(current_dataset)
428
+ current_dataset = self.filter_block.generate(current_dataset, **kwargs)
429
+ filtered_count = len(current_dataset)
430
+
431
+ logger.info(
432
+ f"Faithfulness evaluation completed: {original_count} → {filtered_count} samples "
433
+ f"(filtered {original_count - filtered_count} samples)",
434
+ extra={
435
+ "block_name": self.block_name,
436
+ "original_count": original_count,
437
+ "filtered_count": filtered_count,
438
+ "filter_rate": (original_count - filtered_count) / original_count
439
+ if original_count > 0
440
+ else 0,
441
+ },
442
+ )
443
+
444
+ return current_dataset
445
+
446
+ except Exception as e:
447
+ logger.error(
448
+ f"Error during faithfulness evaluation: {e}",
449
+ extra={
450
+ "block_name": self.block_name,
451
+ "model": self.model,
452
+ "error": str(e),
453
+ },
454
+ )
455
+ raise
456
+
457
+ def _validate_custom(self, dataset: Dataset) -> None:
458
+ """Custom validation for faithfulness evaluation.
459
+
460
+ This method validates the entire chain of internal blocks by simulating
461
+ the data flow through each block to ensure they can all process the data correctly.
462
+ """
463
+ # Validate that required columns exist
464
+ required_columns = ["document", "response"]
465
+ missing_columns = [
466
+ col for col in required_columns if col not in dataset.column_names
467
+ ]
468
+ if missing_columns:
469
+ raise ValueError(
470
+ f"EvaluateFaithfulnessBlock requires columns {required_columns}, "
471
+ f"missing: {missing_columns}"
472
+ )
473
+
474
+ # Validate the entire chain of internal blocks
475
+ if not all(
476
+ [self.prompt_builder, self.llm_chat, self.text_parser, self.filter_block]
477
+ ):
478
+ raise ValueError(
479
+ "All internal blocks must be initialized before validation"
480
+ )
481
+
482
+ # Simulate data flow through the chain to validate each block
483
+ current_dataset = dataset
484
+
485
+ try:
486
+ # 1. Validate PromptBuilderBlock
487
+ logger.debug("Validating prompt builder block")
488
+ self.prompt_builder._validate_custom(current_dataset)
489
+
490
+ # Simulate prompt builder output for next validation
491
+ # Add the expected output column temporarily for validation
492
+ if "eval_faithfulness_prompt" not in current_dataset.column_names:
493
+ # Create a temporary dataset with the expected column for validation
494
+ temp_data = []
495
+ for sample in current_dataset:
496
+ temp_sample = dict(sample)
497
+ temp_sample["eval_faithfulness_prompt"] = [
498
+ {"role": "user", "content": "test"}
499
+ ]
500
+ temp_data.append(temp_sample)
501
+ current_dataset = Dataset.from_list(temp_data)
502
+
503
+ # 2. Validate LLMChatBlock
504
+ logger.debug("Validating LLM chat block")
505
+ self.llm_chat._validate_custom(current_dataset)
506
+
507
+ # Simulate LLM chat output for next validation
508
+ if "raw_eval_faithfulness" not in current_dataset.column_names:
509
+ temp_data = []
510
+ for sample in current_dataset:
511
+ temp_sample = dict(sample)
512
+ temp_sample["raw_eval_faithfulness"] = (
513
+ "[Start of Explanation]Test explanation[End of Explanation]\n[Start of Answer]YES[End of Answer]"
514
+ )
515
+ temp_data.append(temp_sample)
516
+ current_dataset = Dataset.from_list(temp_data)
517
+
518
+ # 3. Validate TextParserBlock
519
+ logger.debug("Validating text parser block")
520
+ self.text_parser._validate_custom(current_dataset)
521
+
522
+ # Simulate text parser output for final validation
523
+ if "faithfulness_judgment" not in current_dataset.column_names:
524
+ temp_data = []
525
+ for sample in current_dataset:
526
+ temp_sample = dict(sample)
527
+ temp_sample["faithfulness_explanation"] = "Test explanation"
528
+ temp_sample["faithfulness_judgment"] = "YES"
529
+ temp_data.append(temp_sample)
530
+ current_dataset = Dataset.from_list(temp_data)
531
+
532
+ # 4. Validate ColumnValueFilterBlock
533
+ logger.debug("Validating filter block")
534
+ self.filter_block._validate_custom(current_dataset)
535
+
536
+ logger.debug("All internal blocks validated successfully")
537
+
538
+ except Exception as e:
539
+ logger.error(f"Validation failed in internal blocks: {e}")
540
+ raise ValueError(f"Internal block validation failed: {e}") from e
541
+
542
+ def get_internal_blocks_info(self) -> dict[str, Any]:
543
+ """Get information about the internal blocks.
544
+
545
+ Returns
546
+ -------
547
+ Dict[str, Any]
548
+ Information about each internal block.
549
+ """
550
+ return {
551
+ "prompt_builder": self.prompt_builder.get_info()
552
+ if self.prompt_builder
553
+ else None,
554
+ "llm_chat": self.llm_chat.get_info() if self.llm_chat else None,
555
+ "text_parser": self.text_parser.get_info() if self.text_parser else None,
556
+ "filter": self.filter_block.get_info() if self.filter_block else None,
557
+ }
558
+
559
+ def __repr__(self) -> str:
560
+ """String representation of the block."""
561
+ return (
562
+ f"EvaluateFaithfulnessBlock(name='{self.block_name}', "
563
+ f"model='{self.model}', filter_value='{self.filter_value}')"
564
+ )