sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +27 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +1209 -0
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +389 -0
- sdg_hub/core/flow/migration.py +198 -0
- sdg_hub/core/flow/registry.py +393 -0
- sdg_hub/core/flow/validation.py +277 -0
- sdg_hub/{utils → core/utils}/__init__.py +7 -4
- sdg_hub/core/utils/datautils.py +63 -0
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.1.dist-info/METADATA +221 -0
- sdg_hub-0.2.1.dist-info/RECORD +68 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/datautils.py +0 -14
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,564 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Composite block for faithfulness evaluation of question-answer pairs.
|
3
|
+
|
4
|
+
This module provides the EvaluateFaithfulnessBlock that encapsulates the complete
|
5
|
+
faithfulness evaluation workflow, combining prompt building, LLM chat, text parsing,
|
6
|
+
and filtering into a single block for simplified configuration.
|
7
|
+
"""
|
8
|
+
|
9
|
+
# Standard
|
10
|
+
from typing import Any, Optional, Union
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
from pydantic import ConfigDict, Field, field_validator
|
15
|
+
|
16
|
+
# Local
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..filtering.column_value_filter import ColumnValueFilterBlock
|
20
|
+
from ..llm.llm_chat_block import LLMChatBlock
|
21
|
+
from ..llm.prompt_builder_block import PromptBuilderBlock
|
22
|
+
from ..llm.text_parser_block import TextParserBlock
|
23
|
+
from ..registry import BlockRegistry
|
24
|
+
|
25
|
+
logger = setup_logger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
@BlockRegistry.register(
|
29
|
+
"EvaluateFaithfulnessBlock",
|
30
|
+
"evaluation",
|
31
|
+
"Composite block for faithfulness evaluation of question-answer pairs",
|
32
|
+
)
|
33
|
+
class EvaluateFaithfulnessBlock(BaseBlock):
|
34
|
+
"""Composite block for faithfulness evaluation workflow.
|
35
|
+
|
36
|
+
This block combines four separate blocks into a single cohesive evaluation block:
|
37
|
+
1. PromptBuilderBlock - builds evaluation prompt from document and response
|
38
|
+
2. LLMChatBlock - generates faithfulness evaluation using LLM
|
39
|
+
3. TextParserBlock - parses explanation and judgment from raw output
|
40
|
+
4. ColumnValueFilterBlock - filters based on faithfulness judgment
|
41
|
+
|
42
|
+
Parameters
|
43
|
+
----------
|
44
|
+
block_name : str
|
45
|
+
Name of the block.
|
46
|
+
input_cols : List[str]
|
47
|
+
Input columns: ["document", "response"]
|
48
|
+
output_cols : List[str]
|
49
|
+
Output columns: ["faithfulness_explanation", "faithfulness_judgment"]
|
50
|
+
prompt_config_path : str
|
51
|
+
Path to YAML file containing the faithfulness evaluation prompt template.
|
52
|
+
model : str
|
53
|
+
Model identifier in LiteLLM format (e.g., "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct")
|
54
|
+
api_base : Optional[str]
|
55
|
+
Base URL for the API. Required for local models.
|
56
|
+
api_key : Optional[str]
|
57
|
+
API key for the provider. Falls back to environment variables.
|
58
|
+
filter_value : str, optional
|
59
|
+
Value to filter on for faithfulness judgment (default: "YES")
|
60
|
+
operation : str, optional
|
61
|
+
Filter operation (default: "eq")
|
62
|
+
convert_dtype : Optional[str], optional
|
63
|
+
Data type conversion for filter column (default: None)
|
64
|
+
async_mode : bool, optional
|
65
|
+
Whether to use async processing (default: True)
|
66
|
+
format_as_messages : bool, optional
|
67
|
+
Whether to format prompt as messages (default: True)
|
68
|
+
start_tags : List[str], optional
|
69
|
+
Start tags for parsing (default: ["[Start of Explanation]", "[Start of Answer]"])
|
70
|
+
end_tags : List[str], optional
|
71
|
+
End tags for parsing (default: ["[End of Explanation]", "[End of Answer]"])
|
72
|
+
parsing_pattern : Optional[str], optional
|
73
|
+
Regex pattern for custom parsing. If provided, takes precedence over tag-based parsing.
|
74
|
+
parser_cleanup_tags : Optional[List[str]], optional
|
75
|
+
List of tags to clean from parsed output.
|
76
|
+
|
77
|
+
### LLM Generation Parameters ###
|
78
|
+
temperature : Optional[float], optional
|
79
|
+
Sampling temperature (0.0 to 2.0).
|
80
|
+
max_tokens : Optional[int], optional
|
81
|
+
Maximum tokens to generate.
|
82
|
+
top_p : Optional[float], optional
|
83
|
+
Nucleus sampling parameter (0.0 to 1.0).
|
84
|
+
frequency_penalty : Optional[float], optional
|
85
|
+
Frequency penalty (-2.0 to 2.0).
|
86
|
+
presence_penalty : Optional[float], optional
|
87
|
+
Presence penalty (-2.0 to 2.0).
|
88
|
+
stop : Optional[Union[str, List[str]]], optional
|
89
|
+
Stop sequences.
|
90
|
+
seed : Optional[int], optional
|
91
|
+
Random seed for reproducible outputs.
|
92
|
+
response_format : Optional[Dict[str, Any]], optional
|
93
|
+
Response format specification (e.g., JSON mode).
|
94
|
+
stream : Optional[bool], optional
|
95
|
+
Whether to stream responses.
|
96
|
+
n : Optional[int], optional
|
97
|
+
Number of completions to generate. When n > 1, the output column will contain
|
98
|
+
a list of responses for each input sample.
|
99
|
+
logprobs : Optional[bool], optional
|
100
|
+
Whether to return log probabilities.
|
101
|
+
top_logprobs : Optional[int], optional
|
102
|
+
Number of top log probabilities to return.
|
103
|
+
user : Optional[str], optional
|
104
|
+
End-user identifier.
|
105
|
+
extra_headers : Optional[Dict[str, str]], optional
|
106
|
+
Additional headers to send with requests.
|
107
|
+
extra_body : Optional[Dict[str, Any]], optional
|
108
|
+
Additional parameters for the request body.
|
109
|
+
timeout : float, optional
|
110
|
+
Request timeout in seconds (default: 120.0).
|
111
|
+
max_retries : int, optional
|
112
|
+
Maximum number of retry attempts (default: 6).
|
113
|
+
**kwargs : Any
|
114
|
+
Additional provider-specific parameters.
|
115
|
+
"""
|
116
|
+
|
117
|
+
model_config = ConfigDict(extra="forbid")
|
118
|
+
|
119
|
+
# Core configuration
|
120
|
+
prompt_config_path: str = Field(
|
121
|
+
...,
|
122
|
+
description="Path to YAML file containing the faithfulness evaluation prompt template",
|
123
|
+
)
|
124
|
+
model: Optional[str] = Field(None, description="Model identifier in LiteLLM format")
|
125
|
+
api_base: Optional[str] = Field(None, description="Base URL for the API")
|
126
|
+
api_key: Optional[str] = Field(
|
127
|
+
None,
|
128
|
+
description="API key for the provider. Falls back to environment variables.",
|
129
|
+
)
|
130
|
+
|
131
|
+
# Filter configuration
|
132
|
+
filter_value: str = Field(
|
133
|
+
"YES", description="Value to filter on for faithfulness judgment"
|
134
|
+
)
|
135
|
+
operation: str = Field("eq", description="Filter operation")
|
136
|
+
convert_dtype: Optional[str] = Field(
|
137
|
+
None, description="Data type conversion for filter column"
|
138
|
+
)
|
139
|
+
|
140
|
+
# Processing configuration
|
141
|
+
async_mode: bool = Field(True, description="Whether to use async processing")
|
142
|
+
format_as_messages: bool = Field(
|
143
|
+
True, description="Whether to format prompt as messages"
|
144
|
+
)
|
145
|
+
|
146
|
+
# Parser configuration
|
147
|
+
start_tags: list[str] = Field(
|
148
|
+
["[Start of Explanation]", "[Start of Answer]"],
|
149
|
+
description="Start tags for parsing explanation and judgment",
|
150
|
+
)
|
151
|
+
end_tags: list[str] = Field(
|
152
|
+
["[End of Explanation]", "[End of Answer]"],
|
153
|
+
description="End tags for parsing explanation and judgment",
|
154
|
+
)
|
155
|
+
parsing_pattern: Optional[str] = Field(
|
156
|
+
None,
|
157
|
+
description="Regex pattern for custom parsing. If provided, takes precedence over tag-based parsing",
|
158
|
+
)
|
159
|
+
parser_cleanup_tags: Optional[list[str]] = Field(
|
160
|
+
None, description="List of tags to clean from parsed output"
|
161
|
+
)
|
162
|
+
|
163
|
+
# LLM generation parameters
|
164
|
+
temperature: Optional[float] = Field(
|
165
|
+
None, description="Sampling temperature (0.0 to 2.0)"
|
166
|
+
)
|
167
|
+
max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
|
168
|
+
top_p: Optional[float] = Field(
|
169
|
+
None, description="Nucleus sampling parameter (0.0 to 1.0)"
|
170
|
+
)
|
171
|
+
frequency_penalty: Optional[float] = Field(
|
172
|
+
None, description="Frequency penalty (-2.0 to 2.0)"
|
173
|
+
)
|
174
|
+
presence_penalty: Optional[float] = Field(
|
175
|
+
None, description="Presence penalty (-2.0 to 2.0)"
|
176
|
+
)
|
177
|
+
stop: Optional[Union[str, list[str]]] = Field(None, description="Stop sequences")
|
178
|
+
seed: Optional[int] = Field(
|
179
|
+
None, description="Random seed for reproducible outputs"
|
180
|
+
)
|
181
|
+
response_format: Optional[dict[str, Any]] = Field(
|
182
|
+
None, description="Response format specification (e.g., JSON mode)"
|
183
|
+
)
|
184
|
+
stream: Optional[bool] = Field(None, description="Whether to stream responses")
|
185
|
+
n: Optional[int] = Field(
|
186
|
+
None,
|
187
|
+
description="Number of completions to generate. When n > 1, the output column will contain a list of responses for each input sample",
|
188
|
+
)
|
189
|
+
logprobs: Optional[bool] = Field(
|
190
|
+
None, description="Whether to return log probabilities"
|
191
|
+
)
|
192
|
+
top_logprobs: Optional[int] = Field(
|
193
|
+
None, description="Number of top log probabilities to return"
|
194
|
+
)
|
195
|
+
user: Optional[str] = Field(None, description="End-user identifier")
|
196
|
+
extra_headers: Optional[dict[str, str]] = Field(
|
197
|
+
None, description="Additional headers to send with requests"
|
198
|
+
)
|
199
|
+
extra_body: Optional[dict[str, Any]] = Field(
|
200
|
+
None, description="Additional parameters for the request body"
|
201
|
+
)
|
202
|
+
timeout: float = Field(120.0, description="Request timeout in seconds")
|
203
|
+
max_retries: int = Field(6, description="Maximum number of retry attempts")
|
204
|
+
|
205
|
+
# Additional provider-specific parameters
|
206
|
+
llm_kwargs: dict[str, Any] = Field(
|
207
|
+
default_factory=dict, description="Additional provider-specific parameters"
|
208
|
+
)
|
209
|
+
|
210
|
+
# Internal blocks - excluded from serialization
|
211
|
+
prompt_builder: Optional[PromptBuilderBlock] = Field(None, exclude=True)
|
212
|
+
llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
|
213
|
+
text_parser: Optional[TextParserBlock] = Field(None, exclude=True)
|
214
|
+
filter_block: Optional[ColumnValueFilterBlock] = Field(None, exclude=True)
|
215
|
+
|
216
|
+
@field_validator("input_cols")
|
217
|
+
@classmethod
|
218
|
+
def validate_input_cols(cls, v):
|
219
|
+
"""Validate that input columns are exactly ["document", "response"]."""
|
220
|
+
expected = ["document", "response"]
|
221
|
+
if v != expected:
|
222
|
+
raise ValueError(
|
223
|
+
f"EvaluateFaithfulnessBlock expects input_cols={expected}, got {v}"
|
224
|
+
)
|
225
|
+
return v
|
226
|
+
|
227
|
+
@field_validator("output_cols")
|
228
|
+
@classmethod
|
229
|
+
def validate_output_cols(cls, v):
|
230
|
+
"""Validate that output columns are exactly ["faithfulness_explanation", "faithfulness_judgment"]."""
|
231
|
+
expected = [
|
232
|
+
"faithfulness_explanation",
|
233
|
+
"faithfulness_judgment",
|
234
|
+
]
|
235
|
+
if v != expected:
|
236
|
+
raise ValueError(
|
237
|
+
f"EvaluateFaithfulnessBlock expects output_cols={expected}, got {v}"
|
238
|
+
)
|
239
|
+
return v
|
240
|
+
|
241
|
+
def model_post_init(self, __context: Any) -> None:
|
242
|
+
"""Initialize the internal blocks after Pydantic validation."""
|
243
|
+
super().model_post_init(__context)
|
244
|
+
|
245
|
+
# Create internal blocks
|
246
|
+
self._create_internal_blocks()
|
247
|
+
|
248
|
+
# Log initialization only when model is configured
|
249
|
+
if self.model:
|
250
|
+
logger.info(
|
251
|
+
f"Initialized EvaluateFaithfulnessBlock '{self.block_name}' with model '{self.model}'",
|
252
|
+
extra={
|
253
|
+
"block_name": self.block_name,
|
254
|
+
"model": self.model,
|
255
|
+
"async_mode": self.async_mode,
|
256
|
+
"filter_value": self.filter_value,
|
257
|
+
},
|
258
|
+
)
|
259
|
+
|
260
|
+
def _create_internal_blocks(self) -> None:
|
261
|
+
"""Create and configure the internal blocks."""
|
262
|
+
# 1. PromptBuilderBlock
|
263
|
+
self.prompt_builder = PromptBuilderBlock(
|
264
|
+
block_name=f"{self.block_name}_prompt_builder",
|
265
|
+
input_cols=["document", "response"],
|
266
|
+
output_cols=["eval_faithfulness_prompt"],
|
267
|
+
prompt_config_path=self.prompt_config_path,
|
268
|
+
format_as_messages=self.format_as_messages,
|
269
|
+
)
|
270
|
+
|
271
|
+
# 2. LLMChatBlock
|
272
|
+
llm_kwargs = {
|
273
|
+
"block_name": f"{self.block_name}_llm_chat",
|
274
|
+
"input_cols": ["eval_faithfulness_prompt"],
|
275
|
+
"output_cols": ["raw_eval_faithfulness"],
|
276
|
+
"model": self.model,
|
277
|
+
"api_base": self.api_base,
|
278
|
+
"api_key": self.api_key,
|
279
|
+
"async_mode": self.async_mode,
|
280
|
+
"timeout": self.timeout,
|
281
|
+
"max_retries": self.max_retries,
|
282
|
+
}
|
283
|
+
|
284
|
+
# Add generation parameters if specified
|
285
|
+
if self.temperature is not None:
|
286
|
+
llm_kwargs["temperature"] = self.temperature
|
287
|
+
if self.max_tokens is not None:
|
288
|
+
llm_kwargs["max_tokens"] = self.max_tokens
|
289
|
+
if self.top_p is not None:
|
290
|
+
llm_kwargs["top_p"] = self.top_p
|
291
|
+
if self.frequency_penalty is not None:
|
292
|
+
llm_kwargs["frequency_penalty"] = self.frequency_penalty
|
293
|
+
if self.presence_penalty is not None:
|
294
|
+
llm_kwargs["presence_penalty"] = self.presence_penalty
|
295
|
+
if self.stop is not None:
|
296
|
+
llm_kwargs["stop"] = self.stop
|
297
|
+
if self.seed is not None:
|
298
|
+
llm_kwargs["seed"] = self.seed
|
299
|
+
if self.response_format is not None:
|
300
|
+
llm_kwargs["response_format"] = self.response_format
|
301
|
+
if self.stream is not None:
|
302
|
+
llm_kwargs["stream"] = self.stream
|
303
|
+
if self.n is not None:
|
304
|
+
llm_kwargs["n"] = self.n
|
305
|
+
if self.logprobs is not None:
|
306
|
+
llm_kwargs["logprobs"] = self.logprobs
|
307
|
+
if self.top_logprobs is not None:
|
308
|
+
llm_kwargs["top_logprobs"] = self.top_logprobs
|
309
|
+
if self.user is not None:
|
310
|
+
llm_kwargs["user"] = self.user
|
311
|
+
if self.extra_headers is not None:
|
312
|
+
llm_kwargs["extra_headers"] = self.extra_headers
|
313
|
+
if self.extra_body is not None:
|
314
|
+
llm_kwargs["extra_body"] = self.extra_body
|
315
|
+
|
316
|
+
# Add any additional kwargs
|
317
|
+
llm_kwargs.update(self.llm_kwargs)
|
318
|
+
|
319
|
+
self.llm_chat = LLMChatBlock(**llm_kwargs)
|
320
|
+
|
321
|
+
# 3. TextParserBlock
|
322
|
+
text_parser_kwargs = {
|
323
|
+
"block_name": f"{self.block_name}_text_parser",
|
324
|
+
"input_cols": ["raw_eval_faithfulness"],
|
325
|
+
"output_cols": ["faithfulness_explanation", "faithfulness_judgment"],
|
326
|
+
"start_tags": self.start_tags,
|
327
|
+
"end_tags": self.end_tags,
|
328
|
+
}
|
329
|
+
|
330
|
+
# Add optional TextParserBlock parameters if specified
|
331
|
+
if self.parsing_pattern is not None:
|
332
|
+
text_parser_kwargs["parsing_pattern"] = self.parsing_pattern
|
333
|
+
if self.parser_cleanup_tags is not None:
|
334
|
+
text_parser_kwargs["parser_cleanup_tags"] = self.parser_cleanup_tags
|
335
|
+
|
336
|
+
self.text_parser = TextParserBlock(**text_parser_kwargs)
|
337
|
+
|
338
|
+
# 4. ColumnValueFilterBlock
|
339
|
+
filter_kwargs = {
|
340
|
+
"block_name": f"{self.block_name}_filter",
|
341
|
+
"input_cols": ["faithfulness_judgment"],
|
342
|
+
"output_cols": [], # Filter blocks don't create new columns
|
343
|
+
"filter_value": self.filter_value,
|
344
|
+
"operation": self.operation,
|
345
|
+
}
|
346
|
+
|
347
|
+
if self.convert_dtype is not None:
|
348
|
+
filter_kwargs["convert_dtype"] = self.convert_dtype
|
349
|
+
|
350
|
+
self.filter_block = ColumnValueFilterBlock(**filter_kwargs)
|
351
|
+
|
352
|
+
def _reinitialize_client_manager(self) -> None:
|
353
|
+
"""Reinitialize the internal LLM chat block's client manager.
|
354
|
+
|
355
|
+
This should be called after model configuration changes to ensure
|
356
|
+
the internal LLM chat block uses the updated model configuration.
|
357
|
+
"""
|
358
|
+
if self.llm_chat and hasattr(self.llm_chat, "_reinitialize_client_manager"):
|
359
|
+
# Update the internal LLM chat block's model config
|
360
|
+
self.llm_chat.model = self.model
|
361
|
+
self.llm_chat.api_base = self.api_base
|
362
|
+
self.llm_chat.api_key = self.api_key
|
363
|
+
# Reinitialize its client manager
|
364
|
+
self.llm_chat._reinitialize_client_manager()
|
365
|
+
|
366
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
367
|
+
"""Generate faithfulness evaluation for all samples.
|
368
|
+
|
369
|
+
This method chains the four internal blocks in sequence:
|
370
|
+
1. Build faithfulness evaluation prompts
|
371
|
+
2. Generate LLM responses
|
372
|
+
3. Parse explanation and judgment
|
373
|
+
4. Filter based on judgment
|
374
|
+
|
375
|
+
Parameters
|
376
|
+
----------
|
377
|
+
samples : Dataset
|
378
|
+
Input dataset containing 'document' and 'response' columns.
|
379
|
+
**kwargs : Any
|
380
|
+
Additional keyword arguments passed to internal blocks.
|
381
|
+
|
382
|
+
Returns
|
383
|
+
-------
|
384
|
+
Dataset
|
385
|
+
Dataset with faithfulness evaluation results and filtering applied.
|
386
|
+
|
387
|
+
Raises
|
388
|
+
------
|
389
|
+
BlockValidationError
|
390
|
+
If model is not configured before calling generate().
|
391
|
+
"""
|
392
|
+
# Validate that model is configured
|
393
|
+
if not self.model:
|
394
|
+
# Local
|
395
|
+
from ...utils.error_handling import BlockValidationError
|
396
|
+
|
397
|
+
raise BlockValidationError(
|
398
|
+
f"Model not configured for block '{self.block_name}'. "
|
399
|
+
f"Call flow.set_model_config() before generating."
|
400
|
+
)
|
401
|
+
logger.info(
|
402
|
+
f"Starting faithfulness evaluation for {len(samples)} samples",
|
403
|
+
extra={
|
404
|
+
"block_name": self.block_name,
|
405
|
+
"model": self.model,
|
406
|
+
"batch_size": len(samples),
|
407
|
+
},
|
408
|
+
)
|
409
|
+
|
410
|
+
current_dataset = samples
|
411
|
+
|
412
|
+
try:
|
413
|
+
# Step 1: Build prompts
|
414
|
+
logger.debug("Step 1: Building faithfulness evaluation prompts")
|
415
|
+
current_dataset = self.prompt_builder.generate(current_dataset, **kwargs)
|
416
|
+
|
417
|
+
# Step 2: Generate LLM responses
|
418
|
+
logger.debug("Step 2: Generating LLM responses")
|
419
|
+
current_dataset = self.llm_chat.generate(current_dataset, **kwargs)
|
420
|
+
|
421
|
+
# Step 3: Parse responses
|
422
|
+
logger.debug("Step 3: Parsing faithfulness evaluation responses")
|
423
|
+
current_dataset = self.text_parser.generate(current_dataset, **kwargs)
|
424
|
+
|
425
|
+
# Step 4: Filter based on judgment
|
426
|
+
logger.debug("Step 4: Filtering based on faithfulness judgment")
|
427
|
+
original_count = len(current_dataset)
|
428
|
+
current_dataset = self.filter_block.generate(current_dataset, **kwargs)
|
429
|
+
filtered_count = len(current_dataset)
|
430
|
+
|
431
|
+
logger.info(
|
432
|
+
f"Faithfulness evaluation completed: {original_count} → {filtered_count} samples "
|
433
|
+
f"(filtered {original_count - filtered_count} samples)",
|
434
|
+
extra={
|
435
|
+
"block_name": self.block_name,
|
436
|
+
"original_count": original_count,
|
437
|
+
"filtered_count": filtered_count,
|
438
|
+
"filter_rate": (original_count - filtered_count) / original_count
|
439
|
+
if original_count > 0
|
440
|
+
else 0,
|
441
|
+
},
|
442
|
+
)
|
443
|
+
|
444
|
+
return current_dataset
|
445
|
+
|
446
|
+
except Exception as e:
|
447
|
+
logger.error(
|
448
|
+
f"Error during faithfulness evaluation: {e}",
|
449
|
+
extra={
|
450
|
+
"block_name": self.block_name,
|
451
|
+
"model": self.model,
|
452
|
+
"error": str(e),
|
453
|
+
},
|
454
|
+
)
|
455
|
+
raise
|
456
|
+
|
457
|
+
def _validate_custom(self, dataset: Dataset) -> None:
|
458
|
+
"""Custom validation for faithfulness evaluation.
|
459
|
+
|
460
|
+
This method validates the entire chain of internal blocks by simulating
|
461
|
+
the data flow through each block to ensure they can all process the data correctly.
|
462
|
+
"""
|
463
|
+
# Validate that required columns exist
|
464
|
+
required_columns = ["document", "response"]
|
465
|
+
missing_columns = [
|
466
|
+
col for col in required_columns if col not in dataset.column_names
|
467
|
+
]
|
468
|
+
if missing_columns:
|
469
|
+
raise ValueError(
|
470
|
+
f"EvaluateFaithfulnessBlock requires columns {required_columns}, "
|
471
|
+
f"missing: {missing_columns}"
|
472
|
+
)
|
473
|
+
|
474
|
+
# Validate the entire chain of internal blocks
|
475
|
+
if not all(
|
476
|
+
[self.prompt_builder, self.llm_chat, self.text_parser, self.filter_block]
|
477
|
+
):
|
478
|
+
raise ValueError(
|
479
|
+
"All internal blocks must be initialized before validation"
|
480
|
+
)
|
481
|
+
|
482
|
+
# Simulate data flow through the chain to validate each block
|
483
|
+
current_dataset = dataset
|
484
|
+
|
485
|
+
try:
|
486
|
+
# 1. Validate PromptBuilderBlock
|
487
|
+
logger.debug("Validating prompt builder block")
|
488
|
+
self.prompt_builder._validate_custom(current_dataset)
|
489
|
+
|
490
|
+
# Simulate prompt builder output for next validation
|
491
|
+
# Add the expected output column temporarily for validation
|
492
|
+
if "eval_faithfulness_prompt" not in current_dataset.column_names:
|
493
|
+
# Create a temporary dataset with the expected column for validation
|
494
|
+
temp_data = []
|
495
|
+
for sample in current_dataset:
|
496
|
+
temp_sample = dict(sample)
|
497
|
+
temp_sample["eval_faithfulness_prompt"] = [
|
498
|
+
{"role": "user", "content": "test"}
|
499
|
+
]
|
500
|
+
temp_data.append(temp_sample)
|
501
|
+
current_dataset = Dataset.from_list(temp_data)
|
502
|
+
|
503
|
+
# 2. Validate LLMChatBlock
|
504
|
+
logger.debug("Validating LLM chat block")
|
505
|
+
self.llm_chat._validate_custom(current_dataset)
|
506
|
+
|
507
|
+
# Simulate LLM chat output for next validation
|
508
|
+
if "raw_eval_faithfulness" not in current_dataset.column_names:
|
509
|
+
temp_data = []
|
510
|
+
for sample in current_dataset:
|
511
|
+
temp_sample = dict(sample)
|
512
|
+
temp_sample["raw_eval_faithfulness"] = (
|
513
|
+
"[Start of Explanation]Test explanation[End of Explanation]\n[Start of Answer]YES[End of Answer]"
|
514
|
+
)
|
515
|
+
temp_data.append(temp_sample)
|
516
|
+
current_dataset = Dataset.from_list(temp_data)
|
517
|
+
|
518
|
+
# 3. Validate TextParserBlock
|
519
|
+
logger.debug("Validating text parser block")
|
520
|
+
self.text_parser._validate_custom(current_dataset)
|
521
|
+
|
522
|
+
# Simulate text parser output for final validation
|
523
|
+
if "faithfulness_judgment" not in current_dataset.column_names:
|
524
|
+
temp_data = []
|
525
|
+
for sample in current_dataset:
|
526
|
+
temp_sample = dict(sample)
|
527
|
+
temp_sample["faithfulness_explanation"] = "Test explanation"
|
528
|
+
temp_sample["faithfulness_judgment"] = "YES"
|
529
|
+
temp_data.append(temp_sample)
|
530
|
+
current_dataset = Dataset.from_list(temp_data)
|
531
|
+
|
532
|
+
# 4. Validate ColumnValueFilterBlock
|
533
|
+
logger.debug("Validating filter block")
|
534
|
+
self.filter_block._validate_custom(current_dataset)
|
535
|
+
|
536
|
+
logger.debug("All internal blocks validated successfully")
|
537
|
+
|
538
|
+
except Exception as e:
|
539
|
+
logger.error(f"Validation failed in internal blocks: {e}")
|
540
|
+
raise ValueError(f"Internal block validation failed: {e}") from e
|
541
|
+
|
542
|
+
def get_internal_blocks_info(self) -> dict[str, Any]:
|
543
|
+
"""Get information about the internal blocks.
|
544
|
+
|
545
|
+
Returns
|
546
|
+
-------
|
547
|
+
Dict[str, Any]
|
548
|
+
Information about each internal block.
|
549
|
+
"""
|
550
|
+
return {
|
551
|
+
"prompt_builder": self.prompt_builder.get_info()
|
552
|
+
if self.prompt_builder
|
553
|
+
else None,
|
554
|
+
"llm_chat": self.llm_chat.get_info() if self.llm_chat else None,
|
555
|
+
"text_parser": self.text_parser.get_info() if self.text_parser else None,
|
556
|
+
"filter": self.filter_block.get_info() if self.filter_block else None,
|
557
|
+
}
|
558
|
+
|
559
|
+
def __repr__(self) -> str:
|
560
|
+
"""String representation of the block."""
|
561
|
+
return (
|
562
|
+
f"EvaluateFaithfulnessBlock(name='{self.block_name}', "
|
563
|
+
f"model='{self.model}', filter_value='{self.filter_value}')"
|
564
|
+
)
|