sdg-hub 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +25 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +980 -0
- sdg_hub/core/flow/metadata.py +344 -0
- sdg_hub/core/flow/migration.py +187 -0
- sdg_hub/core/flow/registry.py +330 -0
- sdg_hub/core/flow/validation.py +265 -0
- sdg_hub/{utils → core/utils}/__init__.py +6 -4
- sdg_hub/{utils → core/utils}/datautils.py +1 -3
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.0.dist-info/METADATA +218 -0
- sdg_hub-0.2.0.dist-info/RECORD +63 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,564 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Composite block for relevancy evaluation of question-answer pairs.
|
3
|
+
|
4
|
+
This module provides the EvaluateRelevancyBlock that encapsulates the complete
|
5
|
+
relevancy evaluation workflow, combining prompt building, LLM chat, text parsing,
|
6
|
+
and filtering into a single block for simplified configuration.
|
7
|
+
"""
|
8
|
+
|
9
|
+
# Standard
|
10
|
+
from typing import Any, Optional, Union
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
from pydantic import ConfigDict, Field, field_validator
|
15
|
+
|
16
|
+
# Local
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..filtering.column_value_filter import ColumnValueFilterBlock
|
20
|
+
from ..llm.llm_chat_block import LLMChatBlock
|
21
|
+
from ..llm.prompt_builder_block import PromptBuilderBlock
|
22
|
+
from ..llm.text_parser_block import TextParserBlock
|
23
|
+
from ..registry import BlockRegistry
|
24
|
+
|
25
|
+
logger = setup_logger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
@BlockRegistry.register(
|
29
|
+
"EvaluateRelevancyBlock",
|
30
|
+
"evaluation",
|
31
|
+
"Composite block for relevancy evaluation of question-answer pairs",
|
32
|
+
)
|
33
|
+
class EvaluateRelevancyBlock(BaseBlock):
|
34
|
+
"""Composite block for relevancy evaluation workflow.
|
35
|
+
|
36
|
+
This block combines four separate blocks into a single cohesive evaluation block:
|
37
|
+
1. PromptBuilderBlock - builds evaluation prompt from question and response
|
38
|
+
2. LLMChatBlock - generates relevancy evaluation using LLM
|
39
|
+
3. TextParserBlock - parses feedback and score from raw output
|
40
|
+
4. ColumnValueFilterBlock - filters based on relevancy score
|
41
|
+
|
42
|
+
Parameters
|
43
|
+
----------
|
44
|
+
block_name : str
|
45
|
+
Name of the block.
|
46
|
+
input_cols : List[str]
|
47
|
+
Input columns: ["question", "response"]
|
48
|
+
output_cols : List[str]
|
49
|
+
Output columns: ["relevancy_explanation", "relevancy_score"]
|
50
|
+
prompt_config_path : str
|
51
|
+
Path to YAML file containing the relevancy evaluation prompt template.
|
52
|
+
model : str
|
53
|
+
Model identifier in LiteLLM format (e.g., "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct")
|
54
|
+
api_base : Optional[str]
|
55
|
+
Base URL for the API. Required for local models.
|
56
|
+
api_key : Optional[str]
|
57
|
+
API key for the provider. Falls back to environment variables.
|
58
|
+
filter_value : Union[str, int, float], optional
|
59
|
+
Value to filter on for relevancy score (default: 2.0)
|
60
|
+
operation : str, optional
|
61
|
+
Filter operation (default: "eq")
|
62
|
+
convert_dtype : Optional[str], optional
|
63
|
+
Data type conversion for filter column (default: "float")
|
64
|
+
async_mode : bool, optional
|
65
|
+
Whether to use async processing (default: True)
|
66
|
+
format_as_messages : bool, optional
|
67
|
+
Whether to format prompt as messages (default: True)
|
68
|
+
start_tags : List[str], optional
|
69
|
+
Start tags for parsing (default: ["[Start of Feedback]", "[Start of Score]"])
|
70
|
+
end_tags : List[str], optional
|
71
|
+
End tags for parsing (default: ["[End of Feedback]", "[End of Score]"])
|
72
|
+
parsing_pattern : Optional[str], optional
|
73
|
+
Regex pattern for custom parsing. If provided, takes precedence over tag-based parsing.
|
74
|
+
parser_cleanup_tags : Optional[List[str]], optional
|
75
|
+
List of tags to clean from parsed output.
|
76
|
+
|
77
|
+
### LLM Generation Parameters ###
|
78
|
+
temperature : Optional[float], optional
|
79
|
+
Sampling temperature (0.0 to 2.0).
|
80
|
+
max_tokens : Optional[int], optional
|
81
|
+
Maximum tokens to generate.
|
82
|
+
top_p : Optional[float], optional
|
83
|
+
Nucleus sampling parameter (0.0 to 1.0).
|
84
|
+
frequency_penalty : Optional[float], optional
|
85
|
+
Frequency penalty (-2.0 to 2.0).
|
86
|
+
presence_penalty : Optional[float], optional
|
87
|
+
Presence penalty (-2.0 to 2.0).
|
88
|
+
stop : Optional[Union[str, List[str]]], optional
|
89
|
+
Stop sequences.
|
90
|
+
seed : Optional[int], optional
|
91
|
+
Random seed for reproducible outputs.
|
92
|
+
response_format : Optional[Dict[str, Any]], optional
|
93
|
+
Response format specification (e.g., JSON mode).
|
94
|
+
stream : Optional[bool], optional
|
95
|
+
Whether to stream responses.
|
96
|
+
n : Optional[int], optional
|
97
|
+
Number of completions to generate. When n > 1, the output column will contain
|
98
|
+
a list of responses for each input sample.
|
99
|
+
logprobs : Optional[bool], optional
|
100
|
+
Whether to return log probabilities.
|
101
|
+
top_logprobs : Optional[int], optional
|
102
|
+
Number of top log probabilities to return.
|
103
|
+
user : Optional[str], optional
|
104
|
+
End-user identifier.
|
105
|
+
extra_headers : Optional[Dict[str, str]], optional
|
106
|
+
Additional headers to send with requests.
|
107
|
+
extra_body : Optional[Dict[str, Any]], optional
|
108
|
+
Additional parameters for the request body.
|
109
|
+
timeout : float, optional
|
110
|
+
Request timeout in seconds (default: 120.0).
|
111
|
+
max_retries : int, optional
|
112
|
+
Maximum number of retry attempts (default: 6).
|
113
|
+
**kwargs : Any
|
114
|
+
Additional provider-specific parameters.
|
115
|
+
"""
|
116
|
+
|
117
|
+
model_config = ConfigDict(extra="forbid")
|
118
|
+
|
119
|
+
# Core configuration
|
120
|
+
prompt_config_path: str = Field(
|
121
|
+
...,
|
122
|
+
description="Path to YAML file containing the relevancy evaluation prompt template",
|
123
|
+
)
|
124
|
+
model: Optional[str] = Field(None, description="Model identifier in LiteLLM format")
|
125
|
+
api_base: Optional[str] = Field(None, description="Base URL for the API")
|
126
|
+
api_key: Optional[str] = Field(
|
127
|
+
None,
|
128
|
+
description="API key for the provider. Falls back to environment variables.",
|
129
|
+
)
|
130
|
+
|
131
|
+
# Filter configuration
|
132
|
+
filter_value: Union[str, int, float] = Field(
|
133
|
+
2.0, description="Value to filter on for relevancy score"
|
134
|
+
)
|
135
|
+
operation: str = Field("eq", description="Filter operation")
|
136
|
+
convert_dtype: Optional[str] = Field(
|
137
|
+
"float", description="Data type conversion for filter column"
|
138
|
+
)
|
139
|
+
|
140
|
+
# Processing configuration
|
141
|
+
async_mode: bool = Field(True, description="Whether to use async processing")
|
142
|
+
format_as_messages: bool = Field(
|
143
|
+
True, description="Whether to format prompt as messages"
|
144
|
+
)
|
145
|
+
|
146
|
+
# Parser configuration
|
147
|
+
start_tags: list[str] = Field(
|
148
|
+
["[Start of Feedback]", "[Start of Score]"],
|
149
|
+
description="Start tags for parsing feedback and score",
|
150
|
+
)
|
151
|
+
end_tags: list[str] = Field(
|
152
|
+
["[End of Feedback]", "[End of Score]"],
|
153
|
+
description="End tags for parsing feedback and score",
|
154
|
+
)
|
155
|
+
parsing_pattern: Optional[str] = Field(
|
156
|
+
None,
|
157
|
+
description="Regex pattern for custom parsing. If provided, takes precedence over tag-based parsing",
|
158
|
+
)
|
159
|
+
parser_cleanup_tags: Optional[list[str]] = Field(
|
160
|
+
None, description="List of tags to clean from parsed output"
|
161
|
+
)
|
162
|
+
|
163
|
+
# LLM generation parameters
|
164
|
+
temperature: Optional[float] = Field(
|
165
|
+
None, description="Sampling temperature (0.0 to 2.0)"
|
166
|
+
)
|
167
|
+
max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
|
168
|
+
top_p: Optional[float] = Field(
|
169
|
+
None, description="Nucleus sampling parameter (0.0 to 1.0)"
|
170
|
+
)
|
171
|
+
frequency_penalty: Optional[float] = Field(
|
172
|
+
None, description="Frequency penalty (-2.0 to 2.0)"
|
173
|
+
)
|
174
|
+
presence_penalty: Optional[float] = Field(
|
175
|
+
None, description="Presence penalty (-2.0 to 2.0)"
|
176
|
+
)
|
177
|
+
stop: Optional[Union[str, list[str]]] = Field(None, description="Stop sequences")
|
178
|
+
seed: Optional[int] = Field(
|
179
|
+
None, description="Random seed for reproducible outputs"
|
180
|
+
)
|
181
|
+
response_format: Optional[dict[str, Any]] = Field(
|
182
|
+
None, description="Response format specification (e.g., JSON mode)"
|
183
|
+
)
|
184
|
+
stream: Optional[bool] = Field(None, description="Whether to stream responses")
|
185
|
+
n: Optional[int] = Field(
|
186
|
+
None,
|
187
|
+
description="Number of completions to generate. When n > 1, the output column will contain a list of responses for each input sample",
|
188
|
+
)
|
189
|
+
logprobs: Optional[bool] = Field(
|
190
|
+
None, description="Whether to return log probabilities"
|
191
|
+
)
|
192
|
+
top_logprobs: Optional[int] = Field(
|
193
|
+
None, description="Number of top log probabilities to return"
|
194
|
+
)
|
195
|
+
user: Optional[str] = Field(None, description="End-user identifier")
|
196
|
+
extra_headers: Optional[dict[str, str]] = Field(
|
197
|
+
None, description="Additional headers to send with requests"
|
198
|
+
)
|
199
|
+
extra_body: Optional[dict[str, Any]] = Field(
|
200
|
+
None, description="Additional parameters for the request body"
|
201
|
+
)
|
202
|
+
timeout: float = Field(120.0, description="Request timeout in seconds")
|
203
|
+
max_retries: int = Field(6, description="Maximum number of retry attempts")
|
204
|
+
|
205
|
+
# Additional provider-specific parameters
|
206
|
+
llm_kwargs: dict[str, Any] = Field(
|
207
|
+
default_factory=dict, description="Additional provider-specific parameters"
|
208
|
+
)
|
209
|
+
|
210
|
+
# Internal blocks - excluded from serialization
|
211
|
+
prompt_builder: Optional[PromptBuilderBlock] = Field(None, exclude=True)
|
212
|
+
llm_chat: Optional[LLMChatBlock] = Field(None, exclude=True)
|
213
|
+
text_parser: Optional[TextParserBlock] = Field(None, exclude=True)
|
214
|
+
filter_block: Optional[ColumnValueFilterBlock] = Field(None, exclude=True)
|
215
|
+
|
216
|
+
@field_validator("input_cols")
|
217
|
+
@classmethod
|
218
|
+
def validate_input_cols(cls, v):
|
219
|
+
"""Validate that input columns are exactly ["question", "response"]."""
|
220
|
+
expected = ["question", "response"]
|
221
|
+
if v != expected:
|
222
|
+
raise ValueError(
|
223
|
+
f"EvaluateRelevancyBlock expects input_cols={expected}, got {v}"
|
224
|
+
)
|
225
|
+
return v
|
226
|
+
|
227
|
+
@field_validator("output_cols")
|
228
|
+
@classmethod
|
229
|
+
def validate_output_cols(cls, v):
|
230
|
+
"""Validate that output columns are exactly ["relevancy_explanation", "relevancy_score"]."""
|
231
|
+
expected = [
|
232
|
+
"relevancy_explanation",
|
233
|
+
"relevancy_score",
|
234
|
+
]
|
235
|
+
if v != expected:
|
236
|
+
raise ValueError(
|
237
|
+
f"EvaluateRelevancyBlock expects output_cols={expected}, got {v}"
|
238
|
+
)
|
239
|
+
return v
|
240
|
+
|
241
|
+
def model_post_init(self, __context: Any) -> None:
|
242
|
+
"""Initialize the internal blocks after Pydantic validation."""
|
243
|
+
super().model_post_init(__context)
|
244
|
+
|
245
|
+
# Create internal blocks
|
246
|
+
self._create_internal_blocks()
|
247
|
+
|
248
|
+
# Log initialization only when model is configured
|
249
|
+
if self.model:
|
250
|
+
logger.info(
|
251
|
+
f"Initialized EvaluateRelevancyBlock '{self.block_name}' with model '{self.model}'",
|
252
|
+
extra={
|
253
|
+
"block_name": self.block_name,
|
254
|
+
"model": self.model,
|
255
|
+
"async_mode": self.async_mode,
|
256
|
+
"filter_value": self.filter_value,
|
257
|
+
},
|
258
|
+
)
|
259
|
+
|
260
|
+
def _create_internal_blocks(self) -> None:
|
261
|
+
"""Create and configure the internal blocks."""
|
262
|
+
# 1. PromptBuilderBlock
|
263
|
+
self.prompt_builder = PromptBuilderBlock(
|
264
|
+
block_name=f"{self.block_name}_prompt_builder",
|
265
|
+
input_cols=["question", "response"],
|
266
|
+
output_cols=["eval_relevancy_prompt"],
|
267
|
+
prompt_config_path=self.prompt_config_path,
|
268
|
+
format_as_messages=self.format_as_messages,
|
269
|
+
)
|
270
|
+
|
271
|
+
# 2. LLMChatBlock
|
272
|
+
llm_kwargs = {
|
273
|
+
"block_name": f"{self.block_name}_llm_chat",
|
274
|
+
"input_cols": ["eval_relevancy_prompt"],
|
275
|
+
"output_cols": ["raw_eval_relevancy"],
|
276
|
+
"model": self.model,
|
277
|
+
"api_base": self.api_base,
|
278
|
+
"api_key": self.api_key,
|
279
|
+
"async_mode": self.async_mode,
|
280
|
+
"timeout": self.timeout,
|
281
|
+
"max_retries": self.max_retries,
|
282
|
+
}
|
283
|
+
|
284
|
+
# Add generation parameters if specified
|
285
|
+
if self.temperature is not None:
|
286
|
+
llm_kwargs["temperature"] = self.temperature
|
287
|
+
if self.max_tokens is not None:
|
288
|
+
llm_kwargs["max_tokens"] = self.max_tokens
|
289
|
+
if self.top_p is not None:
|
290
|
+
llm_kwargs["top_p"] = self.top_p
|
291
|
+
if self.frequency_penalty is not None:
|
292
|
+
llm_kwargs["frequency_penalty"] = self.frequency_penalty
|
293
|
+
if self.presence_penalty is not None:
|
294
|
+
llm_kwargs["presence_penalty"] = self.presence_penalty
|
295
|
+
if self.stop is not None:
|
296
|
+
llm_kwargs["stop"] = self.stop
|
297
|
+
if self.seed is not None:
|
298
|
+
llm_kwargs["seed"] = self.seed
|
299
|
+
if self.response_format is not None:
|
300
|
+
llm_kwargs["response_format"] = self.response_format
|
301
|
+
if self.stream is not None:
|
302
|
+
llm_kwargs["stream"] = self.stream
|
303
|
+
if self.n is not None:
|
304
|
+
llm_kwargs["n"] = self.n
|
305
|
+
if self.logprobs is not None:
|
306
|
+
llm_kwargs["logprobs"] = self.logprobs
|
307
|
+
if self.top_logprobs is not None:
|
308
|
+
llm_kwargs["top_logprobs"] = self.top_logprobs
|
309
|
+
if self.user is not None:
|
310
|
+
llm_kwargs["user"] = self.user
|
311
|
+
if self.extra_headers is not None:
|
312
|
+
llm_kwargs["extra_headers"] = self.extra_headers
|
313
|
+
if self.extra_body is not None:
|
314
|
+
llm_kwargs["extra_body"] = self.extra_body
|
315
|
+
|
316
|
+
# Add any additional kwargs
|
317
|
+
llm_kwargs.update(self.llm_kwargs)
|
318
|
+
|
319
|
+
self.llm_chat = LLMChatBlock(**llm_kwargs)
|
320
|
+
|
321
|
+
# 3. TextParserBlock
|
322
|
+
text_parser_kwargs = {
|
323
|
+
"block_name": f"{self.block_name}_text_parser",
|
324
|
+
"input_cols": ["raw_eval_relevancy"],
|
325
|
+
"output_cols": ["relevancy_explanation", "relevancy_score"],
|
326
|
+
"start_tags": self.start_tags,
|
327
|
+
"end_tags": self.end_tags,
|
328
|
+
}
|
329
|
+
|
330
|
+
# Add optional TextParserBlock parameters if specified
|
331
|
+
if self.parsing_pattern is not None:
|
332
|
+
text_parser_kwargs["parsing_pattern"] = self.parsing_pattern
|
333
|
+
if self.parser_cleanup_tags is not None:
|
334
|
+
text_parser_kwargs["parser_cleanup_tags"] = self.parser_cleanup_tags
|
335
|
+
|
336
|
+
self.text_parser = TextParserBlock(**text_parser_kwargs)
|
337
|
+
|
338
|
+
# 4. ColumnValueFilterBlock
|
339
|
+
filter_kwargs = {
|
340
|
+
"block_name": f"{self.block_name}_filter",
|
341
|
+
"input_cols": ["relevancy_score"],
|
342
|
+
"output_cols": [], # Filter blocks don't create new columns
|
343
|
+
"filter_value": self.filter_value,
|
344
|
+
"operation": self.operation,
|
345
|
+
}
|
346
|
+
|
347
|
+
if self.convert_dtype is not None:
|
348
|
+
filter_kwargs["convert_dtype"] = self.convert_dtype
|
349
|
+
|
350
|
+
self.filter_block = ColumnValueFilterBlock(**filter_kwargs)
|
351
|
+
|
352
|
+
def _reinitialize_client_manager(self) -> None:
|
353
|
+
"""Reinitialize the internal LLM chat block's client manager.
|
354
|
+
|
355
|
+
This should be called after model configuration changes to ensure
|
356
|
+
the internal LLM chat block uses the updated model configuration.
|
357
|
+
"""
|
358
|
+
if self.llm_chat and hasattr(self.llm_chat, "_reinitialize_client_manager"):
|
359
|
+
# Update the internal LLM chat block's model config
|
360
|
+
self.llm_chat.model = self.model
|
361
|
+
self.llm_chat.api_base = self.api_base
|
362
|
+
self.llm_chat.api_key = self.api_key
|
363
|
+
# Reinitialize its client manager
|
364
|
+
self.llm_chat._reinitialize_client_manager()
|
365
|
+
|
366
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
367
|
+
"""Generate relevancy evaluation for all samples.
|
368
|
+
|
369
|
+
This method chains the four internal blocks in sequence:
|
370
|
+
1. Build relevancy evaluation prompts
|
371
|
+
2. Generate LLM responses
|
372
|
+
3. Parse explanation and score
|
373
|
+
4. Filter based on score
|
374
|
+
|
375
|
+
Parameters
|
376
|
+
----------
|
377
|
+
samples : Dataset
|
378
|
+
Input dataset containing 'question' and 'response' columns.
|
379
|
+
**kwargs : Any
|
380
|
+
Additional keyword arguments passed to internal blocks.
|
381
|
+
|
382
|
+
Returns
|
383
|
+
-------
|
384
|
+
Dataset
|
385
|
+
Dataset with relevancy evaluation results and filtering applied.
|
386
|
+
|
387
|
+
Raises
|
388
|
+
------
|
389
|
+
BlockValidationError
|
390
|
+
If model is not configured before calling generate().
|
391
|
+
"""
|
392
|
+
# Validate that model is configured
|
393
|
+
if not self.model:
|
394
|
+
# Local
|
395
|
+
from ...utils.error_handling import BlockValidationError
|
396
|
+
|
397
|
+
raise BlockValidationError(
|
398
|
+
f"Model not configured for block '{self.block_name}'. "
|
399
|
+
f"Call flow.set_model_config() before generating."
|
400
|
+
)
|
401
|
+
logger.info(
|
402
|
+
f"Starting relevancy evaluation for {len(samples)} samples",
|
403
|
+
extra={
|
404
|
+
"block_name": self.block_name,
|
405
|
+
"model": self.model,
|
406
|
+
"batch_size": len(samples),
|
407
|
+
},
|
408
|
+
)
|
409
|
+
|
410
|
+
current_dataset = samples
|
411
|
+
|
412
|
+
try:
|
413
|
+
# Step 1: Build prompts
|
414
|
+
logger.debug("Step 1: Building relevancy evaluation prompts")
|
415
|
+
current_dataset = self.prompt_builder.generate(current_dataset, **kwargs)
|
416
|
+
|
417
|
+
# Step 2: Generate LLM responses
|
418
|
+
logger.debug("Step 2: Generating LLM responses")
|
419
|
+
current_dataset = self.llm_chat.generate(current_dataset, **kwargs)
|
420
|
+
|
421
|
+
# Step 3: Parse responses
|
422
|
+
logger.debug("Step 3: Parsing relevancy evaluation responses")
|
423
|
+
current_dataset = self.text_parser.generate(current_dataset, **kwargs)
|
424
|
+
|
425
|
+
# Step 4: Filter based on score
|
426
|
+
logger.debug("Step 4: Filtering based on relevancy score")
|
427
|
+
original_count = len(current_dataset)
|
428
|
+
current_dataset = self.filter_block.generate(current_dataset, **kwargs)
|
429
|
+
filtered_count = len(current_dataset)
|
430
|
+
|
431
|
+
logger.info(
|
432
|
+
f"Relevancy evaluation completed: {original_count} → {filtered_count} samples "
|
433
|
+
f"(filtered {original_count - filtered_count} samples)",
|
434
|
+
extra={
|
435
|
+
"block_name": self.block_name,
|
436
|
+
"original_count": original_count,
|
437
|
+
"filtered_count": filtered_count,
|
438
|
+
"filter_rate": (original_count - filtered_count) / original_count
|
439
|
+
if original_count > 0
|
440
|
+
else 0,
|
441
|
+
},
|
442
|
+
)
|
443
|
+
|
444
|
+
return current_dataset
|
445
|
+
|
446
|
+
except Exception as e:
|
447
|
+
logger.error(
|
448
|
+
f"Error during relevancy evaluation: {e}",
|
449
|
+
extra={
|
450
|
+
"block_name": self.block_name,
|
451
|
+
"model": self.model,
|
452
|
+
"error": str(e),
|
453
|
+
},
|
454
|
+
)
|
455
|
+
raise
|
456
|
+
|
457
|
+
def _validate_custom(self, dataset: Dataset) -> None:
|
458
|
+
"""Custom validation for relevancy evaluation.
|
459
|
+
|
460
|
+
This method validates the entire chain of internal blocks by simulating
|
461
|
+
the data flow through each block to ensure they can all process the data correctly.
|
462
|
+
"""
|
463
|
+
# Validate that required columns exist
|
464
|
+
required_columns = ["question", "response"]
|
465
|
+
missing_columns = [
|
466
|
+
col for col in required_columns if col not in dataset.column_names
|
467
|
+
]
|
468
|
+
if missing_columns:
|
469
|
+
raise ValueError(
|
470
|
+
f"EvaluateRelevancyBlock requires columns {required_columns}, "
|
471
|
+
f"missing: {missing_columns}"
|
472
|
+
)
|
473
|
+
|
474
|
+
# Validate the entire chain of internal blocks
|
475
|
+
if not all(
|
476
|
+
[self.prompt_builder, self.llm_chat, self.text_parser, self.filter_block]
|
477
|
+
):
|
478
|
+
raise ValueError(
|
479
|
+
"All internal blocks must be initialized before validation"
|
480
|
+
)
|
481
|
+
|
482
|
+
# Simulate data flow through the chain to validate each block
|
483
|
+
current_dataset = dataset
|
484
|
+
|
485
|
+
try:
|
486
|
+
# 1. Validate PromptBuilderBlock
|
487
|
+
logger.debug("Validating prompt builder block")
|
488
|
+
self.prompt_builder._validate_custom(current_dataset)
|
489
|
+
|
490
|
+
# Simulate prompt builder output for next validation
|
491
|
+
# Add the expected output column temporarily for validation
|
492
|
+
if "eval_relevancy_prompt" not in current_dataset.column_names:
|
493
|
+
# Create a temporary dataset with the expected column for validation
|
494
|
+
temp_data = []
|
495
|
+
for sample in current_dataset:
|
496
|
+
temp_sample = dict(sample)
|
497
|
+
temp_sample["eval_relevancy_prompt"] = [
|
498
|
+
{"role": "user", "content": "test"}
|
499
|
+
]
|
500
|
+
temp_data.append(temp_sample)
|
501
|
+
current_dataset = Dataset.from_list(temp_data)
|
502
|
+
|
503
|
+
# 2. Validate LLMChatBlock
|
504
|
+
logger.debug("Validating LLM chat block")
|
505
|
+
self.llm_chat._validate_custom(current_dataset)
|
506
|
+
|
507
|
+
# Simulate LLM chat output for next validation
|
508
|
+
if "raw_eval_relevancy" not in current_dataset.column_names:
|
509
|
+
temp_data = []
|
510
|
+
for sample in current_dataset:
|
511
|
+
temp_sample = dict(sample)
|
512
|
+
temp_sample["raw_eval_relevancy"] = (
|
513
|
+
"[Start of Feedback]Test feedback[End of Feedback]\n[Start of Score]2.0[End of Score]"
|
514
|
+
)
|
515
|
+
temp_data.append(temp_sample)
|
516
|
+
current_dataset = Dataset.from_list(temp_data)
|
517
|
+
|
518
|
+
# 3. Validate TextParserBlock
|
519
|
+
logger.debug("Validating text parser block")
|
520
|
+
self.text_parser._validate_custom(current_dataset)
|
521
|
+
|
522
|
+
# Simulate text parser output for final validation
|
523
|
+
if "relevancy_score" not in current_dataset.column_names:
|
524
|
+
temp_data = []
|
525
|
+
for sample in current_dataset:
|
526
|
+
temp_sample = dict(sample)
|
527
|
+
temp_sample["relevancy_explanation"] = "Test feedback"
|
528
|
+
temp_sample["relevancy_score"] = "2.0"
|
529
|
+
temp_data.append(temp_sample)
|
530
|
+
current_dataset = Dataset.from_list(temp_data)
|
531
|
+
|
532
|
+
# 4. Validate ColumnValueFilterBlock
|
533
|
+
logger.debug("Validating filter block")
|
534
|
+
self.filter_block._validate_custom(current_dataset)
|
535
|
+
|
536
|
+
logger.debug("All internal blocks validated successfully")
|
537
|
+
|
538
|
+
except Exception as e:
|
539
|
+
logger.error(f"Validation failed in internal blocks: {e}")
|
540
|
+
raise ValueError(f"Internal block validation failed: {e}") from e
|
541
|
+
|
542
|
+
def get_internal_blocks_info(self) -> dict[str, Any]:
|
543
|
+
"""Get information about the internal blocks.
|
544
|
+
|
545
|
+
Returns
|
546
|
+
-------
|
547
|
+
Dict[str, Any]
|
548
|
+
Information about each internal block.
|
549
|
+
"""
|
550
|
+
return {
|
551
|
+
"prompt_builder": self.prompt_builder.get_info()
|
552
|
+
if self.prompt_builder
|
553
|
+
else None,
|
554
|
+
"llm_chat": self.llm_chat.get_info() if self.llm_chat else None,
|
555
|
+
"text_parser": self.text_parser.get_info() if self.text_parser else None,
|
556
|
+
"filter": self.filter_block.get_info() if self.filter_block else None,
|
557
|
+
}
|
558
|
+
|
559
|
+
def __repr__(self) -> str:
|
560
|
+
"""String representation of the block."""
|
561
|
+
return (
|
562
|
+
f"EvaluateRelevancyBlock(name='{self.block_name}', "
|
563
|
+
f"model='{self.model}', filter_value='{self.filter_value}')"
|
564
|
+
)
|