sdg-hub 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +0 -2
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +1 -2
- sdg_hub/core/blocks/__init__.py +2 -4
- sdg_hub/core/blocks/base.py +61 -6
- sdg_hub/core/blocks/filtering/column_value_filter.py +3 -2
- sdg_hub/core/blocks/llm/__init__.py +2 -4
- sdg_hub/core/blocks/llm/llm_chat_block.py +251 -265
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +216 -98
- sdg_hub/core/blocks/llm/llm_parser_block.py +320 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +53 -152
- sdg_hub/core/flow/__init__.py +3 -4
- sdg_hub/core/flow/base.py +11 -73
- sdg_hub/core/flow/metadata.py +1 -68
- sdg_hub/core/flow/registry.py +0 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +51 -12
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +158 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +51 -12
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +14 -3
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +147 -28
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +41 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +14 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +14 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +303 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +55 -0
- sdg_hub/flows/text_analysis/structured_insights/flow.yaml +28 -5
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/METADATA +2 -1
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/RECORD +34 -30
- sdg_hub/core/blocks/evaluation/__init__.py +0 -9
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +0 -323
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +0 -323
- sdg_hub/core/blocks/evaluation/verify_question_block.py +0 -329
- sdg_hub/core/blocks/llm/client_manager.py +0 -472
- sdg_hub/core/blocks/llm/config.py +0 -337
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/top_level.txt +0 -0
sdg_hub/core/flow/base.py
CHANGED
@@ -35,7 +35,7 @@ from ..utils.logger_config import setup_logger
|
|
35
35
|
from ..utils.path_resolution import resolve_path
|
36
36
|
from ..utils.yaml_utils import save_flow_yaml
|
37
37
|
from .checkpointer import FlowCheckpointer
|
38
|
-
from .metadata import DatasetRequirements, FlowMetadata
|
38
|
+
from .metadata import DatasetRequirements, FlowMetadata
|
39
39
|
from .migration import FlowMigration
|
40
40
|
from .validation import FlowValidator
|
41
41
|
|
@@ -55,8 +55,6 @@ class Flow(BaseModel):
|
|
55
55
|
Ordered list of blocks to execute in the flow.
|
56
56
|
metadata : FlowMetadata
|
57
57
|
Flow metadata including name, version, author, etc.
|
58
|
-
parameters : Dict[str, FlowParameter]
|
59
|
-
Runtime parameters that can be overridden during execution.
|
60
58
|
"""
|
61
59
|
|
62
60
|
blocks: list[BaseBlock] = Field(
|
@@ -66,10 +64,6 @@ class Flow(BaseModel):
|
|
66
64
|
metadata: FlowMetadata = Field(
|
67
65
|
description="Flow metadata including name, version, author, etc."
|
68
66
|
)
|
69
|
-
parameters: dict[str, FlowParameter] = Field(
|
70
|
-
default_factory=dict,
|
71
|
-
description="Runtime parameters that can be overridden during execution",
|
72
|
-
)
|
73
67
|
|
74
68
|
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
75
69
|
|
@@ -96,32 +90,6 @@ class Flow(BaseModel):
|
|
96
90
|
|
97
91
|
return v
|
98
92
|
|
99
|
-
@field_validator("parameters")
|
100
|
-
@classmethod
|
101
|
-
def validate_parameters(
|
102
|
-
cls, v: dict[str, FlowParameter]
|
103
|
-
) -> dict[str, FlowParameter]:
|
104
|
-
"""Validate parameter names and ensure they are FlowParameter instances."""
|
105
|
-
if not v:
|
106
|
-
return v
|
107
|
-
|
108
|
-
validated = {}
|
109
|
-
for param_name, param_value in v.items():
|
110
|
-
if not isinstance(param_name, str) or not param_name.strip():
|
111
|
-
raise ValueError(
|
112
|
-
f"Parameter name must be a non-empty string: {param_name}"
|
113
|
-
)
|
114
|
-
|
115
|
-
if not isinstance(param_value, FlowParameter):
|
116
|
-
raise ValueError(
|
117
|
-
f"Parameter '{param_name}' must be a FlowParameter instance, "
|
118
|
-
f"got: {type(param_value)}"
|
119
|
-
)
|
120
|
-
|
121
|
-
validated[param_name.strip()] = param_value
|
122
|
-
|
123
|
-
return validated
|
124
|
-
|
125
93
|
@model_validator(mode="after")
|
126
94
|
def validate_block_names_unique(self) -> "Flow":
|
127
95
|
"""Ensure all block names are unique within the flow."""
|
@@ -215,17 +183,6 @@ class Flow(BaseModel):
|
|
215
183
|
except Exception as exc:
|
216
184
|
raise FlowValidationError(f"Invalid metadata configuration: {exc}") from exc
|
217
185
|
|
218
|
-
# Extract and validate parameters
|
219
|
-
parameters = {}
|
220
|
-
params_dict = flow_config.get("parameters", {})
|
221
|
-
for param_name, param_config in params_dict.items():
|
222
|
-
try:
|
223
|
-
parameters[param_name] = FlowParameter(**param_config)
|
224
|
-
except Exception as exc:
|
225
|
-
raise FlowValidationError(
|
226
|
-
f"Invalid parameter '{param_name}': {exc}"
|
227
|
-
) from exc
|
228
|
-
|
229
186
|
# Create blocks with validation
|
230
187
|
blocks = []
|
231
188
|
block_configs = flow_config.get("blocks", [])
|
@@ -254,7 +211,7 @@ class Flow(BaseModel):
|
|
254
211
|
|
255
212
|
# Create and validate the flow
|
256
213
|
try:
|
257
|
-
flow = cls(blocks=blocks, metadata=metadata
|
214
|
+
flow = cls(blocks=blocks, metadata=metadata)
|
258
215
|
# Persist generated id back to the YAML file (only on initial load)
|
259
216
|
# If the file had no metadata.id originally, update and rewrite
|
260
217
|
if not flow_config.get("metadata", {}).get("id"):
|
@@ -877,16 +834,19 @@ class Flow(BaseModel):
|
|
877
834
|
f"Block '{block.block_name}': {param_name} "
|
878
835
|
f"'{old_value}' -> '{param_value}'"
|
879
836
|
)
|
837
|
+
## check if allow extra
|
838
|
+
elif block.model_config["extra"] == "allow":
|
839
|
+
setattr(block, param_name, param_value)
|
840
|
+
logger.debug(
|
841
|
+
f"Block '{block.block_name}': {param_name} "
|
842
|
+
f"'{old_value}' -> '{param_value}'"
|
843
|
+
)
|
880
844
|
else:
|
881
845
|
logger.warning(
|
882
846
|
f"Block '{block.block_name}' ({block.__class__.__name__}) "
|
883
847
|
f"does not have attribute '{param_name}' - skipping"
|
884
848
|
)
|
885
849
|
|
886
|
-
# Reinitialize client manager for LLM blocks after updating config
|
887
|
-
if hasattr(block, "_reinitialize_client_manager"):
|
888
|
-
block._reinitialize_client_manager()
|
889
|
-
|
890
850
|
modified_count += 1
|
891
851
|
|
892
852
|
if modified_count > 0:
|
@@ -1222,17 +1182,12 @@ class Flow(BaseModel):
|
|
1222
1182
|
# Create new flow with added block
|
1223
1183
|
new_blocks = self.blocks + [block]
|
1224
1184
|
|
1225
|
-
return Flow(
|
1226
|
-
blocks=new_blocks, metadata=self.metadata, parameters=self.parameters
|
1227
|
-
)
|
1185
|
+
return Flow(blocks=new_blocks, metadata=self.metadata)
|
1228
1186
|
|
1229
1187
|
def get_info(self) -> dict[str, Any]:
|
1230
1188
|
"""Get information about the flow."""
|
1231
1189
|
return {
|
1232
1190
|
"metadata": self.metadata.model_dump(),
|
1233
|
-
"parameters": {
|
1234
|
-
name: param.model_dump() for name, param in self.parameters.items()
|
1235
|
-
},
|
1236
1191
|
"blocks": [
|
1237
1192
|
{
|
1238
1193
|
"block_type": block.__class__.__name__,
|
@@ -1336,8 +1291,7 @@ class Flow(BaseModel):
|
|
1336
1291
|
|
1337
1292
|
The summary contains:
|
1338
1293
|
1. Flow metadata (name, version, author, description)
|
1339
|
-
2.
|
1340
|
-
3. A table of all blocks with their input and output columns
|
1294
|
+
2. A table of all blocks with their input and output columns
|
1341
1295
|
|
1342
1296
|
Notes
|
1343
1297
|
-----
|
@@ -1371,17 +1325,6 @@ class Flow(BaseModel):
|
|
1371
1325
|
f"Description: [white]{self.metadata.description}[/white]"
|
1372
1326
|
)
|
1373
1327
|
|
1374
|
-
# Parameters section
|
1375
|
-
if self.parameters:
|
1376
|
-
params_branch = flow_tree.add(
|
1377
|
-
"[bold bright_yellow]Parameters[/bold bright_yellow]"
|
1378
|
-
)
|
1379
|
-
for name, param in self.parameters.items():
|
1380
|
-
param_info = f"[bright_cyan]{name}[/bright_cyan]: [white]{param.type_hint}[/white]"
|
1381
|
-
if param.default is not None:
|
1382
|
-
param_info += f" = [bright_white]{param.default}[/bright_white]"
|
1383
|
-
params_branch.add(param_info)
|
1384
|
-
|
1385
1328
|
# Blocks overview
|
1386
1329
|
flow_tree.add(
|
1387
1330
|
f"[bold bright_magenta]Blocks[/bold bright_magenta] ({len(self.blocks)} total)"
|
@@ -1443,11 +1386,6 @@ class Flow(BaseModel):
|
|
1443
1386
|
],
|
1444
1387
|
}
|
1445
1388
|
|
1446
|
-
if self.parameters:
|
1447
|
-
config["parameters"] = {
|
1448
|
-
name: param.model_dump() for name, param in self.parameters.items()
|
1449
|
-
}
|
1450
|
-
|
1451
1389
|
save_flow_yaml(output_path, config)
|
1452
1390
|
|
1453
1391
|
def __len__(self) -> int:
|
sdg_hub/core/flow/metadata.py
CHANGED
@@ -2,9 +2,8 @@
|
|
2
2
|
"""Flow metadata and parameter definitions."""
|
3
3
|
|
4
4
|
# Standard
|
5
|
-
from datetime import datetime
|
6
5
|
from enum import Enum
|
7
|
-
from typing import
|
6
|
+
from typing import Optional
|
8
7
|
|
9
8
|
# Third Party
|
10
9
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
@@ -118,39 +117,6 @@ class RecommendedModels(BaseModel):
|
|
118
117
|
return None
|
119
118
|
|
120
119
|
|
121
|
-
class FlowParameter(BaseModel):
|
122
|
-
"""Represents a runtime parameter for a flow.
|
123
|
-
|
124
|
-
Attributes
|
125
|
-
----------
|
126
|
-
default : Any
|
127
|
-
Default value for the parameter.
|
128
|
-
description : str
|
129
|
-
Human-readable description of the parameter.
|
130
|
-
type_hint : str
|
131
|
-
Type hint as string (e.g., "float", "str").
|
132
|
-
required : bool
|
133
|
-
Whether this parameter is required at runtime.
|
134
|
-
constraints : Dict[str, Any]
|
135
|
-
Additional constraints for the parameter.
|
136
|
-
"""
|
137
|
-
|
138
|
-
default: Any = Field(..., description="Default value for the parameter")
|
139
|
-
description: str = Field(default="", description="Human-readable description")
|
140
|
-
type_hint: str = Field(default="Any", description="Type hint as string")
|
141
|
-
required: bool = Field(default=False, description="Whether parameter is required")
|
142
|
-
constraints: dict[str, Any] = Field(
|
143
|
-
default_factory=dict, description="Additional constraints for the parameter"
|
144
|
-
)
|
145
|
-
|
146
|
-
@model_validator(mode="after")
|
147
|
-
def validate_required_default(self) -> "FlowParameter":
|
148
|
-
"""Validate that required parameters have appropriate defaults."""
|
149
|
-
if self.required and self.default is None:
|
150
|
-
raise ValueError("Required parameters cannot have None as default")
|
151
|
-
return self
|
152
|
-
|
153
|
-
|
154
120
|
class DatasetRequirements(BaseModel):
|
155
121
|
"""Dataset requirements for flow execution.
|
156
122
|
|
@@ -255,20 +221,10 @@ class FlowMetadata(BaseModel):
|
|
255
221
|
Simplified recommended models structure with default, compatible, and experimental lists.
|
256
222
|
tags : List[str]
|
257
223
|
Tags for categorization and search.
|
258
|
-
created_at : str
|
259
|
-
Creation timestamp.
|
260
|
-
updated_at : str
|
261
|
-
Last update timestamp.
|
262
224
|
license : str
|
263
225
|
License identifier.
|
264
|
-
min_sdg_hub_version : str
|
265
|
-
Minimum required SDG Hub version.
|
266
226
|
dataset_requirements : Optional[DatasetRequirements]
|
267
227
|
Requirements for input datasets.
|
268
|
-
estimated_cost : str
|
269
|
-
Estimated cost tier for running the flow.
|
270
|
-
estimated_duration : str
|
271
|
-
Estimated duration for flow execution.
|
272
228
|
"""
|
273
229
|
|
274
230
|
name: str = Field(..., min_length=1, description="Human-readable name")
|
@@ -288,29 +244,10 @@ class FlowMetadata(BaseModel):
|
|
288
244
|
tags: list[str] = Field(
|
289
245
|
default_factory=list, description="Tags for categorization and search"
|
290
246
|
)
|
291
|
-
created_at: str = Field(
|
292
|
-
default_factory=lambda: datetime.now().isoformat(),
|
293
|
-
description="Creation timestamp",
|
294
|
-
)
|
295
|
-
updated_at: str = Field(
|
296
|
-
default_factory=lambda: datetime.now().isoformat(),
|
297
|
-
description="Last update timestamp",
|
298
|
-
)
|
299
247
|
license: str = Field(default="Apache-2.0", description="License identifier")
|
300
|
-
min_sdg_hub_version: str = Field(
|
301
|
-
default="", description="Minimum required SDG Hub version"
|
302
|
-
)
|
303
248
|
dataset_requirements: Optional[DatasetRequirements] = Field(
|
304
249
|
default=None, description="Requirements for input datasets"
|
305
250
|
)
|
306
|
-
estimated_cost: str = Field(
|
307
|
-
default="medium",
|
308
|
-
pattern="^(low|medium|high)$",
|
309
|
-
description="Estimated cost tier for running the flow",
|
310
|
-
)
|
311
|
-
estimated_duration: str = Field(
|
312
|
-
default="", description="Estimated duration for flow execution"
|
313
|
-
)
|
314
251
|
|
315
252
|
@field_validator("id")
|
316
253
|
@classmethod
|
@@ -352,10 +289,6 @@ class FlowMetadata(BaseModel):
|
|
352
289
|
# Validation is handled within RecommendedModels class
|
353
290
|
return v
|
354
291
|
|
355
|
-
def update_timestamp(self) -> None:
|
356
|
-
"""Update the updated_at timestamp."""
|
357
|
-
self.updated_at = datetime.now().isoformat()
|
358
|
-
|
359
292
|
@model_validator(mode="after")
|
360
293
|
def ensure_id(self) -> "FlowMetadata":
|
361
294
|
"""Ensure id is set.
|
sdg_hub/core/flow/registry.py
CHANGED
@@ -17,7 +17,6 @@ metadata:
|
|
17
17
|
- qa-pairs
|
18
18
|
- detailed-summaries
|
19
19
|
license: Apache-2.0
|
20
|
-
min_sdg_hub_version: 0.2.0
|
21
20
|
dataset_requirements:
|
22
21
|
required_columns:
|
23
22
|
- document
|
@@ -61,10 +60,16 @@ blocks:
|
|
61
60
|
temperature: 0.7
|
62
61
|
n: 50
|
63
62
|
async_mode: true
|
63
|
+
- block_type: LLMParserBlock
|
64
|
+
block_config:
|
65
|
+
block_name: extract_detailed_summary
|
66
|
+
input_cols: raw_summary
|
67
|
+
extract_content: true
|
68
|
+
expand_lists: true
|
64
69
|
- block_type: TextParserBlock
|
65
70
|
block_config:
|
66
71
|
block_name: parse_detailed_summary
|
67
|
-
input_cols:
|
72
|
+
input_cols: extract_detailed_summary_content
|
68
73
|
output_cols: summary
|
69
74
|
start_tags:
|
70
75
|
- ''
|
@@ -99,10 +104,16 @@ blocks:
|
|
99
104
|
temperature: 0.7
|
100
105
|
n: 1
|
101
106
|
async_mode: true
|
107
|
+
- block_type: LLMParserBlock
|
108
|
+
block_config:
|
109
|
+
block_name: extract_questions
|
110
|
+
input_cols: question_list
|
111
|
+
extract_content: true
|
112
|
+
expand_lists: true
|
102
113
|
- block_type: TextParserBlock
|
103
114
|
block_config:
|
104
115
|
block_name: parse_question_list
|
105
|
-
input_cols:
|
116
|
+
input_cols: extract_questions_content
|
106
117
|
output_cols: question
|
107
118
|
start_tags:
|
108
119
|
- '[QUESTION]'
|
@@ -127,33 +138,61 @@ blocks:
|
|
127
138
|
temperature: 0.7
|
128
139
|
n: 1
|
129
140
|
async_mode: true
|
141
|
+
- block_type: LLMParserBlock
|
142
|
+
block_config:
|
143
|
+
block_name: extract_answers
|
144
|
+
input_cols: response_dict
|
145
|
+
extract_content: true
|
146
|
+
expand_lists: true
|
130
147
|
- block_type: TextParserBlock
|
131
148
|
block_config:
|
132
149
|
block_name: parse_response_dict
|
133
|
-
input_cols:
|
150
|
+
input_cols: extract_answers_content
|
134
151
|
output_cols: response
|
135
152
|
start_tags:
|
136
153
|
- ''
|
137
154
|
end_tags:
|
138
155
|
- ''
|
139
156
|
save_reasoning_content: true
|
140
|
-
- block_type:
|
157
|
+
- block_type: PromptBuilderBlock
|
141
158
|
block_config:
|
142
|
-
block_name:
|
159
|
+
block_name: eval_faithful_prompt
|
143
160
|
input_cols:
|
144
161
|
- document
|
145
162
|
- response
|
146
|
-
output_cols:
|
147
|
-
- faithfulness_explanation
|
148
|
-
- faithfulness_judgment
|
163
|
+
output_cols: eval_faithful_prompt
|
149
164
|
prompt_config_path: ../../multi_summary_qa/instructlab/evaluate_faithfulness.yaml
|
150
|
-
filter_value: 'YES'
|
151
|
-
operation: eq
|
152
|
-
async_mode: true
|
153
165
|
format_as_messages: true
|
166
|
+
- block_type: LLMChatBlock
|
167
|
+
block_config:
|
168
|
+
block_name: eval_faithful_llm_chat
|
169
|
+
input_cols: eval_faithful_prompt
|
170
|
+
output_cols: eval_faithful_response_dict
|
171
|
+
n: 1
|
172
|
+
async_mode: true
|
173
|
+
- block_type: LLMParserBlock
|
174
|
+
block_config:
|
175
|
+
block_name: extract_eval_faithful
|
176
|
+
input_cols: eval_faithful_response_dict
|
177
|
+
extract_content: true
|
178
|
+
|
179
|
+
- block_type: TextParserBlock
|
180
|
+
block_config:
|
181
|
+
block_name: parse_eval_faithful
|
182
|
+
input_cols: extract_eval_faithful_content
|
183
|
+
output_cols:
|
184
|
+
- faithfulness_explanation
|
185
|
+
- faithfulness_judgment
|
154
186
|
start_tags:
|
155
187
|
- '[Start of Explanation]'
|
156
188
|
- '[Start of Answer]'
|
157
189
|
end_tags:
|
158
190
|
- '[End of Explanation]'
|
159
191
|
- '[End of Answer]'
|
192
|
+
- block_type: ColumnValueFilterBlock
|
193
|
+
block_config:
|
194
|
+
block_name: eval_faithful_filter
|
195
|
+
input_cols:
|
196
|
+
- faithfulness_judgment
|
197
|
+
filter_value: 'YES'
|
198
|
+
operation: eq
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py
ADDED
File without changes
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
metadata:
|
2
|
+
name: Document Based Knowledge Tuning Dataset Generation Flow
|
3
|
+
description: Directly generates QA pairs from the raw document.
|
4
|
+
version: 2.0.0
|
5
|
+
author: SDG Hub Contributors
|
6
|
+
recommended_models:
|
7
|
+
default: openai/gpt-oss-120b
|
8
|
+
compatible:
|
9
|
+
- meta-llama/Llama-3.3-70B-Instruct
|
10
|
+
- microsoft/phi-4
|
11
|
+
- mistralai/Mixtral-8x7B-Instruct-v0.1
|
12
|
+
experimental: []
|
13
|
+
tags:
|
14
|
+
- knowledge-tuning
|
15
|
+
- document-internalization
|
16
|
+
- question-generation
|
17
|
+
- qa-pairs
|
18
|
+
- detailed-summaries
|
19
|
+
license: Apache-2.0
|
20
|
+
dataset_requirements:
|
21
|
+
required_columns:
|
22
|
+
- document
|
23
|
+
- document_outline
|
24
|
+
- domain
|
25
|
+
- icl_document
|
26
|
+
- icl_query_1
|
27
|
+
- icl_query_2
|
28
|
+
- icl_query_3
|
29
|
+
description: 'Input dataset should contain documents with text content and domain classification. Each document should be substantial enough for meaningful question generation (minimum 100 words recommended). The flow generates three types
|
30
|
+
of summaries: detailed (n=20), extractive (n=10), and key facts (n=50), each producing corresponding QA pairs designed to help LLMs internalize document knowledge for knowledge tuning.'
|
31
|
+
output_columns:
|
32
|
+
- question
|
33
|
+
- response
|
34
|
+
- raw_document
|
35
|
+
- faithfulness_explanation
|
36
|
+
- faithfulness_judgment
|
37
|
+
id: stellar-peak-605
|
38
|
+
blocks:
|
39
|
+
- block_type: DuplicateColumnsBlock
|
40
|
+
block_config:
|
41
|
+
block_name: duplicate_document_col
|
42
|
+
input_cols:
|
43
|
+
document: base_document
|
44
|
+
- block_type: PromptBuilderBlock
|
45
|
+
block_config:
|
46
|
+
block_name: question_generation_prompt
|
47
|
+
input_cols:
|
48
|
+
- domain
|
49
|
+
- document
|
50
|
+
- document_outline
|
51
|
+
- icl_document
|
52
|
+
- icl_query_1
|
53
|
+
- icl_query_2
|
54
|
+
- icl_query_3
|
55
|
+
output_cols: question_generation_prompt
|
56
|
+
prompt_config_path: ../generate_question_list.yaml
|
57
|
+
format_as_messages: true
|
58
|
+
- block_type: LLMChatBlock
|
59
|
+
block_config:
|
60
|
+
block_name: question_generation
|
61
|
+
input_cols: question_generation_prompt
|
62
|
+
output_cols: question_list
|
63
|
+
max_tokens: 256
|
64
|
+
temperature: 1.0
|
65
|
+
n: 1
|
66
|
+
async_mode: true
|
67
|
+
- block_type: LLMParserBlock
|
68
|
+
block_config:
|
69
|
+
block_name: extract_questions
|
70
|
+
input_cols: question_list
|
71
|
+
extract_content: true
|
72
|
+
expand_lists: true
|
73
|
+
- block_type: TextParserBlock
|
74
|
+
block_config:
|
75
|
+
block_name: parse_question_list
|
76
|
+
input_cols: extract_questions_content
|
77
|
+
output_cols: question
|
78
|
+
start_tags:
|
79
|
+
- '[QUESTION]'
|
80
|
+
end_tags:
|
81
|
+
- '[END]'
|
82
|
+
- block_type: PromptBuilderBlock
|
83
|
+
block_config:
|
84
|
+
block_name: answer_generation_prompt
|
85
|
+
input_cols:
|
86
|
+
- question
|
87
|
+
- document
|
88
|
+
- document_outline
|
89
|
+
output_cols: answer_generation_prompt
|
90
|
+
prompt_config_path: ../generate_answers.yaml
|
91
|
+
format_as_messages: true
|
92
|
+
- block_type: LLMChatBlock
|
93
|
+
block_config:
|
94
|
+
block_name: answer_generation
|
95
|
+
input_cols: answer_generation_prompt
|
96
|
+
output_cols: response_dict
|
97
|
+
max_tokens: 4096
|
98
|
+
temperature: 1.0
|
99
|
+
n: 1
|
100
|
+
async_mode: true
|
101
|
+
- block_type: LLMParserBlock
|
102
|
+
block_config:
|
103
|
+
block_name: extract_answer
|
104
|
+
input_cols: response_dict
|
105
|
+
extract_content: true
|
106
|
+
expand_lists: true
|
107
|
+
- block_type: TextParserBlock
|
108
|
+
block_config:
|
109
|
+
block_name: parse_response_dict
|
110
|
+
input_cols: extract_answer_content
|
111
|
+
output_cols: response
|
112
|
+
start_tags:
|
113
|
+
- ''
|
114
|
+
end_tags:
|
115
|
+
- ''
|
116
|
+
save_reasoning_content: true
|
117
|
+
- block_type: PromptBuilderBlock
|
118
|
+
block_config:
|
119
|
+
block_name: eval_faithful_prompt
|
120
|
+
input_cols:
|
121
|
+
- document
|
122
|
+
- response
|
123
|
+
output_cols: eval_faithful_prompt
|
124
|
+
prompt_config_path: ../../multi_summary_qa/instructlab/evaluate_faithfulness.yaml
|
125
|
+
format_as_messages: true
|
126
|
+
- block_type: LLMChatBlock
|
127
|
+
block_config:
|
128
|
+
block_name: eval_faithful_llm_chat
|
129
|
+
input_cols: eval_faithful_prompt
|
130
|
+
output_cols: eval_faithful_response_dict
|
131
|
+
n: 1
|
132
|
+
async_mode: true
|
133
|
+
- block_type: LLMParserBlock
|
134
|
+
block_config:
|
135
|
+
block_name: extract_eval_faithful
|
136
|
+
input_cols: eval_faithful_response_dict
|
137
|
+
extract_content: true
|
138
|
+
|
139
|
+
- block_type: TextParserBlock
|
140
|
+
block_config:
|
141
|
+
block_name: parse_eval_faithful
|
142
|
+
input_cols: extract_eval_faithful_content
|
143
|
+
output_cols:
|
144
|
+
- faithfulness_explanation
|
145
|
+
- faithfulness_judgment
|
146
|
+
start_tags:
|
147
|
+
- '[Start of Explanation]'
|
148
|
+
- '[Start of Answer]'
|
149
|
+
end_tags:
|
150
|
+
- '[End of Explanation]'
|
151
|
+
- '[End of Answer]'
|
152
|
+
- block_type: ColumnValueFilterBlock
|
153
|
+
block_config:
|
154
|
+
block_name: eval_faithful_filter
|
155
|
+
input_cols:
|
156
|
+
- faithfulness_judgment
|
157
|
+
filter_value: 'YES'
|
158
|
+
operation: eq
|
@@ -19,7 +19,6 @@ metadata:
|
|
19
19
|
- qa-pairs
|
20
20
|
- extractive-summaries
|
21
21
|
license: Apache-2.0
|
22
|
-
min_sdg_hub_version: 0.2.0
|
23
22
|
dataset_requirements:
|
24
23
|
required_columns:
|
25
24
|
- document
|
@@ -63,10 +62,16 @@ blocks:
|
|
63
62
|
temperature: 0.7
|
64
63
|
n: 50
|
65
64
|
async_mode: true
|
65
|
+
- block_type: LLMParserBlock
|
66
|
+
block_config:
|
67
|
+
block_name: extract_extractive_summary
|
68
|
+
input_cols: raw_summary
|
69
|
+
extract_content: true
|
70
|
+
expand_lists: true
|
66
71
|
- block_type: TextParserBlock
|
67
72
|
block_config:
|
68
73
|
block_name: parse_extractive_summary
|
69
|
-
input_cols:
|
74
|
+
input_cols: extract_extractive_summary_content
|
70
75
|
output_cols: summary
|
71
76
|
start_tags:
|
72
77
|
- ''
|
@@ -101,10 +106,16 @@ blocks:
|
|
101
106
|
temperature: 0.7
|
102
107
|
n: 1
|
103
108
|
async_mode: true
|
109
|
+
- block_type: LLMParserBlock
|
110
|
+
block_config:
|
111
|
+
block_name: extract_questions
|
112
|
+
input_cols: question_list
|
113
|
+
extract_content: true
|
114
|
+
expand_lists: true
|
104
115
|
- block_type: TextParserBlock
|
105
116
|
block_config:
|
106
117
|
block_name: parse_question_list
|
107
|
-
input_cols:
|
118
|
+
input_cols: extract_questions_content
|
108
119
|
output_cols: question
|
109
120
|
start_tags:
|
110
121
|
- '[QUESTION]'
|
@@ -129,33 +140,61 @@ blocks:
|
|
129
140
|
temperature: 0.7
|
130
141
|
n: 1
|
131
142
|
async_mode: true
|
143
|
+
- block_type: LLMParserBlock
|
144
|
+
block_config:
|
145
|
+
block_name: extract_answers
|
146
|
+
input_cols: response_dict
|
147
|
+
extract_content: true
|
148
|
+
expand_lists: true
|
132
149
|
- block_type: TextParserBlock
|
133
150
|
block_config:
|
134
151
|
block_name: parse_response_dict
|
135
|
-
input_cols:
|
152
|
+
input_cols: extract_answers_content
|
136
153
|
output_cols: response
|
137
154
|
start_tags:
|
138
155
|
- ''
|
139
156
|
end_tags:
|
140
157
|
- ''
|
141
158
|
save_reasoning_content: true
|
142
|
-
- block_type:
|
159
|
+
- block_type: PromptBuilderBlock
|
143
160
|
block_config:
|
144
|
-
block_name:
|
161
|
+
block_name: eval_faithful_prompt
|
145
162
|
input_cols:
|
146
163
|
- document
|
147
164
|
- response
|
148
|
-
output_cols:
|
149
|
-
- faithfulness_explanation
|
150
|
-
- faithfulness_judgment
|
165
|
+
output_cols: eval_faithful_prompt
|
151
166
|
prompt_config_path: ../../multi_summary_qa/instructlab/evaluate_faithfulness.yaml
|
152
|
-
filter_value: 'YES'
|
153
|
-
operation: eq
|
154
|
-
async_mode: true
|
155
167
|
format_as_messages: true
|
168
|
+
- block_type: LLMChatBlock
|
169
|
+
block_config:
|
170
|
+
block_name: eval_faithful_llm_chat
|
171
|
+
input_cols: eval_faithful_prompt
|
172
|
+
output_cols: eval_faithful_response_dict
|
173
|
+
n: 1
|
174
|
+
async_mode: true
|
175
|
+
- block_type: LLMParserBlock
|
176
|
+
block_config:
|
177
|
+
block_name: extract_eval_faithful
|
178
|
+
input_cols: eval_faithful_response_dict
|
179
|
+
extract_content: true
|
180
|
+
|
181
|
+
- block_type: TextParserBlock
|
182
|
+
block_config:
|
183
|
+
block_name: parse_eval_faithful
|
184
|
+
input_cols: extract_eval_faithful_content
|
185
|
+
output_cols:
|
186
|
+
- faithfulness_explanation
|
187
|
+
- faithfulness_judgement
|
156
188
|
start_tags:
|
157
189
|
- '[Start of Explanation]'
|
158
190
|
- '[Start of Answer]'
|
159
191
|
end_tags:
|
160
192
|
- '[End of Explanation]'
|
161
193
|
- '[End of Answer]'
|
194
|
+
- block_type: ColumnValueFilterBlock
|
195
|
+
block_config:
|
196
|
+
block_name: eval_faithful_filter
|
197
|
+
input_cols:
|
198
|
+
- faithfulness_judgement
|
199
|
+
filter_value: 'YES'
|
200
|
+
operation: eq
|