sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +27 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +1209 -0
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +389 -0
- sdg_hub/core/flow/migration.py +198 -0
- sdg_hub/core/flow/registry.py +393 -0
- sdg_hub/core/flow/validation.py +277 -0
- sdg_hub/{utils → core/utils}/__init__.py +7 -4
- sdg_hub/core/utils/datautils.py +63 -0
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.1.dist-info/METADATA +221 -0
- sdg_hub-0.2.1.dist-info/RECORD +68 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/datautils.py +0 -14
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,63 @@
|
|
1
|
+
# Third Party
|
2
|
+
from datasets import Dataset, concatenate_datasets
|
3
|
+
|
4
|
+
# Local
|
5
|
+
from .error_handling import FlowValidationError
|
6
|
+
|
7
|
+
|
8
|
+
def safe_concatenate_datasets(datasets: list):
|
9
|
+
"""Concatenate datasets safely, ignoring any datasets that are None or empty."""
|
10
|
+
filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
|
11
|
+
|
12
|
+
if not filtered_datasets:
|
13
|
+
return None
|
14
|
+
|
15
|
+
return concatenate_datasets(filtered_datasets)
|
16
|
+
|
17
|
+
|
18
|
+
def safe_concatenate_with_validation(
|
19
|
+
datasets: list, context: str = "datasets"
|
20
|
+
) -> Dataset:
|
21
|
+
"""Safely concatenate datasets with schema validation and clear error messages.
|
22
|
+
|
23
|
+
Parameters
|
24
|
+
----------
|
25
|
+
datasets : list[Dataset]
|
26
|
+
List of datasets to concatenate
|
27
|
+
context : str
|
28
|
+
Description of what's being concatenated for error messages
|
29
|
+
|
30
|
+
Returns
|
31
|
+
-------
|
32
|
+
Dataset
|
33
|
+
Concatenated dataset
|
34
|
+
|
35
|
+
Raises
|
36
|
+
------
|
37
|
+
FlowValidationError
|
38
|
+
If schema mismatch prevents concatenation or no valid datasets
|
39
|
+
"""
|
40
|
+
# Filter out None and empty datasets first
|
41
|
+
valid_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
|
42
|
+
|
43
|
+
if not valid_datasets:
|
44
|
+
raise FlowValidationError(f"No valid datasets to concatenate in {context}")
|
45
|
+
|
46
|
+
if len(valid_datasets) == 1:
|
47
|
+
return valid_datasets[0]
|
48
|
+
|
49
|
+
try:
|
50
|
+
return concatenate_datasets(valid_datasets)
|
51
|
+
except Exception as e:
|
52
|
+
# Schema mismatch or other concatenation error
|
53
|
+
schema_info = []
|
54
|
+
for i, ds in enumerate(valid_datasets):
|
55
|
+
schema_info.append(f"Dataset {i}: columns={ds.column_names}")
|
56
|
+
|
57
|
+
schema_details = "\n".join(schema_info)
|
58
|
+
raise FlowValidationError(
|
59
|
+
f"Schema mismatch when concatenating {context}. "
|
60
|
+
f"All datasets must have compatible schemas (same columns/types). "
|
61
|
+
f"Original error: {e}\n"
|
62
|
+
f"Dataset schemas:\n{schema_details}"
|
63
|
+
) from e
|
@@ -0,0 +1,208 @@
|
|
1
|
+
"""Custom exception classes for SDG Hub error handling."""
|
2
|
+
|
3
|
+
# Standard
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
|
7
|
+
class SDGHubError(Exception):
|
8
|
+
"""Base exception class for all SDG Hub errors."""
|
9
|
+
|
10
|
+
def __init__(self, message: str, details: Optional[str] = None):
|
11
|
+
"""Initialize SDGHubError.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
message : str
|
16
|
+
The main error message.
|
17
|
+
details : str, optional
|
18
|
+
Additional details about the error.
|
19
|
+
"""
|
20
|
+
self.message = message
|
21
|
+
self.details = details
|
22
|
+
full_message = message
|
23
|
+
if details:
|
24
|
+
full_message = f"{message}\nDetails: {details}"
|
25
|
+
super().__init__(full_message)
|
26
|
+
|
27
|
+
|
28
|
+
class FlowRunnerError(SDGHubError):
|
29
|
+
"""Base exception class for flow runner errors."""
|
30
|
+
|
31
|
+
pass
|
32
|
+
|
33
|
+
|
34
|
+
class DatasetLoadError(FlowRunnerError):
|
35
|
+
"""Raised when dataset loading fails."""
|
36
|
+
|
37
|
+
pass
|
38
|
+
|
39
|
+
|
40
|
+
class FlowConfigurationError(FlowRunnerError):
|
41
|
+
"""Raised when flow configuration is invalid."""
|
42
|
+
|
43
|
+
pass
|
44
|
+
|
45
|
+
|
46
|
+
class APIConnectionError(FlowRunnerError):
|
47
|
+
"""Raised when API connection fails."""
|
48
|
+
|
49
|
+
pass
|
50
|
+
|
51
|
+
|
52
|
+
class DataGenerationError(FlowRunnerError):
|
53
|
+
"""Raised when data generation fails."""
|
54
|
+
|
55
|
+
pass
|
56
|
+
|
57
|
+
|
58
|
+
class DataSaveError(FlowRunnerError):
|
59
|
+
"""Raised when saving generated data fails."""
|
60
|
+
|
61
|
+
pass
|
62
|
+
|
63
|
+
|
64
|
+
class BlockError(SDGHubError):
|
65
|
+
"""Base exception class for block-related errors."""
|
66
|
+
|
67
|
+
pass
|
68
|
+
|
69
|
+
|
70
|
+
class BlockConfigurationError(BlockError):
|
71
|
+
"""Raised when block configuration is invalid."""
|
72
|
+
|
73
|
+
pass
|
74
|
+
|
75
|
+
|
76
|
+
class BlockExecutionError(BlockError):
|
77
|
+
"""Raised when block execution fails."""
|
78
|
+
|
79
|
+
pass
|
80
|
+
|
81
|
+
|
82
|
+
class BlockValidationError(BlockError):
|
83
|
+
"""Base exception class for block validation errors."""
|
84
|
+
|
85
|
+
pass
|
86
|
+
|
87
|
+
|
88
|
+
class MissingColumnError(BlockValidationError):
|
89
|
+
"""Raised when required input columns are missing from dataset."""
|
90
|
+
|
91
|
+
def __init__(
|
92
|
+
self, block_name: str, missing_columns: list[str], available_columns: list[str]
|
93
|
+
):
|
94
|
+
"""Initialize MissingColumnError.
|
95
|
+
|
96
|
+
Parameters
|
97
|
+
----------
|
98
|
+
block_name : str
|
99
|
+
Name of the block that failed validation.
|
100
|
+
missing_columns : List[str]
|
101
|
+
List of missing column names.
|
102
|
+
available_columns : List[str]
|
103
|
+
List of available column names in the dataset.
|
104
|
+
"""
|
105
|
+
self.block_name = block_name
|
106
|
+
self.missing_columns = missing_columns
|
107
|
+
self.available_columns = available_columns
|
108
|
+
|
109
|
+
message = (
|
110
|
+
f"Block '{block_name}' missing required input columns: {missing_columns}"
|
111
|
+
)
|
112
|
+
details = f"Available columns: {available_columns}"
|
113
|
+
|
114
|
+
super().__init__(message, details)
|
115
|
+
|
116
|
+
|
117
|
+
class EmptyDatasetError(BlockValidationError):
|
118
|
+
"""Raised when an empty dataset is provided to a block."""
|
119
|
+
|
120
|
+
def __init__(self, block_name: str):
|
121
|
+
"""Initialize EmptyDatasetError.
|
122
|
+
|
123
|
+
Parameters
|
124
|
+
----------
|
125
|
+
block_name : str
|
126
|
+
Name of the block that received the empty dataset.
|
127
|
+
"""
|
128
|
+
self.block_name = block_name
|
129
|
+
|
130
|
+
message = f"Block '{block_name}' received an empty dataset"
|
131
|
+
details = "Dataset must contain at least one sample for processing"
|
132
|
+
|
133
|
+
super().__init__(message, details)
|
134
|
+
|
135
|
+
|
136
|
+
class OutputColumnCollisionError(BlockValidationError):
|
137
|
+
"""Raised when output columns would overwrite existing dataset columns."""
|
138
|
+
|
139
|
+
def __init__(
|
140
|
+
self, block_name: str, collision_columns: list[str], existing_columns: list[str]
|
141
|
+
):
|
142
|
+
"""Initialize OutputColumnCollisionError.
|
143
|
+
|
144
|
+
Parameters
|
145
|
+
----------
|
146
|
+
block_name : str
|
147
|
+
Name of the block that has column collisions.
|
148
|
+
collision_columns : List[str]
|
149
|
+
List of output columns that collide with existing columns.
|
150
|
+
existing_columns : List[str]
|
151
|
+
List of existing column names in the dataset.
|
152
|
+
"""
|
153
|
+
self.block_name = block_name
|
154
|
+
self.collision_columns = collision_columns
|
155
|
+
self.existing_columns = existing_columns
|
156
|
+
|
157
|
+
message = f"Block '{block_name}' output columns would overwrite existing data: {collision_columns}"
|
158
|
+
details = f"Existing columns: {existing_columns}"
|
159
|
+
|
160
|
+
super().__init__(message, details)
|
161
|
+
|
162
|
+
|
163
|
+
class TemplateValidationError(BlockValidationError):
|
164
|
+
"""Raised when template validation fails due to missing variables."""
|
165
|
+
|
166
|
+
def __init__(
|
167
|
+
self,
|
168
|
+
block_name: str,
|
169
|
+
missing_variables: list[str],
|
170
|
+
available_variables: list[str],
|
171
|
+
):
|
172
|
+
"""Initialize TemplateValidationError.
|
173
|
+
|
174
|
+
Parameters
|
175
|
+
----------
|
176
|
+
block_name : str
|
177
|
+
Name of the block that failed template validation.
|
178
|
+
missing_variables : List[str]
|
179
|
+
List of missing template variable names.
|
180
|
+
available_variables : List[str]
|
181
|
+
List of available template variable names.
|
182
|
+
"""
|
183
|
+
self.block_name = block_name
|
184
|
+
self.missing_variables = missing_variables
|
185
|
+
self.available_variables = available_variables
|
186
|
+
|
187
|
+
message = f"Block '{block_name}' template validation failed - missing required variables: {missing_variables}"
|
188
|
+
details = f"Available variables: {available_variables}"
|
189
|
+
|
190
|
+
super().__init__(message, details)
|
191
|
+
|
192
|
+
|
193
|
+
class FlowError(SDGHubError):
|
194
|
+
"""Base exception class for flow-related errors."""
|
195
|
+
|
196
|
+
pass
|
197
|
+
|
198
|
+
|
199
|
+
class FlowValidationError(FlowError):
|
200
|
+
"""Raised when flow validation fails."""
|
201
|
+
|
202
|
+
pass
|
203
|
+
|
204
|
+
|
205
|
+
class FlowExecutionError(FlowError):
|
206
|
+
"""Raised when flow execution fails."""
|
207
|
+
|
208
|
+
pass
|
@@ -0,0 +1,231 @@
|
|
1
|
+
# Flow ID word lists for wandb-style deterministic generation
|
2
|
+
# Format: adjective-noun-number (e.g., "bright-river-123")
|
3
|
+
|
4
|
+
adjectives:
|
5
|
+
- able
|
6
|
+
- ancient
|
7
|
+
- autumn
|
8
|
+
- bold
|
9
|
+
- brave
|
10
|
+
- bright
|
11
|
+
- calm
|
12
|
+
- clean
|
13
|
+
- clever
|
14
|
+
- cool
|
15
|
+
- cosmic
|
16
|
+
- daily
|
17
|
+
- dark
|
18
|
+
- deep
|
19
|
+
- divine
|
20
|
+
- dry
|
21
|
+
- eager
|
22
|
+
- early
|
23
|
+
- earnest
|
24
|
+
- easy
|
25
|
+
- epic
|
26
|
+
- even
|
27
|
+
- exact
|
28
|
+
- fair
|
29
|
+
- fast
|
30
|
+
- fine
|
31
|
+
- firm
|
32
|
+
- first
|
33
|
+
- fresh
|
34
|
+
- full
|
35
|
+
- gentle
|
36
|
+
- glad
|
37
|
+
- golden
|
38
|
+
- good
|
39
|
+
- great
|
40
|
+
- green
|
41
|
+
- happy
|
42
|
+
- hard
|
43
|
+
- heavy
|
44
|
+
- high
|
45
|
+
- holy
|
46
|
+
- huge
|
47
|
+
- jolly
|
48
|
+
- keen
|
49
|
+
- kind
|
50
|
+
- large
|
51
|
+
- late
|
52
|
+
- light
|
53
|
+
- live
|
54
|
+
- long
|
55
|
+
- loud
|
56
|
+
- lucky
|
57
|
+
- major
|
58
|
+
- mild
|
59
|
+
- new
|
60
|
+
- nice
|
61
|
+
- noble
|
62
|
+
- old
|
63
|
+
- open
|
64
|
+
- plain
|
65
|
+
- proud
|
66
|
+
- pure
|
67
|
+
- quick
|
68
|
+
- quiet
|
69
|
+
- rapid
|
70
|
+
- rare
|
71
|
+
- real
|
72
|
+
- rich
|
73
|
+
- right
|
74
|
+
- rough
|
75
|
+
- round
|
76
|
+
- safe
|
77
|
+
- sharp
|
78
|
+
- short
|
79
|
+
- simple
|
80
|
+
- slow
|
81
|
+
- small
|
82
|
+
- smart
|
83
|
+
- smooth
|
84
|
+
- soft
|
85
|
+
- solid
|
86
|
+
- strong
|
87
|
+
- sure
|
88
|
+
- swift
|
89
|
+
- tall
|
90
|
+
- thick
|
91
|
+
- thin
|
92
|
+
- tiny
|
93
|
+
- vast
|
94
|
+
- warm
|
95
|
+
- weak
|
96
|
+
- whole
|
97
|
+
- wide
|
98
|
+
- wild
|
99
|
+
- wise
|
100
|
+
- young
|
101
|
+
- exalted
|
102
|
+
- legendary
|
103
|
+
- resilient
|
104
|
+
- vibrant
|
105
|
+
- stellar
|
106
|
+
- graceful
|
107
|
+
- radiant
|
108
|
+
- serene
|
109
|
+
- brilliant
|
110
|
+
- majestic
|
111
|
+
- elegant
|
112
|
+
|
113
|
+
nouns:
|
114
|
+
- abyss
|
115
|
+
- angel
|
116
|
+
- arrow
|
117
|
+
- atom
|
118
|
+
- ball
|
119
|
+
- band
|
120
|
+
- bark
|
121
|
+
- beam
|
122
|
+
- bear
|
123
|
+
- bell
|
124
|
+
- bird
|
125
|
+
- bloom
|
126
|
+
- blue
|
127
|
+
- boat
|
128
|
+
- bone
|
129
|
+
- book
|
130
|
+
- brook
|
131
|
+
- brush
|
132
|
+
- calm
|
133
|
+
- cave
|
134
|
+
- cell
|
135
|
+
- chant
|
136
|
+
- chord
|
137
|
+
- clay
|
138
|
+
- cliff
|
139
|
+
- cloud
|
140
|
+
- coal
|
141
|
+
- coast
|
142
|
+
- coin
|
143
|
+
- colt
|
144
|
+
- coral
|
145
|
+
- core
|
146
|
+
- creek
|
147
|
+
- crop
|
148
|
+
- crown
|
149
|
+
- cube
|
150
|
+
- dawn
|
151
|
+
- day
|
152
|
+
- dew
|
153
|
+
- disk
|
154
|
+
- dove
|
155
|
+
- dream
|
156
|
+
- drop
|
157
|
+
- dust
|
158
|
+
- eagle
|
159
|
+
- earth
|
160
|
+
- echo
|
161
|
+
- edge
|
162
|
+
- ember
|
163
|
+
- field
|
164
|
+
- fire
|
165
|
+
- fish
|
166
|
+
- flame
|
167
|
+
- flight
|
168
|
+
- flow
|
169
|
+
- foam
|
170
|
+
- fog
|
171
|
+
- forest
|
172
|
+
- frost
|
173
|
+
- glow
|
174
|
+
- gold
|
175
|
+
- grass
|
176
|
+
- grove
|
177
|
+
- haze
|
178
|
+
- heart
|
179
|
+
- hill
|
180
|
+
- ice
|
181
|
+
- iris
|
182
|
+
- jade
|
183
|
+
- lake
|
184
|
+
- land
|
185
|
+
- leaf
|
186
|
+
- light
|
187
|
+
- lion
|
188
|
+
- moon
|
189
|
+
- moss
|
190
|
+
- night
|
191
|
+
- oak
|
192
|
+
- ocean
|
193
|
+
- path
|
194
|
+
- peak
|
195
|
+
- pearl
|
196
|
+
- pine
|
197
|
+
- pond
|
198
|
+
- rain
|
199
|
+
- reef
|
200
|
+
- river
|
201
|
+
- rock
|
202
|
+
- rose
|
203
|
+
- sage
|
204
|
+
- sand
|
205
|
+
- sea
|
206
|
+
- shadow
|
207
|
+
- shore
|
208
|
+
- sky
|
209
|
+
- snow
|
210
|
+
- song
|
211
|
+
- star
|
212
|
+
- stone
|
213
|
+
- storm
|
214
|
+
- stream
|
215
|
+
- sun
|
216
|
+
- sunset
|
217
|
+
- surf
|
218
|
+
- tide
|
219
|
+
- tree
|
220
|
+
- vale
|
221
|
+
- wave
|
222
|
+
- wind
|
223
|
+
- wing
|
224
|
+
- wolf
|
225
|
+
- wood
|
226
|
+
- darkness
|
227
|
+
- meadow
|
228
|
+
- thunder
|
229
|
+
- crystal
|
230
|
+
- valley
|
231
|
+
- mountain
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# Standard
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Dict, List
|
4
|
+
import hashlib
|
5
|
+
import random
|
6
|
+
|
7
|
+
# Third Party
|
8
|
+
import yaml
|
9
|
+
|
10
|
+
# Cache for loaded word lists to avoid repeated file I/O
|
11
|
+
_WORD_CACHE: Dict[str, List[str]] = {}
|
12
|
+
|
13
|
+
|
14
|
+
def _load_word_lists() -> Dict[str, List[str]]:
|
15
|
+
"""Load word lists from YAML configuration file.
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
Dictionary containing 'adjectives' and 'nouns' lists
|
19
|
+
|
20
|
+
Raises:
|
21
|
+
FileNotFoundError: If the word list file is not found
|
22
|
+
yaml.YAMLError: If the YAML file is malformed
|
23
|
+
"""
|
24
|
+
global _WORD_CACHE
|
25
|
+
|
26
|
+
if _WORD_CACHE:
|
27
|
+
return _WORD_CACHE
|
28
|
+
|
29
|
+
# Get path to word list file relative to this module
|
30
|
+
current_dir = Path(__file__).parent
|
31
|
+
words_file = current_dir / "flow_id_words.yaml"
|
32
|
+
|
33
|
+
try:
|
34
|
+
with open(words_file, "r", encoding="utf-8") as f:
|
35
|
+
word_data = yaml.safe_load(f)
|
36
|
+
|
37
|
+
_WORD_CACHE = {
|
38
|
+
"adjectives": word_data["adjectives"],
|
39
|
+
"nouns": word_data["nouns"],
|
40
|
+
}
|
41
|
+
|
42
|
+
return _WORD_CACHE
|
43
|
+
|
44
|
+
except FileNotFoundError:
|
45
|
+
# Fallback to minimal word lists if configuration file is not found
|
46
|
+
_WORD_CACHE = {
|
47
|
+
"adjectives": ["bright", "calm", "fast", "smart", "quick"],
|
48
|
+
"nouns": ["river", "star", "cloud", "moon", "rock"],
|
49
|
+
}
|
50
|
+
return _WORD_CACHE
|
51
|
+
except yaml.YAMLError as e:
|
52
|
+
raise yaml.YAMLError(f"Error parsing word list YAML: {e}")
|
53
|
+
except KeyError as e:
|
54
|
+
raise KeyError(f"Missing required key in word list YAML: {e}")
|
55
|
+
|
56
|
+
|
57
|
+
def get_flow_identifier(name: str) -> str:
|
58
|
+
"""Generate a deterministic wandb-style flow identifier.
|
59
|
+
|
60
|
+
Creates a human-readable identifier in the format "adjective-noun-number"
|
61
|
+
that is deterministic based on the input name. Same name will always
|
62
|
+
produce the same identifier.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
name: Flow name to generate identifier from
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
A string in the format "adjective-noun-number" (e.g., "bright-river-123")
|
69
|
+
|
70
|
+
Examples:
|
71
|
+
>>> get_flow_identifier("My Document QA Flow")
|
72
|
+
"bright-river-123"
|
73
|
+
>>> get_flow_identifier("My Document QA Flow") # Same input
|
74
|
+
"bright-river-123" # Same output
|
75
|
+
|
76
|
+
Raises:
|
77
|
+
FileNotFoundError: If the word list configuration file is not found
|
78
|
+
yaml.YAMLError: If the word list YAML file is malformed
|
79
|
+
"""
|
80
|
+
# Load word lists from YAML configuration
|
81
|
+
word_lists = _load_word_lists()
|
82
|
+
adjectives = word_lists["adjectives"]
|
83
|
+
nouns = word_lists["nouns"]
|
84
|
+
|
85
|
+
# Create deterministic seed from name
|
86
|
+
seed_value = int(hashlib.sha256(name.encode()).hexdigest()[:8], 16)
|
87
|
+
rng = random.Random(seed_value)
|
88
|
+
|
89
|
+
# Select words and number deterministically
|
90
|
+
adjective = rng.choice(adjectives)
|
91
|
+
noun = rng.choice(nouns)
|
92
|
+
number = rng.randint(1, 999)
|
93
|
+
|
94
|
+
return f"{adjective}-{noun}-{number}"
|
@@ -7,11 +7,11 @@ search paths.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
# Standard
|
10
|
-
from typing import
|
10
|
+
from typing import Union
|
11
11
|
import os
|
12
12
|
|
13
13
|
|
14
|
-
def resolve_path(filename: str, search_dirs: Union[str,
|
14
|
+
def resolve_path(filename: str, search_dirs: Union[str, list[str]]) -> str:
|
15
15
|
"""Resolve a file path relative to one or more search directories.
|
16
16
|
|
17
17
|
Files are checked in the following order:
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""YAML utilities for flow configuration."""
|
3
|
+
|
4
|
+
# Standard
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Dict
|
7
|
+
|
8
|
+
# Third Party
|
9
|
+
import yaml
|
10
|
+
|
11
|
+
# Local
|
12
|
+
from .logger_config import setup_logger
|
13
|
+
|
14
|
+
logger = setup_logger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
def save_flow_yaml(
|
18
|
+
yaml_path: str,
|
19
|
+
flow_config: Dict[str, Any],
|
20
|
+
reason: str = "",
|
21
|
+
sort_keys: bool = False,
|
22
|
+
width: int = 240,
|
23
|
+
indent: int = 2,
|
24
|
+
) -> None:
|
25
|
+
"""
|
26
|
+
Save flow configuration to a YAML file.
|
27
|
+
|
28
|
+
This utility function saves flow configurations to YAML files,
|
29
|
+
ensuring consistent formatting and logging across the codebase.
|
30
|
+
|
31
|
+
Parameters
|
32
|
+
----------
|
33
|
+
yaml_path : str
|
34
|
+
Path to the YAML file to write.
|
35
|
+
flow_config : Dict[str, Any]
|
36
|
+
Flow configuration to save.
|
37
|
+
reason : str, optional
|
38
|
+
Reason for saving, used in log message.
|
39
|
+
width : int, optional
|
40
|
+
Maximum line width for YAML output.
|
41
|
+
indent : int, optional
|
42
|
+
Indentation level for YAML output.
|
43
|
+
"""
|
44
|
+
yaml_path = str(Path(yaml_path)) # Normalize path
|
45
|
+
|
46
|
+
with open(yaml_path, "w", encoding="utf-8") as f:
|
47
|
+
yaml.dump(
|
48
|
+
flow_config,
|
49
|
+
f,
|
50
|
+
default_flow_style=False,
|
51
|
+
sort_keys=sort_keys,
|
52
|
+
width=width,
|
53
|
+
indent=indent,
|
54
|
+
)
|
55
|
+
|
56
|
+
log_msg = f"Saved flow configuration to YAML: {yaml_path}"
|
57
|
+
if reason:
|
58
|
+
log_msg = f"{log_msg} ({reason})"
|
59
|
+
logger.debug(log_msg)
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
- role: system
|
2
|
+
content: You are an AI assistant knowledgeable about {{domain}} domain. Be accurate but concise in response.
|
3
|
+
|
4
|
+
- role: user
|
5
|
+
content: |
|
6
|
+
Please break down the following snippet from an article about {{domain}} into atomic facts.
|
7
|
+
|
8
|
+
1. Makesure each fact is grounded in the given text.
|
9
|
+
2. Include any necessary information needed to explain the fact or concept
|
10
|
+
3. The atomic facts should be as simple as possible, if it's compound sentence, break down one more time
|
11
|
+
4. For clarity, avoid using pronouns like 'it', 'he', 'she', 'this', 'that' etc., and instead use the full names or titles.
|
12
|
+
5. Focus only on key concepts and facts. Skip any question or problems mentioned in the passage.
|
13
|
+
|
14
|
+
To help you understand the task, here is an example:
|
15
|
+
[Passage]
|
16
|
+
The tournament was contested by ten national teams, maintaining the same format used in 2019. After six weeks of round-robin matches, India, South Africa, Australia, and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad. Australia won the final by six wickets, winning their sixth Cricket World Cup title.
|
17
|
+
[Facts]
|
18
|
+
1. The tournament was contested by ten national teams.
|
19
|
+
2. The tournament maintained the same format used in 2019.
|
20
|
+
3. The round-robin matches lasted for six weeks.
|
21
|
+
4. India finished as one of the top four teams.
|
22
|
+
5. South Africa finished as one of the top four teams.
|
23
|
+
6. Australia finished as one of the top four teams.
|
24
|
+
7. New Zealand finished as one of the top four teams.
|
25
|
+
8. India, South Africa, Australia, and New Zealand qualified for the knockout stage.
|
26
|
+
9. In the knockout stage, India beat New Zealand.
|
27
|
+
10. In the knockout stage, Australia beat South Africa.
|
28
|
+
11. India advanced to the final.
|
29
|
+
12. Australia advanced to the final.
|
30
|
+
13. The final was played on 19 November.
|
31
|
+
14. The final was held at the Narendra Modi Stadium in Ahmedabad.
|
32
|
+
15. Australia won the final by six wickets.
|
33
|
+
16. Australia won their sixth Cricket World Cup title.
|
34
|
+
[End]
|
35
|
+
|
36
|
+
Now it's your turn breakdown following snippet from article about {{domain}} into atomic facts following similar style as above examples
|
37
|
+
[Passage]
|
38
|
+
{{document_outline}}
|
39
|
+
{{document}}
|
40
|
+
[Facts]
|