sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,63 @@
1
+ # Third Party
2
+ from datasets import Dataset, concatenate_datasets
3
+
4
+ # Local
5
+ from .error_handling import FlowValidationError
6
+
7
+
8
+ def safe_concatenate_datasets(datasets: list):
9
+ """Concatenate datasets safely, ignoring any datasets that are None or empty."""
10
+ filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
11
+
12
+ if not filtered_datasets:
13
+ return None
14
+
15
+ return concatenate_datasets(filtered_datasets)
16
+
17
+
18
+ def safe_concatenate_with_validation(
19
+ datasets: list, context: str = "datasets"
20
+ ) -> Dataset:
21
+ """Safely concatenate datasets with schema validation and clear error messages.
22
+
23
+ Parameters
24
+ ----------
25
+ datasets : list[Dataset]
26
+ List of datasets to concatenate
27
+ context : str
28
+ Description of what's being concatenated for error messages
29
+
30
+ Returns
31
+ -------
32
+ Dataset
33
+ Concatenated dataset
34
+
35
+ Raises
36
+ ------
37
+ FlowValidationError
38
+ If schema mismatch prevents concatenation or no valid datasets
39
+ """
40
+ # Filter out None and empty datasets first
41
+ valid_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
42
+
43
+ if not valid_datasets:
44
+ raise FlowValidationError(f"No valid datasets to concatenate in {context}")
45
+
46
+ if len(valid_datasets) == 1:
47
+ return valid_datasets[0]
48
+
49
+ try:
50
+ return concatenate_datasets(valid_datasets)
51
+ except Exception as e:
52
+ # Schema mismatch or other concatenation error
53
+ schema_info = []
54
+ for i, ds in enumerate(valid_datasets):
55
+ schema_info.append(f"Dataset {i}: columns={ds.column_names}")
56
+
57
+ schema_details = "\n".join(schema_info)
58
+ raise FlowValidationError(
59
+ f"Schema mismatch when concatenating {context}. "
60
+ f"All datasets must have compatible schemas (same columns/types). "
61
+ f"Original error: {e}\n"
62
+ f"Dataset schemas:\n{schema_details}"
63
+ ) from e
@@ -0,0 +1,208 @@
1
+ """Custom exception classes for SDG Hub error handling."""
2
+
3
+ # Standard
4
+ from typing import Optional
5
+
6
+
7
+ class SDGHubError(Exception):
8
+ """Base exception class for all SDG Hub errors."""
9
+
10
+ def __init__(self, message: str, details: Optional[str] = None):
11
+ """Initialize SDGHubError.
12
+
13
+ Parameters
14
+ ----------
15
+ message : str
16
+ The main error message.
17
+ details : str, optional
18
+ Additional details about the error.
19
+ """
20
+ self.message = message
21
+ self.details = details
22
+ full_message = message
23
+ if details:
24
+ full_message = f"{message}\nDetails: {details}"
25
+ super().__init__(full_message)
26
+
27
+
28
+ class FlowRunnerError(SDGHubError):
29
+ """Base exception class for flow runner errors."""
30
+
31
+ pass
32
+
33
+
34
+ class DatasetLoadError(FlowRunnerError):
35
+ """Raised when dataset loading fails."""
36
+
37
+ pass
38
+
39
+
40
+ class FlowConfigurationError(FlowRunnerError):
41
+ """Raised when flow configuration is invalid."""
42
+
43
+ pass
44
+
45
+
46
+ class APIConnectionError(FlowRunnerError):
47
+ """Raised when API connection fails."""
48
+
49
+ pass
50
+
51
+
52
+ class DataGenerationError(FlowRunnerError):
53
+ """Raised when data generation fails."""
54
+
55
+ pass
56
+
57
+
58
+ class DataSaveError(FlowRunnerError):
59
+ """Raised when saving generated data fails."""
60
+
61
+ pass
62
+
63
+
64
+ class BlockError(SDGHubError):
65
+ """Base exception class for block-related errors."""
66
+
67
+ pass
68
+
69
+
70
+ class BlockConfigurationError(BlockError):
71
+ """Raised when block configuration is invalid."""
72
+
73
+ pass
74
+
75
+
76
+ class BlockExecutionError(BlockError):
77
+ """Raised when block execution fails."""
78
+
79
+ pass
80
+
81
+
82
+ class BlockValidationError(BlockError):
83
+ """Base exception class for block validation errors."""
84
+
85
+ pass
86
+
87
+
88
+ class MissingColumnError(BlockValidationError):
89
+ """Raised when required input columns are missing from dataset."""
90
+
91
+ def __init__(
92
+ self, block_name: str, missing_columns: list[str], available_columns: list[str]
93
+ ):
94
+ """Initialize MissingColumnError.
95
+
96
+ Parameters
97
+ ----------
98
+ block_name : str
99
+ Name of the block that failed validation.
100
+ missing_columns : List[str]
101
+ List of missing column names.
102
+ available_columns : List[str]
103
+ List of available column names in the dataset.
104
+ """
105
+ self.block_name = block_name
106
+ self.missing_columns = missing_columns
107
+ self.available_columns = available_columns
108
+
109
+ message = (
110
+ f"Block '{block_name}' missing required input columns: {missing_columns}"
111
+ )
112
+ details = f"Available columns: {available_columns}"
113
+
114
+ super().__init__(message, details)
115
+
116
+
117
+ class EmptyDatasetError(BlockValidationError):
118
+ """Raised when an empty dataset is provided to a block."""
119
+
120
+ def __init__(self, block_name: str):
121
+ """Initialize EmptyDatasetError.
122
+
123
+ Parameters
124
+ ----------
125
+ block_name : str
126
+ Name of the block that received the empty dataset.
127
+ """
128
+ self.block_name = block_name
129
+
130
+ message = f"Block '{block_name}' received an empty dataset"
131
+ details = "Dataset must contain at least one sample for processing"
132
+
133
+ super().__init__(message, details)
134
+
135
+
136
+ class OutputColumnCollisionError(BlockValidationError):
137
+ """Raised when output columns would overwrite existing dataset columns."""
138
+
139
+ def __init__(
140
+ self, block_name: str, collision_columns: list[str], existing_columns: list[str]
141
+ ):
142
+ """Initialize OutputColumnCollisionError.
143
+
144
+ Parameters
145
+ ----------
146
+ block_name : str
147
+ Name of the block that has column collisions.
148
+ collision_columns : List[str]
149
+ List of output columns that collide with existing columns.
150
+ existing_columns : List[str]
151
+ List of existing column names in the dataset.
152
+ """
153
+ self.block_name = block_name
154
+ self.collision_columns = collision_columns
155
+ self.existing_columns = existing_columns
156
+
157
+ message = f"Block '{block_name}' output columns would overwrite existing data: {collision_columns}"
158
+ details = f"Existing columns: {existing_columns}"
159
+
160
+ super().__init__(message, details)
161
+
162
+
163
+ class TemplateValidationError(BlockValidationError):
164
+ """Raised when template validation fails due to missing variables."""
165
+
166
+ def __init__(
167
+ self,
168
+ block_name: str,
169
+ missing_variables: list[str],
170
+ available_variables: list[str],
171
+ ):
172
+ """Initialize TemplateValidationError.
173
+
174
+ Parameters
175
+ ----------
176
+ block_name : str
177
+ Name of the block that failed template validation.
178
+ missing_variables : List[str]
179
+ List of missing template variable names.
180
+ available_variables : List[str]
181
+ List of available template variable names.
182
+ """
183
+ self.block_name = block_name
184
+ self.missing_variables = missing_variables
185
+ self.available_variables = available_variables
186
+
187
+ message = f"Block '{block_name}' template validation failed - missing required variables: {missing_variables}"
188
+ details = f"Available variables: {available_variables}"
189
+
190
+ super().__init__(message, details)
191
+
192
+
193
+ class FlowError(SDGHubError):
194
+ """Base exception class for flow-related errors."""
195
+
196
+ pass
197
+
198
+
199
+ class FlowValidationError(FlowError):
200
+ """Raised when flow validation fails."""
201
+
202
+ pass
203
+
204
+
205
+ class FlowExecutionError(FlowError):
206
+ """Raised when flow execution fails."""
207
+
208
+ pass
@@ -0,0 +1,231 @@
1
+ # Flow ID word lists for wandb-style deterministic generation
2
+ # Format: adjective-noun-number (e.g., "bright-river-123")
3
+
4
+ adjectives:
5
+ - able
6
+ - ancient
7
+ - autumn
8
+ - bold
9
+ - brave
10
+ - bright
11
+ - calm
12
+ - clean
13
+ - clever
14
+ - cool
15
+ - cosmic
16
+ - daily
17
+ - dark
18
+ - deep
19
+ - divine
20
+ - dry
21
+ - eager
22
+ - early
23
+ - earnest
24
+ - easy
25
+ - epic
26
+ - even
27
+ - exact
28
+ - fair
29
+ - fast
30
+ - fine
31
+ - firm
32
+ - first
33
+ - fresh
34
+ - full
35
+ - gentle
36
+ - glad
37
+ - golden
38
+ - good
39
+ - great
40
+ - green
41
+ - happy
42
+ - hard
43
+ - heavy
44
+ - high
45
+ - holy
46
+ - huge
47
+ - jolly
48
+ - keen
49
+ - kind
50
+ - large
51
+ - late
52
+ - light
53
+ - live
54
+ - long
55
+ - loud
56
+ - lucky
57
+ - major
58
+ - mild
59
+ - new
60
+ - nice
61
+ - noble
62
+ - old
63
+ - open
64
+ - plain
65
+ - proud
66
+ - pure
67
+ - quick
68
+ - quiet
69
+ - rapid
70
+ - rare
71
+ - real
72
+ - rich
73
+ - right
74
+ - rough
75
+ - round
76
+ - safe
77
+ - sharp
78
+ - short
79
+ - simple
80
+ - slow
81
+ - small
82
+ - smart
83
+ - smooth
84
+ - soft
85
+ - solid
86
+ - strong
87
+ - sure
88
+ - swift
89
+ - tall
90
+ - thick
91
+ - thin
92
+ - tiny
93
+ - vast
94
+ - warm
95
+ - weak
96
+ - whole
97
+ - wide
98
+ - wild
99
+ - wise
100
+ - young
101
+ - exalted
102
+ - legendary
103
+ - resilient
104
+ - vibrant
105
+ - stellar
106
+ - graceful
107
+ - radiant
108
+ - serene
109
+ - brilliant
110
+ - majestic
111
+ - elegant
112
+
113
+ nouns:
114
+ - abyss
115
+ - angel
116
+ - arrow
117
+ - atom
118
+ - ball
119
+ - band
120
+ - bark
121
+ - beam
122
+ - bear
123
+ - bell
124
+ - bird
125
+ - bloom
126
+ - blue
127
+ - boat
128
+ - bone
129
+ - book
130
+ - brook
131
+ - brush
132
+ - calm
133
+ - cave
134
+ - cell
135
+ - chant
136
+ - chord
137
+ - clay
138
+ - cliff
139
+ - cloud
140
+ - coal
141
+ - coast
142
+ - coin
143
+ - colt
144
+ - coral
145
+ - core
146
+ - creek
147
+ - crop
148
+ - crown
149
+ - cube
150
+ - dawn
151
+ - day
152
+ - dew
153
+ - disk
154
+ - dove
155
+ - dream
156
+ - drop
157
+ - dust
158
+ - eagle
159
+ - earth
160
+ - echo
161
+ - edge
162
+ - ember
163
+ - field
164
+ - fire
165
+ - fish
166
+ - flame
167
+ - flight
168
+ - flow
169
+ - foam
170
+ - fog
171
+ - forest
172
+ - frost
173
+ - glow
174
+ - gold
175
+ - grass
176
+ - grove
177
+ - haze
178
+ - heart
179
+ - hill
180
+ - ice
181
+ - iris
182
+ - jade
183
+ - lake
184
+ - land
185
+ - leaf
186
+ - light
187
+ - lion
188
+ - moon
189
+ - moss
190
+ - night
191
+ - oak
192
+ - ocean
193
+ - path
194
+ - peak
195
+ - pearl
196
+ - pine
197
+ - pond
198
+ - rain
199
+ - reef
200
+ - river
201
+ - rock
202
+ - rose
203
+ - sage
204
+ - sand
205
+ - sea
206
+ - shadow
207
+ - shore
208
+ - sky
209
+ - snow
210
+ - song
211
+ - star
212
+ - stone
213
+ - storm
214
+ - stream
215
+ - sun
216
+ - sunset
217
+ - surf
218
+ - tide
219
+ - tree
220
+ - vale
221
+ - wave
222
+ - wind
223
+ - wing
224
+ - wolf
225
+ - wood
226
+ - darkness
227
+ - meadow
228
+ - thunder
229
+ - crystal
230
+ - valley
231
+ - mountain
@@ -0,0 +1,94 @@
1
+ # Standard
2
+ from pathlib import Path
3
+ from typing import Dict, List
4
+ import hashlib
5
+ import random
6
+
7
+ # Third Party
8
+ import yaml
9
+
10
+ # Cache for loaded word lists to avoid repeated file I/O
11
+ _WORD_CACHE: Dict[str, List[str]] = {}
12
+
13
+
14
+ def _load_word_lists() -> Dict[str, List[str]]:
15
+ """Load word lists from YAML configuration file.
16
+
17
+ Returns:
18
+ Dictionary containing 'adjectives' and 'nouns' lists
19
+
20
+ Raises:
21
+ FileNotFoundError: If the word list file is not found
22
+ yaml.YAMLError: If the YAML file is malformed
23
+ """
24
+ global _WORD_CACHE
25
+
26
+ if _WORD_CACHE:
27
+ return _WORD_CACHE
28
+
29
+ # Get path to word list file relative to this module
30
+ current_dir = Path(__file__).parent
31
+ words_file = current_dir / "flow_id_words.yaml"
32
+
33
+ try:
34
+ with open(words_file, "r", encoding="utf-8") as f:
35
+ word_data = yaml.safe_load(f)
36
+
37
+ _WORD_CACHE = {
38
+ "adjectives": word_data["adjectives"],
39
+ "nouns": word_data["nouns"],
40
+ }
41
+
42
+ return _WORD_CACHE
43
+
44
+ except FileNotFoundError:
45
+ # Fallback to minimal word lists if configuration file is not found
46
+ _WORD_CACHE = {
47
+ "adjectives": ["bright", "calm", "fast", "smart", "quick"],
48
+ "nouns": ["river", "star", "cloud", "moon", "rock"],
49
+ }
50
+ return _WORD_CACHE
51
+ except yaml.YAMLError as e:
52
+ raise yaml.YAMLError(f"Error parsing word list YAML: {e}")
53
+ except KeyError as e:
54
+ raise KeyError(f"Missing required key in word list YAML: {e}")
55
+
56
+
57
+ def get_flow_identifier(name: str) -> str:
58
+ """Generate a deterministic wandb-style flow identifier.
59
+
60
+ Creates a human-readable identifier in the format "adjective-noun-number"
61
+ that is deterministic based on the input name. Same name will always
62
+ produce the same identifier.
63
+
64
+ Args:
65
+ name: Flow name to generate identifier from
66
+
67
+ Returns:
68
+ A string in the format "adjective-noun-number" (e.g., "bright-river-123")
69
+
70
+ Examples:
71
+ >>> get_flow_identifier("My Document QA Flow")
72
+ "bright-river-123"
73
+ >>> get_flow_identifier("My Document QA Flow") # Same input
74
+ "bright-river-123" # Same output
75
+
76
+ Raises:
77
+ FileNotFoundError: If the word list configuration file is not found
78
+ yaml.YAMLError: If the word list YAML file is malformed
79
+ """
80
+ # Load word lists from YAML configuration
81
+ word_lists = _load_word_lists()
82
+ adjectives = word_lists["adjectives"]
83
+ nouns = word_lists["nouns"]
84
+
85
+ # Create deterministic seed from name
86
+ seed_value = int(hashlib.sha256(name.encode()).hexdigest()[:8], 16)
87
+ rng = random.Random(seed_value)
88
+
89
+ # Select words and number deterministically
90
+ adjective = rng.choice(adjectives)
91
+ noun = rng.choice(nouns)
92
+ number = rng.randint(1, 999)
93
+
94
+ return f"{adjective}-{noun}-{number}"
@@ -7,11 +7,11 @@ search paths.
7
7
  """
8
8
 
9
9
  # Standard
10
- from typing import List, Union
10
+ from typing import Union
11
11
  import os
12
12
 
13
13
 
14
- def resolve_path(filename: str, search_dirs: Union[str, List[str]]) -> str:
14
+ def resolve_path(filename: str, search_dirs: Union[str, list[str]]) -> str:
15
15
  """Resolve a file path relative to one or more search directories.
16
16
 
17
17
  Files are checked in the following order:
@@ -0,0 +1,59 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """YAML utilities for flow configuration."""
3
+
4
+ # Standard
5
+ from pathlib import Path
6
+ from typing import Any, Dict
7
+
8
+ # Third Party
9
+ import yaml
10
+
11
+ # Local
12
+ from .logger_config import setup_logger
13
+
14
+ logger = setup_logger(__name__)
15
+
16
+
17
+ def save_flow_yaml(
18
+ yaml_path: str,
19
+ flow_config: Dict[str, Any],
20
+ reason: str = "",
21
+ sort_keys: bool = False,
22
+ width: int = 240,
23
+ indent: int = 2,
24
+ ) -> None:
25
+ """
26
+ Save flow configuration to a YAML file.
27
+
28
+ This utility function saves flow configurations to YAML files,
29
+ ensuring consistent formatting and logging across the codebase.
30
+
31
+ Parameters
32
+ ----------
33
+ yaml_path : str
34
+ Path to the YAML file to write.
35
+ flow_config : Dict[str, Any]
36
+ Flow configuration to save.
37
+ reason : str, optional
38
+ Reason for saving, used in log message.
39
+ width : int, optional
40
+ Maximum line width for YAML output.
41
+ indent : int, optional
42
+ Indentation level for YAML output.
43
+ """
44
+ yaml_path = str(Path(yaml_path)) # Normalize path
45
+
46
+ with open(yaml_path, "w", encoding="utf-8") as f:
47
+ yaml.dump(
48
+ flow_config,
49
+ f,
50
+ default_flow_style=False,
51
+ sort_keys=sort_keys,
52
+ width=width,
53
+ indent=indent,
54
+ )
55
+
56
+ log_msg = f"Saved flow configuration to YAML: {yaml_path}"
57
+ if reason:
58
+ log_msg = f"{log_msg} ({reason})"
59
+ logger.debug(log_msg)
@@ -0,0 +1,40 @@
1
+ - role: system
2
+ content: You are an AI assistant knowledgeable about {{domain}} domain. Be accurate but concise in response.
3
+
4
+ - role: user
5
+ content: |
6
+ Please break down the following snippet from an article about {{domain}} into atomic facts.
7
+
8
+ 1. Makesure each fact is grounded in the given text.
9
+ 2. Include any necessary information needed to explain the fact or concept
10
+ 3. The atomic facts should be as simple as possible, if it's compound sentence, break down one more time
11
+ 4. For clarity, avoid using pronouns like 'it', 'he', 'she', 'this', 'that' etc., and instead use the full names or titles.
12
+ 5. Focus only on key concepts and facts. Skip any question or problems mentioned in the passage.
13
+
14
+ To help you understand the task, here is an example:
15
+ [Passage]
16
+ The tournament was contested by ten national teams, maintaining the same format used in 2019. After six weeks of round-robin matches, India, South Africa, Australia, and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad. Australia won the final by six wickets, winning their sixth Cricket World Cup title.
17
+ [Facts]
18
+ 1. The tournament was contested by ten national teams.
19
+ 2. The tournament maintained the same format used in 2019.
20
+ 3. The round-robin matches lasted for six weeks.
21
+ 4. India finished as one of the top four teams.
22
+ 5. South Africa finished as one of the top four teams.
23
+ 6. Australia finished as one of the top four teams.
24
+ 7. New Zealand finished as one of the top four teams.
25
+ 8. India, South Africa, Australia, and New Zealand qualified for the knockout stage.
26
+ 9. In the knockout stage, India beat New Zealand.
27
+ 10. In the knockout stage, Australia beat South Africa.
28
+ 11. India advanced to the final.
29
+ 12. Australia advanced to the final.
30
+ 13. The final was played on 19 November.
31
+ 14. The final was held at the Narendra Modi Stadium in Ahmedabad.
32
+ 15. Australia won the final by six wickets.
33
+ 16. Australia won their sixth Cricket World Cup title.
34
+ [End]
35
+
36
+ Now it's your turn breakdown following snippet from article about {{domain}} into atomic facts following similar style as above examples
37
+ [Passage]
38
+ {{document_outline}}
39
+ {{document}}
40
+ [Facts]