sdg-hub 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sdg_hub/_version.py +16 -3
  2. sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
  3. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +175 -416
  4. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +174 -415
  5. sdg_hub/core/blocks/evaluation/verify_question_block.py +180 -415
  6. sdg_hub/core/blocks/llm/__init__.py +2 -0
  7. sdg_hub/core/blocks/llm/client_manager.py +61 -24
  8. sdg_hub/core/blocks/llm/config.py +1 -0
  9. sdg_hub/core/blocks/llm/llm_chat_block.py +62 -7
  10. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +653 -0
  11. sdg_hub/core/blocks/llm/text_parser_block.py +75 -30
  12. sdg_hub/core/blocks/registry.py +49 -35
  13. sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
  14. sdg_hub/core/flow/base.py +370 -20
  15. sdg_hub/core/flow/checkpointer.py +333 -0
  16. sdg_hub/core/flow/metadata.py +45 -0
  17. sdg_hub/core/flow/migration.py +12 -1
  18. sdg_hub/core/flow/registry.py +121 -58
  19. sdg_hub/core/flow/validation.py +12 -0
  20. sdg_hub/core/utils/__init__.py +2 -1
  21. sdg_hub/core/utils/datautils.py +81 -1
  22. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  23. sdg_hub/core/utils/flow_identifier.py +94 -0
  24. sdg_hub/core/utils/yaml_utils.py +59 -0
  25. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -7
  26. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/METADATA +59 -31
  27. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/RECORD +30 -25
  28. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/WHEEL +0 -0
  29. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/licenses/LICENSE +0 -0
  30. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,17 @@
4
4
  # Standard
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
- from typing import Optional
7
+ from typing import Dict, List, Optional
8
8
  import os
9
9
 
10
10
  # Third Party
11
+ from rich.console import Console
12
+ from rich.table import Table
11
13
  import yaml
12
14
 
13
15
  # Local
14
16
  from ..utils.logger_config import setup_logger
17
+ from ..utils.yaml_utils import save_flow_yaml
15
18
  from .metadata import FlowMetadata
16
19
 
17
20
  logger = setup_logger(__name__)
@@ -124,22 +127,36 @@ class FlowRegistry:
124
127
  metadata_dict = flow_config["metadata"]
125
128
  metadata = FlowMetadata(**metadata_dict)
126
129
 
127
- entry = FlowRegistryEntry(path=str(yaml_file), metadata=metadata)
130
+ # If id was generated, update the YAML
131
+ if metadata.id and "id" not in metadata_dict:
132
+ flow_config["metadata"]["id"] = metadata.id
133
+
134
+ save_flow_yaml(
135
+ yaml_file,
136
+ flow_config,
137
+ f"updated with generated id: {metadata.id}",
138
+ )
128
139
 
140
+ entry = FlowRegistryEntry(path=str(yaml_file), metadata=metadata)
129
141
  cls._entries[metadata.name] = entry
130
- logger.debug(f"Registered flow: {metadata.name} from {yaml_file}")
142
+ logger.debug(
143
+ f"Registered flow: {metadata.name} (id: {metadata.id}) from {yaml_file}"
144
+ )
131
145
 
132
146
  except Exception as exc:
133
147
  logger.debug(f"Skipped {yaml_file}: {exc}")
134
148
 
135
149
  @classmethod
136
- def get_flow_path(cls, flow_name: str) -> Optional[str]:
150
+ def get_flow_path(cls, flow_name_or_id: str) -> Optional[str]:
137
151
  """Get the path to a registered flow.
138
152
 
153
+ For backward compatibility, this function accepts either a flow id or flow_name.
154
+ Flow ID is preferred and should be used in new code.
155
+
139
156
  Parameters
140
157
  ----------
141
- flow_name : str
142
- Name of the flow to find.
158
+ flow_name_or_id : str
159
+ Either the flow id or flow_name to find.
143
160
 
144
161
  Returns
145
162
  -------
@@ -149,10 +166,64 @@ class FlowRegistry:
149
166
  cls._ensure_initialized()
150
167
  cls._discover_flows()
151
168
 
152
- if flow_name in cls._entries:
153
- return cls._entries[flow_name].path
169
+ # First try to find by id (preferred)
170
+ for entry in cls._entries.values():
171
+ if entry.metadata.id == flow_name_or_id:
172
+ return entry.path
173
+
174
+ # If not found, try by name (backward compatibility)
175
+ for entry in cls._entries.values():
176
+ if entry.metadata.name == flow_name_or_id:
177
+ logger.debug(
178
+ f"Found flow by name (deprecated): {flow_name_or_id}, use id: {entry.metadata.id} instead"
179
+ )
180
+ return entry.path
181
+
154
182
  return None
155
183
 
184
+ @classmethod
185
+ def get_flow_path_safe(cls, flow_name_or_id: str) -> str:
186
+ """Get the path to a registered flow with better error handling.
187
+
188
+ For backward compatibility, this function accepts either a flow id or flow_name.
189
+ Flow ID is preferred and should be used in new code.
190
+
191
+ Parameters
192
+ ----------
193
+ flow_name_or_id : str
194
+ Either the flow id or flow_name to find.
195
+
196
+ Returns
197
+ -------
198
+ str
199
+ Path to the flow file.
200
+
201
+ Raises
202
+ ------
203
+ ValueError
204
+ If the flow is not found, with helpful suggestions.
205
+ """
206
+ cls._ensure_initialized()
207
+ cls._discover_flows()
208
+
209
+ path = cls.get_flow_path(flow_name_or_id)
210
+ if path is None:
211
+ # Get available flows for better error message
212
+ available_flows = cls.list_flows()
213
+
214
+ error_msg = f"Flow '{flow_name_or_id}' not found.\n"
215
+
216
+ if available_flows:
217
+ error_msg += "Available flows:\n"
218
+ for flow in available_flows:
219
+ error_msg += f" - ID: '{flow['id']}', Name: '{flow['name']}'\n"
220
+ else:
221
+ error_msg += "No flows are currently registered. Try running FlowRegistry.discover_flows() first."
222
+
223
+ raise ValueError(error_msg.strip())
224
+
225
+ return path
226
+
156
227
  @classmethod
157
228
  def get_flow_metadata(cls, flow_name: str) -> Optional[FlowMetadata]:
158
229
  """Get metadata for a registered flow.
@@ -175,22 +246,26 @@ class FlowRegistry:
175
246
  return None
176
247
 
177
248
  @classmethod
178
- def list_flows(cls) -> list[str]:
179
- """List all registered flow names.
249
+ def list_flows(cls) -> List[Dict[str, str]]:
250
+ """List all registered flows with their IDs.
180
251
 
181
252
  Returns
182
253
  -------
183
- List[str]
184
- List of flow names.
254
+ List[Dict[str, str]]
255
+ List of dictionaries containing flow IDs and names.
256
+ Each dictionary has 'id' and 'name' keys.
185
257
  """
186
258
  cls._ensure_initialized()
187
259
  cls._discover_flows()
188
- return list(cls._entries.keys())
260
+ return [
261
+ {"id": entry.metadata.id, "name": entry.metadata.name}
262
+ for entry in cls._entries.values()
263
+ ]
189
264
 
190
265
  @classmethod
191
266
  def search_flows(
192
267
  cls, tag: Optional[str] = None, author: Optional[str] = None
193
- ) -> list[str]:
268
+ ) -> List[Dict[str, str]]:
194
269
  """Search flows by criteria.
195
270
 
196
271
  Parameters
@@ -202,15 +277,17 @@ class FlowRegistry:
202
277
 
203
278
  Returns
204
279
  -------
205
- List[str]
206
- List of matching flow names.
280
+ List[Dict[str, str]]
281
+ List of matching flows. Each dictionary contains:
282
+ - id: Flow ID
283
+ - name: Flow name
207
284
  """
208
285
  cls._ensure_initialized()
209
286
  cls._discover_flows()
210
287
 
211
288
  matching_flows = []
212
289
 
213
- for name, entry in cls._entries.items():
290
+ for entry in cls._entries.values():
214
291
  metadata = entry.metadata
215
292
 
216
293
  # Filter by tag
@@ -221,25 +298,27 @@ class FlowRegistry:
221
298
  if author and author.lower() not in metadata.author.lower():
222
299
  continue
223
300
 
224
- matching_flows.append(name)
301
+ matching_flows.append({"id": metadata.id, "name": metadata.name})
225
302
 
226
303
  return matching_flows
227
304
 
228
305
  @classmethod
229
- def get_flows_by_category(cls) -> dict[str, list[str]]:
306
+ def get_flows_by_category(cls) -> Dict[str, List[Dict[str, str]]]:
230
307
  """Get flows organized by their primary tag.
231
308
 
232
309
  Returns
233
310
  -------
234
- Dict[str, List[str]]
235
- Dictionary mapping tags to flow names.
311
+ Dict[str, List[Dict[str, str]]]
312
+ Dictionary mapping tags to flow information. Each flow is represented by:
313
+ - id: Flow ID
314
+ - name: Flow name
236
315
  """
237
316
  cls._ensure_initialized()
238
317
  cls._discover_flows()
239
318
 
240
319
  categories = {}
241
320
 
242
- for name, entry in cls._entries.items():
321
+ for entry in cls._entries.values():
243
322
  metadata = entry.metadata
244
323
 
245
324
  # Use first tag as primary category, or "uncategorized"
@@ -248,21 +327,16 @@ class FlowRegistry:
248
327
  if category not in categories:
249
328
  categories[category] = []
250
329
 
251
- categories[category].append(name)
330
+ categories[category].append({"id": metadata.id, "name": metadata.name})
252
331
 
253
332
  return categories
254
333
 
255
334
  @classmethod
256
- def discover_flows(cls, show_all_columns: bool = False) -> None:
335
+ def discover_flows(cls) -> None:
257
336
  """Discover and display all flows in a formatted table.
258
337
 
259
338
  This is the main public API for flow discovery. It finds all flows
260
339
  in registered search paths and displays them in a beautiful Rich table.
261
-
262
- Parameters
263
- ----------
264
- show_all_columns : bool, optional
265
- Whether to show extended table with all columns, by default False
266
340
  """
267
341
  cls._ensure_initialized()
268
342
  cls._discover_flows()
@@ -276,11 +350,12 @@ class FlowRegistry:
276
350
 
277
351
  # Prepare data with fallbacks
278
352
  flow_data = []
279
- for name, entry in cls._entries.items():
353
+ for _, entry in cls._entries.items():
280
354
  metadata = entry.metadata
281
355
  flow_data.append(
282
356
  {
283
- "name": name,
357
+ "name": metadata.name,
358
+ "id": metadata.id,
284
359
  "author": metadata.author or "Unknown",
285
360
  "tags": ", ".join(metadata.tags) if metadata.tags else "-",
286
361
  "description": metadata.description or "No description",
@@ -290,41 +365,29 @@ class FlowRegistry:
290
365
  )
291
366
 
292
367
  # Sort by name for consistency
293
- flow_data.sort(key=lambda x: x["name"])
368
+ flow_data.sort(key=lambda x: x["id"])
294
369
 
295
370
  # Display Rich table
296
371
  # Third Party
297
- from rich.console import Console
298
- from rich.table import Table
299
372
 
300
373
  console = Console()
301
- table = Table(show_header=True, header_style="bold magenta")
302
-
303
- # Add columns
304
- table.add_column("Name", style="cyan", no_wrap=True)
305
- table.add_column("Author", style="green")
306
-
307
- if show_all_columns:
308
- table.add_column("Version", style="blue")
309
- table.add_column("Cost", style="yellow")
374
+ table = Table(show_header=True, header_style="bold bright_magenta")
310
375
 
311
- table.add_column("Tags", style="dim")
312
- table.add_column("Description")
376
+ # Add columns with better visibility colors
377
+ table.add_column("ID", style="bold bright_magenta", no_wrap=True)
378
+ table.add_column("Name", style="bold bright_cyan")
379
+ table.add_column("Author", style="bright_green")
380
+ table.add_column("Tags", style="yellow")
381
+ table.add_column("Description", style="white")
313
382
 
314
383
  # Add rows
315
384
  for flow in flow_data:
316
- if show_all_columns:
317
- table.add_row(
318
- flow["name"],
319
- flow["author"],
320
- flow["version"],
321
- flow["cost"],
322
- flow["tags"],
323
- flow["description"],
324
- )
325
- else:
326
- table.add_row(
327
- flow["name"], flow["author"], flow["tags"], flow["description"]
328
- )
385
+ table.add_row(
386
+ flow["id"],
387
+ flow["name"],
388
+ flow["author"],
389
+ flow["tags"],
390
+ flow["description"],
391
+ )
329
392
 
330
393
  console.print(table)
@@ -112,6 +112,18 @@ class FlowValidator:
112
112
  elif not isinstance(metadata["name"], str) or not metadata["name"].strip():
113
113
  errors.append("Metadata 'name' must be a non-empty string")
114
114
 
115
+ # Validate id if present
116
+ if "id" in metadata:
117
+ flow_id = metadata["id"]
118
+ if not isinstance(flow_id, str):
119
+ errors.append("Metadata: 'id' must be a string")
120
+ elif flow_id and not flow_id.islower():
121
+ errors.append("Metadata: 'id' must be lowercase")
122
+ elif flow_id and not flow_id.replace("-", "").isalnum():
123
+ errors.append(
124
+ "Metadata: 'id' must contain only alphanumeric characters and hyphens"
125
+ )
126
+
115
127
  # Validate optional fields
116
128
  string_fields = [
117
129
  "description",
@@ -1,6 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
3
  # Local
4
+ from .flow_identifier import get_flow_identifier
4
5
  from .path_resolution import resolve_path
5
6
 
6
7
 
@@ -9,4 +10,4 @@ class GenerateError(Exception):
9
10
  """An exception raised during generate step."""
10
11
 
11
12
 
12
- __all__ = ["GenerateError", "resolve_path"]
13
+ __all__ = ["GenerateError", "resolve_path", "get_flow_identifier"]
@@ -1,5 +1,8 @@
1
1
  # Third Party
2
- from datasets import concatenate_datasets
2
+ from datasets import Dataset, concatenate_datasets
3
+
4
+ # Local
5
+ from .error_handling import FlowValidationError
3
6
 
4
7
 
5
8
  def safe_concatenate_datasets(datasets: list):
@@ -10,3 +13,80 @@ def safe_concatenate_datasets(datasets: list):
10
13
  return None
11
14
 
12
15
  return concatenate_datasets(filtered_datasets)
16
+
17
+
18
+ def validate_no_duplicates(dataset: Dataset) -> None:
19
+ """
20
+ Validate that the input dataset contains only unique rows.
21
+
22
+ Uses pandas `.duplicated()` for efficient duplicate detection.
23
+ Raises FlowValidationError if duplicates are found, including a count
24
+ of the duplicate rows detected.
25
+
26
+ Parameters
27
+ ----------
28
+ dataset : Dataset
29
+ Input dataset to validate.
30
+
31
+ Raises
32
+ ------
33
+ FlowValidationError
34
+ If duplicate rows are detected in the dataset.
35
+ """
36
+ df = dataset.to_pandas()
37
+ duplicate_count = int(df.duplicated(keep="first").sum())
38
+
39
+ if duplicate_count > 0:
40
+ raise FlowValidationError(
41
+ f"Input dataset contains {duplicate_count} duplicate rows. "
42
+ f"SDG Hub operations require unique input rows. "
43
+ f"Please deduplicate your dataset before processing."
44
+ )
45
+
46
+
47
+ def safe_concatenate_with_validation(
48
+ datasets: list, context: str = "datasets"
49
+ ) -> Dataset:
50
+ """Safely concatenate datasets with schema validation and clear error messages.
51
+
52
+ Parameters
53
+ ----------
54
+ datasets : list[Dataset]
55
+ List of datasets to concatenate
56
+ context : str
57
+ Description of what's being concatenated for error messages
58
+
59
+ Returns
60
+ -------
61
+ Dataset
62
+ Concatenated dataset
63
+
64
+ Raises
65
+ ------
66
+ FlowValidationError
67
+ If schema mismatch prevents concatenation or no valid datasets
68
+ """
69
+ # Filter out None and empty datasets first
70
+ valid_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
71
+
72
+ if not valid_datasets:
73
+ raise FlowValidationError(f"No valid datasets to concatenate in {context}")
74
+
75
+ if len(valid_datasets) == 1:
76
+ return valid_datasets[0]
77
+
78
+ try:
79
+ return concatenate_datasets(valid_datasets)
80
+ except Exception as e:
81
+ # Schema mismatch or other concatenation error
82
+ schema_info = []
83
+ for i, ds in enumerate(valid_datasets):
84
+ schema_info.append(f"Dataset {i}: columns={ds.column_names}")
85
+
86
+ schema_details = "\n".join(schema_info)
87
+ raise FlowValidationError(
88
+ f"Schema mismatch when concatenating {context}. "
89
+ f"All datasets must have compatible schemas (same columns/types). "
90
+ f"Original error: {e}\n"
91
+ f"Dataset schemas:\n{schema_details}"
92
+ ) from e
@@ -0,0 +1,231 @@
1
+ # Flow ID word lists for wandb-style deterministic generation
2
+ # Format: adjective-noun-number (e.g., "bright-river-123")
3
+
4
+ adjectives:
5
+ - able
6
+ - ancient
7
+ - autumn
8
+ - bold
9
+ - brave
10
+ - bright
11
+ - calm
12
+ - clean
13
+ - clever
14
+ - cool
15
+ - cosmic
16
+ - daily
17
+ - dark
18
+ - deep
19
+ - divine
20
+ - dry
21
+ - eager
22
+ - early
23
+ - earnest
24
+ - easy
25
+ - epic
26
+ - even
27
+ - exact
28
+ - fair
29
+ - fast
30
+ - fine
31
+ - firm
32
+ - first
33
+ - fresh
34
+ - full
35
+ - gentle
36
+ - glad
37
+ - golden
38
+ - good
39
+ - great
40
+ - green
41
+ - happy
42
+ - hard
43
+ - heavy
44
+ - high
45
+ - holy
46
+ - huge
47
+ - jolly
48
+ - keen
49
+ - kind
50
+ - large
51
+ - late
52
+ - light
53
+ - live
54
+ - long
55
+ - loud
56
+ - lucky
57
+ - major
58
+ - mild
59
+ - new
60
+ - nice
61
+ - noble
62
+ - old
63
+ - open
64
+ - plain
65
+ - proud
66
+ - pure
67
+ - quick
68
+ - quiet
69
+ - rapid
70
+ - rare
71
+ - real
72
+ - rich
73
+ - right
74
+ - rough
75
+ - round
76
+ - safe
77
+ - sharp
78
+ - short
79
+ - simple
80
+ - slow
81
+ - small
82
+ - smart
83
+ - smooth
84
+ - soft
85
+ - solid
86
+ - strong
87
+ - sure
88
+ - swift
89
+ - tall
90
+ - thick
91
+ - thin
92
+ - tiny
93
+ - vast
94
+ - warm
95
+ - weak
96
+ - whole
97
+ - wide
98
+ - wild
99
+ - wise
100
+ - young
101
+ - exalted
102
+ - legendary
103
+ - resilient
104
+ - vibrant
105
+ - stellar
106
+ - graceful
107
+ - radiant
108
+ - serene
109
+ - brilliant
110
+ - majestic
111
+ - elegant
112
+
113
+ nouns:
114
+ - abyss
115
+ - angel
116
+ - arrow
117
+ - atom
118
+ - ball
119
+ - band
120
+ - bark
121
+ - beam
122
+ - bear
123
+ - bell
124
+ - bird
125
+ - bloom
126
+ - blue
127
+ - boat
128
+ - bone
129
+ - book
130
+ - brook
131
+ - brush
132
+ - calm
133
+ - cave
134
+ - cell
135
+ - chant
136
+ - chord
137
+ - clay
138
+ - cliff
139
+ - cloud
140
+ - coal
141
+ - coast
142
+ - coin
143
+ - colt
144
+ - coral
145
+ - core
146
+ - creek
147
+ - crop
148
+ - crown
149
+ - cube
150
+ - dawn
151
+ - day
152
+ - dew
153
+ - disk
154
+ - dove
155
+ - dream
156
+ - drop
157
+ - dust
158
+ - eagle
159
+ - earth
160
+ - echo
161
+ - edge
162
+ - ember
163
+ - field
164
+ - fire
165
+ - fish
166
+ - flame
167
+ - flight
168
+ - flow
169
+ - foam
170
+ - fog
171
+ - forest
172
+ - frost
173
+ - glow
174
+ - gold
175
+ - grass
176
+ - grove
177
+ - haze
178
+ - heart
179
+ - hill
180
+ - ice
181
+ - iris
182
+ - jade
183
+ - lake
184
+ - land
185
+ - leaf
186
+ - light
187
+ - lion
188
+ - moon
189
+ - moss
190
+ - night
191
+ - oak
192
+ - ocean
193
+ - path
194
+ - peak
195
+ - pearl
196
+ - pine
197
+ - pond
198
+ - rain
199
+ - reef
200
+ - river
201
+ - rock
202
+ - rose
203
+ - sage
204
+ - sand
205
+ - sea
206
+ - shadow
207
+ - shore
208
+ - sky
209
+ - snow
210
+ - song
211
+ - star
212
+ - stone
213
+ - storm
214
+ - stream
215
+ - sun
216
+ - sunset
217
+ - surf
218
+ - tide
219
+ - tree
220
+ - vale
221
+ - wave
222
+ - wind
223
+ - wing
224
+ - wolf
225
+ - wood
226
+ - darkness
227
+ - meadow
228
+ - thunder
229
+ - crystal
230
+ - valley
231
+ - mountain