sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +41 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +121 -193
  5. sdg_hub/blocks/openaichatblock.py +556 -0
  6. sdg_hub/blocks/utilblocks.py +500 -43
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  9. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  10. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  11. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  13. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  14. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  15. sdg_hub/configs/skills/contexts.yaml +18 -11
  16. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  17. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  18. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  19. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  20. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  21. sdg_hub/configs/skills/router.yaml +53 -6
  22. sdg_hub/flow.py +366 -33
  23. sdg_hub/flow_runner.py +437 -0
  24. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
  25. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  26. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  27. sdg_hub/pipeline.py +67 -12
  28. sdg_hub/prompts.py +52 -0
  29. sdg_hub/sdg.py +128 -86
  30. sdg_hub/utils/__init__.py +5 -0
  31. sdg_hub/utils/config_validation.py +91 -0
  32. sdg_hub/utils/error_handling.py +94 -0
  33. sdg_hub/utils/path_resolution.py +62 -0
  34. sdg_hub/utils/validation_result.py +10 -0
  35. sdg_hub-0.1.2.dist-info/METADATA +190 -0
  36. sdg_hub-0.1.2.dist-info/RECORD +89 -0
  37. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
  38. sdg_hub/blocks/filterblock.py +0 -76
  39. sdg_hub/blocks/iterblock.py +0 -31
  40. sdg_hub/blocks/rmblocks.py +0 -194
  41. sdg_hub/configs/annotations/simple.yaml +0 -10
  42. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  43. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  44. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  45. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  46. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  47. sdg_hub/utils/chunking.py +0 -73
  48. sdg_hub/utils/docprocessor.py +0 -357
  49. sdg_hub/utils/parse_and_convert.py +0 -392
  50. sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
  51. sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
  52. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  53. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  54. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  55. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  58. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  59. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  60. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  61. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  62. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
  63. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
sdg_hub/flow.py CHANGED
@@ -1,73 +1,321 @@
1
+ """
2
+ Flow module for managing data generation pipelines.
3
+
4
+ This module provides the core Flow class that handles both configuration loading and execution
5
+ of data generation blocks. The Flow class serves as the main interface for defining and running
6
+ data generation pipelines, supporting both direct usage with SDG and backward compatibility
7
+ through the deprecated Pipeline class.
8
+
9
+ Example:
10
+ >>> flow = Flow(llm_client)
11
+ >>> flow = flow.get_flow_from_file("path/to/flow.yaml")
12
+ >>> dataset = flow.generate(input_dataset)
13
+
14
+ Note:
15
+ This module is part of the SDG Hub package and is designed to work in conjunction
16
+ with the SDG class for distributed data generation.
17
+ """
18
+
1
19
  # SPDX-License-Identifier: Apache-2.0
2
20
  # Standard
3
21
  from abc import ABC
4
22
  from importlib import resources
5
- from typing import Optional
23
+ from typing import Any, Callable, Dict, List, Optional
6
24
  import operator
7
25
  import os
8
26
 
9
27
  # Third Party
28
+ from datasets import Dataset
29
+ from datasets.data_files import EmptyDatasetError
30
+ from jinja2 import Environment, meta
31
+ from rich.console import Console
32
+ from rich.table import Table
10
33
  import yaml
11
34
 
12
35
  # Local
36
+ from .blocks import * # needed to register blocks
37
+ from .logger_config import setup_logger
38
+ from .prompts import * # needed to register prompts
13
39
  from .registry import BlockRegistry, PromptRegistry
14
- from . import prompts
15
- from . import blocks
40
+ from .utils.config_validation import validate_prompt_config_schema
41
+ from .utils.path_resolution import resolve_path
42
+ from .utils.validation_result import ValidationResult
43
+
44
+ logger = setup_logger(__name__)
16
45
 
17
46
 
18
- OPERATOR_MAP = {
47
+ OPERATOR_MAP: Dict[str, Callable] = {
19
48
  "operator.eq": operator.eq,
20
49
  "operator.ge": operator.ge,
50
+ "operator.le": operator.le,
51
+ "operator.gt": operator.gt,
52
+ "operator.lt": operator.lt,
53
+ "operator.ne": operator.ne,
21
54
  "operator.contains": operator.contains,
22
55
  }
23
56
 
24
- CONVERT_DTYPE_MAP = {
57
+ CONVERT_DTYPE_MAP: Dict[str, Callable] = {
25
58
  "float": float,
26
59
  "int": int,
27
60
  }
28
61
 
29
62
 
30
63
  class Flow(ABC):
64
+ """A class representing a data generation flow.
65
+
66
+ This class handles both configuration loading and execution of data generation
67
+ blocks. It can be used directly with SDG or through the deprecated Pipeline class.
68
+ """
69
+
31
70
  def __init__(
32
71
  self,
33
- llm_client,
72
+ llm_client: Any,
34
73
  num_samples_to_generate: Optional[int] = None,
74
+ log_level: Optional[str] = None,
35
75
  ) -> None:
76
+ """
77
+ Initialize the Flow class.
78
+
79
+ Parameters
80
+ ----------
81
+ llm_client : Any
82
+ The LLM client to use for generation.
83
+ num_samples_to_generate : Optional[int], optional
84
+ Number of samples to generate, by default None
85
+ log_level : Optional[str], optional
86
+ Logging verbosity level, by default None
87
+
88
+ Attributes
89
+ ----------
90
+ llm_client : Any
91
+ The LLM client instance.
92
+ base_path : str
93
+ Base path for resource files.
94
+ registered_blocks : Dict[str, Any]
95
+ Registry of available blocks.
96
+ chained_blocks : Optional[List[Dict[str, Any]]]
97
+ List of block configurations.
98
+ num_samples_to_generate : Optional[int]
99
+ Number of samples to generate.
100
+
101
+ """
36
102
  self.llm_client = llm_client
37
- self.num_samples_to_generate = num_samples_to_generate
38
103
  self.base_path = str(resources.files(__package__))
39
104
  self.registered_blocks = BlockRegistry.get_registry()
105
+ self.chained_blocks = None # Will be set by get_flow_from_file
106
+ self.num_samples_to_generate = num_samples_to_generate
107
+
108
+ # Logging verbosity level
109
+ self.log_level = log_level or os.getenv("SDG_HUB_LOG_LEVEL", "normal").lower()
110
+ self.console = Console() if self.log_level in ["verbose", "debug"] else None
111
+
112
+ def _log_block_info(
113
+ self, index: int, total: int, name: str, ds: Dataset, stage: str
114
+ ) -> None:
115
+ if self.log_level in ["verbose", "debug"] and self.console:
116
+ table = Table(
117
+ title=f"{stage} Block {index + 1}/{total}: {name}", show_header=True
118
+ )
119
+ table.add_column("Metric", style="cyan", no_wrap=True)
120
+ table.add_column("Value", style="magenta")
121
+ table.add_row("Rows", str(len(ds)))
122
+ table.add_row("Columns", ", ".join(ds.column_names))
123
+ self.console.print(table)
124
+
125
+ def _getFilePath(self, dirs: List[str], filename: str) -> str:
126
+ """Find a named configuration file.
127
+
128
+ Files are checked in the following order:
129
+ 1. Absolute path is always used
130
+ 2. Checked relative to the directories in "dirs"
131
+ 3. Relative to the current directory
40
132
 
41
- def _getFilePath(self, dirs, filename):
133
+ Parameters
134
+ ----------
135
+ dirs : List[str]
136
+ Directories in which to search for the file.
137
+ filename : str
138
+ The path to the configuration file.
139
+
140
+ Returns
141
+ -------
142
+ str
143
+ Selected file path.
144
+ """
145
+ return resolve_path(filename, dirs)
146
+
147
+ def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
148
+ """Drop duplicates from the dataset based on the columns provided.
149
+
150
+ Parameters
151
+ ----------
152
+ dataset : Dataset
153
+ The input dataset.
154
+ cols : List[str]
155
+ Columns to consider for duplicate detection.
156
+
157
+ Returns
158
+ -------
159
+ Dataset
160
+ Dataset with duplicates removed.
42
161
  """
43
- Find a named configuration file.
162
+ df = dataset.to_pandas()
163
+ df = df.drop_duplicates(subset=cols).reset_index(drop=True)
164
+ return Dataset.from_pandas(df)
44
165
 
45
- Files are checked in the following order
46
- - absulute path is always used
47
- - checked relative to the directories in "dirs"
48
- - relative the the current directory
166
+ def generate(self, dataset: Dataset) -> Dataset:
167
+ """Generate the dataset by running the pipeline steps.
49
168
 
50
- Args:
51
- dirs (list): Directories in which to search for "config_path"
52
- config_path (str): The path to the configuration file.
169
+ Parameters
170
+ ----------
171
+ dataset : Dataset
172
+ The input dataset to process.
53
173
 
54
- Returns:
55
- Selected file path
174
+ Returns
175
+ -------
176
+ Dataset
177
+ The processed dataset.
178
+
179
+ Raises
180
+ ------
181
+ ValueError
182
+ If Flow has not been initialized with blocks.
183
+ EmptyDatasetError
184
+ If a block produces an empty dataset.
56
185
  """
57
- if os.path.isabs(filename):
58
- return filename
59
- for d in dirs:
60
- full_file_path = os.path.join(d, filename)
61
- if os.path.isfile(full_file_path):
62
- return full_file_path
63
- # If not found above then return the path unchanged i.e.
64
- # assume the path is relative to the current directory
65
- return filename
66
-
67
- def get_flow_from_file(self, yaml_path: str) -> list:
68
- yaml_path_relative_to_base = os.path.join(self.base_path, yaml_path)
69
- if os.path.isfile(yaml_path_relative_to_base):
70
- yaml_path = yaml_path_relative_to_base
186
+ if self.chained_blocks is None:
187
+ raise ValueError(
188
+ "Flow has not been initialized with blocks. "
189
+ "Call get_flow_from_file() first. "
190
+ "Or pass a list of blocks to the Flow constructor."
191
+ )
192
+
193
+ for i, block_prop in enumerate(self.chained_blocks):
194
+ block_type = block_prop["block_type"]
195
+ block_config = block_prop["block_config"]
196
+ drop_columns = block_prop.get("drop_columns", [])
197
+ gen_kwargs = block_prop.get("gen_kwargs", {})
198
+ drop_duplicates_cols = block_prop.get("drop_duplicates", False)
199
+ block = block_type(**block_config)
200
+
201
+ name = block_config.get("block_name", f"block_{i}")
202
+
203
+ # Logging: always show basic progress unless in quiet mode
204
+ if self.log_level in ["normal", "verbose", "debug"]:
205
+ logger.info(
206
+ f"🔄 Running block {i + 1}/{len(self.chained_blocks)}: {name}"
207
+ )
208
+
209
+ # Log dataset shape before block (verbose/debug)
210
+ self._log_block_info(i, len(self.chained_blocks), name, dataset, "Input")
211
+
212
+ if self.log_level == "debug":
213
+ logger.debug(f"Input dataset (truncated): {dataset}")
214
+
215
+ dataset = block.generate(dataset, **gen_kwargs)
216
+
217
+ if len(dataset) == 0:
218
+ raise EmptyDatasetError(
219
+ f"Pipeline stopped: "
220
+ f"Empty dataset after running block: "
221
+ f"{block_config['block_name']}"
222
+ )
223
+
224
+ drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
225
+ if drop_columns:
226
+ dataset = dataset.remove_columns(drop_columns_in_ds)
227
+
228
+ if drop_duplicates_cols:
229
+ dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
230
+
231
+ # Log dataset shape after block (verbose/debug)
232
+ self._log_block_info(i, len(self.chained_blocks), name, dataset, "Output")
233
+
234
+ if self.log_level == "debug":
235
+ logger.debug(f"Output dataset (truncated): {dataset}")
236
+
237
+ return dataset
238
+
239
+ def validate_config_files(self) -> "ValidationResult":
240
+ """
241
+ Validate all configuration file paths referenced in the flow blocks.
242
+
243
+ This method checks that all config files specified via `config_path` or `config_paths`
244
+ in each block:
245
+ - Exist on the filesystem
246
+ - Are readable by the current process
247
+ - Are valid YAML files (optional format check)
248
+
249
+ Returns
250
+ -------
251
+ ValidationResult
252
+ An object indicating whether all config files passed validation, along with a list
253
+ of error messages for any missing, unreadable, or invalid YAML files.
254
+
255
+ Notes
256
+ -----
257
+ This method is automatically called at the end of `get_flow_from_file()` to ensure
258
+ early detection of misconfigured blocks.
259
+ """
260
+ errors = []
261
+
262
+ def check_file(path: str, context: str):
263
+ if not os.path.isfile(path):
264
+ errors.append(f"[{context}] File does not exist: {path}")
265
+ else:
266
+ try:
267
+ with open(path, "r", encoding="utf-8") as f:
268
+ config_data = yaml.safe_load(f)
269
+ _, validation_errors = validate_prompt_config_schema(
270
+ config_data, path
271
+ )
272
+
273
+ if validation_errors:
274
+ errors.extend(validation_errors)
275
+
276
+ except PermissionError:
277
+ errors.append(f"[{context}] File is not readable: {path}")
278
+ except yaml.YAMLError as e:
279
+ errors.append(f"[{context}] YAML load failed: {path} ({e})")
280
+
281
+ for i, block in enumerate(self.chained_blocks or []):
282
+ block_name = block["block_config"].get("block_name", f"block_{i}")
283
+
284
+ config_path = block["block_config"].get("config_path")
285
+ if config_path:
286
+ check_file(config_path, f"{block_name}.config_path")
287
+
288
+ config_paths = block["block_config"].get("config_paths")
289
+ if isinstance(config_paths, list):
290
+ for idx, path in enumerate(config_paths):
291
+ check_file(path, f"{block_name}.config_paths[{idx}]")
292
+ elif isinstance(config_paths, dict):
293
+ for key, path in config_paths.items():
294
+ check_file(path, f"{block_name}.config_paths['{key}']")
295
+
296
+ return ValidationResult(valid=(len(errors) == 0), errors=errors)
297
+
298
+ def get_flow_from_file(self, yaml_path: str) -> "Flow":
299
+ """Load and initialize flow configuration from a YAML file.
300
+
301
+ Parameters
302
+ ----------
303
+ yaml_path : str
304
+ Path to the YAML configuration file.
305
+
306
+ Returns
307
+ -------
308
+ Flow
309
+ Self with initialized chained_blocks.
310
+
311
+ Raises
312
+ ------
313
+ FileNotFoundError
314
+ If the YAML file cannot be found.
315
+ KeyError
316
+ If a required block or prompt is not found in the registry.
317
+ """
318
+ yaml_path = resolve_path(yaml_path, self.base_path)
71
319
  yaml_dir = os.path.dirname(yaml_path)
72
320
 
73
321
  try:
@@ -141,4 +389,89 @@ class Flow(ABC):
141
389
  block["block_config"]["convert_dtype"]
142
390
  ]
143
391
 
144
- return flow
392
+ # Store the chained blocks and return self
393
+ self.chained_blocks = flow
394
+
395
+ # Validate config files
396
+ result = self.validate_config_files()
397
+ if not result.valid:
398
+ raise ValueError("Invalid config files:\n\n".join(result.errors))
399
+
400
+ return self
401
+
402
+ def validate_flow(self, dataset: Dataset) -> "ValidationResult":
403
+ """
404
+ Validate that all required dataset columns are present before executing the flow.
405
+
406
+ This includes:
407
+ - Columns referenced in Jinja templates for LLM blocks
408
+ - Columns required by specific utility blocks (e.g. filter_column, choice_col, etc.)
409
+
410
+ Parameters
411
+ ----------
412
+ dataset : Dataset
413
+ The input dataset to validate against.
414
+
415
+ Returns
416
+ -------
417
+ ValidationResult
418
+ Whether the dataset has all required columns, and which ones are missing.
419
+ """
420
+ errors = []
421
+ all_columns = set(dataset.column_names)
422
+
423
+ for i, block in enumerate(self.chained_blocks or []):
424
+ name = block["block_config"].get("block_name", f"block_{i}")
425
+ block_type = block["block_type"]
426
+ config = block["block_config"]
427
+
428
+ # LLM Block: parse Jinja vars
429
+ cls_name = (
430
+ block_type.__name__
431
+ if isinstance(block_type, type)
432
+ else block_type.__class__.__name__
433
+ )
434
+ logger.info(f"Validating block: {name} ({cls_name})")
435
+ if "LLM" in cls_name:
436
+ config_path = config.get("config_path")
437
+ if config_path and os.path.isfile(config_path):
438
+ with open(config_path, "r", encoding="utf-8") as f:
439
+ content = f.read()
440
+ env = Environment()
441
+ ast = env.parse(content)
442
+ vars_found = meta.find_undeclared_variables(ast)
443
+ for var in vars_found:
444
+ if var not in all_columns:
445
+ errors.append(
446
+ f"[{name}] Missing column for prompt var: '{var}'"
447
+ )
448
+
449
+ # FilterByValueBlock
450
+ if "FilterByValueBlock" in str(block_type):
451
+ col = config.get("filter_column")
452
+ if col and col not in all_columns:
453
+ errors.append(f"[{name}] Missing filter_column: '{col}'")
454
+
455
+ # SelectorBlock
456
+ if "SelectorBlock" in str(block_type):
457
+ col = config.get("choice_col")
458
+ if col and col not in all_columns:
459
+ errors.append(f"[{name}] Missing choice_col: '{col}'")
460
+
461
+ choice_map = config.get("choice_map", {})
462
+ for col in choice_map.values():
463
+ if col not in all_columns:
464
+ errors.append(
465
+ f"[{name}] choice_map references missing column: '{col}'"
466
+ )
467
+
468
+ # CombineColumnsBlock
469
+ if "CombineColumnsBlock" in str(block_type):
470
+ cols = config.get("columns", [])
471
+ for col in cols:
472
+ if col not in all_columns:
473
+ errors.append(
474
+ f"[{name}] CombineColumnsBlock requires column: '{col}'"
475
+ )
476
+
477
+ return ValidationResult(valid=(len(errors) == 0), errors=errors)