sdg-hub 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -80
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
sdg_hub/flow_runner.py DELETED
@@ -1,450 +0,0 @@
1
- """Script for running data generation flows with configurable parameters."""
2
-
3
- # Standard
4
- from importlib import resources
5
- from typing import Optional
6
- import os
7
- import sys
8
- import traceback
9
-
10
- # Third Party
11
- from datasets import load_dataset
12
- from openai import OpenAI
13
- import click
14
- import yaml
15
-
16
- # First Party
17
- from sdg_hub.flow import Flow
18
- from sdg_hub.logger_config import setup_logger
19
- from sdg_hub.sdg import SDG
20
- from sdg_hub.utils.error_handling import (
21
- APIConnectionError,
22
- DataGenerationError,
23
- DataSaveError,
24
- DatasetLoadError,
25
- FlowConfigurationError,
26
- FlowRunnerError,
27
- )
28
- from sdg_hub.utils.path_resolution import resolve_path
29
-
30
- logger = setup_logger(__name__)
31
-
32
-
33
- def run_flow(
34
- ds_path: str,
35
- save_path: str,
36
- endpoint: str,
37
- flow_path: str,
38
- checkpoint_dir: str,
39
- batch_size: int = 8,
40
- num_workers: int = 32,
41
- save_freq: int = 2,
42
- debug: bool = False,
43
- dataset_start_index: int = 0,
44
- dataset_end_index: Optional[int] = None,
45
- api_key: Optional[str] = None,
46
- ) -> None:
47
- """Process the dataset using the specified configuration.
48
-
49
- Parameters
50
- ----------
51
- ds_path : str
52
- Path to the dataset file.
53
- save_path : str
54
- Path where the output will be saved.
55
- endpoint : str
56
- API endpoint for data processing.
57
- flow_path : str
58
- Path to the flow configuration file.
59
- checkpoint_dir : str
60
- Directory path for saving checkpoints.
61
- batch_size : int, optional
62
- Batch size for processing, by default 8.
63
- num_workers : int, optional
64
- Number of worker processes to use, by default 32.
65
- save_freq : int, optional
66
- Frequency (in batches) at which to save checkpoints, by default 2.
67
- debug : bool, optional
68
- If True, enables debug mode with a smaller dataset subset, by default False.
69
- dataset_start_index : int, optional
70
- Start index for dataset slicing, by default 0.
71
- dataset_end_index : Optional[int], optional
72
- End index for dataset slicing, by default None.
73
- api_key : Optional[str], optional
74
- API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable, by default None.
75
-
76
- Returns
77
- -------
78
- None
79
-
80
- Raises
81
- ------
82
- DatasetLoadError
83
- If the dataset cannot be loaded or processed.
84
- FlowConfigurationError
85
- If the flow configuration is invalid or cannot be loaded.
86
- APIConnectionError
87
- If connection to the API endpoint fails.
88
- DataGenerationError
89
- If data generation fails during processing.
90
- DataSaveError
91
- If saving the generated data fails.
92
- """
93
- logger.info(f"Generation configuration: {locals()}\n\n")
94
-
95
- try:
96
- # Load and validate dataset
97
- try:
98
- ds = load_dataset("json", data_files=ds_path, split="train")
99
- logger.info(
100
- f"Successfully loaded dataset from {ds_path} with {len(ds)} rows"
101
- )
102
- except Exception as e:
103
- raise DatasetLoadError(
104
- f"Failed to load dataset from '{ds_path}'. "
105
- f"Please check if the file exists and is a valid JSON file.",
106
- details=str(e),
107
- ) from e
108
-
109
- # Apply dataset slicing if specified
110
- try:
111
- if dataset_start_index is not None and dataset_end_index is not None:
112
- if dataset_start_index >= len(ds) or dataset_end_index > len(ds):
113
- raise DatasetLoadError(
114
- f"Dataset slice indices ({dataset_start_index}, {dataset_end_index}) "
115
- f"are out of bounds for dataset with {len(ds)} rows"
116
- )
117
- if dataset_start_index >= dataset_end_index:
118
- raise DatasetLoadError(
119
- f"Start index ({dataset_start_index}) must be less than end index ({dataset_end_index})"
120
- )
121
- ds = ds.select(range(dataset_start_index, dataset_end_index))
122
- logger.info(
123
- f"Dataset sliced from {dataset_start_index} to {dataset_end_index}"
124
- )
125
-
126
- if debug:
127
- if len(ds) < 30:
128
- logger.warning(
129
- f"Debug mode requested 30 samples but dataset only has {len(ds)} rows"
130
- )
131
- ds = ds.shuffle(seed=42).select(range(min(30, len(ds))))
132
- logger.info(
133
- f"Debug mode enabled. Using {len(ds)} samples from the dataset."
134
- )
135
- except DatasetLoadError:
136
- raise
137
- except Exception as e:
138
- raise DatasetLoadError(
139
- "Failed to process dataset slicing or debug mode.", details=str(e)
140
- ) from e
141
-
142
- # Validate API configuration
143
- openai_api_key = api_key or os.environ.get("OPENAI_API_KEY")
144
- if not openai_api_key or openai_api_key == "EMPTY":
145
- logger.warning("API key not provided and OPENAI_API_KEY not set or is 'EMPTY'. API calls may fail.")
146
-
147
- openai_api_base = endpoint
148
- if not openai_api_base:
149
- raise APIConnectionError("API endpoint cannot be empty")
150
-
151
- # Initialize OpenAI client
152
- try:
153
- client = OpenAI(
154
- api_key=openai_api_key or "EMPTY",
155
- base_url=openai_api_base,
156
- )
157
- # test connection with a model list
158
- models = client.models.list()
159
- logger.info(f"Initialized OpenAI client with endpoint: {openai_api_base}")
160
- logger.info(f"Available models: {[model.id for model in models.data]}")
161
- except Exception as e:
162
- raise APIConnectionError(
163
- f"Failed to initialize OpenAI client with endpoint '{openai_api_base}'. "
164
- f"Please check if the endpoint is valid and accessible.",
165
- details=str(e),
166
- ) from e
167
-
168
- # Load and validate flow configuration
169
- try:
170
- base_path = str(resources.files(__package__))
171
- flow_path = resolve_path(flow_path, [".", base_path])
172
- if not os.path.exists(flow_path):
173
- raise FlowConfigurationError(
174
- f"Flow configuration file not found: {flow_path}"
175
- )
176
-
177
- # Validate flow file is readable YAML
178
- try:
179
- with open(flow_path, "r", encoding="utf-8") as f:
180
- flow_config = yaml.safe_load(f)
181
- if not flow_config:
182
- raise FlowConfigurationError(
183
- f"Flow configuration file is empty: {flow_path}"
184
- )
185
- logger.info(f"Successfully loaded flow configuration from {flow_path}")
186
- except yaml.YAMLError as e:
187
- raise FlowConfigurationError(
188
- f"Flow configuration file '{flow_path}' contains invalid YAML.",
189
- details=str(e),
190
- ) from e
191
- except Exception as e:
192
- raise FlowConfigurationError(
193
- f"Failed to read flow configuration file '{flow_path}'.",
194
- details=str(e),
195
- ) from e
196
-
197
- flow = Flow(client).get_flow_from_file(flow_path)
198
- logger.info("Successfully initialized flow from configuration")
199
- except FlowConfigurationError:
200
- raise
201
- except Exception as e:
202
- raise FlowConfigurationError(
203
- f"Failed to create flow from configuration file '{flow_path}'. "
204
- f"Please check the flow configuration format and block definitions.",
205
- details=str(e),
206
- ) from e
207
-
208
- # Initialize SDG and generate data
209
- try:
210
- sdg = SDG(
211
- flows=[flow],
212
- num_workers=num_workers,
213
- batch_size=batch_size,
214
- save_freq=save_freq,
215
- )
216
- logger.info(
217
- f"Initialized SDG with {num_workers} workers, batch size {batch_size}"
218
- )
219
-
220
- # Ensure checkpoint directory exists if specified
221
- if checkpoint_dir and not os.path.exists(checkpoint_dir):
222
- os.makedirs(checkpoint_dir, exist_ok=True)
223
- logger.info(f"Created checkpoint directory: {checkpoint_dir}")
224
-
225
- generated_data = sdg.generate(ds, checkpoint_dir=checkpoint_dir)
226
-
227
- if generated_data is None or len(generated_data) == 0:
228
- raise DataGenerationError(
229
- "Data generation completed but no data was generated. "
230
- "This may indicate issues with the flow configuration or input data."
231
- )
232
-
233
- logger.info(f"Successfully generated {len(generated_data)} rows of data")
234
-
235
- except Exception as e:
236
- if isinstance(e, DataGenerationError):
237
- raise
238
- raise DataGenerationError(
239
- "Data generation failed during processing. This could be due to:"
240
- "\n- API connection issues with the endpoint"
241
- "\n- Invalid flow configuration or block parameters"
242
- "\n- Insufficient system resources (try reducing batch_size or num_workers)"
243
- "\n- Input data format incompatibility",
244
- details=f"Endpoint: {openai_api_base}, Error: {e}",
245
- ) from e
246
-
247
- # Save generated data
248
- try:
249
- # Adjust save path for dataset slicing
250
- final_save_path = save_path
251
- if dataset_end_index is not None and dataset_start_index is not None:
252
- final_save_path = save_path.replace(
253
- ".jsonl", f"_{dataset_start_index}_{dataset_end_index}.jsonl"
254
- )
255
-
256
- # Ensure save directory exists
257
- save_dir = os.path.dirname(final_save_path)
258
- if save_dir and not os.path.exists(save_dir):
259
- os.makedirs(save_dir, exist_ok=True)
260
- logger.info(f"Created save directory: {save_dir}")
261
-
262
- generated_data.to_json(final_save_path, orient="records", lines=True)
263
- logger.info(f"Data successfully saved to {final_save_path}")
264
-
265
- except Exception as e:
266
- raise DataSaveError(
267
- f"Failed to save generated data to '{final_save_path}'. "
268
- f"Please check write permissions and disk space.",
269
- details=str(e),
270
- ) from e
271
-
272
- except (
273
- DatasetLoadError,
274
- FlowConfigurationError,
275
- APIConnectionError,
276
- DataGenerationError,
277
- DataSaveError,
278
- ):
279
- # Re-raise our custom exceptions with their detailed messages
280
- raise
281
- except Exception as e:
282
- # Catch any unexpected errors
283
- logger.error(f"Unexpected error during flow execution: {e}")
284
- logger.error(f"Traceback: {traceback.format_exc()}")
285
- raise FlowRunnerError(
286
- "An unexpected error occurred during flow execution. "
287
- "Please check the logs for more details.",
288
- details=str(e),
289
- ) from e
290
-
291
-
292
- @click.command()
293
- @click.option(
294
- "--ds_path",
295
- type=click.Path(exists=True),
296
- required=True,
297
- help="Path to the dataset.",
298
- )
299
- @click.option(
300
- "--bs",
301
- type=int,
302
- default=8,
303
- show_default=True,
304
- help="Batch size for processing.",
305
- )
306
- @click.option(
307
- "--num_workers",
308
- type=int,
309
- default=32,
310
- show_default=True,
311
- help="Number of worker processes to use.",
312
- )
313
- @click.option(
314
- "--save_path",
315
- type=click.Path(),
316
- required=True,
317
- help="Path to save the output.",
318
- )
319
- @click.option(
320
- "--endpoint",
321
- type=str,
322
- required=True,
323
- help="API endpoint for data processing.",
324
- )
325
- @click.option(
326
- "--flow",
327
- type=click.Path(exists=True),
328
- required=True,
329
- help="Flow configuration for the process.",
330
- )
331
- @click.option(
332
- "--checkpoint_dir",
333
- type=click.Path(),
334
- required=True,
335
- help="Path to save checkpoints.",
336
- )
337
- @click.option(
338
- "--save_freq",
339
- type=int,
340
- default=2,
341
- show_default=True,
342
- help="Frequency to save checkpoints.",
343
- )
344
- @click.option(
345
- "--debug",
346
- is_flag=True,
347
- help="Enable debug mode with a smaller dataset subset.",
348
- )
349
- @click.option(
350
- "--dataset_start_index", type=int, default=0, help="Start index of the dataset."
351
- )
352
- @click.option(
353
- "--dataset_end_index", type=int, default=None, help="End index of the dataset."
354
- )
355
- @click.option(
356
- "--api_key",
357
- type=str,
358
- default=None,
359
- help="API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable.",
360
- )
361
- def main(
362
- ds_path: str,
363
- bs: int,
364
- num_workers: int,
365
- save_path: str,
366
- endpoint: str,
367
- flow: str,
368
- checkpoint_dir: str,
369
- save_freq: int,
370
- debug: bool,
371
- dataset_start_index: int,
372
- dataset_end_index: Optional[int],
373
- api_key: Optional[str],
374
- ) -> None:
375
- """CLI entry point for running data generation flows.
376
-
377
- Parameters
378
- ----------
379
- ds_path : str
380
- Path to the dataset file.
381
- bs : int
382
- Batch size for processing.
383
- num_workers : int
384
- Number of worker processes to use.
385
- save_path : str
386
- Path where the output will be saved.
387
- endpoint : str
388
- API endpoint for data processing.
389
- flow : str
390
- Path to the flow configuration file.
391
- checkpoint_dir : str
392
- Directory path for saving checkpoints.
393
- save_freq : int
394
- Frequency (in batches) at which to save checkpoints.
395
- debug : bool
396
- If True, enables debug mode with a smaller dataset subset.
397
- dataset_start_index : int
398
- Start index for dataset slicing.
399
- dataset_end_index : Optional[int]
400
- End index for dataset slicing.
401
- api_key : Optional[str]
402
- API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable.
403
-
404
- Returns
405
- -------
406
- None
407
- """
408
- try:
409
- run_flow(
410
- ds_path=ds_path,
411
- batch_size=bs,
412
- num_workers=num_workers,
413
- save_path=save_path,
414
- endpoint=endpoint,
415
- flow_path=flow,
416
- checkpoint_dir=checkpoint_dir,
417
- save_freq=save_freq,
418
- debug=debug,
419
- dataset_start_index=dataset_start_index,
420
- dataset_end_index=dataset_end_index,
421
- api_key=api_key,
422
- )
423
- except (
424
- DatasetLoadError,
425
- FlowConfigurationError,
426
- APIConnectionError,
427
- DataGenerationError,
428
- DataSaveError,
429
- FlowRunnerError,
430
- ) as e:
431
- logger.error(f"Flow execution failed: {e}")
432
- click.echo(f"Error: {e}", err=True)
433
- sys.exit(1)
434
- except KeyboardInterrupt:
435
- logger.info("Flow execution interrupted by user")
436
- click.echo("Flow execution interrupted by user", err=True)
437
- sys.exit(130) # Standard exit code for SIGINT
438
- except Exception as e:
439
- logger.error(f"Unexpected error: {e}")
440
- logger.error(f"Traceback: {traceback.format_exc()}")
441
- click.echo(
442
- f"Unexpected error occurred. Please check the logs for details. Error: {e}",
443
- err=True,
444
- )
445
- sys.exit(1)
446
-
447
-
448
- if __name__ == "__main__":
449
- # pylint: disable=no-value-for-parameter
450
- main()
@@ -1,13 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_mmlu_knowledge
4
- config_path: configs/knowledge/mcq_generation.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - mmlubench_question
8
- - mmlubench_answer
9
- gen_kwargs:
10
- temperature: 0
11
- max_tokens: 2048
12
- drop_duplicates:
13
- - mmlubench_question
@@ -1,12 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_knowledge
4
- config_path: configs/knowledge/simple_generate_qa.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- temperature: 0.7
10
- max_tokens: 2048
11
- drop_duplicates:
12
- - output
@@ -1,89 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_knowledge
4
- config_path: configs/knowledge/generate_questions_responses.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - question
8
- - response
9
- parser_kwargs:
10
- parser_name: custom
11
- parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
12
- parser_cleanup_tags:
13
- - "[END]"
14
- gen_kwargs:
15
- max_tokens: 2048
16
- drop_duplicates:
17
- - question
18
-
19
- - block_type: LLMBlock
20
- block_config:
21
- block_name: eval_faithfulness_qa_pair
22
- config_path: configs/knowledge/evaluate_faithfulness.yaml
23
- model_id: meta-llama/Llama-3.3-70B-Instruct
24
- output_cols:
25
- - explanation
26
- - judgment
27
- gen_kwargs:
28
- max_tokens: 2048
29
-
30
- - block_type: FilterByValueBlock
31
- block_config:
32
- block_name: filter_faithfulness
33
- filter_column: judgment
34
- filter_value: "YES"
35
- operation: operator.eq
36
- batch_kwargs:
37
- num_procs: 8
38
- drop_columns:
39
- - judgment
40
- - explanation
41
-
42
- - block_type: LLMBlock
43
- block_config:
44
- block_name: eval_relevancy_qa_pair
45
- config_path: configs/knowledge/evaluate_relevancy.yaml
46
- model_id: meta-llama/Llama-3.3-70B-Instruct
47
- output_cols:
48
- - feedback
49
- - score
50
- gen_kwargs:
51
- max_tokens: 2048
52
-
53
- - block_type: FilterByValueBlock
54
- block_config:
55
- block_name: filter_relevancy
56
- filter_column: score
57
- filter_value: 2.0
58
- operation: operator.eq
59
- convert_dtype: float
60
- batch_kwargs:
61
- num_procs: 8
62
- drop_columns:
63
- - feedback
64
- - score
65
-
66
- - block_type: LLMBlock
67
- block_config:
68
- block_name: eval_verify_question
69
- config_path: configs/knowledge/evaluate_question.yaml
70
- model_id: meta-llama/Llama-3.3-70B-Instruct
71
- output_cols:
72
- - explanation
73
- - rating
74
- gen_kwargs:
75
- max_tokens: 2048
76
-
77
- - block_type: FilterByValueBlock
78
- block_config:
79
- block_name: filter_verify_question
80
- filter_column: rating
81
- filter_value: 1.0
82
- operation: operator.eq
83
- convert_dtype: float
84
- batch_kwargs:
85
- num_procs: 8
86
- drop_columns:
87
- - explanation
88
- - rating
89
- - __index_level_0__
@@ -1,136 +0,0 @@
1
- - block_type: DuplicateColumns
2
- block_config:
3
- block_name: duplicate_document_col
4
- columns_map:
5
- document: base_document
6
-
7
- - block_type: LLMBlock
8
- block_config:
9
- block_name: gen_detailed_summary
10
- config_path: configs/knowledge/detailed_summary.yaml
11
- model_id: meta-llama/Llama-3.3-70B-Instruct
12
- output_cols:
13
- - summary_detailed
14
- gen_kwargs:
15
- max_tokens: 2048
16
-
17
- - block_type: LLMBlock
18
- block_config:
19
- block_name: gen_atomic_facts
20
- config_path: configs/knowledge/atomic_facts.yaml
21
- model_id: meta-llama/Llama-3.3-70B-Instruct
22
- output_cols:
23
- - summary_atomic_facts
24
- gen_kwargs:
25
- max_tokens: 2048
26
-
27
- - block_type: LLMBlock
28
- block_config:
29
- block_name: gen_extractive_summary
30
- config_path: configs/knowledge/extractive_summary.yaml
31
- model_id: meta-llama/Llama-3.3-70B-Instruct
32
- output_cols:
33
- - summary_extractive
34
- gen_kwargs:
35
- max_tokens: 2048
36
-
37
- - block_type: FlattenColumnsBlock
38
- block_config:
39
- block_name: flatten_summary_columns
40
- var_cols:
41
- - summary_detailed
42
- - summary_extractive
43
- - summary_atomic_facts
44
- - base_document
45
- value_name: summary
46
- var_name: dataset_type
47
-
48
- - block_type: RenameColumns
49
- block_config:
50
- block_name: rename_to_document_column
51
- columns_map:
52
- document: raw_document
53
- summary: document
54
-
55
- - block_type: LLMBlock
56
- block_config:
57
- block_name: knowledge generation
58
- config_path: configs/knowledge/generate_questions_responses.yaml
59
- model_id: meta-llama/Llama-3.3-70B-Instruct
60
- output_cols:
61
- - question
62
- - response
63
- parser_kwargs:
64
- parser_name: custom
65
- parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
66
- parser_cleanup_tags:
67
- - "[END]"
68
- gen_kwargs:
69
- temperature: 0.0
70
- max_tokens: 2048
71
-
72
- - block_type: LLMBlock
73
- block_config:
74
- block_name: eval_faithfulness_qa_pair
75
- config_path: configs/knowledge/evaluate_faithfulness.yaml
76
- model_id: meta-llama/Llama-3.3-70B-Instruct
77
- output_cols:
78
- - explanation
79
- - judgment
80
- gen_kwargs:
81
- max_tokens: 2048
82
-
83
- - block_type: FilterByValueBlock
84
- block_config:
85
- block_name: filter_faithfulness
86
- filter_column: judgment
87
- filter_value: "YES"
88
- operation: operator.eq
89
- drop_columns:
90
- - judgment
91
- - explanation
92
-
93
- - block_type: LLMBlock
94
- block_config:
95
- block_name: eval_relevancy_qa_pair
96
- config_path: configs/knowledge/evaluate_relevancy.yaml
97
- model_id: meta-llama/Llama-3.3-70B-Instruct
98
- output_cols:
99
- - feedback
100
- - score
101
- gen_kwargs:
102
- max_tokens: 2048
103
-
104
- - block_type: FilterByValueBlock
105
- block_config:
106
- block_name: filter_relevancy
107
- filter_column: score
108
- filter_value: 2.0
109
- operation: operator.eq
110
- convert_dtype: float
111
- drop_columns:
112
- - feedback
113
- - score
114
-
115
- - block_type: LLMBlock
116
- block_config:
117
- block_name: eval_verify_question
118
- config_path: configs/knowledge/evaluate_question.yaml
119
- model_id: meta-llama/Llama-3.3-70B-Instruct
120
- output_cols:
121
- - explanation
122
- - rating
123
- gen_kwargs:
124
- max_tokens: 2048
125
-
126
- - block_type: FilterByValueBlock
127
- block_config:
128
- block_name: filter_verify_question
129
- filter_column: rating
130
- filter_value: 1.0
131
- operation: operator.eq
132
- convert_dtype: float
133
- drop_columns:
134
- - explanation
135
- - rating
136
- - __index_level_0__