sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -74
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.3.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.3.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
sdg_hub/flow_runner.py DELETED
@@ -1,450 +0,0 @@
1
- """Script for running data generation flows with configurable parameters."""
2
-
3
- # Standard
4
- from importlib import resources
5
- from typing import Optional
6
- import os
7
- import sys
8
- import traceback
9
-
10
- # Third Party
11
- from datasets import load_dataset
12
- from openai import OpenAI
13
- import click
14
- import yaml
15
-
16
- # First Party
17
- from sdg_hub.flow import Flow
18
- from sdg_hub.logger_config import setup_logger
19
- from sdg_hub.sdg import SDG
20
- from sdg_hub.utils.error_handling import (
21
- APIConnectionError,
22
- DataGenerationError,
23
- DataSaveError,
24
- DatasetLoadError,
25
- FlowConfigurationError,
26
- FlowRunnerError,
27
- )
28
- from sdg_hub.utils.path_resolution import resolve_path
29
-
30
- logger = setup_logger(__name__)
31
-
32
-
33
- def run_flow(
34
- ds_path: str,
35
- save_path: str,
36
- endpoint: str,
37
- flow_path: str,
38
- checkpoint_dir: str,
39
- batch_size: int = 8,
40
- num_workers: int = 32,
41
- save_freq: int = 2,
42
- debug: bool = False,
43
- dataset_start_index: int = 0,
44
- dataset_end_index: Optional[int] = None,
45
- api_key: Optional[str] = None,
46
- ) -> None:
47
- """Process the dataset using the specified configuration.
48
-
49
- Parameters
50
- ----------
51
- ds_path : str
52
- Path to the dataset file.
53
- save_path : str
54
- Path where the output will be saved.
55
- endpoint : str
56
- API endpoint for data processing.
57
- flow_path : str
58
- Path to the flow configuration file.
59
- checkpoint_dir : str
60
- Directory path for saving checkpoints.
61
- batch_size : int, optional
62
- Batch size for processing, by default 8.
63
- num_workers : int, optional
64
- Number of worker processes to use, by default 32.
65
- save_freq : int, optional
66
- Frequency (in batches) at which to save checkpoints, by default 2.
67
- debug : bool, optional
68
- If True, enables debug mode with a smaller dataset subset, by default False.
69
- dataset_start_index : int, optional
70
- Start index for dataset slicing, by default 0.
71
- dataset_end_index : Optional[int], optional
72
- End index for dataset slicing, by default None.
73
- api_key : Optional[str], optional
74
- API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable, by default None.
75
-
76
- Returns
77
- -------
78
- None
79
-
80
- Raises
81
- ------
82
- DatasetLoadError
83
- If the dataset cannot be loaded or processed.
84
- FlowConfigurationError
85
- If the flow configuration is invalid or cannot be loaded.
86
- APIConnectionError
87
- If connection to the API endpoint fails.
88
- DataGenerationError
89
- If data generation fails during processing.
90
- DataSaveError
91
- If saving the generated data fails.
92
- """
93
- logger.info(f"Generation configuration: {locals()}\n\n")
94
-
95
- try:
96
- # Load and validate dataset
97
- try:
98
- ds = load_dataset("json", data_files=ds_path, split="train")
99
- logger.info(
100
- f"Successfully loaded dataset from {ds_path} with {len(ds)} rows"
101
- )
102
- except Exception as e:
103
- raise DatasetLoadError(
104
- f"Failed to load dataset from '{ds_path}'. "
105
- f"Please check if the file exists and is a valid JSON file.",
106
- details=str(e),
107
- ) from e
108
-
109
- # Apply dataset slicing if specified
110
- try:
111
- if dataset_start_index is not None and dataset_end_index is not None:
112
- if dataset_start_index >= len(ds) or dataset_end_index > len(ds):
113
- raise DatasetLoadError(
114
- f"Dataset slice indices ({dataset_start_index}, {dataset_end_index}) "
115
- f"are out of bounds for dataset with {len(ds)} rows"
116
- )
117
- if dataset_start_index >= dataset_end_index:
118
- raise DatasetLoadError(
119
- f"Start index ({dataset_start_index}) must be less than end index ({dataset_end_index})"
120
- )
121
- ds = ds.select(range(dataset_start_index, dataset_end_index))
122
- logger.info(
123
- f"Dataset sliced from {dataset_start_index} to {dataset_end_index}"
124
- )
125
-
126
- if debug:
127
- if len(ds) < 30:
128
- logger.warning(
129
- f"Debug mode requested 30 samples but dataset only has {len(ds)} rows"
130
- )
131
- ds = ds.shuffle(seed=42).select(range(min(30, len(ds))))
132
- logger.info(
133
- f"Debug mode enabled. Using {len(ds)} samples from the dataset."
134
- )
135
- except DatasetLoadError:
136
- raise
137
- except Exception as e:
138
- raise DatasetLoadError(
139
- "Failed to process dataset slicing or debug mode.", details=str(e)
140
- ) from e
141
-
142
- # Validate API configuration
143
- openai_api_key = api_key or os.environ.get("OPENAI_API_KEY")
144
- if not openai_api_key or openai_api_key == "EMPTY":
145
- logger.warning("API key not provided and OPENAI_API_KEY not set or is 'EMPTY'. API calls may fail.")
146
-
147
- openai_api_base = endpoint
148
- if not openai_api_base:
149
- raise APIConnectionError("API endpoint cannot be empty")
150
-
151
- # Initialize OpenAI client
152
- try:
153
- client = OpenAI(
154
- api_key=openai_api_key or "EMPTY",
155
- base_url=openai_api_base,
156
- )
157
- # test connection with a model list
158
- models = client.models.list()
159
- logger.info(f"Initialized OpenAI client with endpoint: {openai_api_base}")
160
- logger.info(f"Available models: {[model.id for model in models.data]}")
161
- except Exception as e:
162
- raise APIConnectionError(
163
- f"Failed to initialize OpenAI client with endpoint '{openai_api_base}'. "
164
- f"Please check if the endpoint is valid and accessible.",
165
- details=str(e),
166
- ) from e
167
-
168
- # Load and validate flow configuration
169
- try:
170
- base_path = str(resources.files(__package__))
171
- flow_path = resolve_path(flow_path, [".", base_path])
172
- if not os.path.exists(flow_path):
173
- raise FlowConfigurationError(
174
- f"Flow configuration file not found: {flow_path}"
175
- )
176
-
177
- # Validate flow file is readable YAML
178
- try:
179
- with open(flow_path, "r", encoding="utf-8") as f:
180
- flow_config = yaml.safe_load(f)
181
- if not flow_config:
182
- raise FlowConfigurationError(
183
- f"Flow configuration file is empty: {flow_path}"
184
- )
185
- logger.info(f"Successfully loaded flow configuration from {flow_path}")
186
- except yaml.YAMLError as e:
187
- raise FlowConfigurationError(
188
- f"Flow configuration file '{flow_path}' contains invalid YAML.",
189
- details=str(e),
190
- ) from e
191
- except Exception as e:
192
- raise FlowConfigurationError(
193
- f"Failed to read flow configuration file '{flow_path}'.",
194
- details=str(e),
195
- ) from e
196
-
197
- flow = Flow(client).get_flow_from_file(flow_path)
198
- logger.info("Successfully initialized flow from configuration")
199
- except FlowConfigurationError:
200
- raise
201
- except Exception as e:
202
- raise FlowConfigurationError(
203
- f"Failed to create flow from configuration file '{flow_path}'. "
204
- f"Please check the flow configuration format and block definitions.",
205
- details=str(e),
206
- ) from e
207
-
208
- # Initialize SDG and generate data
209
- try:
210
- sdg = SDG(
211
- flows=[flow],
212
- num_workers=num_workers,
213
- batch_size=batch_size,
214
- save_freq=save_freq,
215
- )
216
- logger.info(
217
- f"Initialized SDG with {num_workers} workers, batch size {batch_size}"
218
- )
219
-
220
- # Ensure checkpoint directory exists if specified
221
- if checkpoint_dir and not os.path.exists(checkpoint_dir):
222
- os.makedirs(checkpoint_dir, exist_ok=True)
223
- logger.info(f"Created checkpoint directory: {checkpoint_dir}")
224
-
225
- generated_data = sdg.generate(ds, checkpoint_dir=checkpoint_dir)
226
-
227
- if generated_data is None or len(generated_data) == 0:
228
- raise DataGenerationError(
229
- "Data generation completed but no data was generated. "
230
- "This may indicate issues with the flow configuration or input data."
231
- )
232
-
233
- logger.info(f"Successfully generated {len(generated_data)} rows of data")
234
-
235
- except Exception as e:
236
- if isinstance(e, DataGenerationError):
237
- raise
238
- raise DataGenerationError(
239
- "Data generation failed during processing. This could be due to:"
240
- "\n- API connection issues with the endpoint"
241
- "\n- Invalid flow configuration or block parameters"
242
- "\n- Insufficient system resources (try reducing batch_size or num_workers)"
243
- "\n- Input data format incompatibility",
244
- details=f"Endpoint: {openai_api_base}, Error: {e}",
245
- ) from e
246
-
247
- # Save generated data
248
- try:
249
- # Adjust save path for dataset slicing
250
- final_save_path = save_path
251
- if dataset_end_index is not None and dataset_start_index is not None:
252
- final_save_path = save_path.replace(
253
- ".jsonl", f"_{dataset_start_index}_{dataset_end_index}.jsonl"
254
- )
255
-
256
- # Ensure save directory exists
257
- save_dir = os.path.dirname(final_save_path)
258
- if save_dir and not os.path.exists(save_dir):
259
- os.makedirs(save_dir, exist_ok=True)
260
- logger.info(f"Created save directory: {save_dir}")
261
-
262
- generated_data.to_json(final_save_path, orient="records", lines=True)
263
- logger.info(f"Data successfully saved to {final_save_path}")
264
-
265
- except Exception as e:
266
- raise DataSaveError(
267
- f"Failed to save generated data to '{final_save_path}'. "
268
- f"Please check write permissions and disk space.",
269
- details=str(e),
270
- ) from e
271
-
272
- except (
273
- DatasetLoadError,
274
- FlowConfigurationError,
275
- APIConnectionError,
276
- DataGenerationError,
277
- DataSaveError,
278
- ):
279
- # Re-raise our custom exceptions with their detailed messages
280
- raise
281
- except Exception as e:
282
- # Catch any unexpected errors
283
- logger.error(f"Unexpected error during flow execution: {e}")
284
- logger.error(f"Traceback: {traceback.format_exc()}")
285
- raise FlowRunnerError(
286
- "An unexpected error occurred during flow execution. "
287
- "Please check the logs for more details.",
288
- details=str(e),
289
- ) from e
290
-
291
-
292
- @click.command()
293
- @click.option(
294
- "--ds_path",
295
- type=click.Path(exists=True),
296
- required=True,
297
- help="Path to the dataset.",
298
- )
299
- @click.option(
300
- "--bs",
301
- type=int,
302
- default=8,
303
- show_default=True,
304
- help="Batch size for processing.",
305
- )
306
- @click.option(
307
- "--num_workers",
308
- type=int,
309
- default=32,
310
- show_default=True,
311
- help="Number of worker processes to use.",
312
- )
313
- @click.option(
314
- "--save_path",
315
- type=click.Path(),
316
- required=True,
317
- help="Path to save the output.",
318
- )
319
- @click.option(
320
- "--endpoint",
321
- type=str,
322
- required=True,
323
- help="API endpoint for data processing.",
324
- )
325
- @click.option(
326
- "--flow",
327
- type=click.Path(exists=True),
328
- required=True,
329
- help="Flow configuration for the process.",
330
- )
331
- @click.option(
332
- "--checkpoint_dir",
333
- type=click.Path(),
334
- required=True,
335
- help="Path to save checkpoints.",
336
- )
337
- @click.option(
338
- "--save_freq",
339
- type=int,
340
- default=2,
341
- show_default=True,
342
- help="Frequency to save checkpoints.",
343
- )
344
- @click.option(
345
- "--debug",
346
- is_flag=True,
347
- help="Enable debug mode with a smaller dataset subset.",
348
- )
349
- @click.option(
350
- "--dataset_start_index", type=int, default=0, help="Start index of the dataset."
351
- )
352
- @click.option(
353
- "--dataset_end_index", type=int, default=None, help="End index of the dataset."
354
- )
355
- @click.option(
356
- "--api_key",
357
- type=str,
358
- default=None,
359
- help="API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable.",
360
- )
361
- def main(
362
- ds_path: str,
363
- bs: int,
364
- num_workers: int,
365
- save_path: str,
366
- endpoint: str,
367
- flow: str,
368
- checkpoint_dir: str,
369
- save_freq: int,
370
- debug: bool,
371
- dataset_start_index: int,
372
- dataset_end_index: Optional[int],
373
- api_key: Optional[str],
374
- ) -> None:
375
- """CLI entry point for running data generation flows.
376
-
377
- Parameters
378
- ----------
379
- ds_path : str
380
- Path to the dataset file.
381
- bs : int
382
- Batch size for processing.
383
- num_workers : int
384
- Number of worker processes to use.
385
- save_path : str
386
- Path where the output will be saved.
387
- endpoint : str
388
- API endpoint for data processing.
389
- flow : str
390
- Path to the flow configuration file.
391
- checkpoint_dir : str
392
- Directory path for saving checkpoints.
393
- save_freq : int
394
- Frequency (in batches) at which to save checkpoints.
395
- debug : bool
396
- If True, enables debug mode with a smaller dataset subset.
397
- dataset_start_index : int
398
- Start index for dataset slicing.
399
- dataset_end_index : Optional[int]
400
- End index for dataset slicing.
401
- api_key : Optional[str]
402
- API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable.
403
-
404
- Returns
405
- -------
406
- None
407
- """
408
- try:
409
- run_flow(
410
- ds_path=ds_path,
411
- batch_size=bs,
412
- num_workers=num_workers,
413
- save_path=save_path,
414
- endpoint=endpoint,
415
- flow_path=flow,
416
- checkpoint_dir=checkpoint_dir,
417
- save_freq=save_freq,
418
- debug=debug,
419
- dataset_start_index=dataset_start_index,
420
- dataset_end_index=dataset_end_index,
421
- api_key=api_key,
422
- )
423
- except (
424
- DatasetLoadError,
425
- FlowConfigurationError,
426
- APIConnectionError,
427
- DataGenerationError,
428
- DataSaveError,
429
- FlowRunnerError,
430
- ) as e:
431
- logger.error(f"Flow execution failed: {e}")
432
- click.echo(f"Error: {e}", err=True)
433
- sys.exit(1)
434
- except KeyboardInterrupt:
435
- logger.info("Flow execution interrupted by user")
436
- click.echo("Flow execution interrupted by user", err=True)
437
- sys.exit(130) # Standard exit code for SIGINT
438
- except Exception as e:
439
- logger.error(f"Unexpected error: {e}")
440
- logger.error(f"Traceback: {traceback.format_exc()}")
441
- click.echo(
442
- f"Unexpected error occurred. Please check the logs for details. Error: {e}",
443
- err=True,
444
- )
445
- sys.exit(1)
446
-
447
-
448
- if __name__ == "__main__":
449
- # pylint: disable=no-value-for-parameter
450
- main()
@@ -1,13 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_mmlu_knowledge
4
- config_path: configs/knowledge/mcq_generation.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - mmlubench_question
8
- - mmlubench_answer
9
- gen_kwargs:
10
- temperature: 0
11
- max_tokens: 2048
12
- drop_duplicates:
13
- - mmlubench_question
@@ -1,12 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_knowledge
4
- config_path: configs/knowledge/simple_generate_qa.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- temperature: 0.7
10
- max_tokens: 2048
11
- drop_duplicates:
12
- - output
@@ -1,89 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_knowledge
4
- config_path: configs/knowledge/generate_questions_responses.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - question
8
- - response
9
- parser_kwargs:
10
- parser_name: custom
11
- parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
12
- parser_cleanup_tags:
13
- - "[END]"
14
- gen_kwargs:
15
- max_tokens: 2048
16
- drop_duplicates:
17
- - question
18
-
19
- - block_type: LLMBlock
20
- block_config:
21
- block_name: eval_faithfulness_qa_pair
22
- config_path: configs/knowledge/evaluate_faithfulness.yaml
23
- model_id: meta-llama/Llama-3.3-70B-Instruct
24
- output_cols:
25
- - explanation
26
- - judgment
27
- gen_kwargs:
28
- max_tokens: 2048
29
-
30
- - block_type: FilterByValueBlock
31
- block_config:
32
- block_name: filter_faithfulness
33
- filter_column: judgment
34
- filter_value: "YES"
35
- operation: operator.eq
36
- batch_kwargs:
37
- num_procs: 8
38
- drop_columns:
39
- - judgment
40
- - explanation
41
-
42
- - block_type: LLMBlock
43
- block_config:
44
- block_name: eval_relevancy_qa_pair
45
- config_path: configs/knowledge/evaluate_relevancy.yaml
46
- model_id: meta-llama/Llama-3.3-70B-Instruct
47
- output_cols:
48
- - feedback
49
- - score
50
- gen_kwargs:
51
- max_tokens: 2048
52
-
53
- - block_type: FilterByValueBlock
54
- block_config:
55
- block_name: filter_relevancy
56
- filter_column: score
57
- filter_value: 2.0
58
- operation: operator.eq
59
- convert_dtype: float
60
- batch_kwargs:
61
- num_procs: 8
62
- drop_columns:
63
- - feedback
64
- - score
65
-
66
- - block_type: LLMBlock
67
- block_config:
68
- block_name: eval_verify_question
69
- config_path: configs/knowledge/evaluate_question.yaml
70
- model_id: meta-llama/Llama-3.3-70B-Instruct
71
- output_cols:
72
- - explanation
73
- - rating
74
- gen_kwargs:
75
- max_tokens: 2048
76
-
77
- - block_type: FilterByValueBlock
78
- block_config:
79
- block_name: filter_verify_question
80
- filter_column: rating
81
- filter_value: 1.0
82
- operation: operator.eq
83
- convert_dtype: float
84
- batch_kwargs:
85
- num_procs: 8
86
- drop_columns:
87
- - explanation
88
- - rating
89
- - __index_level_0__
@@ -1,148 +0,0 @@
1
- - block_type: DuplicateColumns
2
- block_config:
3
- block_name: duplicate_document_col
4
- columns_map:
5
- document: base_document
6
-
7
- - block_type: LLMBlock
8
- block_config:
9
- block_name: gen_detailed_summary
10
- config_path: configs/knowledge/detailed_summary.yaml
11
- model_id: meta-llama/Llama-3.3-70B-Instruct
12
- output_cols:
13
- - summary_detailed
14
- gen_kwargs:
15
- max_tokens: 4096
16
- temperature: 0.7
17
- n: 50
18
-
19
- - block_type: LLMBlock
20
- block_config:
21
- block_name: gen_atomic_facts
22
- config_path: configs/knowledge/atomic_facts.yaml
23
- model_id: meta-llama/Llama-3.3-70B-Instruct
24
- output_cols:
25
- - summary_atomic_facts
26
- gen_kwargs:
27
- max_tokens: 4096
28
- temperature: 0.7
29
-
30
- - block_type: LLMBlock
31
- block_config:
32
- block_name: gen_extractive_summary
33
- config_path: configs/knowledge/extractive_summary.yaml
34
- model_id: meta-llama/Llama-3.3-70B-Instruct
35
- output_cols:
36
- - summary_extractive
37
- gen_kwargs:
38
- max_tokens: 4096
39
- temperature: 0.7
40
-
41
- - block_type: FlattenColumnsBlock
42
- block_config:
43
- block_name: flatten_summary_columns
44
- var_cols:
45
- - summary_detailed
46
- - summary_extractive
47
- - summary_atomic_facts
48
- - base_document
49
- value_name: summary
50
- var_name: dataset_type
51
-
52
- - block_type: RenameColumns
53
- block_config:
54
- block_name: rename_to_document_column
55
- columns_map:
56
- document: raw_document
57
- summary: document
58
-
59
- - block_type: LLMBlock
60
- block_config:
61
- block_name: knowledge generation
62
- config_path: configs/knowledge/generate_questions.yaml
63
- model_id: meta-llama/Llama-3.3-70B-Instruct
64
- output_cols:
65
- - question
66
- parser_kwargs:
67
- parser_name: custom
68
- parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
69
- gen_kwargs:
70
- temperature: 0.7
71
- max_tokens: 100
72
-
73
- - block_type: LLMBlock
74
- block_config:
75
- block_name: knowledge generation
76
- config_path: configs/knowledge/generate_responses.yaml
77
- model_id: meta-llama/Llama-3.3-70B-Instruct
78
- output_cols:
79
- - response
80
- gen_kwargs:
81
- temperature: 0.7
82
- max_tokens: 2048
83
-
84
- - block_type: LLMBlock
85
- block_config:
86
- block_name: eval_faithfulness_qa_pair
87
- config_path: configs/knowledge/evaluate_faithfulness.yaml
88
- model_id: meta-llama/Llama-3.3-70B-Instruct
89
- output_cols:
90
- - explanation
91
- - judgment
92
- gen_kwargs:
93
- max_tokens: 2048
94
-
95
- - block_type: FilterByValueBlock
96
- block_config:
97
- block_name: filter_faithfulness
98
- filter_column: judgment
99
- filter_value: "YES"
100
- operation: operator.eq
101
- drop_columns:
102
- - judgment
103
- - explanation
104
-
105
- - block_type: LLMBlock
106
- block_config:
107
- block_name: eval_relevancy_qa_pair
108
- config_path: configs/knowledge/evaluate_relevancy.yaml
109
- model_id: meta-llama/Llama-3.3-70B-Instruct
110
- output_cols:
111
- - feedback
112
- - score
113
- gen_kwargs:
114
- max_tokens: 2048
115
-
116
- - block_type: FilterByValueBlock
117
- block_config:
118
- block_name: filter_relevancy
119
- filter_column: score
120
- filter_value: 2.0
121
- operation: operator.eq
122
- convert_dtype: float
123
- drop_columns:
124
- - feedback
125
- - score
126
-
127
- - block_type: LLMBlock
128
- block_config:
129
- block_name: eval_verify_question
130
- config_path: configs/knowledge/evaluate_question.yaml
131
- model_id: meta-llama/Llama-3.3-70B-Instruct
132
- output_cols:
133
- - explanation
134
- - rating
135
- gen_kwargs:
136
- max_tokens: 2048
137
-
138
- - block_type: FilterByValueBlock
139
- block_config:
140
- block_name: filter_verify_question
141
- filter_column: rating
142
- filter_value: 1.0
143
- operation: operator.eq
144
- convert_dtype: float
145
- drop_columns:
146
- - explanation
147
- - rating
148
- - __index_level_0__