sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +41 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +121 -193
  5. sdg_hub/blocks/openaichatblock.py +556 -0
  6. sdg_hub/blocks/utilblocks.py +500 -43
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  9. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  10. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  11. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  13. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  14. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  15. sdg_hub/configs/skills/contexts.yaml +18 -11
  16. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  17. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  18. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  19. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  20. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  21. sdg_hub/configs/skills/router.yaml +53 -6
  22. sdg_hub/flow.py +366 -33
  23. sdg_hub/flow_runner.py +437 -0
  24. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
  25. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  26. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  27. sdg_hub/pipeline.py +67 -12
  28. sdg_hub/prompts.py +52 -0
  29. sdg_hub/sdg.py +128 -86
  30. sdg_hub/utils/__init__.py +5 -0
  31. sdg_hub/utils/config_validation.py +91 -0
  32. sdg_hub/utils/error_handling.py +94 -0
  33. sdg_hub/utils/path_resolution.py +62 -0
  34. sdg_hub/utils/validation_result.py +10 -0
  35. sdg_hub-0.1.2.dist-info/METADATA +190 -0
  36. sdg_hub-0.1.2.dist-info/RECORD +89 -0
  37. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
  38. sdg_hub/blocks/filterblock.py +0 -76
  39. sdg_hub/blocks/iterblock.py +0 -31
  40. sdg_hub/blocks/rmblocks.py +0 -194
  41. sdg_hub/configs/annotations/simple.yaml +0 -10
  42. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  43. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  44. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  45. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  46. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  47. sdg_hub/utils/chunking.py +0 -73
  48. sdg_hub/utils/docprocessor.py +0 -357
  49. sdg_hub/utils/parse_and_convert.py +0 -392
  50. sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
  51. sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
  52. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  53. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  54. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  55. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  58. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  59. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  60. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  61. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  62. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
  63. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
sdg_hub/flow_runner.py ADDED
@@ -0,0 +1,437 @@
1
+ """Script for running data generation flows with configurable parameters."""
2
+
3
+ # Standard
4
+ from importlib import resources
5
+ from typing import Optional
6
+ import os
7
+ import sys
8
+ import traceback
9
+
10
+ # Third Party
11
+ from datasets import load_dataset
12
+ from openai import OpenAI
13
+ import click
14
+ import yaml
15
+
16
+ # First Party
17
+ from sdg_hub.flow import Flow
18
+ from sdg_hub.logger_config import setup_logger
19
+ from sdg_hub.sdg import SDG
20
+ from sdg_hub.utils.error_handling import (
21
+ APIConnectionError,
22
+ DataGenerationError,
23
+ DataSaveError,
24
+ DatasetLoadError,
25
+ FlowConfigurationError,
26
+ FlowRunnerError,
27
+ )
28
+ from sdg_hub.utils.path_resolution import resolve_path
29
+
30
+ logger = setup_logger(__name__)
31
+
32
+
33
+ def run_flow(
34
+ ds_path: str,
35
+ save_path: str,
36
+ endpoint: str,
37
+ flow_path: str,
38
+ checkpoint_dir: str,
39
+ batch_size: int = 8,
40
+ num_workers: int = 32,
41
+ save_freq: int = 2,
42
+ debug: bool = False,
43
+ dataset_start_index: int = 0,
44
+ dataset_end_index: Optional[int] = None,
45
+ ) -> None:
46
+ """Process the dataset using the specified configuration.
47
+
48
+ Parameters
49
+ ----------
50
+ ds_path : str
51
+ Path to the dataset file.
52
+ save_path : str
53
+ Path where the output will be saved.
54
+ endpoint : str
55
+ API endpoint for data processing.
56
+ flow_path : str
57
+ Path to the flow configuration file.
58
+ checkpoint_dir : str
59
+ Directory path for saving checkpoints.
60
+ batch_size : int, optional
61
+ Batch size for processing, by default 8.
62
+ num_workers : int, optional
63
+ Number of worker processes to use, by default 32.
64
+ save_freq : int, optional
65
+ Frequency (in batches) at which to save checkpoints, by default 2.
66
+ debug : bool, optional
67
+ If True, enables debug mode with a smaller dataset subset, by default False.
68
+ dataset_start_index : int, optional
69
+ Start index for dataset slicing, by default 0.
70
+ dataset_end_index : Optional[int], optional
71
+ End index for dataset slicing, by default None.
72
+
73
+ Returns
74
+ -------
75
+ None
76
+
77
+ Raises
78
+ ------
79
+ DatasetLoadError
80
+ If the dataset cannot be loaded or processed.
81
+ FlowConfigurationError
82
+ If the flow configuration is invalid or cannot be loaded.
83
+ APIConnectionError
84
+ If connection to the API endpoint fails.
85
+ DataGenerationError
86
+ If data generation fails during processing.
87
+ DataSaveError
88
+ If saving the generated data fails.
89
+ """
90
+ logger.info(f"Generation configuration: {locals()}\n\n")
91
+
92
+ try:
93
+ # Load and validate dataset
94
+ try:
95
+ ds = load_dataset("json", data_files=ds_path, split="train")
96
+ logger.info(
97
+ f"Successfully loaded dataset from {ds_path} with {len(ds)} rows"
98
+ )
99
+ except Exception as e:
100
+ raise DatasetLoadError(
101
+ f"Failed to load dataset from '{ds_path}'. "
102
+ f"Please check if the file exists and is a valid JSON file.",
103
+ details=str(e),
104
+ ) from e
105
+
106
+ # Apply dataset slicing if specified
107
+ try:
108
+ if dataset_start_index is not None and dataset_end_index is not None:
109
+ if dataset_start_index >= len(ds) or dataset_end_index > len(ds):
110
+ raise DatasetLoadError(
111
+ f"Dataset slice indices ({dataset_start_index}, {dataset_end_index}) "
112
+ f"are out of bounds for dataset with {len(ds)} rows"
113
+ )
114
+ if dataset_start_index >= dataset_end_index:
115
+ raise DatasetLoadError(
116
+ f"Start index ({dataset_start_index}) must be less than end index ({dataset_end_index})"
117
+ )
118
+ ds = ds.select(range(dataset_start_index, dataset_end_index))
119
+ logger.info(
120
+ f"Dataset sliced from {dataset_start_index} to {dataset_end_index}"
121
+ )
122
+
123
+ if debug:
124
+ if len(ds) < 30:
125
+ logger.warning(
126
+ f"Debug mode requested 30 samples but dataset only has {len(ds)} rows"
127
+ )
128
+ ds = ds.shuffle(seed=42).select(range(min(30, len(ds))))
129
+ logger.info(
130
+ f"Debug mode enabled. Using {len(ds)} samples from the dataset."
131
+ )
132
+ except DatasetLoadError:
133
+ raise
134
+ except Exception as e:
135
+ raise DatasetLoadError(
136
+ "Failed to process dataset slicing or debug mode.", details=str(e)
137
+ ) from e
138
+
139
+ # Validate API configuration
140
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
141
+ if not openai_api_key or openai_api_key == "EMPTY":
142
+ logger.warning("OPENAI_API_KEY not set or is 'EMPTY'. API calls may fail.")
143
+
144
+ openai_api_base = endpoint
145
+ if not openai_api_base:
146
+ raise APIConnectionError("API endpoint cannot be empty")
147
+
148
+ # Initialize OpenAI client
149
+ try:
150
+ client = OpenAI(
151
+ api_key=openai_api_key or "EMPTY",
152
+ base_url=openai_api_base,
153
+ )
154
+ # test connection with a model list
155
+ models = client.models.list()
156
+ logger.info(f"Initialized OpenAI client with endpoint: {openai_api_base}")
157
+ logger.info(f"Available models: {[model.id for model in models.data]}")
158
+ except Exception as e:
159
+ raise APIConnectionError(
160
+ f"Failed to initialize OpenAI client with endpoint '{openai_api_base}'. "
161
+ f"Please check if the endpoint is valid and accessible.",
162
+ details=str(e),
163
+ ) from e
164
+
165
+ # Load and validate flow configuration
166
+ try:
167
+ base_path = str(resources.files(__package__))
168
+ flow_path = resolve_path(flow_path, [".", base_path])
169
+ if not os.path.exists(flow_path):
170
+ raise FlowConfigurationError(
171
+ f"Flow configuration file not found: {flow_path}"
172
+ )
173
+
174
+ # Validate flow file is readable YAML
175
+ try:
176
+ with open(flow_path, "r", encoding="utf-8") as f:
177
+ flow_config = yaml.safe_load(f)
178
+ if not flow_config:
179
+ raise FlowConfigurationError(
180
+ f"Flow configuration file is empty: {flow_path}"
181
+ )
182
+ logger.info(f"Successfully loaded flow configuration from {flow_path}")
183
+ except yaml.YAMLError as e:
184
+ raise FlowConfigurationError(
185
+ f"Flow configuration file '{flow_path}' contains invalid YAML.",
186
+ details=str(e),
187
+ ) from e
188
+ except Exception as e:
189
+ raise FlowConfigurationError(
190
+ f"Failed to read flow configuration file '{flow_path}'.",
191
+ details=str(e),
192
+ ) from e
193
+
194
+ flow = Flow(client).get_flow_from_file(flow_path)
195
+ logger.info("Successfully initialized flow from configuration")
196
+ except FlowConfigurationError:
197
+ raise
198
+ except Exception as e:
199
+ raise FlowConfigurationError(
200
+ f"Failed to create flow from configuration file '{flow_path}'. "
201
+ f"Please check the flow configuration format and block definitions.",
202
+ details=str(e),
203
+ ) from e
204
+
205
+ # Initialize SDG and generate data
206
+ try:
207
+ sdg = SDG(
208
+ flows=[flow],
209
+ num_workers=num_workers,
210
+ batch_size=batch_size,
211
+ save_freq=save_freq,
212
+ )
213
+ logger.info(
214
+ f"Initialized SDG with {num_workers} workers, batch size {batch_size}"
215
+ )
216
+
217
+ # Ensure checkpoint directory exists if specified
218
+ if checkpoint_dir and not os.path.exists(checkpoint_dir):
219
+ os.makedirs(checkpoint_dir, exist_ok=True)
220
+ logger.info(f"Created checkpoint directory: {checkpoint_dir}")
221
+
222
+ generated_data = sdg.generate(ds, checkpoint_dir=checkpoint_dir)
223
+
224
+ if generated_data is None or len(generated_data) == 0:
225
+ raise DataGenerationError(
226
+ "Data generation completed but no data was generated. "
227
+ "This may indicate issues with the flow configuration or input data."
228
+ )
229
+
230
+ logger.info(f"Successfully generated {len(generated_data)} rows of data")
231
+
232
+ except Exception as e:
233
+ if isinstance(e, DataGenerationError):
234
+ raise
235
+ raise DataGenerationError(
236
+ "Data generation failed during processing. This could be due to:"
237
+ "\n- API connection issues with the endpoint"
238
+ "\n- Invalid flow configuration or block parameters"
239
+ "\n- Insufficient system resources (try reducing batch_size or num_workers)"
240
+ "\n- Input data format incompatibility",
241
+ details=f"Endpoint: {openai_api_base}, Error: {e}",
242
+ ) from e
243
+
244
+ # Save generated data
245
+ try:
246
+ # Adjust save path for dataset slicing
247
+ final_save_path = save_path
248
+ if dataset_end_index is not None and dataset_start_index is not None:
249
+ final_save_path = save_path.replace(
250
+ ".jsonl", f"_{dataset_start_index}_{dataset_end_index}.jsonl"
251
+ )
252
+
253
+ # Ensure save directory exists
254
+ save_dir = os.path.dirname(final_save_path)
255
+ if save_dir and not os.path.exists(save_dir):
256
+ os.makedirs(save_dir, exist_ok=True)
257
+ logger.info(f"Created save directory: {save_dir}")
258
+
259
+ generated_data.to_json(final_save_path, orient="records", lines=True)
260
+ logger.info(f"Data successfully saved to {final_save_path}")
261
+
262
+ except Exception as e:
263
+ raise DataSaveError(
264
+ f"Failed to save generated data to '{final_save_path}'. "
265
+ f"Please check write permissions and disk space.",
266
+ details=str(e),
267
+ ) from e
268
+
269
+ except (
270
+ DatasetLoadError,
271
+ FlowConfigurationError,
272
+ APIConnectionError,
273
+ DataGenerationError,
274
+ DataSaveError,
275
+ ):
276
+ # Re-raise our custom exceptions with their detailed messages
277
+ raise
278
+ except Exception as e:
279
+ # Catch any unexpected errors
280
+ logger.error(f"Unexpected error during flow execution: {e}")
281
+ logger.error(f"Traceback: {traceback.format_exc()}")
282
+ raise FlowRunnerError(
283
+ "An unexpected error occurred during flow execution. "
284
+ "Please check the logs for more details.",
285
+ details=str(e),
286
+ ) from e
287
+
288
+
289
+ @click.command()
290
+ @click.option(
291
+ "--ds_path",
292
+ type=click.Path(exists=True),
293
+ required=True,
294
+ help="Path to the dataset.",
295
+ )
296
+ @click.option(
297
+ "--bs",
298
+ type=int,
299
+ default=8,
300
+ show_default=True,
301
+ help="Batch size for processing.",
302
+ )
303
+ @click.option(
304
+ "--num_workers",
305
+ type=int,
306
+ default=32,
307
+ show_default=True,
308
+ help="Number of worker processes to use.",
309
+ )
310
+ @click.option(
311
+ "--save_path",
312
+ type=click.Path(),
313
+ required=True,
314
+ help="Path to save the output.",
315
+ )
316
+ @click.option(
317
+ "--endpoint",
318
+ type=str,
319
+ required=True,
320
+ help="API endpoint for data processing.",
321
+ )
322
+ @click.option(
323
+ "--flow",
324
+ type=click.Path(exists=True),
325
+ required=True,
326
+ help="Flow configuration for the process.",
327
+ )
328
+ @click.option(
329
+ "--checkpoint_dir",
330
+ type=click.Path(),
331
+ required=True,
332
+ help="Path to save checkpoints.",
333
+ )
334
+ @click.option(
335
+ "--save_freq",
336
+ type=int,
337
+ default=2,
338
+ show_default=True,
339
+ help="Frequency to save checkpoints.",
340
+ )
341
+ @click.option(
342
+ "--debug",
343
+ is_flag=True,
344
+ help="Enable debug mode with a smaller dataset subset.",
345
+ )
346
+ @click.option(
347
+ "--dataset_start_index", type=int, default=0, help="Start index of the dataset."
348
+ )
349
+ @click.option(
350
+ "--dataset_end_index", type=int, default=None, help="End index of the dataset."
351
+ )
352
+ def main(
353
+ ds_path: str,
354
+ bs: int,
355
+ num_workers: int,
356
+ save_path: str,
357
+ endpoint: str,
358
+ flow: str,
359
+ checkpoint_dir: str,
360
+ save_freq: int,
361
+ debug: bool,
362
+ dataset_start_index: int,
363
+ dataset_end_index: Optional[int],
364
+ ) -> None:
365
+ """CLI entry point for running data generation flows.
366
+
367
+ Parameters
368
+ ----------
369
+ ds_path : str
370
+ Path to the dataset file.
371
+ bs : int
372
+ Batch size for processing.
373
+ num_workers : int
374
+ Number of worker processes to use.
375
+ save_path : str
376
+ Path where the output will be saved.
377
+ endpoint : str
378
+ API endpoint for data processing.
379
+ flow : str
380
+ Path to the flow configuration file.
381
+ checkpoint_dir : str
382
+ Directory path for saving checkpoints.
383
+ save_freq : int
384
+ Frequency (in batches) at which to save checkpoints.
385
+ debug : bool
386
+ If True, enables debug mode with a smaller dataset subset.
387
+ dataset_start_index : int
388
+ Start index for dataset slicing.
389
+ dataset_end_index : Optional[int]
390
+ End index for dataset slicing.
391
+
392
+ Returns
393
+ -------
394
+ None
395
+ """
396
+ try:
397
+ run_flow(
398
+ ds_path=ds_path,
399
+ batch_size=bs,
400
+ num_workers=num_workers,
401
+ save_path=save_path,
402
+ endpoint=endpoint,
403
+ flow_path=flow,
404
+ checkpoint_dir=checkpoint_dir,
405
+ save_freq=save_freq,
406
+ debug=debug,
407
+ dataset_start_index=dataset_start_index,
408
+ dataset_end_index=dataset_end_index,
409
+ )
410
+ except (
411
+ DatasetLoadError,
412
+ FlowConfigurationError,
413
+ APIConnectionError,
414
+ DataGenerationError,
415
+ DataSaveError,
416
+ FlowRunnerError,
417
+ ) as e:
418
+ logger.error(f"Flow execution failed: {e}")
419
+ click.echo(f"Error: {e}", err=True)
420
+ sys.exit(1)
421
+ except KeyboardInterrupt:
422
+ logger.info("Flow execution interrupted by user")
423
+ click.echo("Flow execution interrupted by user", err=True)
424
+ sys.exit(130) # Standard exit code for SIGINT
425
+ except Exception as e:
426
+ logger.error(f"Unexpected error: {e}")
427
+ logger.error(f"Traceback: {traceback.format_exc()}")
428
+ click.echo(
429
+ f"Unexpected error occurred. Please check the logs for details. Error: {e}",
430
+ err=True,
431
+ )
432
+ sys.exit(1)
433
+
434
+
435
+ if __name__ == "__main__":
436
+ # pylint: disable=no-value-for-parameter
437
+ main()
@@ -12,7 +12,9 @@
12
12
  output_cols:
13
13
  - summary_detailed
14
14
  gen_kwargs:
15
- max_tokens: 2048
15
+ max_tokens: 4096
16
+ temperature: 0.7
17
+ n: 50
16
18
 
17
19
  - block_type: LLMBlock
18
20
  block_config:
@@ -22,7 +24,8 @@
22
24
  output_cols:
23
25
  - summary_atomic_facts
24
26
  gen_kwargs:
25
- max_tokens: 2048
27
+ max_tokens: 4096
28
+ temperature: 0.7
26
29
 
27
30
  - block_type: LLMBlock
28
31
  block_config:
@@ -32,7 +35,8 @@
32
35
  output_cols:
33
36
  - summary_extractive
34
37
  gen_kwargs:
35
- max_tokens: 2048
38
+ max_tokens: 4096
39
+ temperature: 0.7
36
40
 
37
41
  - block_type: FlattenColumnsBlock
38
42
  block_config:
@@ -55,18 +59,26 @@
55
59
  - block_type: LLMBlock
56
60
  block_config:
57
61
  block_name: knowledge generation
58
- config_path: configs/knowledge/generate_questions_responses.yaml
62
+ config_path: configs/knowledge/generate_questions.yaml
59
63
  model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
60
64
  output_cols:
61
65
  - question
62
- - response
63
66
  parser_kwargs:
64
67
  parser_name: custom
65
- parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
66
- parser_cleanup_tags:
67
- - "[END]"
68
+ parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
69
+ gen_kwargs:
70
+ temperature: 0.7
71
+ max_tokens: 100
72
+
73
+ - block_type: LLMBlock
74
+ block_config:
75
+ block_name: knowledge generation
76
+ config_path: configs/knowledge/generate_responses.yaml
77
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
78
+ output_cols:
79
+ - response
68
80
  gen_kwargs:
69
- temperature: 0.0
81
+ temperature: 0.7
70
82
  max_tokens: 2048
71
83
 
72
84
  - block_type: LLMBlock
@@ -2,34 +2,34 @@
2
2
  block_config:
3
3
  block_name: router
4
4
  config_path: configs/skills/router.yaml
5
- model_id: skill-classifier-v3-clm
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
6
  output_cols:
7
7
  - route
8
8
  gen_kwargs:
9
9
  temperature: 0
10
- max_tokens: 1
10
+ max_tokens: 5
11
11
  extra_body:
12
- allowed_token_ids:
13
- - 32001
14
- - 32002
15
- - 32003
16
- - 32004
17
- - 32005
18
- - 32006
19
- - 32007
20
- - 32008
12
+ guided_choice:
13
+ - "coding"
14
+ - "extraction"
15
+ - "humanities"
16
+ - "math"
17
+ - "reasoning"
18
+ - "roleplay"
19
+ - "STEM"
20
+ - "writing"
21
21
  - block_type: SamplePopulatorBlock
22
22
  block_config:
23
23
  block_name: icl_populator
24
24
  config_paths:
25
- - configs/skills/_A_.yaml
26
- - configs/skills/_B_.yaml
27
- - configs/skills/_C_.yaml
28
- - configs/skills/_D_.yaml
29
- - configs/skills/_E_.yaml
30
- - configs/skills/_F_.yaml
31
- - configs/skills/_G_.yaml
32
- - configs/skills/_H_.yaml
25
+ - configs/skills/icl_examples/coding.yaml
26
+ - configs/skills/icl_examples/extraction.yaml
27
+ - configs/skills/icl_examples/humanities.yaml
28
+ - configs/skills/icl_examples/math.yaml
29
+ - configs/skills/icl_examples/reasoning.yaml
30
+ - configs/skills/icl_examples/roleplay.yaml
31
+ - configs/skills/icl_examples/STEM.yaml
32
+ - configs/skills/icl_examples/writing.yaml
33
33
  column_name: route
34
34
  batch_kwargs:
35
35
  num_procs: 8
@@ -37,8 +37,7 @@
37
37
  block_config:
38
38
  block_name: analyzer
39
39
  config_path: configs/skills/analyzer.yaml
40
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
41
- model_prompt: <s> [INST] {prompt} [/INST]
40
+ model_id: meta-llama/Llama-3.3-70B-Instruct
42
41
  output_cols:
43
42
  - analysis
44
43
  - rubric
@@ -46,24 +45,21 @@
46
45
  block_config:
47
46
  block_name: critic
48
47
  config_path: configs/skills/critic.yaml
49
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
50
- model_prompt: <s> [INST] {prompt} [/INST]
48
+ model_id: meta-llama/Llama-3.3-70B-Instruct
51
49
  output_cols:
52
50
  - critique
53
51
  - block_type: LLMBlock
54
52
  block_config:
55
53
  block_name: planner
56
54
  config_path: configs/skills/planner.yaml
57
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
58
- model_prompt: <s> [INST] {prompt} [/INST]
55
+ model_id: meta-llama/Llama-3.3-70B-Instruct
59
56
  output_cols:
60
57
  - plan
61
58
  - block_type: LLMBlock
62
59
  block_config:
63
60
  block_name: revised_responder
64
61
  config_path: configs/skills/revised_responder.yaml
65
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
66
- model_prompt: <s> [INST] {prompt} [/INST]
62
+ model_id: meta-llama/Llama-3.3-70B-Instruct
67
63
  output_cols:
68
64
  - revised_response
69
65
  drop_columns:
@@ -78,8 +74,7 @@
78
74
  block_config:
79
75
  block_name: judge
80
76
  config_path: configs/skills/judge.yaml
81
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
82
- model_prompt: <s> [INST] {prompt} [/INST]
77
+ model_id: meta-llama/Llama-3.3-70B-Instruct
83
78
  output_cols:
84
79
  - judgement
85
80
  - verdict
@@ -100,9 +95,9 @@
100
95
  Assistant A: "response"
101
96
  Assistant B: "revised_response"
102
97
  choice_col: verdict
103
- output_col: chosen_reponse
98
+ output_col: chosen_response
104
99
  batch_kwargs:
105
100
  num_procs: 8
106
101
  drop_columns:
107
102
  - judgemnent
108
- - verdict
103
+ - verdict
@@ -2,7 +2,7 @@
2
2
  block_config:
3
3
  block_name: gen_questions
4
4
  config_path: configs/skills/freeform_questions.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
6
  output_cols:
7
7
  - question
8
8
  batch_kwargs:
@@ -13,7 +13,7 @@
13
13
  block_config:
14
14
  block_name: eval_questions
15
15
  config_path: configs/skills/evaluate_freeform_questions.yaml
16
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
16
+ model_id: meta-llama/Llama-3.3-70B-Instruct
17
17
  output_cols:
18
18
  - evaluation
19
19
  - score
@@ -34,14 +34,14 @@
34
34
  block_config:
35
35
  block_name: gen_responses
36
36
  config_path: configs/skills/freeform_responses.yaml
37
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
37
+ model_id: meta-llama/Llama-3.3-70B-Instruct
38
38
  output_cols:
39
39
  - response
40
40
  - block_type: LLMBlock
41
41
  block_config:
42
42
  block_name: evaluate_qa_pair
43
43
  config_path: configs/skills/evaluate_freeform_pair.yaml
44
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
44
+ model_id: meta-llama/Llama-3.3-70B-Instruct
45
45
  output_cols:
46
46
  - evaluation
47
47
  - score