sdg-hub 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +35 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +149 -204
  5. sdg_hub/blocks/utilblocks.py +500 -43
  6. sdg_hub/checkpointer.py +139 -0
  7. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  8. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  9. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  10. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  11. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  13. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  14. sdg_hub/configs/skills/contexts.yaml +18 -11
  15. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  16. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  17. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  18. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  19. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  20. sdg_hub/configs/skills/router.yaml +53 -6
  21. sdg_hub/flow.py +351 -21
  22. sdg_hub/flow_runner.py +216 -0
  23. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
  24. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  25. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  26. sdg_hub/pipeline.py +67 -12
  27. sdg_hub/prompts.py +26 -0
  28. sdg_hub/sdg.py +128 -86
  29. sdg_hub/utils/config_validation.py +91 -0
  30. sdg_hub/utils/validation_result.py +10 -0
  31. sdg_hub-0.1.1.dist-info/METADATA +190 -0
  32. sdg_hub-0.1.1.dist-info/RECORD +86 -0
  33. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
  34. sdg_hub/blocks/filterblock.py +0 -76
  35. sdg_hub/blocks/iterblock.py +0 -31
  36. sdg_hub/blocks/rmblocks.py +0 -194
  37. sdg_hub/configs/annotations/simple.yaml +0 -10
  38. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  39. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  40. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  41. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  42. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  43. sdg_hub/utils/chunking.py +0 -73
  44. sdg_hub/utils/docprocessor.py +0 -357
  45. sdg_hub/utils/parse_and_convert.py +0 -392
  46. sdg_hub-0.1.0a3.dist-info/METADATA +0 -154
  47. sdg_hub-0.1.0a3.dist-info/RECORD +0 -90
  48. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  49. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  50. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  51. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  52. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  53. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  54. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  55. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  58. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
  59. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0
@@ -1,392 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
-
3
- # Standard
4
- from enum import Enum
5
- from typing import Any
6
- import json
7
- import os
8
- import random
9
- import re
10
- import uuid
11
-
12
- # Third Party
13
- from datasets import Dataset
14
- import yaml
15
-
16
- # First Party
17
- # pylint: disable=ungrouped-imports
18
- from sdg_hub import utils
19
- from sdg_hub.logger_config import setup_logger
20
- from .datautils import safe_concatenate_datasets
21
-
22
- logger = setup_logger(__name__)
23
-
24
-
25
- class TaxonomyType(Enum):
26
- KNOWLEDGE = "knowledge"
27
- SKILL = "skill"
28
-
29
-
30
- def _unescape(s):
31
- return bytes(s, "utf-8").decode("utf-8").strip()
32
-
33
-
34
- # This is a hack because the simple workflow returns a q/a pair as a single output.
35
- # We could possibly try to ask for them separately, but it would cost twice the inference
36
- # API calls. All of this is because the smallest models we use on small environments
37
- # for testing and demos weren't good enough to follow the strict formatting instructions used
38
- # in the full pipeline.
39
- def _get_question(synth_example: dict):
40
- if "question" in synth_example:
41
- return synth_example["question"]
42
-
43
- if not synth_example.get("output"):
44
- raise utils.GenerateException(
45
- f"Error: output not found in synth_example: {synth_example}"
46
- )
47
-
48
- parts = synth_example["output"].split("?", 1)
49
- if len(parts) != 2:
50
- logger.warning(f"Failed to split generated q&a: {synth_example['output']}")
51
- return parts[0].strip() + "?" if len(parts) == 2 else ""
52
-
53
-
54
- # This is also a hack. See the comment above _get_question.
55
- def _get_response(synth_example: dict):
56
- if "response" in synth_example:
57
- return synth_example["response"]
58
-
59
- if "output" not in synth_example:
60
- raise utils.GenerateException(
61
- f"Error: output not found in synth_example: {synth_example}"
62
- )
63
-
64
- parts = synth_example["output"].split("?", 1)
65
- if len(parts) != 2:
66
- logger.warning(f"Failed to split generated q&a: {synth_example['output']}")
67
- return parts[1].strip() if len(parts) == 2 else parts[0].strip()
68
-
69
-
70
- def _convert_to_hack_fmt(sample: dict, sys_prompt: str):
71
- """
72
- Convert a sample dictionary to contain 'system', 'user', and 'assistant' columns.
73
-
74
- Note: We should remove this function in the future when we resolve this issue and
75
- standardize the format to messages.
76
- """
77
- # Create user query message
78
- user_query = _unescape(_get_question(sample))
79
- response = _unescape(_get_response(sample))
80
- if "context" in sample:
81
- user_query = f"{sample['context']}\n\n{user_query}"
82
-
83
- sample["id"] = str(uuid.uuid4())
84
- sample["system"] = sys_prompt
85
- sample["user"] = user_query
86
- sample["assistant"] = response
87
-
88
- return sample
89
-
90
-
91
- def _convert_to_messages(sample: dict, sys_prompt: str):
92
- """
93
- Convert a sample dictionary to contain 'messages'
94
- and 'metadata' columns required for training.
95
- """
96
- # Create user query message
97
- user_query = _unescape(_get_question(sample))
98
- response = _unescape(_get_response(sample))
99
-
100
- sample["id"] = str(uuid.uuid4())
101
- sample["messages"] = [
102
- {"content": sys_prompt, "role": "system"},
103
- {"content": user_query, "role": "user"},
104
- {"content": response, "role": "assistant"},
105
- ]
106
-
107
- return sample
108
-
109
-
110
- def create_auxiliary_dataset(generated_dataset: Dataset):
111
- if "dataset_type" not in generated_dataset.column_names:
112
- return None
113
-
114
- # get module path of the current file
115
- module_dir = os.path.dirname(os.path.abspath(__file__))
116
- aux_inst_path = os.path.join(module_dir, "../configs/knowledge/auxilary_instructions.yaml")
117
- if os.path.isfile(
118
- aux_inst_path
119
- ):
120
- with open(aux_inst_path, "r", encoding="utf-8") as fp:
121
- auxiliary_inst = yaml.safe_load(fp)
122
- else:
123
- logger.error(f"auxiliary instructions file not found at {aux_inst_path}")
124
- return None
125
- auxiliary_ds = generated_dataset.filter(
126
- lambda x: x["dataset_type"] != "base_document"
127
- )
128
- unique_document_auxiliary = auxiliary_ds.to_pandas().drop_duplicates(
129
- subset=["document"]
130
- )
131
- unique_document_auxiliary = Dataset.from_pandas(unique_document_auxiliary)
132
- unique_document_auxiliary = unique_document_auxiliary.remove_columns(
133
- [
134
- col
135
- for col in unique_document_auxiliary.column_names
136
- if col
137
- not in [
138
- "raw_document",
139
- "document_outline",
140
- "domain",
141
- "dataset_type",
142
- "document",
143
- ]
144
- ]
145
- )
146
- unique_document_auxiliary = unique_document_auxiliary.rename_columns(
147
- {"raw_document": "context", "document": "response"}
148
- )
149
-
150
- def __create_auxiliary_ds(rec):
151
- instruction = random.choice(auxiliary_inst[rec["dataset_type"]])
152
- messages = [
153
- {"role": "user", "content": f"{rec['context']}\n\n{instruction}"},
154
- {"role": "assistant", "content": rec["response"]},
155
- ]
156
- metadata = json.dumps(
157
- {
158
- "dataset_type": rec["dataset_type"],
159
- "raw_document": rec["context"],
160
- "dataset": f"document_{rec['dataset_type']}",
161
- "domain": rec["domain"],
162
- }
163
- )
164
- return {"messages": messages, "metadata": metadata, "id": str(uuid.uuid4())}
165
-
166
- unique_document_auxiliary = unique_document_auxiliary.map(
167
- __create_auxiliary_ds, remove_columns=unique_document_auxiliary.column_names
168
- )
169
- return unique_document_auxiliary
170
-
171
-
172
- def generate_knowledge_qa_dataset(
173
- generated_dataset: Dataset, keep_context_separate=False
174
- ):
175
- def __create_qa_row(rec):
176
- context = rec["document"]
177
- instruction = rec["question"]
178
- response = rec["response"]
179
- metadata = {
180
- "sdg_document": rec["document"],
181
- "domain": rec["domain"],
182
- "dataset": "document_knowledge_qa",
183
- }
184
- if "raw_document" in rec and "dataset_type" in rec:
185
- metadata.update(
186
- {
187
- "raw_document": rec["raw_document"],
188
- "dataset_type": rec["dataset_type"],
189
- }
190
- )
191
- metadata = json.dumps(metadata)
192
- if keep_context_separate:
193
- messages = [
194
- {"role": "user", "content": f"{instruction}"},
195
- {"role": "assistant", "content": response},
196
- ]
197
- return {
198
- "messages": messages,
199
- "metadata": metadata,
200
- "id": str(uuid.uuid4()),
201
- "context": context,
202
- }
203
- else:
204
- messages = [
205
- {"role": "user", "content": f"{context}\n\n{instruction}"},
206
- {"role": "assistant", "content": response},
207
- ]
208
-
209
- return {"messages": messages, "metadata": metadata, "id": str(uuid.uuid4())}
210
-
211
- knowledge_ds = generated_dataset.map(
212
- __create_qa_row, remove_columns=generated_dataset.column_names
213
- )
214
- return knowledge_ds
215
-
216
-
217
- def build_raft_dataset(ds: Dataset, p, num_doc_in_context=4):
218
- all_context = list(set(ds["context"]))
219
-
220
- def _pick_documents(rec, p):
221
- answer_document = rec["context"]
222
- selected_docs = [e for e in all_context if e != answer_document]
223
- if len(selected_docs) > 0:
224
- if len(selected_docs) < num_doc_in_context:
225
- logger.info(
226
- f"Number of unique document is {len(selected_docs)} which is less than {num_doc_in_context}. Using all the documents in the RAFT context"
227
- )
228
- if random.uniform(0, 1) < p:
229
- # golden/answer + distractor documents
230
- docs = (
231
- random.sample(selected_docs, k=num_doc_in_context-1) + [answer_document]
232
- if len(selected_docs) >= (num_doc_in_context-1)
233
- else selected_docs + [answer_document]
234
- )
235
- else:
236
- # distractor documents
237
- docs = (
238
- random.sample(selected_docs, k=num_doc_in_context)
239
- if len(selected_docs) >= num_doc_in_context
240
- else selected_docs
241
- )
242
- else:
243
- logger.info("Only 1 unique document found. Turning off RAFT styling")
244
- docs = [answer_document]
245
-
246
- random.shuffle(docs)
247
-
248
- docs = "\n".join(([f"Document:\n{e}\n\n" for idx, e in enumerate(docs)]))
249
- user_idx, user_msg = [
250
- (idx, rec_msg)
251
- for idx, rec_msg in enumerate(rec["messages"])
252
- if rec_msg["role"] == "user"
253
- ][0]
254
- user_inst = user_msg["content"]
255
- rec["messages"][user_idx]["content"] = f"{docs}\n\n{user_inst}"
256
- rec["messages"] = rec["messages"]
257
- metadata = json.loads(rec["metadata"])
258
- metadata["dataset"] += f"_raft_p{p}"
259
- rec["metadata"] = json.dumps(metadata)
260
- return rec
261
-
262
- ds = ds.map(_pick_documents, fn_kwargs={"p": p} , remove_columns=["context"])
263
- return ds
264
-
265
-
266
-
267
- def _conv_pretrain(rec):
268
- rec["messages"] = [
269
- {
270
- "role": "pretraining",
271
- "content": f"<|user|>\n{rec['messages'][0]['content']}\n<|assistant|>\n{rec['messages'][1]['content']}",
272
- }
273
- ]
274
- return rec
275
-
276
-
277
- def create_knowledge_regular_ds(generated_dataset: Dataset):
278
- # Phase 1.0
279
- knowledge_ds = generate_knowledge_qa_dataset(
280
- generated_dataset, keep_context_separate=True
281
- )
282
- knowledge_ds = build_raft_dataset(knowledge_ds, p=0.4)
283
-
284
- auxiliary_dataset = create_auxiliary_dataset(generated_dataset)
285
- if auxiliary_dataset is not None:
286
- transformed_data = safe_concatenate_datasets([knowledge_ds, auxiliary_dataset])
287
- else:
288
- transformed_data = knowledge_ds
289
- return transformed_data
290
-
291
-
292
- def create_knowledge_pretraining_ds(generated_dataset: Dataset):
293
- # Phase 0.7
294
- knowledge_ds = generate_knowledge_qa_dataset(
295
- generated_dataset, keep_context_separate=False
296
- )
297
- knowledge_ds = knowledge_ds.map(_conv_pretrain)
298
-
299
- auxiliary_dataset = create_auxiliary_dataset(generated_dataset)
300
- if auxiliary_dataset is not None:
301
- auxiliary_dataset = auxiliary_dataset.map(_conv_pretrain)
302
- transformed_data = safe_concatenate_datasets([knowledge_ds, auxiliary_dataset])
303
- else:
304
- transformed_data = knowledge_ds
305
- return transformed_data
306
-
307
-
308
- def post_process_mcq(ds: Dataset, is_mmlu_eval: bool = False) -> Dataset:
309
- """Filters out badly generated data, adds dataset type column
310
-
311
- Args:
312
- ds (Dataset): mcq generated dataset from mmmlu pipeline
313
- is_mmlu_eval (bool, optional): _description_. Defaults to False.
314
-
315
- Returns:
316
- Dataset: Hf Dataset with new column, filtered dataset
317
- """
318
- ds = ds.filter(lambda x: ")" in x["mmlubench_answer"])
319
- ds = ds.filter(lambda x: "A)" in x["mmlubench_question"])
320
- ds = ds.add_column("dataset_type", ["mcq_qa"] * ds.num_rows)
321
- if is_mmlu_eval:
322
- return format_mmlu_style(ds)
323
- return ds
324
-
325
-
326
- def extract_options(text: str) -> list[Any]:
327
- """regex to extract options from mcq
328
-
329
- Args:
330
- text (str): question with options/mcq choices
331
-
332
- Returns:
333
- list[Any]: options under question that match the pattern.
334
- """
335
- # Use a regular expression to find patterns and capture the text after the letter and parenthesis
336
- pattern = r"\b[A-Z]\) (.+)"
337
- matches = re.findall(pattern, text)
338
- return matches
339
-
340
-
341
- def format_mmlu_style(ds: Dataset) -> Dataset:
342
- """Format the dataset according to lm-harness mmlu requirement.
343
-
344
- Args:
345
- ds (Dataset): input dataset
346
-
347
- Returns:
348
- Dataset: formated hf dataset
349
- """
350
- ds = ds.map(
351
- lambda x: {"answer": x["mmlubench_answer"][: x["mmlubench_answer"].index(")")]}
352
- )
353
- ds = ds.map(lambda x: {"choices": extract_options(x["mmlubench_question"])})
354
- ds = ds.map(
355
- lambda x: {
356
- "question": x["mmlubench_question"][
357
- : x["mmlubench_question"].index("A)")
358
- ].strip()
359
- }
360
- )
361
- ds = ds.rename_columns({"domain": "subject"})
362
- ds = ds.filter(lambda x: x["choices"])
363
- ds = ds.filter(lambda x: len(x["choices"]) == 4)
364
- ds = ds.filter(lambda x: x["answer"] in ["A", "B", "C", "D"])
365
- ds = ds.class_encode_column("answer")
366
- return ds
367
-
368
-
369
- def create_mmlu_evaluation_dataset(generate_mcq_dataset: Dataset) -> Dataset:
370
- """Filter, format and return mcq dataset that is compatible with lm-harness for doing mmlu-style evaluation
371
-
372
- Args:
373
- generate_mcq_dataset (Dataset): sdg generated mcq dataset
374
- Returns:
375
- Dataset: MMLU MCQ datast
376
- """
377
- mmlu_dataset = post_process_mcq(generate_mcq_dataset, is_mmlu_eval=True)
378
- return mmlu_dataset
379
-
380
-
381
- def create_mmlu_evaluation_yaml(task_name, eval_data_file_path, yaml_file_path):
382
- """
383
- Prepare Task Yaml that will be used in lm_eval_harness to evaluate knowledge using mmlu style metric
384
- """
385
- task_yaml = {
386
- "task": task_name,
387
- "dataset_kwargs": {"data_files": {"test": eval_data_file_path}},
388
- "include": "_default_mmlu_pr_template_yaml",
389
- "group": "mmlu_pr",
390
- }
391
- with open(yaml_file_path, "w", encoding="utf-8") as yaml_file:
392
- yaml.dump(task_yaml, yaml_file, default_flow_style=False)
@@ -1,154 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: sdg_hub
3
- Version: 0.1.0a3
4
- Summary: Synthetic Data Generation
5
- Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
- License: Apache-2.0
7
- Project-URL: homepage, https://ai-innovation.team/
8
- Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
- Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
- Classifier: Development Status :: 3 - Alpha
11
- Classifier: Environment :: Console
12
- Classifier: License :: OSI Approved :: Apache Software License
13
- Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Operating System :: MacOS :: MacOS X
15
- Classifier: Operating System :: POSIX :: Linux
16
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.9
19
- Classifier: Programming Language :: Python :: 3.10
20
- Classifier: Programming Language :: Python :: 3.11
21
- Classifier: Programming Language :: Python :: 3.12
22
- Classifier: Programming Language :: Python :: Implementation :: CPython
23
- Requires-Python: >=3.9
24
- Description-Content-Type: text/markdown
25
- License-File: LICENSE
26
- Requires-Dist: click<9.0.0,>=8.1.7
27
- Requires-Dist: datasets<4.0.0,>=2.18.0
28
- Requires-Dist: httpx<1.0.0,>=0.25.0
29
- Requires-Dist: jinja2
30
- Requires-Dist: langchain-text-splitters
31
- Requires-Dist: openai<2.0.0,>=1.13.3
32
- Requires-Dist: rich
33
- Requires-Dist: tenacity!=8.4.0,>=8.3.0
34
- Requires-Dist: tqdm<5.0.0,>=4.66.2
35
- Dynamic: license-file
36
-
37
- # Synthetic Data Generation for LLMs
38
-
39
- The SDG Framework is a modular, scalable, and efficient solution for creating synthetic data generation workflows in a "no-code" manner. At its core, this framework is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful pipelines for generating data and processing tasks.
40
-
41
-
42
-
43
- ## Core Design Principles
44
-
45
- The framework is built around the following principles:
46
-
47
- 1. **Modular Design**: Highly composable blocks form the building units of the framework, allowing users to build workflows effortlessly.
48
- 2. **No-Code Workflow Creation**: Specify workflows using simple YAML configuration files.
49
- 3. **Scalability and Performance**: Optimized for handling large-scale workflows with millions of records.
50
-
51
- ---
52
-
53
- ## Framework Architecture
54
-
55
- ![overview](assets/imgs/overview.png)
56
-
57
- ### Blocks: The Fundamental Unit
58
-
59
- At the heart of the framework is the **Block**. Each block is a self-contained computational unit that performs specific tasks, such as:
60
-
61
- - Making LLM calls
62
- - Performing data transformations
63
- - Applying filters
64
-
65
- Blocks are designed to be:
66
- - **Modular**: Reusable across multiple pipelines.
67
- - **Composable**: Easily chained together to create workflows.
68
-
69
- These blocks are implemented in the [src/sdg_hub/blocks](src/sdg_hub/blocks) directory.
70
-
71
- ### Pipelines: Higher-Level Abstraction
72
-
73
- Blocks can be chained together to form a **Pipeline**. Pipelines enable:
74
- - Linear or recursive chaining of blocks.
75
- - Execution of complex workflows by chaining multiple pipelines together.
76
-
77
- ### SDG Workflow: Full Workflow Automation
78
-
79
- Pipelines are further orchestrated into **SDG Workflows**, enabling seamless end-to-end processing. When invoking `sdg_hub.generate`, it triggers a pipeline/ or multiple pipelines that processes data through all the configured blocks.
80
-
81
- ---
82
-
83
- ### YAML-Based Workflow: The Flow
84
-
85
- The YAML configuration file, known as the **Flow**, is central to defining data generation workflows in the SDG Framework. A Flow describes how blocks and pipelines are orchestrated to process and generate data efficiently. By leveraging YAML, users can create highly customizable and modular workflows without writing any code.
86
-
87
- #### Key Features of a Flow
88
-
89
- 1. **Modular Design**:
90
- - Flows are composed of blocks, which can be chained together into pipelines.
91
- - Each block performs a specific task, such as generating, filtering, or transforming data.
92
-
93
- 2. **Reusability**:
94
- - Blocks and configurations defined in a Flow can be reused across different workflows.
95
- - YAML makes it easy to tweak or extend workflows without significant changes.
96
-
97
- 3. **Ease of Configuration**:
98
- - Users can specify block types, configurations, and data processing details in a simple and intuitive manner.
99
-
100
- ---
101
-
102
- ### Sample Flow
103
-
104
- Here is an example of a Flow configuration:
105
-
106
- ```yaml
107
- - block_type: LLMBlock
108
- block_config:
109
- block_name: gen_questions
110
- config_path: configs/skills/freeform_questions.yaml
111
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
112
- output_cols:
113
- - question
114
- batch_kwargs:
115
- num_samples: 30
116
- drop_duplicates:
117
- - question
118
- - block_type: FilterByValueBlock
119
- block_config:
120
- block_name: filter_questions
121
- filter_column: score
122
- filter_value: 1.0
123
- operation: operator.eq
124
- convert_dtype: float
125
- batch_kwargs:
126
- num_procs: 8
127
- drop_columns:
128
- - evaluation
129
- - score
130
- - num_samples
131
- - block_type: LLMBlock
132
- block_config:
133
- block_name: gen_responses
134
- config_path: configs/skills/freeform_responses.yaml
135
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
136
- output_cols:
137
- - response
138
- ```
139
-
140
- ### Dataflow and Storage
141
-
142
- - **Data Representation**: Dataflow between blocks and pipelines is handled using **Hugging Face Datasets**, which are based on Arrow tables. This provides:
143
- - Native parallelization capabilities (e.g., maps, filters).
144
- - Support for efficient data transformations.
145
-
146
- - **Data Checkpoints**: Intermediate caches of generated data. Checkpoints allow users to:
147
- - Resume workflows from the last successful state if interrupted.
148
- - Improve reliability for long-running workflows.
149
-
150
- ---
151
-
152
- ## Examples
153
-
154
- For sample use cases and implementation examples, please refer to the [examples](examples) directory. This directory contains various examples demonstrating different workflows and use cases of the SDG Framework.
@@ -1,90 +0,0 @@
1
- sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
2
- sdg_hub/_version.py,sha256=wrhrM1UZdxROWn7XOHbbPZa5jOBzV8tlSBMw233huBg,513
3
- sdg_hub/flow.py,sha256=3b97fMei1rWuQWeNfv-xyHKUbcMaf-d_b9Xms9J3BCQ,5425
4
- sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
5
- sdg_hub/pipeline.py,sha256=u24ccryfy_nOSvsrWiynNmq1rOmOOkw1L5-TqJvuRSo,2339
6
- sdg_hub/prompts.py,sha256=dOiC9CsNbMt5Km9PnwyuW0v9zUs3cVXE5jZYwtXZTwc,1957
7
- sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
9
- sdg_hub/sdg.py,sha256=SXXnDGA3MpYlNpsw4XyImL97l0pXiF5P9jrDkZNlDJc,6492
10
- sdg_hub/blocks/__init__.py,sha256=OwPWofuBBWG7n0nYAXNtFXdq4rPf7FyvKkPfjUBlqec,130
11
- sdg_hub/blocks/block.py,sha256=ObJp8JaAhQ3lQK6SOYoqHPc7b2hBZMhOXEmIap_qa1k,1788
12
- sdg_hub/blocks/filterblock.py,sha256=leH0k3stcRzdCWoy8kI2hFruGJ0VUemeA4QBW1eQcdQ,2650
13
- sdg_hub/blocks/iterblock.py,sha256=7UZnK_JyQfbMhVNVzZ79TtEtADLuosI0z62LhoP63s4,958
14
- sdg_hub/blocks/llmblock.py,sha256=Jy5vWvcMpXphtv4JEc9Nyjs7lgcoF-Yp0gYx4d_Iopc,16156
15
- sdg_hub/blocks/rmblocks.py,sha256=nw0p1LytHO7Dmc8RGfJ5uajDQWM93-oNoYrzhaY2QEY,6222
16
- sdg_hub/blocks/utilblocks.py,sha256=nAehqcDKiDE5W3REGApytYAXztRm9AW65cAy95Ufb8U,4926
17
- sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
20
- sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
21
- sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
22
- sdg_hub/configs/annotations/simple.yaml,sha256=C89QyC4DGJqdsr6mW3iqfUcAOj5rMZZSesmMdFoACuM,199
23
- sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=9icyigsMooyBR_nEwWgj9eBAnuc3kMZMNnEy6AxFSKU,2430
25
- sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
26
- sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=PBymlZljkzN8kbo5DgmNsSM_Xb76SZifuS5Yl-x4Uy4,365
27
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
28
- sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
29
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
30
- sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=06Z9lDiZUsQEURhpwWUVXA3wYO3bRaC0aNoGCpo3-44,376
31
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
32
- sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
33
- sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
34
- sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
35
- sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
36
- sdg_hub/configs/knowledge/data_recipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
- sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml,sha256=mB4uQifuS9F5ewKtxwd93XM5yZTZfSqiXxKhdT8bYT8,232
38
- sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
39
- sdg_hub/configs/skills/_A_.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
40
- sdg_hub/configs/skills/_B_.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
41
- sdg_hub/configs/skills/_C_.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
42
- sdg_hub/configs/skills/_D_.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
43
- sdg_hub/configs/skills/_E_.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
44
- sdg_hub/configs/skills/_F_.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
45
- sdg_hub/configs/skills/_G_.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
46
- sdg_hub/configs/skills/_H_.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
47
- sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
49
- sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
50
- sdg_hub/configs/skills/contexts.yaml,sha256=xSFB6_VmNkEixmqv0RKo6_9CI1i5SD4zvwyJtAIc0vk,1206
51
- sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
52
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=waszWejwK8hkNw7xl70H4FIIOAt9SL4R4ufZVkTvl-c,3026
53
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=peyoumtMh_OAQJxPN02Yb3M4gP_2B8czVgbRYC4Np94,2116
54
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=PzkuY491f9-jDwFy3Xm_y4A6ebIcpAaJ6FtGiAYLVWg,3181
55
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
56
- sdg_hub/configs/skills/freeform_questions.yaml,sha256=5mkwtJDKuFz0U8W8HTMXvYV8mXZaWyN1IDf2cLHO7gg,1512
57
- sdg_hub/configs/skills/freeform_responses.yaml,sha256=_BLeR2DDOHpXHn2TYMwD4deGLW2Ae2kgeJuEEu3qJGU,1492
58
- sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
59
- sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
60
- sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
61
- sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
62
- sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
63
- sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
64
- sdg_hub/configs/skills/router.yaml,sha256=CIfea7uIycwGO4cC5a_cBH_OBFZ0F1grO8TE2VCEpAI,121
65
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
66
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
67
- sdg_hub/configs/skills/data_recipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- sdg_hub/configs/skills/data_recipe/default_recipe.yaml,sha256=z1432g7gqKOan57vr72gk9QTC7p7xNkxGsaJMeO_yDY,296
69
- sdg_hub/flows/annotation/emotion/detailed_description.yaml,sha256=TmiTDFo3jPbXjQIgmm-QJg66nPqKMxmMYFa1clm3AjY,422
70
- sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml,sha256=1Vk3iKMn1HJX7AIthS8Z2pd0y6WTQ6qWXT-w8J2MggE,426
71
- sdg_hub/flows/annotation/emotion/simple.yaml,sha256=eX7I8IngXoOklaDgWuJU2X12QLZ5qPAo5WMcI1qadDo,408
72
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
73
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
74
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
75
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=aVnHkp0DkeuVgWdZ2eUQf5-uzI8tPYOkrs27yoF8m5g,3393
76
- sdg_hub/flows/generation/skills/agentic_improve_skill.yaml,sha256=XvdvUsn-mi4TmLwCbQ-5sYQqvaxfSTTNXqSfxzO0RXo,2750
77
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
78
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
79
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
80
- sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=PhUP2iBo4RkeFafSW-qxh4WmX_ZTfGi0UAmwN_XSTqs,1504
81
- sdg_hub/utils/__init__.py,sha256=UEo-9qPt5iVKBIRvgZhOI0SoIBO6zeBxOuLvUQXaM3g,185
82
- sdg_hub/utils/chunking.py,sha256=VSPQ8dSFI5LF4sefcI0tzWG0Vc1rM_FSMTO6xg_iFzA,2556
83
- sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
84
- sdg_hub/utils/docprocessor.py,sha256=Z4J2DfLhRxMCeIeMKttwi-FdivmPqI-hjEwq6-Ub35c,12485
85
- sdg_hub/utils/parse_and_convert.py,sha256=I27FdS-H2mSoZ07SsKZmNYM2F_Cg7GHTBXD7YNgASNw,13443
86
- sdg_hub-0.1.0a3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
87
- sdg_hub-0.1.0a3.dist-info/METADATA,sha256=vUusH0jLACOcoxvTL-e5dAPfhoTV--zgs_MJ-6IYQfQ,5847
88
- sdg_hub-0.1.0a3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
89
- sdg_hub-0.1.0a3.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
90
- sdg_hub-0.1.0a3.dist-info/RECORD,,