sdg-hub 0.1.0a2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +4 -0
- sdg_hub/_version.py +21 -0
- sdg_hub/blocks/__init__.py +6 -0
- sdg_hub/blocks/block.py +54 -0
- sdg_hub/blocks/filterblock.py +76 -0
- sdg_hub/blocks/iterblock.py +31 -0
- sdg_hub/blocks/llmblock.py +430 -0
- sdg_hub/blocks/rmblocks.py +194 -0
- sdg_hub/blocks/utilblocks.py +140 -0
- sdg_hub/configs/__init__.py +0 -0
- sdg_hub/configs/annotations/__init__.py +0 -0
- sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
- sdg_hub/configs/annotations/detailed_description.yaml +10 -0
- sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
- sdg_hub/configs/annotations/simple.yaml +10 -0
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
- sdg_hub/configs/knowledge/data_recipe/__init__.py +0 -0
- sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +3 -0
- sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
- sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
- sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
- sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
- sdg_hub/configs/knowledge/router.yaml +12 -0
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
- sdg_hub/configs/skills/_A_.yaml +97 -0
- sdg_hub/configs/skills/_B_.yaml +36 -0
- sdg_hub/configs/skills/_C_.yaml +71 -0
- sdg_hub/configs/skills/_D_.yaml +85 -0
- sdg_hub/configs/skills/_E_.yaml +30 -0
- sdg_hub/configs/skills/_F_.yaml +45 -0
- sdg_hub/configs/skills/_G_.yaml +56 -0
- sdg_hub/configs/skills/_H_.yaml +80 -0
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +48 -0
- sdg_hub/configs/skills/annotation.yaml +36 -0
- sdg_hub/configs/skills/contexts.yaml +21 -0
- sdg_hub/configs/skills/critic.yaml +60 -0
- sdg_hub/configs/skills/data_recipe/__init__.py +0 -0
- sdg_hub/configs/skills/data_recipe/default_recipe.yaml +6 -0
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +44 -0
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +46 -0
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +54 -0
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
- sdg_hub/configs/skills/freeform_questions.yaml +29 -0
- sdg_hub/configs/skills/freeform_responses.yaml +45 -0
- sdg_hub/configs/skills/grounded_questions.yaml +38 -0
- sdg_hub/configs/skills/grounded_responses.yaml +59 -0
- sdg_hub/configs/skills/judge.yaml +53 -0
- sdg_hub/configs/skills/planner.yaml +67 -0
- sdg_hub/configs/skills/respond.yaml +8 -0
- sdg_hub/configs/skills/revised_responder.yaml +78 -0
- sdg_hub/configs/skills/router.yaml +12 -0
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
- sdg_hub/flow.py +127 -0
- sdg_hub/flows/annotation/emotion/detailed_description.yaml +19 -0
- sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +19 -0
- sdg_hub/flows/annotation/emotion/simple.yaml +19 -0
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
- sdg_hub/flows/generation/skills/agentic_improve_skill.yaml +108 -0
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
- sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
- sdg_hub/logger_config.py +20 -0
- sdg_hub/pipeline.py +66 -0
- sdg_hub/prompts.py +17 -0
- sdg_hub/py.typed +0 -0
- sdg_hub/registry.py +122 -0
- sdg_hub/sdg.py +164 -0
- sdg_hub/utils/__init__.py +5 -0
- sdg_hub/utils/chunking.py +73 -0
- sdg_hub/utils/datamixing.py +123 -0
- sdg_hub/utils/datautils.py +14 -0
- sdg_hub/utils/docprocessor.py +357 -0
- sdg_hub/utils/json.py +48 -0
- sdg_hub/utils/models.py +31 -0
- sdg_hub/utils/parse_and_convert.py +392 -0
- sdg_hub/utils/taxonomy.py +489 -0
- sdg_hub-0.1.0a2.dev0.dist-info/METADATA +154 -0
- sdg_hub-0.1.0a2.dev0.dist-info/RECORD +94 -0
- sdg_hub-0.1.0a2.dev0.dist-info/WHEEL +5 -0
- sdg_hub-0.1.0a2.dev0.dist-info/licenses/LICENSE +201 -0
- sdg_hub-0.1.0a2.dev0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,489 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
|
3
|
+
# Standard
|
4
|
+
from functools import cache
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Dict, List, Mapping, Optional, Union
|
7
|
+
import glob
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import re
|
12
|
+
import subprocess
|
13
|
+
import tempfile
|
14
|
+
|
15
|
+
# Third Party
|
16
|
+
import git
|
17
|
+
import gitdb
|
18
|
+
import yaml
|
19
|
+
|
20
|
+
# First Party
|
21
|
+
from sdg_hub import utils
|
22
|
+
from sdg_hub.utils import chunking
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
DEFAULT_YAML_RULES = """\
|
27
|
+
extends: relaxed
|
28
|
+
|
29
|
+
rules:
|
30
|
+
line-length:
|
31
|
+
max: 120
|
32
|
+
"""
|
33
|
+
|
34
|
+
|
35
|
+
class TaxonomyReadingException(Exception):
|
36
|
+
"""An exception raised during reading of the taxonomy."""
|
37
|
+
|
38
|
+
|
39
|
+
TAXONOMY_FOLDERS: List[str] = ["compositional_skills", "knowledge"]
|
40
|
+
"""Taxonomy folders which are also the schema names"""
|
41
|
+
|
42
|
+
|
43
|
+
def _istaxonomyfile(fn):
|
44
|
+
path = Path(fn)
|
45
|
+
if path.suffix == ".yaml" and path.parts[0] in TAXONOMY_FOLDERS:
|
46
|
+
return True
|
47
|
+
return False
|
48
|
+
|
49
|
+
|
50
|
+
def _get_taxonomy_diff(repo="taxonomy", base="origin/main"):
|
51
|
+
repo = git.Repo(repo)
|
52
|
+
untracked_files = [u for u in repo.untracked_files if _istaxonomyfile(u)]
|
53
|
+
|
54
|
+
branches = [b.name for b in repo.branches]
|
55
|
+
|
56
|
+
head_commit = None
|
57
|
+
if "/" in base:
|
58
|
+
re_git_branch = re.compile(f"remotes/{base}$", re.MULTILINE)
|
59
|
+
elif base in branches:
|
60
|
+
re_git_branch = re.compile(f"{base}$", re.MULTILINE)
|
61
|
+
else:
|
62
|
+
try:
|
63
|
+
head_commit = repo.commit(base)
|
64
|
+
except gitdb.exc.BadName as e:
|
65
|
+
raise SystemExit(
|
66
|
+
yaml.YAMLError(
|
67
|
+
f'Couldn\'t find the taxonomy git ref "{base}" from the current HEAD'
|
68
|
+
)
|
69
|
+
) from e
|
70
|
+
|
71
|
+
# Move backwards from HEAD until we find the first commit that is part of base
|
72
|
+
# then we can take our diff from there
|
73
|
+
current_commit = repo.commit("HEAD")
|
74
|
+
while not head_commit:
|
75
|
+
branches = repo.git.branch("-a", "--contains", current_commit.hexsha)
|
76
|
+
if re_git_branch.findall(branches):
|
77
|
+
head_commit = current_commit
|
78
|
+
break
|
79
|
+
try:
|
80
|
+
current_commit = current_commit.parents[0]
|
81
|
+
except IndexError as e:
|
82
|
+
raise SystemExit(
|
83
|
+
yaml.YAMLError(
|
84
|
+
f'Couldn\'t find the taxonomy base branch "{base}" from the current HEAD'
|
85
|
+
)
|
86
|
+
) from e
|
87
|
+
|
88
|
+
modified_files = [
|
89
|
+
d.b_path
|
90
|
+
for d in head_commit.diff(None)
|
91
|
+
if not d.deleted_file and _istaxonomyfile(d.b_path)
|
92
|
+
]
|
93
|
+
|
94
|
+
updated_taxonomy_files = list(set(untracked_files + modified_files))
|
95
|
+
return updated_taxonomy_files
|
96
|
+
|
97
|
+
|
98
|
+
def _get_documents(
|
99
|
+
source: Dict[str, Union[str, List[str]]],
|
100
|
+
skip_checkout: bool = False,
|
101
|
+
) -> List[str]:
|
102
|
+
"""
|
103
|
+
Retrieve the content of files from a Git repository.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
source (dict): Source info containing repository URL, commit hash, and list of file patterns.
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
List[str]: List of document contents.
|
110
|
+
""" ""
|
111
|
+
repo_url = source.get("repo")
|
112
|
+
commit_hash = source.get("commit")
|
113
|
+
file_patterns = source.get("patterns", [])
|
114
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
115
|
+
try:
|
116
|
+
repo = git.Repo.clone_from(repo_url, temp_dir)
|
117
|
+
if not skip_checkout:
|
118
|
+
repo.git.checkout(commit_hash)
|
119
|
+
|
120
|
+
file_contents = []
|
121
|
+
|
122
|
+
logger.debug("Processing files...")
|
123
|
+
for pattern in file_patterns:
|
124
|
+
for file_path in glob.glob(os.path.join(repo.working_dir, pattern)):
|
125
|
+
if os.path.isfile(file_path) and file_path.endswith(".md"):
|
126
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
127
|
+
file_contents.append(file.read())
|
128
|
+
|
129
|
+
if file_contents:
|
130
|
+
return file_contents
|
131
|
+
raise SystemExit("Couldn't find knowledge documents")
|
132
|
+
except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
|
133
|
+
raise e
|
134
|
+
|
135
|
+
|
136
|
+
@cache
|
137
|
+
def _load_schema(path: "importlib.resources.abc.Traversable") -> "referencing.Resource":
|
138
|
+
"""Load the schema from the path into a Resource object.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
path (Traversable): Path to the schema to be loaded.
|
142
|
+
|
143
|
+
Raises:
|
144
|
+
NoSuchResource: If the resource cannot be loaded.
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
Resource: A Resource containing the requested schema.
|
148
|
+
"""
|
149
|
+
# pylint: disable=C0415
|
150
|
+
# Third Party
|
151
|
+
from referencing import Resource
|
152
|
+
from referencing.exceptions import NoSuchResource
|
153
|
+
from referencing.jsonschema import DRAFT202012
|
154
|
+
|
155
|
+
try:
|
156
|
+
contents = json.loads(path.read_text(encoding="utf-8"))
|
157
|
+
resource = Resource.from_contents(
|
158
|
+
contents=contents, default_specification=DRAFT202012
|
159
|
+
)
|
160
|
+
except Exception as e:
|
161
|
+
raise NoSuchResource(ref=str(path)) from e
|
162
|
+
return resource
|
163
|
+
|
164
|
+
|
165
|
+
def _validate_yaml(contents: Mapping[str, Any], taxonomy_path: Path) -> int:
|
166
|
+
"""Validate the parsed yaml document using the taxonomy path to
|
167
|
+
determine the proper schema.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
contents (Mapping): The parsed yaml document to validate against the schema.
|
171
|
+
taxonomy_path (Path): Relative path of the taxonomy yaml document where the
|
172
|
+
first element is the schema to use.
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
int: The number of errors found during validation.
|
176
|
+
Messages for each error have been logged.
|
177
|
+
"""
|
178
|
+
# pylint: disable=C0415
|
179
|
+
# Standard
|
180
|
+
from importlib import resources
|
181
|
+
|
182
|
+
# Third Party
|
183
|
+
from jsonschema.protocols import Validator
|
184
|
+
from jsonschema.validators import validator_for
|
185
|
+
from referencing import Registry, Resource
|
186
|
+
from referencing.exceptions import NoSuchResource
|
187
|
+
from referencing.typing import URI
|
188
|
+
|
189
|
+
errors = 0
|
190
|
+
version = _get_version(contents)
|
191
|
+
schemas_path = resources.files("instructlab.schema").joinpath(f"v{version}")
|
192
|
+
|
193
|
+
def retrieve(uri: URI) -> Resource:
|
194
|
+
path = schemas_path.joinpath(uri)
|
195
|
+
return _load_schema(path)
|
196
|
+
|
197
|
+
schema_name = taxonomy_path.parts[0]
|
198
|
+
if schema_name not in TAXONOMY_FOLDERS:
|
199
|
+
schema_name = "knowledge" if "document" in contents else "compositional_skills"
|
200
|
+
logger.info(
|
201
|
+
f"Cannot determine schema name from path {taxonomy_path}. Using {schema_name} schema."
|
202
|
+
)
|
203
|
+
|
204
|
+
try:
|
205
|
+
schema_resource = retrieve(f"{schema_name}.json")
|
206
|
+
schema = schema_resource.contents
|
207
|
+
validator_cls = validator_for(schema)
|
208
|
+
validator: Validator = validator_cls(
|
209
|
+
schema, registry=Registry(retrieve=retrieve)
|
210
|
+
)
|
211
|
+
|
212
|
+
for validation_error in validator.iter_errors(contents):
|
213
|
+
errors += 1
|
214
|
+
yaml_path = validation_error.json_path[1:]
|
215
|
+
if not yaml_path:
|
216
|
+
yaml_path = "."
|
217
|
+
if validation_error.validator == "minItems":
|
218
|
+
# Special handling for minItems which can have a long message for seed_examples
|
219
|
+
message = (
|
220
|
+
f"Value must have at least {validation_error.validator_value} items"
|
221
|
+
)
|
222
|
+
else:
|
223
|
+
message = validation_error.message[-200:]
|
224
|
+
logger.error(
|
225
|
+
f"Validation error in {taxonomy_path}: [{yaml_path}] {message}"
|
226
|
+
)
|
227
|
+
except NoSuchResource as e:
|
228
|
+
cause = e.__cause__ if e.__cause__ is not None else e
|
229
|
+
errors += 1
|
230
|
+
logger.error(f"Cannot load schema file {e.ref}. {cause}")
|
231
|
+
|
232
|
+
return errors
|
233
|
+
|
234
|
+
|
235
|
+
def _get_version(contents: Mapping) -> int:
|
236
|
+
version = contents.get("version", 1)
|
237
|
+
if not isinstance(version, int):
|
238
|
+
# schema validation will complain about the type
|
239
|
+
try:
|
240
|
+
version = int(version)
|
241
|
+
except ValueError:
|
242
|
+
version = 1 # fallback to version 1
|
243
|
+
return version
|
244
|
+
|
245
|
+
|
246
|
+
# pylint: disable=broad-exception-caught
|
247
|
+
def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):
|
248
|
+
seed_instruction_data = []
|
249
|
+
warnings = 0
|
250
|
+
errors = 0
|
251
|
+
file_path = Path(file_path).resolve()
|
252
|
+
# file should end with ".yaml" explicitly
|
253
|
+
if file_path.suffix != ".yaml":
|
254
|
+
logger.warning(
|
255
|
+
f"Skipping {file_path}! Use lowercase '.yaml' extension instead."
|
256
|
+
)
|
257
|
+
warnings += 1
|
258
|
+
return None, warnings, errors
|
259
|
+
for i in range(len(file_path.parts) - 1, -1, -1):
|
260
|
+
if file_path.parts[i] in TAXONOMY_FOLDERS:
|
261
|
+
taxonomy_path = Path(*file_path.parts[i:])
|
262
|
+
break
|
263
|
+
else:
|
264
|
+
taxonomy_path = file_path
|
265
|
+
# read file if extension is correct
|
266
|
+
try:
|
267
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
268
|
+
contents = yaml.safe_load(file)
|
269
|
+
if not contents:
|
270
|
+
logger.warning(f"Skipping {file_path} because it is empty!")
|
271
|
+
warnings += 1
|
272
|
+
return None, warnings, errors
|
273
|
+
if not isinstance(contents, Mapping):
|
274
|
+
logger.error(
|
275
|
+
f"{file_path} is not valid. The top-level element is not an object with key-value pairs."
|
276
|
+
)
|
277
|
+
errors += 1
|
278
|
+
return None, warnings, errors
|
279
|
+
|
280
|
+
# do general YAML linting if specified
|
281
|
+
version = _get_version(contents)
|
282
|
+
if version > 1: # no linting for version 1 yaml
|
283
|
+
if yaml_rules is not None:
|
284
|
+
is_file = os.path.isfile(yaml_rules)
|
285
|
+
if is_file:
|
286
|
+
logger.debug(f"Using YAML rules from {yaml_rules}")
|
287
|
+
yamllint_cmd = [
|
288
|
+
"yamllint",
|
289
|
+
"-f",
|
290
|
+
"parsable",
|
291
|
+
"-c",
|
292
|
+
yaml_rules,
|
293
|
+
file_path,
|
294
|
+
"-s",
|
295
|
+
]
|
296
|
+
else:
|
297
|
+
logger.debug(f"Cannot find {yaml_rules}. Using default rules.")
|
298
|
+
yamllint_cmd = [
|
299
|
+
"yamllint",
|
300
|
+
"-f",
|
301
|
+
"parsable",
|
302
|
+
"-d",
|
303
|
+
DEFAULT_YAML_RULES,
|
304
|
+
file_path,
|
305
|
+
"-s",
|
306
|
+
]
|
307
|
+
else:
|
308
|
+
yamllint_cmd = [
|
309
|
+
"yamllint",
|
310
|
+
"-f",
|
311
|
+
"parsable",
|
312
|
+
"-d",
|
313
|
+
DEFAULT_YAML_RULES,
|
314
|
+
file_path,
|
315
|
+
"-s",
|
316
|
+
]
|
317
|
+
try:
|
318
|
+
subprocess.check_output(yamllint_cmd, text=True)
|
319
|
+
except subprocess.SubprocessError as e:
|
320
|
+
lint_messages = [f"Problems found in file {file_path}"]
|
321
|
+
parsed_output = e.output.splitlines()
|
322
|
+
for p in parsed_output:
|
323
|
+
errors += 1
|
324
|
+
delim = str(file_path) + ":"
|
325
|
+
parsed_p = p.split(delim)[1]
|
326
|
+
lint_messages.append(parsed_p)
|
327
|
+
logger.error("\n".join(lint_messages))
|
328
|
+
return None, warnings, errors
|
329
|
+
|
330
|
+
# validation_errors = _validate_yaml(contents, taxonomy_path)
|
331
|
+
# if validation_errors:
|
332
|
+
# errors += validation_errors
|
333
|
+
# return None, warnings, errors
|
334
|
+
|
335
|
+
# get seed instruction data
|
336
|
+
tax_path = "->".join(taxonomy_path.parent.parts)
|
337
|
+
task_description = contents.get("task_description", None)
|
338
|
+
domain = contents.get("domain")
|
339
|
+
documents = contents.get("document")
|
340
|
+
if documents:
|
341
|
+
documents = _get_documents(source=documents)
|
342
|
+
logger.debug("Content from git repo fetched")
|
343
|
+
|
344
|
+
for seed_example in contents.get("seed_examples"):
|
345
|
+
context = seed_example.get("context", "")
|
346
|
+
if 'questions_and_answers' in seed_example:
|
347
|
+
question_answer_list = seed_example.get("questions_and_answers")
|
348
|
+
seed_instruction_data.append(
|
349
|
+
{
|
350
|
+
"questions_and_answers": question_answer_list,
|
351
|
+
"input": context,
|
352
|
+
"taxonomy_path": tax_path,
|
353
|
+
"document": documents,
|
354
|
+
"domain": domain,
|
355
|
+
"document_outline": contents.get("document_outline")
|
356
|
+
}
|
357
|
+
)
|
358
|
+
else:
|
359
|
+
question = seed_example.get("question")
|
360
|
+
answer = seed_example.get("answer")
|
361
|
+
|
362
|
+
seed_instruction_data.append(
|
363
|
+
{
|
364
|
+
"instruction": question,
|
365
|
+
"input": context,
|
366
|
+
"output": answer,
|
367
|
+
"taxonomy_path": tax_path,
|
368
|
+
"task_description": task_description,
|
369
|
+
"document": documents,
|
370
|
+
"domain": domain,
|
371
|
+
}
|
372
|
+
)
|
373
|
+
except Exception as e:
|
374
|
+
errors += 1
|
375
|
+
raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e
|
376
|
+
|
377
|
+
return seed_instruction_data, warnings, errors
|
378
|
+
|
379
|
+
|
380
|
+
def read_taxonomy(taxonomy, taxonomy_base, yaml_rules):
|
381
|
+
seed_instruction_data = []
|
382
|
+
is_file = os.path.isfile(taxonomy)
|
383
|
+
if is_file: # taxonomy is file
|
384
|
+
seed_instruction_data, warnings, errors = _read_taxonomy_file(
|
385
|
+
taxonomy, yaml_rules
|
386
|
+
)
|
387
|
+
if warnings:
|
388
|
+
logger.warning(
|
389
|
+
f"{warnings} warnings (see above) due to taxonomy file not (fully) usable."
|
390
|
+
)
|
391
|
+
if errors:
|
392
|
+
raise SystemExit(yaml.YAMLError("Taxonomy file with errors! Exiting."))
|
393
|
+
else: # taxonomy is dir
|
394
|
+
# Gather the new or changed YAMLs using git diff
|
395
|
+
updated_taxonomy_files = _get_taxonomy_diff(taxonomy, taxonomy_base)
|
396
|
+
total_errors = 0
|
397
|
+
total_warnings = 0
|
398
|
+
if updated_taxonomy_files:
|
399
|
+
logger.debug("Found new taxonomy files:")
|
400
|
+
for e in updated_taxonomy_files:
|
401
|
+
logger.debug(f"* {e}")
|
402
|
+
for f in updated_taxonomy_files:
|
403
|
+
file_path = os.path.join(taxonomy, f)
|
404
|
+
data, warnings, errors = _read_taxonomy_file(file_path, yaml_rules)
|
405
|
+
total_warnings += warnings
|
406
|
+
total_errors += errors
|
407
|
+
if data:
|
408
|
+
seed_instruction_data.extend(data)
|
409
|
+
if total_warnings:
|
410
|
+
logger.warning(
|
411
|
+
f"{total_warnings} warnings (see above) due to taxonomy files that were not (fully) usable."
|
412
|
+
)
|
413
|
+
if total_errors:
|
414
|
+
raise SystemExit(
|
415
|
+
yaml.YAMLError(f"{total_errors} taxonomy files with errors! Exiting.")
|
416
|
+
)
|
417
|
+
return seed_instruction_data
|
418
|
+
|
419
|
+
|
420
|
+
def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
|
421
|
+
seed_instruction_data = read_taxonomy(taxonomy, taxonomy_base, yaml_rules)
|
422
|
+
|
423
|
+
# Transform into a more convenient format to feed into our updated SDG library
|
424
|
+
leaf_nodes = {}
|
425
|
+
for seed in seed_instruction_data:
|
426
|
+
node = leaf_nodes.setdefault(seed["taxonomy_path"], [])
|
427
|
+
node.append(seed)
|
428
|
+
leaf_nodes[seed["taxonomy_path"]] = node
|
429
|
+
|
430
|
+
return leaf_nodes
|
431
|
+
|
432
|
+
|
433
|
+
def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
|
434
|
+
samples = []
|
435
|
+
# document is the same for the whole leaf node
|
436
|
+
chunks = (
|
437
|
+
chunking.chunk_document(
|
438
|
+
documents=leaf_node[0]["document"],
|
439
|
+
server_ctx_size=server_ctx_size,
|
440
|
+
chunk_word_count=chunk_word_count,
|
441
|
+
)
|
442
|
+
if leaf_node[0].get("document")
|
443
|
+
else []
|
444
|
+
)
|
445
|
+
|
446
|
+
# domain is the same for the whole leaf node
|
447
|
+
domain = leaf_node[0].get("domain")
|
448
|
+
|
449
|
+
for chunk in chunks:
|
450
|
+
# pylint: disable=consider-using-enumerate
|
451
|
+
for icl_ in leaf_node:
|
452
|
+
icl_query = {f"icl_query_{idx+1}": val["question"] for idx, val in enumerate(icl_["questions_and_answers"])}
|
453
|
+
icl_resp = {f"icl_response_{idx+1}": val["answer"] for idx, val in enumerate(icl_["questions_and_answers"])}
|
454
|
+
samples_row = {
|
455
|
+
"icl_document": icl_["input"],
|
456
|
+
"document": chunk,
|
457
|
+
"document_outline": icl_["document_outline"],
|
458
|
+
"domain": domain
|
459
|
+
}
|
460
|
+
samples_row.update(icl_query)
|
461
|
+
samples_row.update(icl_resp)
|
462
|
+
samples.append(samples_row)
|
463
|
+
|
464
|
+
return samples
|
465
|
+
|
466
|
+
|
467
|
+
def _skill_leaf_node_to_samples(leaf_node):
|
468
|
+
samples = []
|
469
|
+
|
470
|
+
# pylint: disable=consider-using-enumerate
|
471
|
+
for i in range(len(leaf_node)):
|
472
|
+
samples.append({})
|
473
|
+
samples[-1]["task_description"] = leaf_node[i]["task_description"]
|
474
|
+
if leaf_node[i].get("input"):
|
475
|
+
samples[-1]["seed_context"] = leaf_node[i]["input"]
|
476
|
+
samples[-1]["seed_question"] = leaf_node[i]["instruction"]
|
477
|
+
samples[-1]["seed_response"] = leaf_node[i]["output"]
|
478
|
+
|
479
|
+
return samples
|
480
|
+
|
481
|
+
|
482
|
+
def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
|
483
|
+
if not leaf_node:
|
484
|
+
return []
|
485
|
+
if leaf_node[0].get("document"):
|
486
|
+
return _knowledge_leaf_node_to_samples(
|
487
|
+
leaf_node, server_ctx_size, chunk_word_count
|
488
|
+
)
|
489
|
+
return _skill_leaf_node_to_samples(leaf_node)
|
@@ -0,0 +1,154 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: sdg_hub
|
3
|
+
Version: 0.1.0a2.dev0
|
4
|
+
Summary: Synthetic Data Generation
|
5
|
+
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: homepage, https://ai-innovation.team/
|
8
|
+
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
+
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Environment :: Console
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23
|
+
Requires-Python: >=3.9
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
License-File: LICENSE
|
26
|
+
Requires-Dist: click<9.0.0,>=8.1.7
|
27
|
+
Requires-Dist: datasets<4.0.0,>=2.18.0
|
28
|
+
Requires-Dist: httpx<1.0.0,>=0.25.0
|
29
|
+
Requires-Dist: jinja2
|
30
|
+
Requires-Dist: langchain-text-splitters
|
31
|
+
Requires-Dist: openai<2.0.0,>=1.13.3
|
32
|
+
Requires-Dist: rich
|
33
|
+
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
34
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
35
|
+
Dynamic: license-file
|
36
|
+
|
37
|
+
# Synthetic Data Generation for LLMs
|
38
|
+
|
39
|
+
The SDG Framework is a modular, scalable, and efficient solution for creating synthetic data generation workflows in a "no-code" manner. At its core, this framework is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful pipelines for generating data and processing tasks.
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
## Core Design Principles
|
44
|
+
|
45
|
+
The framework is built around the following principles:
|
46
|
+
|
47
|
+
1. **Modular Design**: Highly composable blocks form the building units of the framework, allowing users to build workflows effortlessly.
|
48
|
+
2. **No-Code Workflow Creation**: Specify workflows using simple YAML configuration files.
|
49
|
+
3. **Scalability and Performance**: Optimized for handling large-scale workflows with millions of records.
|
50
|
+
|
51
|
+
---
|
52
|
+
|
53
|
+
## Framework Architecture
|
54
|
+
|
55
|
+

|
56
|
+
|
57
|
+
### Blocks: The Fundamental Unit
|
58
|
+
|
59
|
+
At the heart of the framework is the **Block**. Each block is a self-contained computational unit that performs specific tasks, such as:
|
60
|
+
|
61
|
+
- Making LLM calls
|
62
|
+
- Performing data transformations
|
63
|
+
- Applying filters
|
64
|
+
|
65
|
+
Blocks are designed to be:
|
66
|
+
- **Modular**: Reusable across multiple pipelines.
|
67
|
+
- **Composable**: Easily chained together to create workflows.
|
68
|
+
|
69
|
+
These blocks are implemented in the [src/sdg_hub/blocks](src/sdg_hub/blocks) directory.
|
70
|
+
|
71
|
+
### Pipelines: Higher-Level Abstraction
|
72
|
+
|
73
|
+
Blocks can be chained together to form a **Pipeline**. Pipelines enable:
|
74
|
+
- Linear or recursive chaining of blocks.
|
75
|
+
- Execution of complex workflows by chaining multiple pipelines together.
|
76
|
+
|
77
|
+
### SDG Workflow: Full Workflow Automation
|
78
|
+
|
79
|
+
Pipelines are further orchestrated into **SDG Workflows**, enabling seamless end-to-end processing. When invoking `sdg_hub.generate`, it triggers a pipeline/ or multiple pipelines that processes data through all the configured blocks.
|
80
|
+
|
81
|
+
---
|
82
|
+
|
83
|
+
### YAML-Based Workflow: The Flow
|
84
|
+
|
85
|
+
The YAML configuration file, known as the **Flow**, is central to defining data generation workflows in the SDG Framework. A Flow describes how blocks and pipelines are orchestrated to process and generate data efficiently. By leveraging YAML, users can create highly customizable and modular workflows without writing any code.
|
86
|
+
|
87
|
+
#### Key Features of a Flow
|
88
|
+
|
89
|
+
1. **Modular Design**:
|
90
|
+
- Flows are composed of blocks, which can be chained together into pipelines.
|
91
|
+
- Each block performs a specific task, such as generating, filtering, or transforming data.
|
92
|
+
|
93
|
+
2. **Reusability**:
|
94
|
+
- Blocks and configurations defined in a Flow can be reused across different workflows.
|
95
|
+
- YAML makes it easy to tweak or extend workflows without significant changes.
|
96
|
+
|
97
|
+
3. **Ease of Configuration**:
|
98
|
+
- Users can specify block types, configurations, and data processing details in a simple and intuitive manner.
|
99
|
+
|
100
|
+
---
|
101
|
+
|
102
|
+
### Sample Flow
|
103
|
+
|
104
|
+
Here is an example of a Flow configuration:
|
105
|
+
|
106
|
+
```yaml
|
107
|
+
- block_type: LLMBlock
|
108
|
+
block_config:
|
109
|
+
block_name: gen_questions
|
110
|
+
config_path: configs/skills/freeform_questions.yaml
|
111
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
112
|
+
output_cols:
|
113
|
+
- question
|
114
|
+
batch_kwargs:
|
115
|
+
num_samples: 30
|
116
|
+
drop_duplicates:
|
117
|
+
- question
|
118
|
+
- block_type: FilterByValueBlock
|
119
|
+
block_config:
|
120
|
+
block_name: filter_questions
|
121
|
+
filter_column: score
|
122
|
+
filter_value: 1.0
|
123
|
+
operation: operator.eq
|
124
|
+
convert_dtype: float
|
125
|
+
batch_kwargs:
|
126
|
+
num_procs: 8
|
127
|
+
drop_columns:
|
128
|
+
- evaluation
|
129
|
+
- score
|
130
|
+
- num_samples
|
131
|
+
- block_type: LLMBlock
|
132
|
+
block_config:
|
133
|
+
block_name: gen_responses
|
134
|
+
config_path: configs/skills/freeform_responses.yaml
|
135
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
136
|
+
output_cols:
|
137
|
+
- response
|
138
|
+
```
|
139
|
+
|
140
|
+
### Dataflow and Storage
|
141
|
+
|
142
|
+
- **Data Representation**: Dataflow between blocks and pipelines is handled using **Hugging Face Datasets**, which are based on Arrow tables. This provides:
|
143
|
+
- Native parallelization capabilities (e.g., maps, filters).
|
144
|
+
- Support for efficient data transformations.
|
145
|
+
|
146
|
+
- **Data Checkpoints**: Intermediate caches of generated data. Checkpoints allow users to:
|
147
|
+
- Resume workflows from the last successful state if interrupted.
|
148
|
+
- Improve reliability for long-running workflows.
|
149
|
+
|
150
|
+
---
|
151
|
+
|
152
|
+
## Examples
|
153
|
+
|
154
|
+
For sample use cases and implementation examples, please refer to the [examples](examples) directory. This directory contains various examples demonstrating different workflows and use cases of the SDG Framework.
|