sdg-hub 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. sdg_hub/__init__.py +4 -0
  2. sdg_hub/_version.py +21 -0
  3. sdg_hub/blocks/__init__.py +6 -0
  4. sdg_hub/blocks/block.py +54 -0
  5. sdg_hub/blocks/filterblock.py +76 -0
  6. sdg_hub/blocks/iterblock.py +31 -0
  7. sdg_hub/blocks/llmblock.py +430 -0
  8. sdg_hub/blocks/rmblocks.py +194 -0
  9. sdg_hub/blocks/utilblocks.py +140 -0
  10. sdg_hub/configs/__init__.py +0 -0
  11. sdg_hub/configs/annotations/__init__.py +0 -0
  12. sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
  13. sdg_hub/configs/annotations/detailed_description.yaml +10 -0
  14. sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
  15. sdg_hub/configs/annotations/simple.yaml +10 -0
  16. sdg_hub/configs/knowledge/__init__.py +0 -0
  17. sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
  18. sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
  19. sdg_hub/configs/knowledge/data_recipe/__init__.py +0 -0
  20. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +3 -0
  21. sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
  22. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
  23. sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
  24. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
  25. sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
  26. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
  27. sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
  28. sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
  29. sdg_hub/configs/knowledge/router.yaml +12 -0
  30. sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
  31. sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
  32. sdg_hub/configs/skills/_A_.yaml +97 -0
  33. sdg_hub/configs/skills/_B_.yaml +36 -0
  34. sdg_hub/configs/skills/_C_.yaml +71 -0
  35. sdg_hub/configs/skills/_D_.yaml +85 -0
  36. sdg_hub/configs/skills/_E_.yaml +30 -0
  37. sdg_hub/configs/skills/_F_.yaml +45 -0
  38. sdg_hub/configs/skills/_G_.yaml +56 -0
  39. sdg_hub/configs/skills/_H_.yaml +80 -0
  40. sdg_hub/configs/skills/__init__.py +0 -0
  41. sdg_hub/configs/skills/analyzer.yaml +48 -0
  42. sdg_hub/configs/skills/annotation.yaml +36 -0
  43. sdg_hub/configs/skills/contexts.yaml +21 -0
  44. sdg_hub/configs/skills/critic.yaml +60 -0
  45. sdg_hub/configs/skills/data_recipe/__init__.py +0 -0
  46. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +6 -0
  47. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +44 -0
  48. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +46 -0
  49. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +54 -0
  50. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
  51. sdg_hub/configs/skills/freeform_questions.yaml +29 -0
  52. sdg_hub/configs/skills/freeform_responses.yaml +45 -0
  53. sdg_hub/configs/skills/grounded_questions.yaml +38 -0
  54. sdg_hub/configs/skills/grounded_responses.yaml +59 -0
  55. sdg_hub/configs/skills/judge.yaml +53 -0
  56. sdg_hub/configs/skills/planner.yaml +67 -0
  57. sdg_hub/configs/skills/respond.yaml +8 -0
  58. sdg_hub/configs/skills/revised_responder.yaml +78 -0
  59. sdg_hub/configs/skills/router.yaml +12 -0
  60. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
  61. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
  62. sdg_hub/flow.py +127 -0
  63. sdg_hub/flows/annotation/emotion/detailed_description.yaml +19 -0
  64. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +19 -0
  65. sdg_hub/flows/annotation/emotion/simple.yaml +19 -0
  66. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
  67. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
  68. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
  69. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
  70. sdg_hub/flows/generation/skills/agentic_improve_skill.yaml +108 -0
  71. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
  72. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
  73. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
  74. sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
  75. sdg_hub/logger_config.py +20 -0
  76. sdg_hub/pipeline.py +66 -0
  77. sdg_hub/prompts.py +17 -0
  78. sdg_hub/py.typed +0 -0
  79. sdg_hub/registry.py +122 -0
  80. sdg_hub/sdg.py +164 -0
  81. sdg_hub/utils/__init__.py +5 -0
  82. sdg_hub/utils/chunking.py +73 -0
  83. sdg_hub/utils/datamixing.py +123 -0
  84. sdg_hub/utils/datautils.py +14 -0
  85. sdg_hub/utils/docprocessor.py +357 -0
  86. sdg_hub/utils/json.py +48 -0
  87. sdg_hub/utils/models.py +31 -0
  88. sdg_hub/utils/parse_and_convert.py +392 -0
  89. sdg_hub/utils/taxonomy.py +489 -0
  90. sdg_hub-0.1.0a1.dist-info/METADATA +154 -0
  91. sdg_hub-0.1.0a1.dist-info/RECORD +94 -0
  92. sdg_hub-0.1.0a1.dist-info/WHEEL +5 -0
  93. sdg_hub-0.1.0a1.dist-info/licenses/LICENSE +201 -0
  94. sdg_hub-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,489 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Standard
4
+ from functools import cache
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Mapping, Optional, Union
7
+ import glob
8
+ import json
9
+ import logging
10
+ import os
11
+ import re
12
+ import subprocess
13
+ import tempfile
14
+
15
+ # Third Party
16
+ import git
17
+ import gitdb
18
+ import yaml
19
+
20
+ # First Party
21
+ from sdg_hub import utils
22
+ from sdg_hub.utils import chunking
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ DEFAULT_YAML_RULES = """\
27
+ extends: relaxed
28
+
29
+ rules:
30
+ line-length:
31
+ max: 120
32
+ """
33
+
34
+
35
+ class TaxonomyReadingException(Exception):
36
+ """An exception raised during reading of the taxonomy."""
37
+
38
+
39
+ TAXONOMY_FOLDERS: List[str] = ["compositional_skills", "knowledge"]
40
+ """Taxonomy folders which are also the schema names"""
41
+
42
+
43
+ def _istaxonomyfile(fn):
44
+ path = Path(fn)
45
+ if path.suffix == ".yaml" and path.parts[0] in TAXONOMY_FOLDERS:
46
+ return True
47
+ return False
48
+
49
+
50
+ def _get_taxonomy_diff(repo="taxonomy", base="origin/main"):
51
+ repo = git.Repo(repo)
52
+ untracked_files = [u for u in repo.untracked_files if _istaxonomyfile(u)]
53
+
54
+ branches = [b.name for b in repo.branches]
55
+
56
+ head_commit = None
57
+ if "/" in base:
58
+ re_git_branch = re.compile(f"remotes/{base}$", re.MULTILINE)
59
+ elif base in branches:
60
+ re_git_branch = re.compile(f"{base}$", re.MULTILINE)
61
+ else:
62
+ try:
63
+ head_commit = repo.commit(base)
64
+ except gitdb.exc.BadName as e:
65
+ raise SystemExit(
66
+ yaml.YAMLError(
67
+ f'Couldn\'t find the taxonomy git ref "{base}" from the current HEAD'
68
+ )
69
+ ) from e
70
+
71
+ # Move backwards from HEAD until we find the first commit that is part of base
72
+ # then we can take our diff from there
73
+ current_commit = repo.commit("HEAD")
74
+ while not head_commit:
75
+ branches = repo.git.branch("-a", "--contains", current_commit.hexsha)
76
+ if re_git_branch.findall(branches):
77
+ head_commit = current_commit
78
+ break
79
+ try:
80
+ current_commit = current_commit.parents[0]
81
+ except IndexError as e:
82
+ raise SystemExit(
83
+ yaml.YAMLError(
84
+ f'Couldn\'t find the taxonomy base branch "{base}" from the current HEAD'
85
+ )
86
+ ) from e
87
+
88
+ modified_files = [
89
+ d.b_path
90
+ for d in head_commit.diff(None)
91
+ if not d.deleted_file and _istaxonomyfile(d.b_path)
92
+ ]
93
+
94
+ updated_taxonomy_files = list(set(untracked_files + modified_files))
95
+ return updated_taxonomy_files
96
+
97
+
98
+ def _get_documents(
99
+ source: Dict[str, Union[str, List[str]]],
100
+ skip_checkout: bool = False,
101
+ ) -> List[str]:
102
+ """
103
+ Retrieve the content of files from a Git repository.
104
+
105
+ Args:
106
+ source (dict): Source info containing repository URL, commit hash, and list of file patterns.
107
+
108
+ Returns:
109
+ List[str]: List of document contents.
110
+ """ ""
111
+ repo_url = source.get("repo")
112
+ commit_hash = source.get("commit")
113
+ file_patterns = source.get("patterns", [])
114
+ with tempfile.TemporaryDirectory() as temp_dir:
115
+ try:
116
+ repo = git.Repo.clone_from(repo_url, temp_dir)
117
+ if not skip_checkout:
118
+ repo.git.checkout(commit_hash)
119
+
120
+ file_contents = []
121
+
122
+ logger.debug("Processing files...")
123
+ for pattern in file_patterns:
124
+ for file_path in glob.glob(os.path.join(repo.working_dir, pattern)):
125
+ if os.path.isfile(file_path) and file_path.endswith(".md"):
126
+ with open(file_path, "r", encoding="utf-8") as file:
127
+ file_contents.append(file.read())
128
+
129
+ if file_contents:
130
+ return file_contents
131
+ raise SystemExit("Couldn't find knowledge documents")
132
+ except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
133
+ raise e
134
+
135
+
136
+ @cache
137
+ def _load_schema(path: "importlib.resources.abc.Traversable") -> "referencing.Resource":
138
+ """Load the schema from the path into a Resource object.
139
+
140
+ Args:
141
+ path (Traversable): Path to the schema to be loaded.
142
+
143
+ Raises:
144
+ NoSuchResource: If the resource cannot be loaded.
145
+
146
+ Returns:
147
+ Resource: A Resource containing the requested schema.
148
+ """
149
+ # pylint: disable=C0415
150
+ # Third Party
151
+ from referencing import Resource
152
+ from referencing.exceptions import NoSuchResource
153
+ from referencing.jsonschema import DRAFT202012
154
+
155
+ try:
156
+ contents = json.loads(path.read_text(encoding="utf-8"))
157
+ resource = Resource.from_contents(
158
+ contents=contents, default_specification=DRAFT202012
159
+ )
160
+ except Exception as e:
161
+ raise NoSuchResource(ref=str(path)) from e
162
+ return resource
163
+
164
+
165
+ def _validate_yaml(contents: Mapping[str, Any], taxonomy_path: Path) -> int:
166
+ """Validate the parsed yaml document using the taxonomy path to
167
+ determine the proper schema.
168
+
169
+ Args:
170
+ contents (Mapping): The parsed yaml document to validate against the schema.
171
+ taxonomy_path (Path): Relative path of the taxonomy yaml document where the
172
+ first element is the schema to use.
173
+
174
+ Returns:
175
+ int: The number of errors found during validation.
176
+ Messages for each error have been logged.
177
+ """
178
+ # pylint: disable=C0415
179
+ # Standard
180
+ from importlib import resources
181
+
182
+ # Third Party
183
+ from jsonschema.protocols import Validator
184
+ from jsonschema.validators import validator_for
185
+ from referencing import Registry, Resource
186
+ from referencing.exceptions import NoSuchResource
187
+ from referencing.typing import URI
188
+
189
+ errors = 0
190
+ version = _get_version(contents)
191
+ schemas_path = resources.files("instructlab.schema").joinpath(f"v{version}")
192
+
193
+ def retrieve(uri: URI) -> Resource:
194
+ path = schemas_path.joinpath(uri)
195
+ return _load_schema(path)
196
+
197
+ schema_name = taxonomy_path.parts[0]
198
+ if schema_name not in TAXONOMY_FOLDERS:
199
+ schema_name = "knowledge" if "document" in contents else "compositional_skills"
200
+ logger.info(
201
+ f"Cannot determine schema name from path {taxonomy_path}. Using {schema_name} schema."
202
+ )
203
+
204
+ try:
205
+ schema_resource = retrieve(f"{schema_name}.json")
206
+ schema = schema_resource.contents
207
+ validator_cls = validator_for(schema)
208
+ validator: Validator = validator_cls(
209
+ schema, registry=Registry(retrieve=retrieve)
210
+ )
211
+
212
+ for validation_error in validator.iter_errors(contents):
213
+ errors += 1
214
+ yaml_path = validation_error.json_path[1:]
215
+ if not yaml_path:
216
+ yaml_path = "."
217
+ if validation_error.validator == "minItems":
218
+ # Special handling for minItems which can have a long message for seed_examples
219
+ message = (
220
+ f"Value must have at least {validation_error.validator_value} items"
221
+ )
222
+ else:
223
+ message = validation_error.message[-200:]
224
+ logger.error(
225
+ f"Validation error in {taxonomy_path}: [{yaml_path}] {message}"
226
+ )
227
+ except NoSuchResource as e:
228
+ cause = e.__cause__ if e.__cause__ is not None else e
229
+ errors += 1
230
+ logger.error(f"Cannot load schema file {e.ref}. {cause}")
231
+
232
+ return errors
233
+
234
+
235
+ def _get_version(contents: Mapping) -> int:
236
+ version = contents.get("version", 1)
237
+ if not isinstance(version, int):
238
+ # schema validation will complain about the type
239
+ try:
240
+ version = int(version)
241
+ except ValueError:
242
+ version = 1 # fallback to version 1
243
+ return version
244
+
245
+
246
+ # pylint: disable=broad-exception-caught
247
+ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):
248
+ seed_instruction_data = []
249
+ warnings = 0
250
+ errors = 0
251
+ file_path = Path(file_path).resolve()
252
+ # file should end with ".yaml" explicitly
253
+ if file_path.suffix != ".yaml":
254
+ logger.warning(
255
+ f"Skipping {file_path}! Use lowercase '.yaml' extension instead."
256
+ )
257
+ warnings += 1
258
+ return None, warnings, errors
259
+ for i in range(len(file_path.parts) - 1, -1, -1):
260
+ if file_path.parts[i] in TAXONOMY_FOLDERS:
261
+ taxonomy_path = Path(*file_path.parts[i:])
262
+ break
263
+ else:
264
+ taxonomy_path = file_path
265
+ # read file if extension is correct
266
+ try:
267
+ with open(file_path, "r", encoding="utf-8") as file:
268
+ contents = yaml.safe_load(file)
269
+ if not contents:
270
+ logger.warning(f"Skipping {file_path} because it is empty!")
271
+ warnings += 1
272
+ return None, warnings, errors
273
+ if not isinstance(contents, Mapping):
274
+ logger.error(
275
+ f"{file_path} is not valid. The top-level element is not an object with key-value pairs."
276
+ )
277
+ errors += 1
278
+ return None, warnings, errors
279
+
280
+ # do general YAML linting if specified
281
+ version = _get_version(contents)
282
+ if version > 1: # no linting for version 1 yaml
283
+ if yaml_rules is not None:
284
+ is_file = os.path.isfile(yaml_rules)
285
+ if is_file:
286
+ logger.debug(f"Using YAML rules from {yaml_rules}")
287
+ yamllint_cmd = [
288
+ "yamllint",
289
+ "-f",
290
+ "parsable",
291
+ "-c",
292
+ yaml_rules,
293
+ file_path,
294
+ "-s",
295
+ ]
296
+ else:
297
+ logger.debug(f"Cannot find {yaml_rules}. Using default rules.")
298
+ yamllint_cmd = [
299
+ "yamllint",
300
+ "-f",
301
+ "parsable",
302
+ "-d",
303
+ DEFAULT_YAML_RULES,
304
+ file_path,
305
+ "-s",
306
+ ]
307
+ else:
308
+ yamllint_cmd = [
309
+ "yamllint",
310
+ "-f",
311
+ "parsable",
312
+ "-d",
313
+ DEFAULT_YAML_RULES,
314
+ file_path,
315
+ "-s",
316
+ ]
317
+ try:
318
+ subprocess.check_output(yamllint_cmd, text=True)
319
+ except subprocess.SubprocessError as e:
320
+ lint_messages = [f"Problems found in file {file_path}"]
321
+ parsed_output = e.output.splitlines()
322
+ for p in parsed_output:
323
+ errors += 1
324
+ delim = str(file_path) + ":"
325
+ parsed_p = p.split(delim)[1]
326
+ lint_messages.append(parsed_p)
327
+ logger.error("\n".join(lint_messages))
328
+ return None, warnings, errors
329
+
330
+ # validation_errors = _validate_yaml(contents, taxonomy_path)
331
+ # if validation_errors:
332
+ # errors += validation_errors
333
+ # return None, warnings, errors
334
+
335
+ # get seed instruction data
336
+ tax_path = "->".join(taxonomy_path.parent.parts)
337
+ task_description = contents.get("task_description", None)
338
+ domain = contents.get("domain")
339
+ documents = contents.get("document")
340
+ if documents:
341
+ documents = _get_documents(source=documents)
342
+ logger.debug("Content from git repo fetched")
343
+
344
+ for seed_example in contents.get("seed_examples"):
345
+ context = seed_example.get("context", "")
346
+ if 'questions_and_answers' in seed_example:
347
+ question_answer_list = seed_example.get("questions_and_answers")
348
+ seed_instruction_data.append(
349
+ {
350
+ "questions_and_answers": question_answer_list,
351
+ "input": context,
352
+ "taxonomy_path": tax_path,
353
+ "document": documents,
354
+ "domain": domain,
355
+ "document_outline": contents.get("document_outline")
356
+ }
357
+ )
358
+ else:
359
+ question = seed_example.get("question")
360
+ answer = seed_example.get("answer")
361
+
362
+ seed_instruction_data.append(
363
+ {
364
+ "instruction": question,
365
+ "input": context,
366
+ "output": answer,
367
+ "taxonomy_path": tax_path,
368
+ "task_description": task_description,
369
+ "document": documents,
370
+ "domain": domain,
371
+ }
372
+ )
373
+ except Exception as e:
374
+ errors += 1
375
+ raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e
376
+
377
+ return seed_instruction_data, warnings, errors
378
+
379
+
380
+ def read_taxonomy(taxonomy, taxonomy_base, yaml_rules):
381
+ seed_instruction_data = []
382
+ is_file = os.path.isfile(taxonomy)
383
+ if is_file: # taxonomy is file
384
+ seed_instruction_data, warnings, errors = _read_taxonomy_file(
385
+ taxonomy, yaml_rules
386
+ )
387
+ if warnings:
388
+ logger.warning(
389
+ f"{warnings} warnings (see above) due to taxonomy file not (fully) usable."
390
+ )
391
+ if errors:
392
+ raise SystemExit(yaml.YAMLError("Taxonomy file with errors! Exiting."))
393
+ else: # taxonomy is dir
394
+ # Gather the new or changed YAMLs using git diff
395
+ updated_taxonomy_files = _get_taxonomy_diff(taxonomy, taxonomy_base)
396
+ total_errors = 0
397
+ total_warnings = 0
398
+ if updated_taxonomy_files:
399
+ logger.debug("Found new taxonomy files:")
400
+ for e in updated_taxonomy_files:
401
+ logger.debug(f"* {e}")
402
+ for f in updated_taxonomy_files:
403
+ file_path = os.path.join(taxonomy, f)
404
+ data, warnings, errors = _read_taxonomy_file(file_path, yaml_rules)
405
+ total_warnings += warnings
406
+ total_errors += errors
407
+ if data:
408
+ seed_instruction_data.extend(data)
409
+ if total_warnings:
410
+ logger.warning(
411
+ f"{total_warnings} warnings (see above) due to taxonomy files that were not (fully) usable."
412
+ )
413
+ if total_errors:
414
+ raise SystemExit(
415
+ yaml.YAMLError(f"{total_errors} taxonomy files with errors! Exiting.")
416
+ )
417
+ return seed_instruction_data
418
+
419
+
420
+ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
421
+ seed_instruction_data = read_taxonomy(taxonomy, taxonomy_base, yaml_rules)
422
+
423
+ # Transform into a more convenient format to feed into our updated SDG library
424
+ leaf_nodes = {}
425
+ for seed in seed_instruction_data:
426
+ node = leaf_nodes.setdefault(seed["taxonomy_path"], [])
427
+ node.append(seed)
428
+ leaf_nodes[seed["taxonomy_path"]] = node
429
+
430
+ return leaf_nodes
431
+
432
+
433
+ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
434
+ samples = []
435
+ # document is the same for the whole leaf node
436
+ chunks = (
437
+ chunking.chunk_document(
438
+ documents=leaf_node[0]["document"],
439
+ server_ctx_size=server_ctx_size,
440
+ chunk_word_count=chunk_word_count,
441
+ )
442
+ if leaf_node[0].get("document")
443
+ else []
444
+ )
445
+
446
+ # domain is the same for the whole leaf node
447
+ domain = leaf_node[0].get("domain")
448
+
449
+ for chunk in chunks:
450
+ # pylint: disable=consider-using-enumerate
451
+ for icl_ in leaf_node:
452
+ icl_query = {f"icl_query_{idx+1}": val["question"] for idx, val in enumerate(icl_["questions_and_answers"])}
453
+ icl_resp = {f"icl_response_{idx+1}": val["answer"] for idx, val in enumerate(icl_["questions_and_answers"])}
454
+ samples_row = {
455
+ "icl_document": icl_["input"],
456
+ "document": chunk,
457
+ "document_outline": icl_["document_outline"],
458
+ "domain": domain
459
+ }
460
+ samples_row.update(icl_query)
461
+ samples_row.update(icl_resp)
462
+ samples.append(samples_row)
463
+
464
+ return samples
465
+
466
+
467
+ def _skill_leaf_node_to_samples(leaf_node):
468
+ samples = []
469
+
470
+ # pylint: disable=consider-using-enumerate
471
+ for i in range(len(leaf_node)):
472
+ samples.append({})
473
+ samples[-1]["task_description"] = leaf_node[i]["task_description"]
474
+ if leaf_node[i].get("input"):
475
+ samples[-1]["seed_context"] = leaf_node[i]["input"]
476
+ samples[-1]["seed_question"] = leaf_node[i]["instruction"]
477
+ samples[-1]["seed_response"] = leaf_node[i]["output"]
478
+
479
+ return samples
480
+
481
+
482
+ def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
483
+ if not leaf_node:
484
+ return []
485
+ if leaf_node[0].get("document"):
486
+ return _knowledge_leaf_node_to_samples(
487
+ leaf_node, server_ctx_size, chunk_word_count
488
+ )
489
+ return _skill_leaf_node_to_samples(leaf_node)
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.1.0a1
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: click<9.0.0,>=8.1.7
27
+ Requires-Dist: datasets<4.0.0,>=2.18.0
28
+ Requires-Dist: httpx<1.0.0,>=0.25.0
29
+ Requires-Dist: jinja2
30
+ Requires-Dist: langchain-text-splitters
31
+ Requires-Dist: openai<2.0.0,>=1.13.3
32
+ Requires-Dist: rich
33
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
34
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
35
+ Dynamic: license-file
36
+
37
+ # Synthetic Data Generation for LLMs
38
+
39
+ The SDG Framework is a modular, scalable, and efficient solution for creating synthetic data generation workflows in a "no-code" manner. At its core, this framework is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful pipelines for generating data and processing tasks.
40
+
41
+
42
+
43
+ ## Core Design Principles
44
+
45
+ The framework is built around the following principles:
46
+
47
+ 1. **Modular Design**: Highly composable blocks form the building units of the framework, allowing users to build workflows effortlessly.
48
+ 2. **No-Code Workflow Creation**: Specify workflows using simple YAML configuration files.
49
+ 3. **Scalability and Performance**: Optimized for handling large-scale workflows with millions of records.
50
+
51
+ ---
52
+
53
+ ## Framework Architecture
54
+
55
+ ![overview](assets/imgs/overview.png)
56
+
57
+ ### Blocks: The Fundamental Unit
58
+
59
+ At the heart of the framework is the **Block**. Each block is a self-contained computational unit that performs specific tasks, such as:
60
+
61
+ - Making LLM calls
62
+ - Performing data transformations
63
+ - Applying filters
64
+
65
+ Blocks are designed to be:
66
+ - **Modular**: Reusable across multiple pipelines.
67
+ - **Composable**: Easily chained together to create workflows.
68
+
69
+ These blocks are implemented in the [src/sdg_hub/blocks](src/sdg_hub/blocks) directory.
70
+
71
+ ### Pipelines: Higher-Level Abstraction
72
+
73
+ Blocks can be chained together to form a **Pipeline**. Pipelines enable:
74
+ - Linear or recursive chaining of blocks.
75
+ - Execution of complex workflows by chaining multiple pipelines together.
76
+
77
+ ### SDG Workflow: Full Workflow Automation
78
+
79
+ Pipelines are further orchestrated into **SDG Workflows**, enabling seamless end-to-end processing. When invoking `sdg_hub.generate`, it triggers a pipeline/ or multiple pipelines that processes data through all the configured blocks.
80
+
81
+ ---
82
+
83
+ ### YAML-Based Workflow: The Flow
84
+
85
+ The YAML configuration file, known as the **Flow**, is central to defining data generation workflows in the SDG Framework. A Flow describes how blocks and pipelines are orchestrated to process and generate data efficiently. By leveraging YAML, users can create highly customizable and modular workflows without writing any code.
86
+
87
+ #### Key Features of a Flow
88
+
89
+ 1. **Modular Design**:
90
+ - Flows are composed of blocks, which can be chained together into pipelines.
91
+ - Each block performs a specific task, such as generating, filtering, or transforming data.
92
+
93
+ 2. **Reusability**:
94
+ - Blocks and configurations defined in a Flow can be reused across different workflows.
95
+ - YAML makes it easy to tweak or extend workflows without significant changes.
96
+
97
+ 3. **Ease of Configuration**:
98
+ - Users can specify block types, configurations, and data processing details in a simple and intuitive manner.
99
+
100
+ ---
101
+
102
+ ### Sample Flow
103
+
104
+ Here is an example of a Flow configuration:
105
+
106
+ ```yaml
107
+ - block_type: LLMBlock
108
+ block_config:
109
+ block_name: gen_questions
110
+ config_path: configs/skills/freeform_questions.yaml
111
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
112
+ output_cols:
113
+ - question
114
+ batch_kwargs:
115
+ num_samples: 30
116
+ drop_duplicates:
117
+ - question
118
+ - block_type: FilterByValueBlock
119
+ block_config:
120
+ block_name: filter_questions
121
+ filter_column: score
122
+ filter_value: 1.0
123
+ operation: operator.eq
124
+ convert_dtype: float
125
+ batch_kwargs:
126
+ num_procs: 8
127
+ drop_columns:
128
+ - evaluation
129
+ - score
130
+ - num_samples
131
+ - block_type: LLMBlock
132
+ block_config:
133
+ block_name: gen_responses
134
+ config_path: configs/skills/freeform_responses.yaml
135
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
136
+ output_cols:
137
+ - response
138
+ ```
139
+
140
+ ### Dataflow and Storage
141
+
142
+ - **Data Representation**: Dataflow between blocks and pipelines is handled using **Hugging Face Datasets**, which are based on Arrow tables. This provides:
143
+ - Native parallelization capabilities (e.g., maps, filters).
144
+ - Support for efficient data transformations.
145
+
146
+ - **Data Checkpoints**: Intermediate caches of generated data. Checkpoints allow users to:
147
+ - Resume workflows from the last successful state if interrupted.
148
+ - Improve reliability for long-running workflows.
149
+
150
+ ---
151
+
152
+ ## Examples
153
+
154
+ For sample use cases and implementation examples, please refer to the [examples](examples) directory. This directory contains various examples demonstrating different workflows and use cases of the SDG Framework.