sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +41 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +121 -193
  5. sdg_hub/blocks/openaichatblock.py +556 -0
  6. sdg_hub/blocks/utilblocks.py +500 -43
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  9. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  10. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  11. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  13. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  14. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  15. sdg_hub/configs/skills/contexts.yaml +18 -11
  16. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  17. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  18. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  19. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  20. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  21. sdg_hub/configs/skills/router.yaml +53 -6
  22. sdg_hub/flow.py +366 -33
  23. sdg_hub/flow_runner.py +437 -0
  24. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
  25. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  26. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  27. sdg_hub/pipeline.py +67 -12
  28. sdg_hub/prompts.py +52 -0
  29. sdg_hub/sdg.py +128 -86
  30. sdg_hub/utils/__init__.py +5 -0
  31. sdg_hub/utils/config_validation.py +91 -0
  32. sdg_hub/utils/error_handling.py +94 -0
  33. sdg_hub/utils/path_resolution.py +62 -0
  34. sdg_hub/utils/validation_result.py +10 -0
  35. sdg_hub-0.1.2.dist-info/METADATA +190 -0
  36. sdg_hub-0.1.2.dist-info/RECORD +89 -0
  37. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
  38. sdg_hub/blocks/filterblock.py +0 -76
  39. sdg_hub/blocks/iterblock.py +0 -31
  40. sdg_hub/blocks/rmblocks.py +0 -194
  41. sdg_hub/configs/annotations/simple.yaml +0 -10
  42. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  43. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  44. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  45. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  46. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  47. sdg_hub/utils/chunking.py +0 -73
  48. sdg_hub/utils/docprocessor.py +0 -357
  49. sdg_hub/utils/parse_and_convert.py +0 -392
  50. sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
  51. sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
  52. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  53. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  54. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  55. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  58. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  59. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  60. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  61. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  62. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
  63. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
sdg_hub/pipeline.py CHANGED
@@ -1,6 +1,17 @@
1
+ """
2
+ Deprecated Pipeline class for data generation pipelines.
3
+
4
+ Use the Flow class directly for new code.
5
+ """
6
+
1
7
  # SPDX-License-Identifier: Apache-2.0
8
+ # Standard
9
+ import warnings
10
+ from typing import List, Dict, Any
11
+
2
12
  # Third Party
3
13
  from datasets import Dataset
14
+ from datasets.data_files import EmptyDatasetError
4
15
 
5
16
  # Local
6
17
  from .logger_config import setup_logger
@@ -8,31 +19,75 @@ from .logger_config import setup_logger
8
19
  logger = setup_logger(__name__)
9
20
 
10
21
 
11
- class EmptyDatasetError(Exception):
12
- pass
22
+ class Pipeline:
23
+ """A class representing a data generation pipeline.
13
24
 
25
+ This class is deprecated and will be removed in a future version.
26
+ Use the Flow class directly instead.
14
27
 
15
- class Pipeline:
16
- def __init__(self, chained_blocks: list) -> None:
28
+ Parameters
29
+ ----------
30
+ chained_blocks : List[Dict[str, Any]]
31
+ List of block configurations to execute in sequence.
32
+
33
+ Attributes
34
+ ----------
35
+ chained_blocks : List[Dict[str, Any]]
36
+ List of block configurations to execute in sequence.
37
+ """
38
+
39
+ def __init__(self, chained_blocks: List[Dict[str, Any]]) -> None:
17
40
  """
18
41
  Initialize the Pipeline class with a configuration dictionary.
19
- config_dict: the run config py or yaml loaded into a dictionary
42
+
43
+ DEPRECATED: This class is deprecated and will be removed in a future version.
44
+ Use the Flow class directly instead.
20
45
  """
46
+ warnings.warn(
47
+ "Pipeline class is deprecated and will be removed in a future version. "
48
+ "Use Flow class directly instead of wrapping it with Pipeline.",
49
+ DeprecationWarning,
50
+ stacklevel=2
51
+ )
21
52
  # pipeline config is the run configuration that consists of the pipeline steps
22
53
  self.chained_blocks = chained_blocks
23
54
 
24
- def _drop_duplicates(self, dataset, cols):
25
- """
26
- Drop duplicates from the dataset based on the columns provided.
55
+ def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
56
+ """Drop duplicates from the dataset based on the columns provided.
57
+
58
+ Parameters
59
+ ----------
60
+ dataset : Dataset
61
+ The input dataset.
62
+ cols : List[str]
63
+ Columns to consider for duplicate detection.
64
+
65
+ Returns
66
+ -------
67
+ Dataset
68
+ Dataset with duplicates removed.
27
69
  """
28
70
  df = dataset.to_pandas()
29
71
  df = df.drop_duplicates(subset=cols).reset_index(drop=True)
30
72
  return Dataset.from_pandas(df)
31
73
 
32
- def generate(self, dataset) -> Dataset:
33
- """
34
- Generate the dataset by running the pipeline steps.
35
- dataset: the input dataset
74
+ def generate(self, dataset: Dataset) -> Dataset:
75
+ """Generate the dataset by running the pipeline steps.
76
+
77
+ Parameters
78
+ ----------
79
+ dataset : Dataset
80
+ The input dataset to process.
81
+
82
+ Returns
83
+ -------
84
+ Dataset
85
+ The processed dataset.
86
+
87
+ Raises
88
+ ------
89
+ EmptyDatasetError
90
+ If a block produces an empty dataset.
36
91
  """
37
92
  for block_prop in self.chained_blocks:
38
93
  block_type = block_prop["block_type"]
sdg_hub/prompts.py CHANGED
@@ -20,3 +20,55 @@ def mistral_chat_template():
20
20
  @PromptRegistry.register("meta-llama/Llama-3.3")
21
21
  def meta_llama_chat_template():
22
22
  return """{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
23
+
24
+
25
+ @PromptRegistry.register("microsoft/phi-4")
26
+ def microsoft_phi_chat_template():
27
+ return """{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}"""
28
+
29
+ @PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
30
+ def nemotron_chat_template():
31
+ """
32
+ Format chat messages for the Nemotron model, including a system prompt and structured message headers.
33
+
34
+ The template starts with a system message containing "detailed thinking on", then iterates over messages, wrapping each with start and end header tokens and an end-of-text token. For assistant messages containing a `</think>` tag, only the content after this tag is included. Optionally appends an assistant prompt if generation is requested.
35
+ """
36
+ return """{{- bos_token }}
37
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
38
+ {%- for message in messages %}
39
+ {%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
40
+ {%- set content = message['content'].split('</think>')[-1].lstrip() %}
41
+ {%- else %}
42
+ {%- set content = message['content'] %}
43
+ {%- endif %}
44
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
45
+ {%- endfor %}
46
+ {%- if add_generation_prompt %}
47
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
48
+ {%- endif %}"""
49
+
50
+
51
+ @PromptRegistry.register("Qwen/Qwen2.5")
52
+ def qwen_2_5_chat_template():
53
+ """
54
+ Formats chat messages into the prompt structure required by the Qwen 2.5 model family, supporting system messages, tool descriptions, function call instructions, and role-based message formatting.
55
+
56
+ If tools are provided, includes tool signatures and instructions for function calls in the system prompt. User, assistant, and tool messages are wrapped with special tokens, and assistant tool calls are serialized as JSON within XML tags. Optionally appends a generation prompt for the assistant.
57
+ """
58
+ return """{%- if tools %}\n {{- \'<|im_start|>system\\n\' }}\n {%- if messages[0][\'role\'] == \'system\' %}\n {{- messages[0][\'content\'] }}\n {%- else %}\n {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n {%- endif %}\n {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n {%- for tool in tools %}\n {{- "\\n" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n {%- if messages[0][\'role\'] == \'system\' %}\n {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n {%- else %}\n {{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n {%- elif message.role == "assistant" %}\n {{- \'<|im_start|>\' + message.role }}\n {%- if message.content %}\n {{- \'\\n\' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- \'\\n<tool_call>\\n{"name": "\' }}\n {{- tool_call.name }}\n {{- \'", "arguments": \' }}\n {{- tool_call.arguments | tojson }}\n {{- \'}\\n</tool_call>\' }}\n {%- endfor %}\n {{- \'<|im_end|>\\n\' }}\n {%- elif message.role == "tool" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n {{- \'<|im_start|>user\' }}\n {%- endif %}\n {{- \'\\n<tool_response>\\n\' }}\n {{- message.content }}\n {{- \'\\n</tool_response>\' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n {{- \'<|im_end|>\\n\' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n"""
59
+
60
+
61
+ @PromptRegistry.register("Qwen/Qwen3")
62
+ def qwen_3_chat_template():
63
+ """
64
+ Formats chat messages for the Qwen 3 model family, supporting multi-step tool usage, reasoning content, and special XML tags for tool calls and responses.
65
+
66
+ This template handles system messages, user and assistant roles, and tool interactions. When tools are provided, it outputs their signatures and instructions for function calls. It tracks the last user query to determine where to insert assistant reasoning content within `<think>` tags. Assistant tool calls are serialized as JSON within `<tool_call>` tags, and tool responses are grouped inside `<tool_response>` tags. Optionally, a generation prompt and empty reasoning block can be added.
67
+
68
+ Parameters:
69
+ tools (optional): List of tool signature objects to be included in the prompt.
70
+ messages: List of message objects, each with a role and content, and optionally tool_calls or reasoning_content.
71
+ add_generation_prompt (optional): If true, appends an assistant prompt for generation.
72
+ enable_thinking (optional): If false, inserts an empty reasoning block in the assistant prompt.
73
+ """
74
+ return """{%- if tools %}\n {{- \'<|im_start|>system\\n\' }}\n {%- if messages[0].role == \'system\' %}\n {{- messages[0].content + \'\\n\\n\' }}\n {%- endif %}\n {{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n {%- for tool in tools %}\n {{- "\\n" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n {%- if messages[0].role == \'system\' %}\n {{- \'<|im_start|>system\\n\' + messages[0].content + \'<|im_end|>\\n\' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith(\'<tool_response>\') and message.content.endswith(\'</tool_response>\')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = \'\' %}\n {%- endif %}\n {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content + \'<|im_end|>\' + \'\\n\' }}\n {%- elif message.role == "assistant" %}\n {%- set reasoning_content = \'\' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if \'</think>\' in content %}\n {%- set reasoning_content = content.split(\'</think>\')[0].rstrip(\'\\n\').split(\'<think>\')[-1].lstrip(\'\\n\') %}\n {%- set content = content.split(\'</think>\')[-1].lstrip(\'\\n\') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- \'<|im_start|>\' + message.role + \'\\n<think>\\n\' + reasoning_content.strip(\'\\n\') + \'\\n</think>\\n\\n\' + content.lstrip(\'\\n\') }}\n {%- else %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n {%- endif %}\n {%- else %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- \'\\n\' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- \'<tool_call>\\n{"name": "\' }}\n {{- tool_call.name }}\n {{- \'", "arguments": \' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- \'}\\n</tool_call>\' }}\n {%- endfor %}\n {%- endif %}\n {{- \'<|im_end|>\\n\' }}\n {%- elif message.role == "tool" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}\n {{- \'<|im_start|>user\' }}\n {%- endif %}\n {{- \'\\n<tool_response>\\n\' }}\n {{- content }}\n {{- \'\\n</tool_response>\' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n {{- \'<|im_end|>\\n\' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|im_start|>assistant\\n\' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- \'<think>\\n\\n</think>\\n\\n\' }}\n {%- endif %}\n{%- endif %}"""
sdg_hub/sdg.py CHANGED
@@ -1,35 +1,83 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
+
3
+ """Synthetic Data Generator (SDG) module for managing data generation flows."""
4
+
2
5
  # Standard
3
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from typing import List
7
+ from typing import List, Optional, Tuple
5
8
  import traceback
6
- import uuid
7
9
 
8
10
  # Third Party
9
- from datasets import Dataset, load_dataset
10
- from datasets.data_files import EmptyDatasetError
11
+ from datasets import Dataset
11
12
  from tqdm import tqdm
12
13
 
13
14
  # Local
15
+ from .checkpointer import Checkpointer
16
+ from .flow import Flow
14
17
  from .logger_config import setup_logger
15
- from .pipeline import Pipeline
16
18
  from .utils.datautils import safe_concatenate_datasets
17
19
 
18
-
19
20
  logger = setup_logger(__name__)
20
21
 
21
22
 
22
23
  class SDG:
24
+ """Synthetic Data Generator class.
25
+
26
+ This class manages the generation of synthetic data using one or more
27
+ data generation flows.
28
+
29
+ Parameters
30
+ ----------
31
+ flows : List[Flow]
32
+ List of flows to execute.
33
+ num_workers : int, optional
34
+ Number of worker threads to use, by default 1
35
+ batch_size : Optional[int], optional
36
+ Size of batches to process, by default None
37
+ save_freq : Optional[int], optional
38
+ Frequency of checkpoint saves, by default None
39
+
40
+ Attributes
41
+ ----------
42
+ flows : List[Flow]
43
+ List of flows to execute.
44
+ num_workers : int
45
+ Number of worker threads to use.
46
+ batch_size : Optional[int]
47
+ Size of batches to process.
48
+ save_freq : Optional[int]
49
+ Frequency of checkpoint saves.
50
+ """
51
+
23
52
  def __init__(
24
- self, pipelines: List[Pipeline], num_workers=1, batch_size=None, save_freq=None
53
+ self,
54
+ flows: List[Flow],
55
+ num_workers: int = 1,
56
+ batch_size: Optional[int] = None,
57
+ save_freq: Optional[int] = None,
25
58
  ) -> None:
26
- self.pipelines = pipelines
59
+ self.flows = flows
27
60
  self.num_workers = num_workers
28
61
  self.batch_size = batch_size
29
62
  self.save_freq = save_freq
30
63
 
31
- def _split_dataset(self, dataset: Dataset, batch_size: int) -> List[Dataset]:
32
- """Split the dataset into smaller batches."""
64
+ def _split_dataset(
65
+ self, dataset: Dataset, batch_size: int
66
+ ) -> List[Tuple[int, int]]:
67
+ """Split the dataset into smaller batches.
68
+
69
+ Parameters
70
+ ----------
71
+ dataset : Dataset
72
+ The dataset to split.
73
+ batch_size : int
74
+ Size of each batch.
75
+
76
+ Returns
77
+ -------
78
+ List[Tuple[int, int]]
79
+ List of (start, end) indices for each batch.
80
+ """
33
81
  total_size = len(dataset)
34
82
  num_batches = (total_size + batch_size - 1) // batch_size
35
83
 
@@ -40,94 +88,87 @@ class SDG:
40
88
 
41
89
  return batches
42
90
 
43
- def _get_missing_data(self, seed_data, generated_data):
44
- # Get the common columns between the two datasets
45
- common_columns = list(
46
- set(seed_data.column_names) & set(generated_data.column_names)
47
- )
48
-
49
- # Extract the relevant data based on common columns
50
- seed_data_common = seed_data.select_columns(common_columns)
51
- generated_data_common = generated_data.select_columns(common_columns)
52
-
53
- # Convert to Pandas DataFrames for easier comparison
54
- seed_df = seed_data_common.to_pandas()
55
- generated_df = generated_data_common.to_pandas()
56
-
57
- # Identify missing rows
58
- missing_df = seed_df[
59
- ~seed_df.apply(tuple, 1).isin(generated_df.apply(tuple, 1))
60
- ]
61
-
62
- # Convert back to Dataset
63
- missing_data = Dataset.from_pandas(missing_df, preserve_index=False)
64
-
65
- return missing_data
66
-
67
- def _save_intermediate_checkpoint(self, dataset, checkpoint_dir):
68
- checkpoint_id = uuid.uuid4().hex
69
- checkpoint_file = f"{checkpoint_dir}/data_checkpoint_{checkpoint_id}.jsonl"
70
- logger.info(f"Saving checkpoint to {checkpoint_file}")
71
- dataset.to_json(checkpoint_file, orient="records", lines=True)
72
-
73
91
  @staticmethod
74
- def _generate_data(pipelines, input_split, ds, i=None):
92
+ def _generate_data(
93
+ flows: List[Flow],
94
+ input_split: Tuple[int, int],
95
+ ds: Dataset,
96
+ i: Optional[int] = None,
97
+ ) -> Optional[Dataset]:
98
+ """Generate data for a single split using the provided flows.
99
+
100
+ Parameters
101
+ ----------
102
+ flows : List[Flow]
103
+ List of flows to execute.
104
+ input_split : Tuple[int, int]
105
+ (start, end) indices for the current split.
106
+ ds : Dataset
107
+ The full input dataset.
108
+ i : Optional[int], optional
109
+ Split index for logging, by default None
110
+
111
+ Returns
112
+ -------
113
+ Optional[Dataset]
114
+ Generated dataset for the split, or None if generation failed.
115
+ """
75
116
  logger.info(f"Processing split {i}")
76
117
  input_split = ds.select(range(input_split[0], input_split[1]))
77
118
  try:
78
- for pipeline in pipelines:
79
- input_split = pipeline.generate(input_split)
119
+ for flow in flows:
120
+ input_split = flow.generate(input_split)
80
121
  return input_split
81
122
  except Exception as e:
82
123
  logger.error(f"Error processing split {i}: {e}")
83
124
  traceback.print_exc()
84
125
  return None
85
126
 
86
- def generate(self, dataset: Dataset, checkpoint_dir=None) -> Dataset:
87
- # check if checkpoint_dir exists
88
- pre_generated_data = []
89
- if checkpoint_dir is not None:
90
- try:
91
- # check if there are any existing checkpoints
92
- pre_generated_data = load_dataset(
93
- "json", data_dir=checkpoint_dir, split="train"
94
- )
95
- logger.info(
96
- f"Loading existing checkpoints from {checkpoint_dir}, with {pre_generated_data.num_rows} rows"
97
- )
98
- seed_data = self._get_missing_data(dataset, pre_generated_data)
99
- if seed_data.num_rows == 0:
100
- logger.info(
101
- f"All seed data has been generated, no missing rows found, returning data from {checkpoint_dir}"
102
- )
103
- return pre_generated_data
104
- logger.info(f"Found {seed_data.num_rows} missing rows in the dataset")
105
-
106
- except EmptyDatasetError:
107
- logger.info(
108
- f"No existing checkpoints found in {checkpoint_dir}, generating from scratch"
109
- )
110
- seed_data = dataset
111
-
112
- else:
113
- seed_data = dataset
127
+ def generate(
128
+ self, dataset: Dataset, checkpoint_dir: Optional[str] = None
129
+ ) -> Dataset:
130
+ """Generate synthetic data using the configured flows.
131
+
132
+ Parameters
133
+ ----------
134
+ dataset : Dataset
135
+ The input dataset to process.
136
+ checkpoint_dir : Optional[str], optional
137
+ Directory to save checkpoints, by default None
138
+
139
+ Returns
140
+ -------
141
+ Dataset
142
+ The generated dataset.
143
+
144
+ Notes
145
+ -----
146
+ If checkpoint_dir is provided, the generation process can be resumed
147
+ from the last checkpoint in case of interruption.
148
+ """
149
+ # Initialize checkpointer
150
+ checkpointer = Checkpointer(checkpoint_dir, self.save_freq)
151
+
152
+ # Load existing checkpoints and determine missing data
153
+ seed_data, pre_generated_data = checkpointer.load_existing_data(dataset)
154
+
155
+ # If all data has been generated, return the pre-generated data
156
+ if seed_data.num_rows == 0 and pre_generated_data is not None:
157
+ return pre_generated_data
114
158
 
115
159
  if not self.batch_size:
116
160
  # If batch size is not provided, generate the dataset in a single pass
117
161
  generated_dataset = seed_data
118
- # generated_data is initialized with seed_data, and it gets updated with each pipeline
119
- for pipeline in self.pipelines:
120
- generated_dataset = pipeline.generate(seed_data)
162
+ # generated_data is initialized with seed_data, and it gets updated with each flow
163
+ for flow in self.flows:
164
+ generated_dataset = flow.generate(generated_dataset)
121
165
  return generated_dataset
122
-
166
+
123
167
  logger.info("Splitting the dataset into smaller batches")
124
- input_splits = (
125
- self._split_dataset(seed_data, self.batch_size)
126
- if self.batch_size
127
- else [seed_data]
128
- )
168
+ input_splits = self._split_dataset(seed_data, self.batch_size)
129
169
  logger.info(
130
- f"Generating dataset with {len(input_splits)} splits, batch size {self.batch_size}, and {self.num_workers} workers"
170
+ f"Generating dataset with {len(input_splits)} splits, "
171
+ f"batch size {self.batch_size}, and {self.num_workers} workers"
131
172
  )
132
173
 
133
174
  generated_data = [pre_generated_data] if pre_generated_data else []
@@ -136,7 +177,7 @@ class SDG:
136
177
  with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
137
178
  futures = [
138
179
  executor.submit(
139
- self._generate_data, self.pipelines, input_split, seed_data, i
180
+ self._generate_data, self.flows, input_split, seed_data, i
140
181
  )
141
182
  for i, input_split in enumerate(input_splits)
142
183
  ]
@@ -147,16 +188,17 @@ class SDG:
147
188
  if generated_data_split:
148
189
  generated_data.append(generated_data_split)
149
190
  logger.info(f"Finished future processing split {i} \n\n")
150
- if self.save_freq and (i + 1) % self.save_freq == 0:
191
+
192
+ # Use checkpointer to handle intermediate saves
193
+ if checkpointer.should_save_checkpoint(i):
151
194
  # Save only the new splits since the last checkpoint
152
195
  new_splits = generated_data[last_saved_split_index : i + 1]
153
196
  checkpoint_dataset = safe_concatenate_datasets(new_splits)
154
197
  # check if checkpoint_dataset is not None
155
198
  if checkpoint_dataset:
156
- self._save_intermediate_checkpoint(
157
- checkpoint_dataset, checkpoint_dir
199
+ checkpointer.save_intermediate_checkpoint(
200
+ checkpoint_dataset
158
201
  )
159
-
160
202
  last_saved_split_index = i + 1
161
203
 
162
204
  generated_dataset = safe_concatenate_datasets(generated_data)
sdg_hub/utils/__init__.py CHANGED
@@ -3,3 +3,8 @@
3
3
  # This is part of the public API, and used by instructlab
4
4
  class GenerateException(Exception):
5
5
  """An exception raised during generate step."""
6
+
7
+
8
+ from .path_resolution import resolve_path
9
+
10
+ __all__ = ["GenerateException", "resolve_path"]
@@ -0,0 +1,91 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Configuration validation utilities for SDG Hub.
3
+
4
+ This module provides functions to validate configuration files used by blocks,
5
+ ensuring they meet the required schema and contain all necessary fields.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any, Dict, List
10
+
11
+ # Local
12
+ from ..logger_config import setup_logger
13
+
14
+ logger = setup_logger(__name__)
15
+
16
+
17
+ def validate_prompt_config_schema(
18
+ config: Dict[str, Any], config_path: str
19
+ ) -> tuple[bool, List[str]]:
20
+ """Validate that a prompt configuration file has the required schema fields.
21
+
22
+ For prompt template configs, 'system' and 'generation' are required fields.
23
+ Other fields like 'introduction', 'principles', 'examples', 'start_tags', 'end_tags' are optional.
24
+
25
+ Parameters
26
+ ----------
27
+ config : Dict[str, Any]
28
+ The loaded configuration dictionary.
29
+ config_path : str
30
+ The path to the configuration file (for error reporting).
31
+
32
+ Returns
33
+ -------
34
+ tuple[bool, List[str]]
35
+ A tuple containing:
36
+ - bool: True if schema is valid, False otherwise
37
+ - List[str]: List of validation error messages (empty if valid)
38
+ """
39
+ required_fields = ["system", "generation"]
40
+ errors = []
41
+
42
+ # Ensure config is a dictionary
43
+ if not isinstance(config, dict):
44
+ errors.append(f"Configuration must be a dictionary, got {type(config).__name__}")
45
+ return False, errors
46
+
47
+ # Check for missing required fields
48
+ missing_fields = [field for field in required_fields if field not in config]
49
+ if missing_fields:
50
+ errors.append(f"Missing required fields: {missing_fields}")
51
+
52
+ # Check for empty or null required fields and validate they are strings
53
+ for field in required_fields:
54
+ if field in config:
55
+ value = config[field]
56
+ if value is None:
57
+ errors.append(f"Required field '{field}' is null")
58
+ elif not isinstance(value, str):
59
+ errors.append(f"Required field '{field}' must be a string, got {type(value).__name__}")
60
+ elif not value.strip():
61
+ errors.append(f"Required field '{field}' is empty")
62
+
63
+ # Check optional string fields are strings when present
64
+ string_fields = ["introduction", "principles", "examples"]
65
+ for field in string_fields:
66
+ if field in config:
67
+ value = config[field]
68
+ if value is not None and not isinstance(value, str):
69
+ errors.append(f"Field '{field}' must be a string, got {type(value).__name__}")
70
+
71
+ # Check start_tags and end_tags are lists of strings when present
72
+ tag_fields = ["start_tags", "end_tags"]
73
+ for field in tag_fields:
74
+ if field in config:
75
+ value = config[field]
76
+ if value is not None:
77
+ if not isinstance(value, list):
78
+ errors.append(f"Field '{field}' must be a list, got {type(value).__name__}")
79
+ else:
80
+ for i, tag in enumerate(value):
81
+ if not isinstance(tag, str):
82
+ errors.append(f"Field '{field}[{i}]' must be a string, got {type(tag).__name__}")
83
+
84
+ # Log validation results
85
+ if errors:
86
+ for error in errors:
87
+ logger.error(f"Config validation failed for {config_path}: {error}")
88
+ return False, errors
89
+
90
+ logger.debug(f"Config validation passed for {config_path}")
91
+ return True, []
@@ -0,0 +1,94 @@
1
+ """Custom exception classes for SDG Hub error handling."""
2
+
3
+
4
+ class SDGHubError(Exception):
5
+ """Base exception class for all SDG Hub errors."""
6
+
7
+ def __init__(self, message: str, details: str = None):
8
+ """Initialize SDGHubError.
9
+
10
+ Parameters
11
+ ----------
12
+ message : str
13
+ The main error message.
14
+ details : str, optional
15
+ Additional details about the error.
16
+ """
17
+ self.message = message
18
+ self.details = details
19
+ full_message = message
20
+ if details:
21
+ full_message = f"{message}\nDetails: {details}"
22
+ super().__init__(full_message)
23
+
24
+
25
+ class FlowRunnerError(SDGHubError):
26
+ """Base exception class for flow runner errors."""
27
+
28
+ pass
29
+
30
+
31
+ class DatasetLoadError(FlowRunnerError):
32
+ """Raised when dataset loading fails."""
33
+
34
+ pass
35
+
36
+
37
+ class FlowConfigurationError(FlowRunnerError):
38
+ """Raised when flow configuration is invalid."""
39
+
40
+ pass
41
+
42
+
43
+ class APIConnectionError(FlowRunnerError):
44
+ """Raised when API connection fails."""
45
+
46
+ pass
47
+
48
+
49
+ class DataGenerationError(FlowRunnerError):
50
+ """Raised when data generation fails."""
51
+
52
+ pass
53
+
54
+
55
+ class DataSaveError(FlowRunnerError):
56
+ """Raised when saving generated data fails."""
57
+
58
+ pass
59
+
60
+
61
+ class BlockError(SDGHubError):
62
+ """Base exception class for block-related errors."""
63
+
64
+ pass
65
+
66
+
67
+ class BlockConfigurationError(BlockError):
68
+ """Raised when block configuration is invalid."""
69
+
70
+ pass
71
+
72
+
73
+ class BlockExecutionError(BlockError):
74
+ """Raised when block execution fails."""
75
+
76
+ pass
77
+
78
+
79
+ class FlowError(SDGHubError):
80
+ """Base exception class for flow-related errors."""
81
+
82
+ pass
83
+
84
+
85
+ class FlowValidationError(FlowError):
86
+ """Raised when flow validation fails."""
87
+
88
+ pass
89
+
90
+
91
+ class FlowExecutionError(FlowError):
92
+ """Raised when flow execution fails."""
93
+
94
+ pass