sdg-hub 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/blocks/__init__.py +6 -0
- sdg_hub/blocks/openaichatblock.py +556 -0
- sdg_hub/flow.py +21 -18
- sdg_hub/flow_runner.py +273 -52
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -5
- sdg_hub/prompts.py +31 -0
- sdg_hub/utils/__init__.py +5 -0
- sdg_hub/utils/error_handling.py +94 -0
- sdg_hub/utils/path_resolution.py +62 -0
- {sdg_hub-0.1.1.dist-info → sdg_hub-0.1.2.dist-info}/METADATA +1 -1
- {sdg_hub-0.1.1.dist-info → sdg_hub-0.1.2.dist-info}/RECORD +15 -12
- {sdg_hub-0.1.1.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.1.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.1.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
sdg_hub/flow_runner.py
CHANGED
@@ -1,18 +1,31 @@
|
|
1
1
|
"""Script for running data generation flows with configurable parameters."""
|
2
2
|
|
3
3
|
# Standard
|
4
|
+
from importlib import resources
|
5
|
+
from typing import Optional
|
4
6
|
import os
|
7
|
+
import sys
|
8
|
+
import traceback
|
5
9
|
|
6
10
|
# Third Party
|
7
11
|
from datasets import load_dataset
|
8
12
|
from openai import OpenAI
|
9
13
|
import click
|
14
|
+
import yaml
|
10
15
|
|
11
16
|
# First Party
|
12
17
|
from sdg_hub.flow import Flow
|
13
18
|
from sdg_hub.logger_config import setup_logger
|
14
19
|
from sdg_hub.sdg import SDG
|
15
|
-
|
20
|
+
from sdg_hub.utils.error_handling import (
|
21
|
+
APIConnectionError,
|
22
|
+
DataGenerationError,
|
23
|
+
DataSaveError,
|
24
|
+
DatasetLoadError,
|
25
|
+
FlowConfigurationError,
|
26
|
+
FlowRunnerError,
|
27
|
+
)
|
28
|
+
from sdg_hub.utils.path_resolution import resolve_path
|
16
29
|
|
17
30
|
logger = setup_logger(__name__)
|
18
31
|
|
@@ -28,7 +41,7 @@ def run_flow(
|
|
28
41
|
save_freq: int = 2,
|
29
42
|
debug: bool = False,
|
30
43
|
dataset_start_index: int = 0,
|
31
|
-
dataset_end_index: int = None,
|
44
|
+
dataset_end_index: Optional[int] = None,
|
32
45
|
) -> None:
|
33
46
|
"""Process the dataset using the specified configuration.
|
34
47
|
|
@@ -52,6 +65,10 @@ def run_flow(
|
|
52
65
|
Frequency (in batches) at which to save checkpoints, by default 2.
|
53
66
|
debug : bool, optional
|
54
67
|
If True, enables debug mode with a smaller dataset subset, by default False.
|
68
|
+
dataset_start_index : int, optional
|
69
|
+
Start index for dataset slicing, by default 0.
|
70
|
+
dataset_end_index : Optional[int], optional
|
71
|
+
End index for dataset slicing, by default None.
|
55
72
|
|
56
73
|
Returns
|
57
74
|
-------
|
@@ -59,42 +76,214 @@ def run_flow(
|
|
59
76
|
|
60
77
|
Raises
|
61
78
|
------
|
62
|
-
|
63
|
-
If the
|
79
|
+
DatasetLoadError
|
80
|
+
If the dataset cannot be loaded or processed.
|
81
|
+
FlowConfigurationError
|
82
|
+
If the flow configuration is invalid or cannot be loaded.
|
83
|
+
APIConnectionError
|
84
|
+
If connection to the API endpoint fails.
|
85
|
+
DataGenerationError
|
86
|
+
If data generation fails during processing.
|
87
|
+
DataSaveError
|
88
|
+
If saving the generated data fails.
|
64
89
|
"""
|
65
90
|
logger.info(f"Generation configuration: {locals()}\n\n")
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
91
|
+
|
92
|
+
try:
|
93
|
+
# Load and validate dataset
|
94
|
+
try:
|
95
|
+
ds = load_dataset("json", data_files=ds_path, split="train")
|
96
|
+
logger.info(
|
97
|
+
f"Successfully loaded dataset from {ds_path} with {len(ds)} rows"
|
98
|
+
)
|
99
|
+
except Exception as e:
|
100
|
+
raise DatasetLoadError(
|
101
|
+
f"Failed to load dataset from '{ds_path}'. "
|
102
|
+
f"Please check if the file exists and is a valid JSON file.",
|
103
|
+
details=str(e),
|
104
|
+
) from e
|
105
|
+
|
106
|
+
# Apply dataset slicing if specified
|
107
|
+
try:
|
108
|
+
if dataset_start_index is not None and dataset_end_index is not None:
|
109
|
+
if dataset_start_index >= len(ds) or dataset_end_index > len(ds):
|
110
|
+
raise DatasetLoadError(
|
111
|
+
f"Dataset slice indices ({dataset_start_index}, {dataset_end_index}) "
|
112
|
+
f"are out of bounds for dataset with {len(ds)} rows"
|
113
|
+
)
|
114
|
+
if dataset_start_index >= dataset_end_index:
|
115
|
+
raise DatasetLoadError(
|
116
|
+
f"Start index ({dataset_start_index}) must be less than end index ({dataset_end_index})"
|
117
|
+
)
|
118
|
+
ds = ds.select(range(dataset_start_index, dataset_end_index))
|
119
|
+
logger.info(
|
120
|
+
f"Dataset sliced from {dataset_start_index} to {dataset_end_index}"
|
121
|
+
)
|
122
|
+
|
123
|
+
if debug:
|
124
|
+
if len(ds) < 30:
|
125
|
+
logger.warning(
|
126
|
+
f"Debug mode requested 30 samples but dataset only has {len(ds)} rows"
|
127
|
+
)
|
128
|
+
ds = ds.shuffle(seed=42).select(range(min(30, len(ds))))
|
129
|
+
logger.info(
|
130
|
+
f"Debug mode enabled. Using {len(ds)} samples from the dataset."
|
131
|
+
)
|
132
|
+
except DatasetLoadError:
|
133
|
+
raise
|
134
|
+
except Exception as e:
|
135
|
+
raise DatasetLoadError(
|
136
|
+
"Failed to process dataset slicing or debug mode.", details=str(e)
|
137
|
+
) from e
|
138
|
+
|
139
|
+
# Validate API configuration
|
140
|
+
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
141
|
+
if not openai_api_key or openai_api_key == "EMPTY":
|
142
|
+
logger.warning("OPENAI_API_KEY not set or is 'EMPTY'. API calls may fail.")
|
143
|
+
|
144
|
+
openai_api_base = endpoint
|
145
|
+
if not openai_api_base:
|
146
|
+
raise APIConnectionError("API endpoint cannot be empty")
|
147
|
+
|
148
|
+
# Initialize OpenAI client
|
149
|
+
try:
|
150
|
+
client = OpenAI(
|
151
|
+
api_key=openai_api_key or "EMPTY",
|
152
|
+
base_url=openai_api_base,
|
153
|
+
)
|
154
|
+
# test connection with a model list
|
155
|
+
models = client.models.list()
|
156
|
+
logger.info(f"Initialized OpenAI client with endpoint: {openai_api_base}")
|
157
|
+
logger.info(f"Available models: {[model.id for model in models.data]}")
|
158
|
+
except Exception as e:
|
159
|
+
raise APIConnectionError(
|
160
|
+
f"Failed to initialize OpenAI client with endpoint '{openai_api_base}'. "
|
161
|
+
f"Please check if the endpoint is valid and accessible.",
|
162
|
+
details=str(e),
|
163
|
+
) from e
|
164
|
+
|
165
|
+
# Load and validate flow configuration
|
166
|
+
try:
|
167
|
+
base_path = str(resources.files(__package__))
|
168
|
+
flow_path = resolve_path(flow_path, [".", base_path])
|
169
|
+
if not os.path.exists(flow_path):
|
170
|
+
raise FlowConfigurationError(
|
171
|
+
f"Flow configuration file not found: {flow_path}"
|
172
|
+
)
|
173
|
+
|
174
|
+
# Validate flow file is readable YAML
|
175
|
+
try:
|
176
|
+
with open(flow_path, "r", encoding="utf-8") as f:
|
177
|
+
flow_config = yaml.safe_load(f)
|
178
|
+
if not flow_config:
|
179
|
+
raise FlowConfigurationError(
|
180
|
+
f"Flow configuration file is empty: {flow_path}"
|
181
|
+
)
|
182
|
+
logger.info(f"Successfully loaded flow configuration from {flow_path}")
|
183
|
+
except yaml.YAMLError as e:
|
184
|
+
raise FlowConfigurationError(
|
185
|
+
f"Flow configuration file '{flow_path}' contains invalid YAML.",
|
186
|
+
details=str(e),
|
187
|
+
) from e
|
188
|
+
except Exception as e:
|
189
|
+
raise FlowConfigurationError(
|
190
|
+
f"Failed to read flow configuration file '{flow_path}'.",
|
191
|
+
details=str(e),
|
192
|
+
) from e
|
193
|
+
|
194
|
+
flow = Flow(client).get_flow_from_file(flow_path)
|
195
|
+
logger.info("Successfully initialized flow from configuration")
|
196
|
+
except FlowConfigurationError:
|
197
|
+
raise
|
198
|
+
except Exception as e:
|
199
|
+
raise FlowConfigurationError(
|
200
|
+
f"Failed to create flow from configuration file '{flow_path}'. "
|
201
|
+
f"Please check the flow configuration format and block definitions.",
|
202
|
+
details=str(e),
|
203
|
+
) from e
|
204
|
+
|
205
|
+
# Initialize SDG and generate data
|
206
|
+
try:
|
207
|
+
sdg = SDG(
|
208
|
+
flows=[flow],
|
209
|
+
num_workers=num_workers,
|
210
|
+
batch_size=batch_size,
|
211
|
+
save_freq=save_freq,
|
212
|
+
)
|
213
|
+
logger.info(
|
214
|
+
f"Initialized SDG with {num_workers} workers, batch size {batch_size}"
|
215
|
+
)
|
216
|
+
|
217
|
+
# Ensure checkpoint directory exists if specified
|
218
|
+
if checkpoint_dir and not os.path.exists(checkpoint_dir):
|
219
|
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
220
|
+
logger.info(f"Created checkpoint directory: {checkpoint_dir}")
|
221
|
+
|
222
|
+
generated_data = sdg.generate(ds, checkpoint_dir=checkpoint_dir)
|
223
|
+
|
224
|
+
if generated_data is None or len(generated_data) == 0:
|
225
|
+
raise DataGenerationError(
|
226
|
+
"Data generation completed but no data was generated. "
|
227
|
+
"This may indicate issues with the flow configuration or input data."
|
228
|
+
)
|
229
|
+
|
230
|
+
logger.info(f"Successfully generated {len(generated_data)} rows of data")
|
231
|
+
|
232
|
+
except Exception as e:
|
233
|
+
if isinstance(e, DataGenerationError):
|
234
|
+
raise
|
235
|
+
raise DataGenerationError(
|
236
|
+
"Data generation failed during processing. This could be due to:"
|
237
|
+
"\n- API connection issues with the endpoint"
|
238
|
+
"\n- Invalid flow configuration or block parameters"
|
239
|
+
"\n- Insufficient system resources (try reducing batch_size or num_workers)"
|
240
|
+
"\n- Input data format incompatibility",
|
241
|
+
details=f"Endpoint: {openai_api_base}, Error: {e}",
|
242
|
+
) from e
|
243
|
+
|
244
|
+
# Save generated data
|
245
|
+
try:
|
246
|
+
# Adjust save path for dataset slicing
|
247
|
+
final_save_path = save_path
|
248
|
+
if dataset_end_index is not None and dataset_start_index is not None:
|
249
|
+
final_save_path = save_path.replace(
|
250
|
+
".jsonl", f"_{dataset_start_index}_{dataset_end_index}.jsonl"
|
251
|
+
)
|
252
|
+
|
253
|
+
# Ensure save directory exists
|
254
|
+
save_dir = os.path.dirname(final_save_path)
|
255
|
+
if save_dir and not os.path.exists(save_dir):
|
256
|
+
os.makedirs(save_dir, exist_ok=True)
|
257
|
+
logger.info(f"Created save directory: {save_dir}")
|
258
|
+
|
259
|
+
generated_data.to_json(final_save_path, orient="records", lines=True)
|
260
|
+
logger.info(f"Data successfully saved to {final_save_path}")
|
261
|
+
|
262
|
+
except Exception as e:
|
263
|
+
raise DataSaveError(
|
264
|
+
f"Failed to save generated data to '{final_save_path}'. "
|
265
|
+
f"Please check write permissions and disk space.",
|
266
|
+
details=str(e),
|
267
|
+
) from e
|
268
|
+
|
269
|
+
except (
|
270
|
+
DatasetLoadError,
|
271
|
+
FlowConfigurationError,
|
272
|
+
APIConnectionError,
|
273
|
+
DataGenerationError,
|
274
|
+
DataSaveError,
|
275
|
+
):
|
276
|
+
# Re-raise our custom exceptions with their detailed messages
|
277
|
+
raise
|
278
|
+
except Exception as e:
|
279
|
+
# Catch any unexpected errors
|
280
|
+
logger.error(f"Unexpected error during flow execution: {e}")
|
281
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
282
|
+
raise FlowRunnerError(
|
283
|
+
"An unexpected error occurred during flow execution. "
|
284
|
+
"Please check the logs for more details.",
|
285
|
+
details=str(e),
|
286
|
+
) from e
|
98
287
|
|
99
288
|
|
100
289
|
@click.command()
|
@@ -154,8 +343,12 @@ def run_flow(
|
|
154
343
|
is_flag=True,
|
155
344
|
help="Enable debug mode with a smaller dataset subset.",
|
156
345
|
)
|
157
|
-
@click.option(
|
158
|
-
|
346
|
+
@click.option(
|
347
|
+
"--dataset_start_index", type=int, default=0, help="Start index of the dataset."
|
348
|
+
)
|
349
|
+
@click.option(
|
350
|
+
"--dataset_end_index", type=int, default=None, help="End index of the dataset."
|
351
|
+
)
|
159
352
|
def main(
|
160
353
|
ds_path: str,
|
161
354
|
bs: int,
|
@@ -167,7 +360,7 @@ def main(
|
|
167
360
|
save_freq: int,
|
168
361
|
debug: bool,
|
169
362
|
dataset_start_index: int,
|
170
|
-
dataset_end_index: int,
|
363
|
+
dataset_end_index: Optional[int],
|
171
364
|
) -> None:
|
172
365
|
"""CLI entry point for running data generation flows.
|
173
366
|
|
@@ -191,24 +384,52 @@ def main(
|
|
191
384
|
Frequency (in batches) at which to save checkpoints.
|
192
385
|
debug : bool
|
193
386
|
If True, enables debug mode with a smaller dataset subset.
|
387
|
+
dataset_start_index : int
|
388
|
+
Start index for dataset slicing.
|
389
|
+
dataset_end_index : Optional[int]
|
390
|
+
End index for dataset slicing.
|
194
391
|
|
195
392
|
Returns
|
196
393
|
-------
|
197
394
|
None
|
198
395
|
"""
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
396
|
+
try:
|
397
|
+
run_flow(
|
398
|
+
ds_path=ds_path,
|
399
|
+
batch_size=bs,
|
400
|
+
num_workers=num_workers,
|
401
|
+
save_path=save_path,
|
402
|
+
endpoint=endpoint,
|
403
|
+
flow_path=flow,
|
404
|
+
checkpoint_dir=checkpoint_dir,
|
405
|
+
save_freq=save_freq,
|
406
|
+
debug=debug,
|
407
|
+
dataset_start_index=dataset_start_index,
|
408
|
+
dataset_end_index=dataset_end_index,
|
409
|
+
)
|
410
|
+
except (
|
411
|
+
DatasetLoadError,
|
412
|
+
FlowConfigurationError,
|
413
|
+
APIConnectionError,
|
414
|
+
DataGenerationError,
|
415
|
+
DataSaveError,
|
416
|
+
FlowRunnerError,
|
417
|
+
) as e:
|
418
|
+
logger.error(f"Flow execution failed: {e}")
|
419
|
+
click.echo(f"Error: {e}", err=True)
|
420
|
+
sys.exit(1)
|
421
|
+
except KeyboardInterrupt:
|
422
|
+
logger.info("Flow execution interrupted by user")
|
423
|
+
click.echo("Flow execution interrupted by user", err=True)
|
424
|
+
sys.exit(130) # Standard exit code for SIGINT
|
425
|
+
except Exception as e:
|
426
|
+
logger.error(f"Unexpected error: {e}")
|
427
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
428
|
+
click.echo(
|
429
|
+
f"Unexpected error occurred. Please check the logs for details. Error: {e}",
|
430
|
+
err=True,
|
431
|
+
)
|
432
|
+
sys.exit(1)
|
212
433
|
|
213
434
|
|
214
435
|
if __name__ == "__main__":
|
@@ -14,7 +14,6 @@
|
|
14
14
|
gen_kwargs:
|
15
15
|
max_tokens: 4096
|
16
16
|
temperature: 0.7
|
17
|
-
seed: 7452
|
18
17
|
n: 50
|
19
18
|
|
20
19
|
- block_type: LLMBlock
|
@@ -27,7 +26,6 @@
|
|
27
26
|
gen_kwargs:
|
28
27
|
max_tokens: 4096
|
29
28
|
temperature: 0.7
|
30
|
-
seed: 7452
|
31
29
|
|
32
30
|
- block_type: LLMBlock
|
33
31
|
block_config:
|
@@ -39,7 +37,6 @@
|
|
39
37
|
gen_kwargs:
|
40
38
|
max_tokens: 4096
|
41
39
|
temperature: 0.7
|
42
|
-
seed: 7452
|
43
40
|
|
44
41
|
- block_type: FlattenColumnsBlock
|
45
42
|
block_config:
|
@@ -72,7 +69,6 @@
|
|
72
69
|
gen_kwargs:
|
73
70
|
temperature: 0.7
|
74
71
|
max_tokens: 100
|
75
|
-
seed: 7452
|
76
72
|
|
77
73
|
- block_type: LLMBlock
|
78
74
|
block_config:
|
@@ -84,7 +80,6 @@
|
|
84
80
|
gen_kwargs:
|
85
81
|
temperature: 0.7
|
86
82
|
max_tokens: 2048
|
87
|
-
seed: 7452
|
88
83
|
|
89
84
|
- block_type: LLMBlock
|
90
85
|
block_config:
|
sdg_hub/prompts.py
CHANGED
@@ -28,6 +28,11 @@ def microsoft_phi_chat_template():
|
|
28
28
|
|
29
29
|
@PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
|
30
30
|
def nemotron_chat_template():
|
31
|
+
"""
|
32
|
+
Format chat messages for the Nemotron model, including a system prompt and structured message headers.
|
33
|
+
|
34
|
+
The template starts with a system message containing "detailed thinking on", then iterates over messages, wrapping each with start and end header tokens and an end-of-text token. For assistant messages containing a `</think>` tag, only the content after this tag is included. Optionally appends an assistant prompt if generation is requested.
|
35
|
+
"""
|
31
36
|
return """{{- bos_token }}
|
32
37
|
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
|
33
38
|
{%- for message in messages %}
|
@@ -41,3 +46,29 @@ def nemotron_chat_template():
|
|
41
46
|
{%- if add_generation_prompt %}
|
42
47
|
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
43
48
|
{%- endif %}"""
|
49
|
+
|
50
|
+
|
51
|
+
@PromptRegistry.register("Qwen/Qwen2.5")
|
52
|
+
def qwen_2_5_chat_template():
|
53
|
+
"""
|
54
|
+
Formats chat messages into the prompt structure required by the Qwen 2.5 model family, supporting system messages, tool descriptions, function call instructions, and role-based message formatting.
|
55
|
+
|
56
|
+
If tools are provided, includes tool signatures and instructions for function calls in the system prompt. User, assistant, and tool messages are wrapped with special tokens, and assistant tool calls are serialized as JSON within XML tags. Optionally appends a generation prompt for the assistant.
|
57
|
+
"""
|
58
|
+
return """{%- if tools %}\n {{- \'<|im_start|>system\\n\' }}\n {%- if messages[0][\'role\'] == \'system\' %}\n {{- messages[0][\'content\'] }}\n {%- else %}\n {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n {%- endif %}\n {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n {%- for tool in tools %}\n {{- "\\n" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n {%- if messages[0][\'role\'] == \'system\' %}\n {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n {%- else %}\n {{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n {%- elif message.role == "assistant" %}\n {{- \'<|im_start|>\' + message.role }}\n {%- if message.content %}\n {{- \'\\n\' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- \'\\n<tool_call>\\n{"name": "\' }}\n {{- tool_call.name }}\n {{- \'", "arguments": \' }}\n {{- tool_call.arguments | tojson }}\n {{- \'}\\n</tool_call>\' }}\n {%- endfor %}\n {{- \'<|im_end|>\\n\' }}\n {%- elif message.role == "tool" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n {{- \'<|im_start|>user\' }}\n {%- endif %}\n {{- \'\\n<tool_response>\\n\' }}\n {{- message.content }}\n {{- \'\\n</tool_response>\' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n {{- \'<|im_end|>\\n\' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n"""
|
59
|
+
|
60
|
+
|
61
|
+
@PromptRegistry.register("Qwen/Qwen3")
|
62
|
+
def qwen_3_chat_template():
|
63
|
+
"""
|
64
|
+
Formats chat messages for the Qwen 3 model family, supporting multi-step tool usage, reasoning content, and special XML tags for tool calls and responses.
|
65
|
+
|
66
|
+
This template handles system messages, user and assistant roles, and tool interactions. When tools are provided, it outputs their signatures and instructions for function calls. It tracks the last user query to determine where to insert assistant reasoning content within `<think>` tags. Assistant tool calls are serialized as JSON within `<tool_call>` tags, and tool responses are grouped inside `<tool_response>` tags. Optionally, a generation prompt and empty reasoning block can be added.
|
67
|
+
|
68
|
+
Parameters:
|
69
|
+
tools (optional): List of tool signature objects to be included in the prompt.
|
70
|
+
messages: List of message objects, each with a role and content, and optionally tool_calls or reasoning_content.
|
71
|
+
add_generation_prompt (optional): If true, appends an assistant prompt for generation.
|
72
|
+
enable_thinking (optional): If false, inserts an empty reasoning block in the assistant prompt.
|
73
|
+
"""
|
74
|
+
return """{%- if tools %}\n {{- \'<|im_start|>system\\n\' }}\n {%- if messages[0].role == \'system\' %}\n {{- messages[0].content + \'\\n\\n\' }}\n {%- endif %}\n {{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n {%- for tool in tools %}\n {{- "\\n" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n {%- if messages[0].role == \'system\' %}\n {{- \'<|im_start|>system\\n\' + messages[0].content + \'<|im_end|>\\n\' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith(\'<tool_response>\') and message.content.endswith(\'</tool_response>\')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = \'\' %}\n {%- endif %}\n {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content + \'<|im_end|>\' + \'\\n\' }}\n {%- elif message.role == "assistant" %}\n {%- set reasoning_content = \'\' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if \'</think>\' in content %}\n {%- set reasoning_content = content.split(\'</think>\')[0].rstrip(\'\\n\').split(\'<think>\')[-1].lstrip(\'\\n\') %}\n {%- set content = content.split(\'</think>\')[-1].lstrip(\'\\n\') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- \'<|im_start|>\' + message.role + \'\\n<think>\\n\' + reasoning_content.strip(\'\\n\') + \'\\n</think>\\n\\n\' + content.lstrip(\'\\n\') }}\n {%- else %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n {%- endif %}\n {%- else %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- \'\\n\' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- \'<tool_call>\\n{"name": "\' }}\n {{- tool_call.name }}\n {{- \'", "arguments": \' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- \'}\\n</tool_call>\' }}\n {%- endfor %}\n {%- endif %}\n {{- \'<|im_end|>\\n\' }}\n {%- elif message.role == "tool" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}\n {{- \'<|im_start|>user\' }}\n {%- endif %}\n {{- \'\\n<tool_response>\\n\' }}\n {{- content }}\n {{- \'\\n</tool_response>\' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n {{- \'<|im_end|>\\n\' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|im_start|>assistant\\n\' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- \'<think>\\n\\n</think>\\n\\n\' }}\n {%- endif %}\n{%- endif %}"""
|
sdg_hub/utils/__init__.py
CHANGED
@@ -0,0 +1,94 @@
|
|
1
|
+
"""Custom exception classes for SDG Hub error handling."""
|
2
|
+
|
3
|
+
|
4
|
+
class SDGHubError(Exception):
|
5
|
+
"""Base exception class for all SDG Hub errors."""
|
6
|
+
|
7
|
+
def __init__(self, message: str, details: str = None):
|
8
|
+
"""Initialize SDGHubError.
|
9
|
+
|
10
|
+
Parameters
|
11
|
+
----------
|
12
|
+
message : str
|
13
|
+
The main error message.
|
14
|
+
details : str, optional
|
15
|
+
Additional details about the error.
|
16
|
+
"""
|
17
|
+
self.message = message
|
18
|
+
self.details = details
|
19
|
+
full_message = message
|
20
|
+
if details:
|
21
|
+
full_message = f"{message}\nDetails: {details}"
|
22
|
+
super().__init__(full_message)
|
23
|
+
|
24
|
+
|
25
|
+
class FlowRunnerError(SDGHubError):
|
26
|
+
"""Base exception class for flow runner errors."""
|
27
|
+
|
28
|
+
pass
|
29
|
+
|
30
|
+
|
31
|
+
class DatasetLoadError(FlowRunnerError):
|
32
|
+
"""Raised when dataset loading fails."""
|
33
|
+
|
34
|
+
pass
|
35
|
+
|
36
|
+
|
37
|
+
class FlowConfigurationError(FlowRunnerError):
|
38
|
+
"""Raised when flow configuration is invalid."""
|
39
|
+
|
40
|
+
pass
|
41
|
+
|
42
|
+
|
43
|
+
class APIConnectionError(FlowRunnerError):
|
44
|
+
"""Raised when API connection fails."""
|
45
|
+
|
46
|
+
pass
|
47
|
+
|
48
|
+
|
49
|
+
class DataGenerationError(FlowRunnerError):
|
50
|
+
"""Raised when data generation fails."""
|
51
|
+
|
52
|
+
pass
|
53
|
+
|
54
|
+
|
55
|
+
class DataSaveError(FlowRunnerError):
|
56
|
+
"""Raised when saving generated data fails."""
|
57
|
+
|
58
|
+
pass
|
59
|
+
|
60
|
+
|
61
|
+
class BlockError(SDGHubError):
|
62
|
+
"""Base exception class for block-related errors."""
|
63
|
+
|
64
|
+
pass
|
65
|
+
|
66
|
+
|
67
|
+
class BlockConfigurationError(BlockError):
|
68
|
+
"""Raised when block configuration is invalid."""
|
69
|
+
|
70
|
+
pass
|
71
|
+
|
72
|
+
|
73
|
+
class BlockExecutionError(BlockError):
|
74
|
+
"""Raised when block execution fails."""
|
75
|
+
|
76
|
+
pass
|
77
|
+
|
78
|
+
|
79
|
+
class FlowError(SDGHubError):
|
80
|
+
"""Base exception class for flow-related errors."""
|
81
|
+
|
82
|
+
pass
|
83
|
+
|
84
|
+
|
85
|
+
class FlowValidationError(FlowError):
|
86
|
+
"""Raised when flow validation fails."""
|
87
|
+
|
88
|
+
pass
|
89
|
+
|
90
|
+
|
91
|
+
class FlowExecutionError(FlowError):
|
92
|
+
"""Raised when flow execution fails."""
|
93
|
+
|
94
|
+
pass
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""
|
2
|
+
Path resolution utilities for SDG Hub.
|
3
|
+
|
4
|
+
This module provides utilities for resolving file paths relative to one or more
|
5
|
+
base directories, with support for both single directory and multiple directory
|
6
|
+
search paths.
|
7
|
+
"""
|
8
|
+
|
9
|
+
# Standard
|
10
|
+
from typing import List, Union
|
11
|
+
import os
|
12
|
+
|
13
|
+
|
14
|
+
def resolve_path(filename: str, search_dirs: Union[str, List[str]]) -> str:
|
15
|
+
"""Resolve a file path relative to one or more search directories.
|
16
|
+
|
17
|
+
Files are checked in the following order:
|
18
|
+
1. Absolute path is always used as-is
|
19
|
+
2. Checked relative to each directory in search_dirs (in order)
|
20
|
+
3. If not found, returns the original filename (assumes relative to current directory)
|
21
|
+
|
22
|
+
Parameters
|
23
|
+
----------
|
24
|
+
filename : str
|
25
|
+
The path to the file to resolve.
|
26
|
+
search_dirs : Union[str, List[str]]
|
27
|
+
Directory or list of directories in which to search for the file.
|
28
|
+
|
29
|
+
Returns
|
30
|
+
-------
|
31
|
+
str
|
32
|
+
Resolved file path.
|
33
|
+
|
34
|
+
Examples
|
35
|
+
--------
|
36
|
+
>>> resolve_path("config.yaml", "/path/to/base")
|
37
|
+
'/path/to/base/config.yaml' # if file exists
|
38
|
+
|
39
|
+
>>> resolve_path("config.yaml", ["/path1", "/path2"])
|
40
|
+
'/path1/config.yaml' # if file exists in path1
|
41
|
+
'/path2/config.yaml' # if file exists in path2 but not path1
|
42
|
+
|
43
|
+
>>> resolve_path("/absolute/path/file.yaml", ["/path1", "/path2"])
|
44
|
+
'/absolute/path/file.yaml' # absolute path always used as-is
|
45
|
+
"""
|
46
|
+
# Handle absolute paths - always use as-is
|
47
|
+
if os.path.isabs(filename):
|
48
|
+
return filename
|
49
|
+
|
50
|
+
# Convert single directory to list for uniform handling
|
51
|
+
if isinstance(search_dirs, str):
|
52
|
+
search_dirs = [search_dirs]
|
53
|
+
|
54
|
+
# Check each directory in order
|
55
|
+
for directory in search_dirs:
|
56
|
+
full_file_path = os.path.join(directory, filename)
|
57
|
+
if os.path.isfile(full_file_path):
|
58
|
+
return full_file_path
|
59
|
+
|
60
|
+
# If not found in any search directory, return the original filename
|
61
|
+
# This assumes the path is relative to the current directory
|
62
|
+
return filename
|