DeepFabric 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/__init__.py +70 -0
- deepfabric/__main__.py +6 -0
- deepfabric/auth.py +382 -0
- deepfabric/builders.py +303 -0
- deepfabric/builders_agent.py +1304 -0
- deepfabric/cli.py +1288 -0
- deepfabric/config.py +899 -0
- deepfabric/config_manager.py +251 -0
- deepfabric/constants.py +94 -0
- deepfabric/dataset_manager.py +534 -0
- deepfabric/error_codes.py +581 -0
- deepfabric/evaluation/__init__.py +47 -0
- deepfabric/evaluation/backends/__init__.py +32 -0
- deepfabric/evaluation/backends/ollama_backend.py +137 -0
- deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
- deepfabric/evaluation/backends/transformers_backend.py +326 -0
- deepfabric/evaluation/evaluator.py +845 -0
- deepfabric/evaluation/evaluators/__init__.py +13 -0
- deepfabric/evaluation/evaluators/base.py +104 -0
- deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
- deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
- deepfabric/evaluation/evaluators/registry.py +66 -0
- deepfabric/evaluation/inference.py +155 -0
- deepfabric/evaluation/metrics.py +397 -0
- deepfabric/evaluation/parser.py +304 -0
- deepfabric/evaluation/reporters/__init__.py +13 -0
- deepfabric/evaluation/reporters/base.py +56 -0
- deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
- deepfabric/evaluation/reporters/file_reporter.py +61 -0
- deepfabric/evaluation/reporters/multi_reporter.py +56 -0
- deepfabric/exceptions.py +67 -0
- deepfabric/factory.py +26 -0
- deepfabric/generator.py +1084 -0
- deepfabric/graph.py +545 -0
- deepfabric/hf_hub.py +214 -0
- deepfabric/kaggle_hub.py +219 -0
- deepfabric/llm/__init__.py +41 -0
- deepfabric/llm/api_key_verifier.py +534 -0
- deepfabric/llm/client.py +1206 -0
- deepfabric/llm/errors.py +105 -0
- deepfabric/llm/rate_limit_config.py +262 -0
- deepfabric/llm/rate_limit_detector.py +278 -0
- deepfabric/llm/retry_handler.py +270 -0
- deepfabric/metrics.py +212 -0
- deepfabric/progress.py +262 -0
- deepfabric/prompts.py +290 -0
- deepfabric/schemas.py +1000 -0
- deepfabric/spin/__init__.py +6 -0
- deepfabric/spin/client.py +263 -0
- deepfabric/spin/models.py +26 -0
- deepfabric/stream_simulator.py +90 -0
- deepfabric/tools/__init__.py +5 -0
- deepfabric/tools/defaults.py +85 -0
- deepfabric/tools/loader.py +87 -0
- deepfabric/tools/mcp_client.py +677 -0
- deepfabric/topic_manager.py +303 -0
- deepfabric/topic_model.py +20 -0
- deepfabric/training/__init__.py +35 -0
- deepfabric/training/api_key_prompt.py +302 -0
- deepfabric/training/callback.py +363 -0
- deepfabric/training/metrics_sender.py +301 -0
- deepfabric/tree.py +438 -0
- deepfabric/tui.py +1267 -0
- deepfabric/update_checker.py +166 -0
- deepfabric/utils.py +150 -0
- deepfabric/validation.py +143 -0
- deepfabric-4.4.0.dist-info/METADATA +702 -0
- deepfabric-4.4.0.dist-info/RECORD +71 -0
- deepfabric-4.4.0.dist-info/WHEEL +4 -0
- deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
- deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import traceback
|
|
6
|
+
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
from datasets import Dataset as HFDataset
|
|
12
|
+
from rich.layout import Layout
|
|
13
|
+
from rich.live import Live
|
|
14
|
+
|
|
15
|
+
from .config import DeepFabricConfig
|
|
16
|
+
from .config_manager import DEFAULT_MODEL
|
|
17
|
+
from .exceptions import ConfigurationError
|
|
18
|
+
from .generator import DataSetGenerator
|
|
19
|
+
from .progress import ProgressReporter
|
|
20
|
+
from .tui import STREAM_PANEL_WIDTH, get_dataset_tui, get_tui
|
|
21
|
+
from .utils import ensure_not_running_loop
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Lazy/defensive access to TUI settings to avoid early import issues
|
|
25
|
+
def _get_tui_settings():
|
|
26
|
+
try:
|
|
27
|
+
from .tui import get_tui_settings as _gts # noqa: PLC0415
|
|
28
|
+
|
|
29
|
+
return _gts()
|
|
30
|
+
except Exception:
|
|
31
|
+
|
|
32
|
+
class _S:
|
|
33
|
+
mode = "rich"
|
|
34
|
+
|
|
35
|
+
return _S()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _get_preview_lines() -> int:
|
|
39
|
+
try:
|
|
40
|
+
from .tui import get_preview_lines as _gpl # noqa: PLC0415
|
|
41
|
+
|
|
42
|
+
return _gpl()
|
|
43
|
+
except Exception:
|
|
44
|
+
return 16
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from .topic_model import TopicModel
|
|
49
|
+
|
|
50
|
+
# Constants for debug output
|
|
51
|
+
DEBUG_MAX_FAILURES_TO_SHOW = 10
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
async def handle_dataset_events_async(
|
|
55
|
+
generator: AsyncIterator[dict | HFDataset], engine=None, debug: bool = False
|
|
56
|
+
) -> HFDataset | None:
|
|
57
|
+
"""Handle dataset generation with TUI progress and streaming feedback."""
|
|
58
|
+
tui = get_dataset_tui()
|
|
59
|
+
footer_prog = None
|
|
60
|
+
task = None
|
|
61
|
+
live = None
|
|
62
|
+
simple_task = None
|
|
63
|
+
|
|
64
|
+
final_result: HFDataset | None = None
|
|
65
|
+
try:
|
|
66
|
+
async for event in generator:
|
|
67
|
+
if isinstance(event, dict) and "event" in event:
|
|
68
|
+
if event["event"] == "generation_start":
|
|
69
|
+
settings = _get_tui_settings()
|
|
70
|
+
# Build header and params panels for layout
|
|
71
|
+
header_panel, params_panel = tui.build_generation_panels(
|
|
72
|
+
event["model_name"], event["num_steps"], event["batch_size"]
|
|
73
|
+
)
|
|
74
|
+
# Capture context for the run
|
|
75
|
+
tui.root_topic_prompt = event.get("root_topic_prompt")
|
|
76
|
+
tui.topic_model_type = event.get("topic_model_type")
|
|
77
|
+
|
|
78
|
+
if settings.mode == "rich":
|
|
79
|
+
# Initialize status tracking
|
|
80
|
+
tui.init_status(
|
|
81
|
+
total_steps=event["num_steps"],
|
|
82
|
+
total_samples=event["total_samples"],
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Build layout with footer card
|
|
86
|
+
layout = Layout(name="root")
|
|
87
|
+
layout.split(Layout(name="main"), Layout(name="footer", size=3))
|
|
88
|
+
left = Layout(name="left", ratio=3)
|
|
89
|
+
right = Layout(name="right", ratio=2)
|
|
90
|
+
right.minimum_size = STREAM_PANEL_WIDTH
|
|
91
|
+
# Right column: status on top, streaming preview fills remaining space
|
|
92
|
+
right.split(
|
|
93
|
+
Layout(name="status", size=8),
|
|
94
|
+
Layout(name="preview"),
|
|
95
|
+
)
|
|
96
|
+
left.split(
|
|
97
|
+
Layout(name="header", size=4),
|
|
98
|
+
Layout(name="params", size=6),
|
|
99
|
+
Layout(name="context", size=5),
|
|
100
|
+
Layout(name="events"),
|
|
101
|
+
)
|
|
102
|
+
left["header"].update(header_panel)
|
|
103
|
+
left["params"].update(params_panel)
|
|
104
|
+
left["context"].update(tui._context_panel())
|
|
105
|
+
left["events"].update(tui.tui.build_events_panel([], title="Events"))
|
|
106
|
+
right["status"].update(tui._status_panel())
|
|
107
|
+
right["preview"].update(
|
|
108
|
+
tui.tui.build_stream_panel("Waiting for LLM output...")
|
|
109
|
+
)
|
|
110
|
+
layout["main"].split_row(left, right)
|
|
111
|
+
|
|
112
|
+
# Footer run status
|
|
113
|
+
footer_prog = tui.tui.create_footer(layout, title="Run Status")
|
|
114
|
+
task = footer_prog.add_task(
|
|
115
|
+
"Generating dataset samples", total=event["total_samples"]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Use alternate screen to avoid scroll trails; leave a clean terminal
|
|
119
|
+
live = Live(
|
|
120
|
+
layout,
|
|
121
|
+
console=tui.console,
|
|
122
|
+
refresh_per_second=15,
|
|
123
|
+
screen=True,
|
|
124
|
+
)
|
|
125
|
+
tui.live_display = live # Give TUI reference to update it
|
|
126
|
+
tui.live_layout = layout # Allow TUI to update panes
|
|
127
|
+
live.start()
|
|
128
|
+
else:
|
|
129
|
+
# Simple/headless mode: print and proceed without Live
|
|
130
|
+
tui.show_generation_header(
|
|
131
|
+
event["model_name"], event["num_steps"], event["batch_size"]
|
|
132
|
+
)
|
|
133
|
+
simple_task = {"count": 0, "total": event["total_samples"]}
|
|
134
|
+
elif event["event"] == "step_complete":
|
|
135
|
+
samples_generated = event.get("samples_generated", 0)
|
|
136
|
+
if footer_prog and task is not None:
|
|
137
|
+
if samples_generated > 0:
|
|
138
|
+
with contextlib.suppress(Exception):
|
|
139
|
+
footer_prog.update(task, advance=samples_generated)
|
|
140
|
+
tui.log_event(f"✓ Generated +{samples_generated} samples")
|
|
141
|
+
# Update status totals
|
|
142
|
+
tui.status_step_complete(
|
|
143
|
+
samples_generated, int(event.get("failed_in_step", 0))
|
|
144
|
+
)
|
|
145
|
+
elif isinstance(simple_task, dict):
|
|
146
|
+
simple_task["count"] += samples_generated
|
|
147
|
+
failed_in_step = int(event.get("failed_in_step", 0))
|
|
148
|
+
retry_summary = tui.get_step_retry_summary()
|
|
149
|
+
|
|
150
|
+
# Build step summary message
|
|
151
|
+
step_msg = f"Step {event.get('step')}: +{samples_generated}"
|
|
152
|
+
if failed_in_step > 0:
|
|
153
|
+
step_msg += f" (-{failed_in_step} failed)"
|
|
154
|
+
step_msg += f" (total {simple_task['count']}/{simple_task['total']})"
|
|
155
|
+
|
|
156
|
+
# Display with appropriate style based on failures
|
|
157
|
+
if failed_in_step > 0:
|
|
158
|
+
tui.warning(step_msg)
|
|
159
|
+
else:
|
|
160
|
+
tui.info(step_msg)
|
|
161
|
+
|
|
162
|
+
# Show retry summary if there were retries
|
|
163
|
+
if retry_summary:
|
|
164
|
+
tui.console.print(f" [dim]{retry_summary}[/dim]")
|
|
165
|
+
|
|
166
|
+
# Clear retries for next step
|
|
167
|
+
tui.clear_step_retries()
|
|
168
|
+
elif event["event"] == "step_start":
|
|
169
|
+
# Keep status panel in sync
|
|
170
|
+
step = int(event.get("step", 0))
|
|
171
|
+
total = int(event.get("total_steps", 0))
|
|
172
|
+
tui.status_step_start(step, total)
|
|
173
|
+
|
|
174
|
+
elif event["event"] == "generation_complete":
|
|
175
|
+
if live:
|
|
176
|
+
live.stop()
|
|
177
|
+
tui.console.print() # Add blank line after live display
|
|
178
|
+
tui.success(f"Successfully generated {event['total_samples']} samples")
|
|
179
|
+
tui.log_event(
|
|
180
|
+
f"Done • total={event['total_samples']} failed={event['failed_samples']}"
|
|
181
|
+
)
|
|
182
|
+
if event["failed_samples"] > 0:
|
|
183
|
+
tui.warning(f"Failed to generate {event['failed_samples']} samples")
|
|
184
|
+
|
|
185
|
+
# Show detailed failure information in debug mode
|
|
186
|
+
if debug and engine and hasattr(engine, "failed_samples"):
|
|
187
|
+
get_tui().error("\n🔍 Debug: Dataset generation failures:")
|
|
188
|
+
for idx, failure in enumerate(
|
|
189
|
+
engine.failed_samples[:DEBUG_MAX_FAILURES_TO_SHOW], 1
|
|
190
|
+
):
|
|
191
|
+
get_tui().error(f" [{idx}] {failure}")
|
|
192
|
+
if len(engine.failed_samples) > DEBUG_MAX_FAILURES_TO_SHOW:
|
|
193
|
+
remaining = len(engine.failed_samples) - DEBUG_MAX_FAILURES_TO_SHOW
|
|
194
|
+
get_tui().error(f" ... and {remaining} more failures")
|
|
195
|
+
|
|
196
|
+
elif isinstance(event, HFDataset):
|
|
197
|
+
final_result = event
|
|
198
|
+
else:
|
|
199
|
+
# Handle unexpected non-dict, non-Dataset events
|
|
200
|
+
get_tui().warning(f"Unexpected event type: {type(event)}")
|
|
201
|
+
except Exception as e:
|
|
202
|
+
if live:
|
|
203
|
+
live.stop()
|
|
204
|
+
if debug:
|
|
205
|
+
get_tui().error(f"🔍 Debug: Full traceback:\n{traceback.format_exc()}")
|
|
206
|
+
get_tui().error(f"Dataset generation failed: {str(e)}")
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
return final_result
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def handle_dataset_events(generator, engine=None, debug: bool = False) -> HFDataset | None:
|
|
213
|
+
"""Synchronous wrapper for async dataset event handling."""
|
|
214
|
+
ensure_not_running_loop("handle_dataset_events")
|
|
215
|
+
return asyncio.run(handle_dataset_events_async(generator, engine=engine, debug=debug))
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def create_dataset(
|
|
219
|
+
engine: DataSetGenerator,
|
|
220
|
+
topic_model: "TopicModel",
|
|
221
|
+
config: DeepFabricConfig,
|
|
222
|
+
num_samples: int | None = None,
|
|
223
|
+
batch_size: int | None = None,
|
|
224
|
+
include_system_message: bool | None = None,
|
|
225
|
+
provider: str | None = None, # noqa: ARG001
|
|
226
|
+
model: str | None = None,
|
|
227
|
+
generation_overrides: dict | None = None,
|
|
228
|
+
debug: bool = False,
|
|
229
|
+
) -> HFDataset:
|
|
230
|
+
"""
|
|
231
|
+
Create dataset using the data engine and topic model.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
engine: DataSetGenerator instance
|
|
235
|
+
topic_model: TopicModel (Tree or Graph) to use for generation
|
|
236
|
+
config: DeepFabricConfig object
|
|
237
|
+
num_samples: Override for number of samples
|
|
238
|
+
batch_size: Override for batch size
|
|
239
|
+
include_system_message: Override for including system message
|
|
240
|
+
provider: Override for LLM provider
|
|
241
|
+
model: Override for model name
|
|
242
|
+
generation_overrides: Additional generation parameter overrides
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
Generated HuggingFace Dataset object
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ConfigurationError: If dataset generation fails
|
|
249
|
+
"""
|
|
250
|
+
ensure_not_running_loop("create_dataset")
|
|
251
|
+
return asyncio.run(
|
|
252
|
+
create_dataset_async(
|
|
253
|
+
engine=engine,
|
|
254
|
+
topic_model=topic_model,
|
|
255
|
+
config=config,
|
|
256
|
+
num_samples=num_samples,
|
|
257
|
+
batch_size=batch_size,
|
|
258
|
+
include_system_message=include_system_message,
|
|
259
|
+
provider=provider,
|
|
260
|
+
model=model,
|
|
261
|
+
generation_overrides=generation_overrides,
|
|
262
|
+
debug=debug,
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
async def create_dataset_async(
|
|
268
|
+
engine: DataSetGenerator,
|
|
269
|
+
topic_model: "TopicModel",
|
|
270
|
+
config: DeepFabricConfig,
|
|
271
|
+
num_samples: int | None = None,
|
|
272
|
+
batch_size: int | None = None,
|
|
273
|
+
include_system_message: bool | None = None,
|
|
274
|
+
provider: str | None = None, # noqa: ARG001
|
|
275
|
+
model: str | None = None,
|
|
276
|
+
generation_overrides: dict | None = None,
|
|
277
|
+
debug: bool = False,
|
|
278
|
+
) -> HFDataset:
|
|
279
|
+
output_config = config.get_output_config()
|
|
280
|
+
|
|
281
|
+
final_num_samples = num_samples or output_config["num_samples"]
|
|
282
|
+
final_batch_size = batch_size or output_config["batch_size"]
|
|
283
|
+
|
|
284
|
+
generation_params = config.get_generation_params(**(generation_overrides or {}))
|
|
285
|
+
final_model = model or generation_params.get("model_name", DEFAULT_MODEL)
|
|
286
|
+
|
|
287
|
+
# Create progress reporter and attach TUI as observer for streaming feedback
|
|
288
|
+
progress_reporter = ProgressReporter()
|
|
289
|
+
tui = get_dataset_tui()
|
|
290
|
+
progress_reporter.attach(tui)
|
|
291
|
+
|
|
292
|
+
# Attach progress reporter to engine
|
|
293
|
+
engine.progress_reporter = progress_reporter
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
generator = engine.create_data_with_events_async(
|
|
297
|
+
num_steps=final_num_samples,
|
|
298
|
+
batch_size=final_batch_size,
|
|
299
|
+
topic_model=topic_model,
|
|
300
|
+
model_name=final_model,
|
|
301
|
+
sys_msg=include_system_message,
|
|
302
|
+
num_example_demonstrations=output_config.get("num_example_demonstrations") or 3,
|
|
303
|
+
)
|
|
304
|
+
dataset = await handle_dataset_events_async(generator, engine=engine, debug=debug)
|
|
305
|
+
except Exception as e: # noqa: BLE001
|
|
306
|
+
raise ConfigurationError(f"Error creating dataset: {str(e)}") from e
|
|
307
|
+
|
|
308
|
+
if dataset is None:
|
|
309
|
+
raise ConfigurationError("Dataset generation failed - no dataset returned")
|
|
310
|
+
|
|
311
|
+
return dataset
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _upload_to_service(
|
|
315
|
+
service_name: str,
|
|
316
|
+
dataset_path: str,
|
|
317
|
+
config: dict,
|
|
318
|
+
credential_check_func,
|
|
319
|
+
uploader_import_func,
|
|
320
|
+
uploader_args_func,
|
|
321
|
+
push_args_func,
|
|
322
|
+
tui,
|
|
323
|
+
) -> None:
|
|
324
|
+
"""Generic function to upload dataset to any configured service."""
|
|
325
|
+
try:
|
|
326
|
+
tui.info(f"Uploading dataset to {service_name}...")
|
|
327
|
+
|
|
328
|
+
# Check credentials
|
|
329
|
+
credentials = credential_check_func()
|
|
330
|
+
if not credentials:
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
# Import uploader class
|
|
334
|
+
uploader_class = uploader_import_func()
|
|
335
|
+
|
|
336
|
+
# Create uploader instance
|
|
337
|
+
uploader_args = uploader_args_func(credentials)
|
|
338
|
+
uploader = (
|
|
339
|
+
uploader_class(*uploader_args)
|
|
340
|
+
if isinstance(uploader_args, tuple)
|
|
341
|
+
else uploader_class(**uploader_args)
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Prepare push arguments
|
|
345
|
+
push_args = push_args_func(config, dataset_path)
|
|
346
|
+
|
|
347
|
+
# Upload dataset
|
|
348
|
+
result = uploader.push_to_hub(**push_args)
|
|
349
|
+
|
|
350
|
+
if result["status"] == "success":
|
|
351
|
+
tui.success(result["message"])
|
|
352
|
+
else:
|
|
353
|
+
tui.warning(f"{service_name} upload failed: {result['message']}")
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
tui.warning(f"Error uploading to {service_name}: {str(e)}")
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _upload_to_huggingface(dataset_path: str, hf_config: dict, tui) -> None:
|
|
360
|
+
"""Upload dataset to HuggingFace Hub if configured."""
|
|
361
|
+
|
|
362
|
+
def check_credentials():
|
|
363
|
+
token = os.getenv("HF_TOKEN")
|
|
364
|
+
if not token:
|
|
365
|
+
tui.warning("HF_TOKEN not set. Skipping HuggingFace upload.")
|
|
366
|
+
return None
|
|
367
|
+
return token
|
|
368
|
+
|
|
369
|
+
def import_uploader():
|
|
370
|
+
from .hf_hub import HFUploader # noqa: PLC0415
|
|
371
|
+
|
|
372
|
+
return HFUploader
|
|
373
|
+
|
|
374
|
+
def get_uploader_args(credentials):
|
|
375
|
+
return (credentials,) # HFUploader takes token as single argument
|
|
376
|
+
|
|
377
|
+
def get_push_args(config, dataset_path):
|
|
378
|
+
return {
|
|
379
|
+
"hf_dataset_repo": config["repository"],
|
|
380
|
+
"jsonl_file_path": dataset_path,
|
|
381
|
+
"tags": config.get("tags", []),
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
_upload_to_service(
|
|
385
|
+
"HuggingFace Hub",
|
|
386
|
+
dataset_path,
|
|
387
|
+
hf_config,
|
|
388
|
+
check_credentials,
|
|
389
|
+
import_uploader,
|
|
390
|
+
get_uploader_args,
|
|
391
|
+
get_push_args,
|
|
392
|
+
tui,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _upload_to_kaggle(dataset_path: str, kaggle_config: dict, tui) -> None:
|
|
397
|
+
"""Upload dataset to Kaggle if configured."""
|
|
398
|
+
|
|
399
|
+
def check_credentials():
|
|
400
|
+
username = os.getenv("KAGGLE_USERNAME")
|
|
401
|
+
key = os.getenv("KAGGLE_KEY")
|
|
402
|
+
if not username or not key:
|
|
403
|
+
tui.warning("KAGGLE_USERNAME or KAGGLE_KEY not set. Skipping Kaggle upload.")
|
|
404
|
+
return None
|
|
405
|
+
return (username, key)
|
|
406
|
+
|
|
407
|
+
def import_uploader():
|
|
408
|
+
from .kaggle_hub import KaggleUploader # noqa: PLC0415
|
|
409
|
+
|
|
410
|
+
return KaggleUploader
|
|
411
|
+
|
|
412
|
+
def get_uploader_args(credentials):
|
|
413
|
+
return credentials # KaggleUploader takes username, key as tuple
|
|
414
|
+
|
|
415
|
+
def get_push_args(config, dataset_path):
|
|
416
|
+
return {
|
|
417
|
+
"dataset_handle": config["handle"],
|
|
418
|
+
"jsonl_file_path": dataset_path,
|
|
419
|
+
"tags": config.get("tags", []),
|
|
420
|
+
"version_notes": config.get("version_notes"),
|
|
421
|
+
"description": config.get("description"),
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
_upload_to_service(
|
|
425
|
+
"Kaggle",
|
|
426
|
+
dataset_path,
|
|
427
|
+
kaggle_config,
|
|
428
|
+
check_credentials,
|
|
429
|
+
import_uploader,
|
|
430
|
+
get_uploader_args,
|
|
431
|
+
get_push_args,
|
|
432
|
+
tui,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _strip_nulls(obj: Any) -> Any:
|
|
437
|
+
"""Recursively strip null values from nested dicts and lists.
|
|
438
|
+
|
|
439
|
+
HuggingFace Dataset's Arrow schema injects null for missing fields across rows.
|
|
440
|
+
This function removes those nulls for clean JSON output.
|
|
441
|
+
"""
|
|
442
|
+
if isinstance(obj, dict):
|
|
443
|
+
return {k: _strip_nulls(v) for k, v in obj.items() if v is not None}
|
|
444
|
+
if isinstance(obj, list):
|
|
445
|
+
return [_strip_nulls(item) for item in obj]
|
|
446
|
+
return obj
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _save_jsonl_without_nulls(dataset: HFDataset, save_path: str) -> None:
|
|
450
|
+
"""Save HF Dataset to JSONL, stripping null values injected by Arrow schema."""
|
|
451
|
+
with open(save_path, "w") as f:
|
|
452
|
+
for row in dataset:
|
|
453
|
+
cleaned = _strip_nulls(dict(row))
|
|
454
|
+
f.write(json.dumps(cleaned, separators=(",", ":")) + "\n")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def _save_failed_samples(save_path: str, failed_samples: list, tui) -> None:
|
|
458
|
+
"""Save failed samples to a timestamped file alongside the main dataset.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
save_path: Path to the main dataset file (e.g., "my-dataset.jsonl")
|
|
462
|
+
failed_samples: List of failed samples - can be dicts with 'error' and 'raw_content' keys,
|
|
463
|
+
or plain strings/other types for legacy compatibility
|
|
464
|
+
tui: TUI instance for output
|
|
465
|
+
"""
|
|
466
|
+
# Generate timestamped filename: my-dataset.jsonl -> my-dataset_failures_20231130_143022.jsonl
|
|
467
|
+
base_path = save_path.rsplit(".", 1)[0] if "." in save_path else save_path
|
|
468
|
+
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
469
|
+
failures_path = f"{base_path}_failures_{timestamp}.jsonl"
|
|
470
|
+
|
|
471
|
+
try:
|
|
472
|
+
with open(failures_path, "w") as f:
|
|
473
|
+
for idx, failure in enumerate(failed_samples):
|
|
474
|
+
# Structure each failure as a JSON object with metadata
|
|
475
|
+
failure_record = {
|
|
476
|
+
"index": idx,
|
|
477
|
+
"timestamp": datetime.now(tz=timezone.utc).isoformat(),
|
|
478
|
+
}
|
|
479
|
+
if isinstance(failure, dict):
|
|
480
|
+
# New format: dict with 'error' and optionally 'raw_content'
|
|
481
|
+
failure_record["error"] = failure.get("error", str(failure))
|
|
482
|
+
if "raw_content" in failure:
|
|
483
|
+
failure_record["raw_content"] = failure["raw_content"]
|
|
484
|
+
else:
|
|
485
|
+
# Legacy format: plain string or other type
|
|
486
|
+
failure_record["error"] = str(failure)
|
|
487
|
+
f.write(json.dumps(failure_record) + "\n")
|
|
488
|
+
tui.warning(f"Failed samples saved to: {failures_path} ({len(failed_samples)} failures)")
|
|
489
|
+
except Exception as e:
|
|
490
|
+
tui.error(f"Could not save failed samples: {str(e)}")
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def save_dataset(
|
|
494
|
+
dataset: HFDataset,
|
|
495
|
+
save_path: str,
|
|
496
|
+
config: DeepFabricConfig | None = None,
|
|
497
|
+
engine: DataSetGenerator | None = None,
|
|
498
|
+
) -> None:
|
|
499
|
+
"""
|
|
500
|
+
Save dataset to file.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
dataset: HuggingFace Dataset object to save
|
|
504
|
+
save_path: Path where to save the dataset
|
|
505
|
+
config: Optional configuration for upload settings
|
|
506
|
+
engine: Optional DataSetGenerator to save failed samples from
|
|
507
|
+
|
|
508
|
+
Raises:
|
|
509
|
+
ConfigurationError: If saving fails
|
|
510
|
+
"""
|
|
511
|
+
tui = get_tui()
|
|
512
|
+
try:
|
|
513
|
+
# Save the dataset as JSONL, stripping null values injected by HF Dataset
|
|
514
|
+
# HuggingFace Dataset's Arrow schema adds null for missing fields across rows,
|
|
515
|
+
# but we want clean output without null values for optional fields
|
|
516
|
+
_save_jsonl_without_nulls(dataset, save_path)
|
|
517
|
+
tui.success(f"Dataset saved to: {save_path}")
|
|
518
|
+
|
|
519
|
+
# Save failed samples if engine has any
|
|
520
|
+
if engine and engine.failed_samples:
|
|
521
|
+
_save_failed_samples(save_path, engine.failed_samples, tui)
|
|
522
|
+
|
|
523
|
+
# Handle automatic uploads if configured
|
|
524
|
+
if config:
|
|
525
|
+
# HuggingFace upload
|
|
526
|
+
if config.huggingface:
|
|
527
|
+
_upload_to_huggingface(save_path, config.get_huggingface_config(), tui)
|
|
528
|
+
|
|
529
|
+
# Kaggle upload
|
|
530
|
+
if config.kaggle:
|
|
531
|
+
_upload_to_kaggle(save_path, config.get_kaggle_config(), tui)
|
|
532
|
+
|
|
533
|
+
except Exception as e:
|
|
534
|
+
raise ConfigurationError(f"Error saving dataset: {str(e)}") from e
|