DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. deepfabric/__init__.py +70 -0
  2. deepfabric/__main__.py +6 -0
  3. deepfabric/auth.py +382 -0
  4. deepfabric/builders.py +303 -0
  5. deepfabric/builders_agent.py +1304 -0
  6. deepfabric/cli.py +1288 -0
  7. deepfabric/config.py +899 -0
  8. deepfabric/config_manager.py +251 -0
  9. deepfabric/constants.py +94 -0
  10. deepfabric/dataset_manager.py +534 -0
  11. deepfabric/error_codes.py +581 -0
  12. deepfabric/evaluation/__init__.py +47 -0
  13. deepfabric/evaluation/backends/__init__.py +32 -0
  14. deepfabric/evaluation/backends/ollama_backend.py +137 -0
  15. deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
  16. deepfabric/evaluation/backends/transformers_backend.py +326 -0
  17. deepfabric/evaluation/evaluator.py +845 -0
  18. deepfabric/evaluation/evaluators/__init__.py +13 -0
  19. deepfabric/evaluation/evaluators/base.py +104 -0
  20. deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
  21. deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
  22. deepfabric/evaluation/evaluators/registry.py +66 -0
  23. deepfabric/evaluation/inference.py +155 -0
  24. deepfabric/evaluation/metrics.py +397 -0
  25. deepfabric/evaluation/parser.py +304 -0
  26. deepfabric/evaluation/reporters/__init__.py +13 -0
  27. deepfabric/evaluation/reporters/base.py +56 -0
  28. deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
  29. deepfabric/evaluation/reporters/file_reporter.py +61 -0
  30. deepfabric/evaluation/reporters/multi_reporter.py +56 -0
  31. deepfabric/exceptions.py +67 -0
  32. deepfabric/factory.py +26 -0
  33. deepfabric/generator.py +1084 -0
  34. deepfabric/graph.py +545 -0
  35. deepfabric/hf_hub.py +214 -0
  36. deepfabric/kaggle_hub.py +219 -0
  37. deepfabric/llm/__init__.py +41 -0
  38. deepfabric/llm/api_key_verifier.py +534 -0
  39. deepfabric/llm/client.py +1206 -0
  40. deepfabric/llm/errors.py +105 -0
  41. deepfabric/llm/rate_limit_config.py +262 -0
  42. deepfabric/llm/rate_limit_detector.py +278 -0
  43. deepfabric/llm/retry_handler.py +270 -0
  44. deepfabric/metrics.py +212 -0
  45. deepfabric/progress.py +262 -0
  46. deepfabric/prompts.py +290 -0
  47. deepfabric/schemas.py +1000 -0
  48. deepfabric/spin/__init__.py +6 -0
  49. deepfabric/spin/client.py +263 -0
  50. deepfabric/spin/models.py +26 -0
  51. deepfabric/stream_simulator.py +90 -0
  52. deepfabric/tools/__init__.py +5 -0
  53. deepfabric/tools/defaults.py +85 -0
  54. deepfabric/tools/loader.py +87 -0
  55. deepfabric/tools/mcp_client.py +677 -0
  56. deepfabric/topic_manager.py +303 -0
  57. deepfabric/topic_model.py +20 -0
  58. deepfabric/training/__init__.py +35 -0
  59. deepfabric/training/api_key_prompt.py +302 -0
  60. deepfabric/training/callback.py +363 -0
  61. deepfabric/training/metrics_sender.py +301 -0
  62. deepfabric/tree.py +438 -0
  63. deepfabric/tui.py +1267 -0
  64. deepfabric/update_checker.py +166 -0
  65. deepfabric/utils.py +150 -0
  66. deepfabric/validation.py +143 -0
  67. deepfabric-4.4.0.dist-info/METADATA +702 -0
  68. deepfabric-4.4.0.dist-info/RECORD +71 -0
  69. deepfabric-4.4.0.dist-info/WHEEL +4 -0
  70. deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
  71. deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,702 @@
1
+ Metadata-Version: 2.4
2
+ Name: DeepFabric
3
+ Version: 4.4.0
4
+ Summary: Curate High Quality Datasets, Train, Evaluate and Ship
5
+ Author-email: Luke Hinds <luke@alwaysfurther.ai>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: accelerate>=0.20.0
9
+ Requires-Dist: anthropic>=0.75.0
10
+ Requires-Dist: click>=8.1.7
11
+ Requires-Dist: componentize-py>=0.19.3
12
+ Requires-Dist: datasets<5.0,>=3.0
13
+ Requires-Dist: google-api-core>=2.0.0
14
+ Requires-Dist: google-genai>=1.56.0
15
+ Requires-Dist: huggingface-hub==0.36.0
16
+ Requires-Dist: kagglehub>=0.3.0
17
+ Requires-Dist: mermaid-py>=0.8.0
18
+ Requires-Dist: ollama>=0.6.1
19
+ Requires-Dist: openai>=1.107.2
20
+ Requires-Dist: outlines==1.2.9
21
+ Requires-Dist: packaging>=25.0
22
+ Requires-Dist: peft>=0.7.0
23
+ Requires-Dist: posthog>=3.0.0
24
+ Requires-Dist: protobuf>=3.20.0
25
+ Requires-Dist: pydantic>=2.0.0
26
+ Requires-Dist: pyyaml>=6.0.1
27
+ Requires-Dist: rich>=13.0.0
28
+ Requires-Dist: sentencepiece>=0.1.99
29
+ Requires-Dist: spin-sdk>=3.4.1
30
+ Requires-Dist: torch>=2.4.0
31
+ Requires-Dist: transformers>=4.57.1
32
+ Provides-Extra: dev
33
+ Requires-Dist: bandit>=1.7.10; extra == 'dev'
34
+ Requires-Dist: mermaid-py>=0.2.0; extra == 'dev'
35
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
36
+ Requires-Dist: pytest-mock>=3.10.0; extra == 'dev'
37
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
38
+ Requires-Dist: requests-mock>=1.11.0; extra == 'dev'
39
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
+ Provides-Extra: docs
41
+ Requires-Dist: mkdocs-material>=9.0.0; extra == 'docs'
42
+ Requires-Dist: mkdocstrings[python]>=0.30.0; extra == 'docs'
43
+ Description-Content-Type: text/markdown
44
+
45
+ <div align="center">
46
+ <picture>
47
+ <source media="(prefers-color-scheme: dark)" srcset="./assets/logo-light.png" />
48
+ <img alt="DeepFabric logo" src="./assets/logo-light-hols.png" style="width:40%;max-width:40%;height:auto;display:block;margin:0 auto;" />
49
+ </picture>
50
+ <h3>Training Model Behavior in Agentic Systems</h3>
51
+
52
+ <!-- CTA Buttons -->
53
+ <p>
54
+ <a href="https://github.com/always-further/deepfabric/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22">
55
+ <img src="https://img.shields.io/badge/Contribute-Good%20First%20Issues-green?style=for-the-badge&logo=github" alt="Good First Issues"/>
56
+ </a>
57
+ &nbsp;
58
+ <a href="https://discord.gg/pPcjYzGvbS">
59
+ <img src="https://img.shields.io/badge/Chat-Join%20Discord-7289da?style=for-the-badge&logo=discord&logoColor=white" alt="Join Discord"/>
60
+ </a>
61
+ </p>
62
+
63
+ <!-- Badges -->
64
+ <p>
65
+ <a href="https://opensource.org/licenses/Apache-2.0">
66
+ <img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License"/>
67
+ </a>
68
+ <a href="https://github.com/always-further/deepfabric/actions/workflows/test.yml">
69
+ <img src="https://github.com/always-further/deepfabric/actions/workflows/test.yml/badge.svg" alt="CI Status"/>
70
+ </a>
71
+ <a href="https://pypi.org/project/deepfabric/">
72
+ <img src="https://img.shields.io/pypi/v/deepfabric.svg" alt="PyPI Version"/>
73
+ </a>
74
+ <a href="https://pepy.tech/project/deepfabric">
75
+ <img src="https://static.pepy.tech/badge/deepfabric" alt="Downloads"/>
76
+ </a>
77
+ <a href="https://discord.gg/pPcjYzGvbS">
78
+ <img src="https://img.shields.io/discord/1384081906773131274?color=7289da&label=Discord&logo=discord&logoColor=white" alt="Discord"/>
79
+ </a>
80
+ </p>
81
+ </div>
82
+
83
+ **DeepFabric** generates synthetic training data for language models and agent evaluations. By combining reasoning traces with tool-calling patterns, it creates high-quality, domain-specific datasets that teach models to think, plan, and act effectively, call tools correctly, and conform to strict schema structures.
84
+
85
+ What sets DeepFabric apart from other dataset generation tools is its ability to ensure high diversity yet domain-anchored relevance through unique topic graph generation algorithms. This guides sample creation to cover all necessary subtopics while avoiding redundancy, which is where other tools often fall short, resulting in model overfit.
86
+
87
+ <img src="/assets/df-demo.gif" width="100%" height="100%"/>
88
+
89
+ Constrained decoding and response validation, along with real tool executions within isolated webassembly environments, ensure that generated samples strictly adhere to structured schema, variable constraints, and execution correctness, ensuring datasets have exact syntax and structure for use in model training pipelines. Tool definations can be either directly imported from MCP (Model Context Protocol) server schemas and automatically mocked, real life interfaces along with a standard set of common tools (`list_files()`, 'read_file()` etc)
90
+
91
+ Once your dataset is generated, it can be automatically uploaded to Hugging Face and directly imported into popular training frameworks like TRL, Unsloth, and Axolotl.
92
+
93
+ Post-training, DeepFabric's built-in evaluation engine assesses model performance, whereby models prove their capabilities on unseen tasks derived from training splits—covering evaluation-only questions, answers, and tool traces.
94
+
95
+ ## Quickstart
96
+
97
+ DeepFabric can be used in several ways, as a library, CLI tool, or via YAML configuration. Here's a quick example using the CLI:
98
+
99
+ ```bash
100
+ pip install deepfabric
101
+ ```
102
+
103
+ ```bash
104
+ export OPENAI_API_KEY="your-api-key"
105
+
106
+ deepfabric generate \
107
+ --topic-prompt "Python programming fundamentals" \
108
+ --generation-system-prompt "You are a Python expert" \
109
+ --mode graph \
110
+ --depth 3 \
111
+ --degree 3 \
112
+ --num-samples 9 \
113
+ --batch-size 3 \
114
+ --provider openai \
115
+ --model gpt-4o \
116
+ --output-save-as dataset.jsonl
117
+ ```
118
+
119
+ This generates a topic graph and creates 27 unique nodes, then generates 27 training samples saved to `dataset.jsonl`, giving you 100% topic coverage.
120
+
121
+ ## Configuration
122
+
123
+ DeepFabric also uses YAML configuration with three main sections and optional shared LLM defaults:
124
+
125
+ ```yaml
126
+ # Optional: Shared LLM defaults (inherited by topics and generation)
127
+ llm:
128
+ provider: "openai"
129
+ model: "gpt-4o"
130
+ temperature: 0.7
131
+
132
+ # TOPICS: Generate the topic tree/graph
133
+ topics:
134
+ prompt: "Building production-ready REST APIs with Python"
135
+ mode: tree # tree | graph
136
+ depth: 3
137
+ degree: 3
138
+ save_as: "topics.jsonl"
139
+ # Optional: Override shared LLM settings
140
+ llm:
141
+ model: "gpt-4o-mini" # Use cheaper model for topics
142
+
143
+ # GENERATION: Create training samples from topics
144
+ generation:
145
+ system_prompt: |
146
+ You are an expert Python backend developer and technical educator.
147
+ Create practical, production-ready code examples with clear explanations.
148
+ Include error handling, type hints, and follow PEP 8 conventions.
149
+
150
+ # Additional instructions for sample generation
151
+ instructions: |
152
+ Focus on real-world scenarios developers encounter daily.
153
+ Include both happy path and edge case handling.
154
+ Provide context on when and why to use specific patterns.
155
+
156
+ conversation:
157
+ type: chain_of_thought # basic | chain_of_thought
158
+ reasoning_style: agent # freetext | agent (for chain_of_thought)
159
+ agent_mode: single_turn # single_turn | multi_turn (for agent)
160
+
161
+ # Tool configuration (required for agent modes)
162
+ tools:
163
+ spin_endpoint: "http://localhost:3000" # Spin service for tool execution
164
+ available: # Filter to specific tools (empty = all VFS tools)
165
+ - read_file
166
+ - write_file
167
+ - list_files
168
+ max_per_query: 3 # Maximum tools per query
169
+ max_agent_steps: 5 # Max ReAct reasoning iterations
170
+
171
+ max_retries: 3 # Retries for failed generations
172
+ sample_retries: 2 # Retries for validation failures
173
+ max_tokens: 2000 # Max tokens per generation
174
+
175
+ # Optional: Override shared LLM settings
176
+ llm:
177
+ temperature: 0.3 # Lower temp for consistent code
178
+
179
+ # OUTPUT: Final dataset configuration
180
+ output:
181
+ # System prompt that goes INTO the training data
182
+ # This is what the trained model will see as its system message
183
+ system_prompt: |
184
+ You are a helpful Python programming assistant specialized in REST API
185
+ development. You provide clear, production-ready code with explanations.
186
+ Always consider security, error handling, and best practices.
187
+
188
+ include_system_message: true # Whether to include system message in output
189
+ num_samples: 4 # Total training samples to generate
190
+ batch_size: 3 # Parallel generation batch size
191
+ save_as: "api-dataset.jsonl"
192
+
193
+ # Optional: Upload to Hugging Face
194
+ huggingface:
195
+ repository: "your-username/api-dataset-training-name"
196
+ tags: ["python", "programming"]
197
+ ```
198
+
199
+ Run with:
200
+
201
+ ```bash
202
+ deepfabric generate config.yaml
203
+ ```
204
+
205
+ ## Generate, Train, Evaluate
206
+
207
+ DeepFabric returns standard HuggingFace datasets, making it easy to integrate with any training framework.
208
+
209
+ ### 1. Generate Dataset
210
+
211
+ ```bash
212
+ deepfabric generate config.yaml --output-save-as dataset.jsonl
213
+ ```
214
+
215
+ Or upload to HuggingFace Hub:
216
+
217
+ ```bash
218
+ deepfabric upload dataset.jsonl --repo your-username/my-dataset
219
+ ```
220
+
221
+ ### 2. Load and Split for Training
222
+
223
+ ```python
224
+ from datasets import load_dataset
225
+ from transformers import AutoTokenizer
226
+
227
+ # Load from Hub
228
+ dataset = load_dataset("alwaysfurther/deepfabric-generic-tools", split="train")
229
+
230
+ # Split into train/eval
231
+ splits = dataset.train_test_split(test_size=0.1, seed=42)
232
+ train_ds = splits["train"]
233
+ eval_ds = splits["test"]
234
+
235
+ # Format using your tokenizer
236
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
237
+
238
+ def format_example(example):
239
+ messages = [{k: v for k, v in msg.items() if v is not None}
240
+ for msg in example["messages"]]
241
+ return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}
242
+
243
+ formatted_train = train_ds.map(format_example)
244
+ ```
245
+
246
+ ### 3. Train with TRL or Unsloth
247
+
248
+ ```python
249
+ from trl import SFTTrainer, SFTConfig
250
+
251
+ trainer = SFTTrainer(
252
+ model=model,
253
+ tokenizer=tokenizer,
254
+ train_dataset=formatted_train,
255
+ args=SFTConfig(output_dir="./output", num_train_epochs=3),
256
+ )
257
+ trainer.train()
258
+ ```
259
+
260
+ ### 4. Evaluate Your Model
261
+
262
+ ```python
263
+ from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
264
+
265
+ config = EvaluatorConfig(
266
+ inference_config=InferenceConfig(
267
+ model_path="./output/checkpoint-final", # Local path or HF Hub ID
268
+ backend="transformers",
269
+ ),
270
+ )
271
+
272
+ evaluator = Evaluator(config)
273
+ results = evaluator.evaluate(dataset=eval_ds) # Pass HF Dataset directly
274
+
275
+ print(f"Tool Selection Accuracy: {results.metrics.tool_selection_accuracy:.2%}")
276
+ print(f"Parameter Accuracy: {results.metrics.parameter_accuracy:.2%}")
277
+ print(f"Overall Score: {results.metrics.overall_score:.2%}")
278
+ ```
279
+
280
+ ## Evaluation
281
+
282
+ DeepFabric provides a comprehensive evaluation system to measure how well your fine-tuned models perform on tool-calling tasks.
283
+
284
+ ### Basic Evaluation
285
+
286
+ ```python
287
+ from datasets import load_dataset
288
+ from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
289
+
290
+ # Load your evaluation dataset
291
+ dataset = load_dataset("your-username/your-dataset", split="test")
292
+
293
+ # Configure the evaluator
294
+ config = EvaluatorConfig(
295
+ inference_config=InferenceConfig(
296
+ model_path="./output/checkpoint-final", # Local path or HF Hub ID
297
+ backend="transformers", # "transformers" or "ollama"
298
+ temperature=0.1, # Low temp for deterministic outputs
299
+ max_tokens=2048,
300
+ ),
301
+ max_samples=100, # Limit samples for quick testing (None for all)
302
+ save_predictions=True, # Save individual predictions
303
+ output_path="eval_results.json",
304
+ )
305
+
306
+ # Run evaluation
307
+ evaluator = Evaluator(config)
308
+ results = evaluator.evaluate(dataset=dataset)
309
+
310
+ # Print summary
311
+ evaluator.print_summary(results.metrics)
312
+
313
+ # Cleanup GPU memory
314
+ evaluator.cleanup()
315
+ ```
316
+
317
+ ### Evaluation with LoRA Adapters
318
+
319
+ ```python
320
+ from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
321
+
322
+ config = EvaluatorConfig(
323
+ inference_config=InferenceConfig(
324
+ model_path="Qwen/Qwen2.5-7B-Instruct", # Base model
325
+ adapter_path="./output/lora-adapter", # LoRA adapter path
326
+ backend="transformers",
327
+ use_unsloth=True, # Use Unsloth for adapters trained with Unsloth
328
+ load_in_4bit=True, # 4-bit quantization
329
+ max_seq_length=2048,
330
+ ),
331
+ )
332
+
333
+ evaluator = Evaluator(config)
334
+ results = evaluator.evaluate(dataset=eval_dataset)
335
+ ```
336
+
337
+ ### Understanding Evaluation Metrics
338
+
339
+ The evaluator computes several metrics for tool-calling tasks:
340
+
341
+ ```python
342
+ results = evaluator.evaluate(dataset=eval_dataset)
343
+ metrics = results.metrics
344
+
345
+ # Core metrics
346
+ print(f"Samples Evaluated: {metrics.samples_evaluated}")
347
+ print(f"Samples Processed: {metrics.samples_processed}")
348
+ print(f"Processing Errors: {metrics.processing_errors}")
349
+
350
+ # Tool-calling metrics
351
+ print(f"Tool Selection Accuracy: {metrics.tool_selection_accuracy:.2%}")
352
+ print(f"Parameter Accuracy: {metrics.parameter_accuracy:.2%}")
353
+ print(f"Execution Success Rate: {metrics.execution_success_rate:.2%}")
354
+ print(f"Response Quality: {metrics.response_quality:.2%}")
355
+ print(f"Overall Score: {metrics.overall_score:.2%}")
356
+ ```
357
+
358
+ | Metric | Description |
359
+ |--------|-------------|
360
+ | `tool_selection_accuracy` | How often the model selects the correct tool |
361
+ | `parameter_accuracy` | How often tool parameters match expected values |
362
+ | `execution_success_rate` | Rate of valid, executable tool calls |
363
+ | `response_quality` | Quality score for non-tool responses |
364
+ | `overall_score` | Weighted combination of all metrics |
365
+
366
+ ### Accessing Individual Predictions
367
+
368
+ ```python
369
+ results = evaluator.evaluate(dataset=eval_dataset)
370
+
371
+ # Iterate through individual sample evaluations
372
+ for pred in results.predictions:
373
+ print(f"Sample {pred.sample_id}:")
374
+ print(f" Query: {pred.query}")
375
+ print(f" Expected Tool: {pred.expected_tool}")
376
+ print(f" Predicted Tool: {pred.predicted_tool}")
377
+ print(f" Tool Correct: {pred.tool_selection_correct}")
378
+ print(f" Params Correct: {pred.parameters_correct}")
379
+ if pred.error:
380
+ print(f" Error: {pred.error}")
381
+ ```
382
+
383
+ ### Evaluation from JSONL File
384
+
385
+ ```python
386
+ from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
387
+
388
+ config = EvaluatorConfig(
389
+ dataset_path="eval_dataset.jsonl", # Load from file instead
390
+ inference_config=InferenceConfig(
391
+ model_path="./my-model",
392
+ backend="transformers",
393
+ ),
394
+ output_path="results.json",
395
+ )
396
+
397
+ evaluator = Evaluator(config)
398
+ results = evaluator.evaluate() # No dataset argument needed
399
+ ```
400
+
401
+ ### Using Ollama Backend
402
+
403
+ ```python
404
+ from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
405
+
406
+ config = EvaluatorConfig(
407
+ inference_config=InferenceConfig(
408
+ model_path="llama3.2:latest", # Ollama model name
409
+ backend="ollama",
410
+ temperature=0.1,
411
+ ),
412
+ )
413
+
414
+ evaluator = Evaluator(config)
415
+ results = evaluator.evaluate(dataset=eval_dataset)
416
+ ```
417
+
418
+ ## Training Metrics
419
+
420
+ DeepFabric provides a training callback that automatically logs metrics to the DeepFabric cloud during model training. This enables real-time monitoring and tracking of training runs.
421
+
422
+ ### Basic Usage with HuggingFace Trainer
423
+
424
+ ```python
425
+ from transformers import Trainer, TrainingArguments
426
+ from deepfabric import DeepFabricCallback
427
+
428
+ # Set up training arguments
429
+ training_args = TrainingArguments(
430
+ output_dir="./output",
431
+ num_train_epochs=3,
432
+ per_device_train_batch_size=4,
433
+ logging_steps=10,
434
+ )
435
+
436
+ # Create trainer
437
+ trainer = Trainer(
438
+ model=model,
439
+ args=training_args,
440
+ train_dataset=train_dataset,
441
+ eval_dataset=eval_dataset,
442
+ )
443
+
444
+ # Add DeepFabric callback for metrics logging
445
+ trainer.add_callback(DeepFabricCallback(trainer))
446
+
447
+ # Train - metrics are automatically logged
448
+ trainer.train()
449
+ ```
450
+
451
+ ### Usage with TRL SFTTrainer
452
+
453
+ ```python
454
+ from trl import SFTTrainer, SFTConfig
455
+ from deepfabric import DeepFabricCallback
456
+
457
+ trainer = SFTTrainer(
458
+ model=model,
459
+ tokenizer=tokenizer,
460
+ train_dataset=train_dataset,
461
+ args=SFTConfig(
462
+ output_dir="./output",
463
+ num_train_epochs=3,
464
+ logging_steps=10,
465
+ ),
466
+ )
467
+
468
+ # Add callback - works with any Trainer-compatible class
469
+ trainer.add_callback(DeepFabricCallback(trainer))
470
+ trainer.train()
471
+ ```
472
+
473
+ ### Configuration Options
474
+
475
+ ```python
476
+ from deepfabric import DeepFabricCallback
477
+
478
+ callback = DeepFabricCallback(
479
+ trainer=trainer, # Optional: Trainer instance
480
+ api_key="your-api-key", # Or set DEEPFABRIC_API_KEY env var
481
+ endpoint="https://api.deepfabric.ai", # Custom endpoint (optional)
482
+ enabled=True, # Disable to skip logging
483
+ )
484
+ ```
485
+
486
+ ### Environment Variables
487
+
488
+ ```bash
489
+ # API key for authentication
490
+ export DEEPFABRIC_API_KEY="your-api-key"
491
+
492
+ # Custom API endpoint (optional)
493
+ export DEEPFABRIC_API_URL="https://api.deepfabric.ai"
494
+ ```
495
+
496
+ ### Logged Metrics
497
+
498
+ The callback automatically captures and logs:
499
+
500
+ | Metric Type | Examples |
501
+ |-------------|----------|
502
+ | Training | `loss`, `learning_rate`, `epoch`, `global_step` |
503
+ | Throughput | `train_runtime`, `train_samples_per_second` |
504
+ | Evaluation | `eval_loss`, `eval_accuracy` (when evaluation is run) |
505
+ | TRL-specific | `rewards/chosen`, `rewards/rejected`, `kl_divergence` |
506
+ | Checkpoints | Checkpoint save events with step numbers |
507
+
508
+ ### Callback Events
509
+
510
+ ```python
511
+ # The callback hooks into these Trainer events:
512
+ # - on_train_begin: Logs run start with training configuration
513
+ # - on_log: Logs training metrics (loss, lr, etc.)
514
+ # - on_evaluate: Logs evaluation metrics
515
+ # - on_save: Logs checkpoint events
516
+ # - on_train_end: Logs run completion and flushes pending metrics
517
+ ```
518
+
519
+ ### Non-Blocking Design
520
+
521
+ The callback uses a background thread to send metrics asynchronously, ensuring training is never blocked by network operations:
522
+
523
+ ```python
524
+ from deepfabric.training import MetricsSender
525
+
526
+ # Direct access to sender for advanced use cases
527
+ sender = MetricsSender(
528
+ endpoint="https://api.deepfabric.ai",
529
+ api_key="your-key",
530
+ batch_size=10, # Batch metrics before sending
531
+ flush_interval=5.0, # Auto-flush every 5 seconds
532
+ max_queue_size=1000, # Queue capacity
533
+ )
534
+
535
+ # Manually send metrics
536
+ sender.send_metrics({"custom_metric": 0.95, "step": 100})
537
+
538
+ # Flush pending metrics (blocking)
539
+ sender.flush(timeout=30.0)
540
+
541
+ # Check sender statistics
542
+ print(sender.stats)
543
+ # {'metrics_sent': 150, 'metrics_dropped': 0, 'send_errors': 0, 'queue_size': 0}
544
+ ```
545
+
546
+ ### Interactive API Key Prompt
547
+
548
+ When running in an interactive environment (Jupyter notebook, terminal) without an API key configured, the callback will prompt for authentication:
549
+
550
+ ```python
551
+ from deepfabric import DeepFabricCallback
552
+
553
+ # If DEEPFABRIC_API_KEY is not set, prompts for login
554
+ callback = DeepFabricCallback(trainer)
555
+ # > DeepFabric API key not found. Log in to enable cloud metrics.
556
+ # > Visit: https://app.deepfabric.ai/signup
557
+ ```
558
+
559
+ ### Disabling Metrics Logging
560
+
561
+ ```python
562
+ # Disable via constructor
563
+ callback = DeepFabricCallback(trainer, enabled=False)
564
+
565
+ # Or set API key to None
566
+ callback = DeepFabricCallback(trainer, api_key=None)
567
+
568
+ # Or don't set DEEPFABRIC_API_KEY environment variable
569
+ ```
570
+
571
+ ## Providers
572
+
573
+ | Provider | Local/Cloud | Best For |
574
+ |----------|-------------|----------|
575
+ | OpenAI | Cloud | High quality, complex tasks |
576
+ | Anthropic | Cloud | Nuanced reasoning |
577
+ | Google Gemini | Cloud | Cost-effective at scale |
578
+ | Ollama | Local | Privacy, unlimited generation |
579
+ | OpenRouter | Cloud | Flexible model choice |
580
+
581
+ ## Tool Tracing with Spin
582
+
583
+ DeepFabric supports **real tool execution** during dataset generation using the [Spin Framework](https://www.fermyon.com/spin). Instead of simulating tool outputs, tools actually execute in isolated WebAssembly sandboxes, producing authentic training data.
584
+
585
+ ### Why Real Execution Matters
586
+
587
+ Traditional synthetic data generators simulate tool outputs, which creates unrealistic training data:
588
+
589
+ ```
590
+ # Simulated (problematic)
591
+ Agent: read_file("config.json")
592
+ Result: {"setting": "value"} # LLM hallucinated this content
593
+ ```
594
+
595
+ With Spin integration, tools execute against real state:
596
+
597
+ ```
598
+ # Real execution (accurate)
599
+ Agent: read_file("config.json")
600
+ Result: FileNotFound # Actual filesystem state
601
+ Agent: write_file("config.json", "{...}")
602
+ Result: Written 42 bytes # Real operation
603
+ ```
604
+
605
+ ### ReAct-Style Execution
606
+
607
+ DeepFabric uses a ReAct (Reason-Act-Observe) loop for tool calling. The agent observes real results before deciding the next action:
608
+
609
+ ```
610
+ Step 1: Agent thinks "I should check if config exists"
611
+ -> Calls read_file("config.json")
612
+ -> Observes: FileNotFound
613
+
614
+ Step 2: Agent thinks "Config doesn't exist, I'll create it"
615
+ -> Calls write_file("config.json", content)
616
+ -> Observes: Success
617
+ ```
618
+
619
+ This produces training data where decisions are based on actual observations, not hallucinated assumptions.
620
+
621
+ ### Configuration
622
+
623
+ Enable tool tracing in your YAML config:
624
+
625
+ ```yaml
626
+ generation:
627
+ conversation:
628
+ type: chain_of_thought
629
+ reasoning_style: agent
630
+ agent_mode: single_turn
631
+
632
+ tools:
633
+ spin_endpoint: "http://localhost:3000" # Spin service URL
634
+ available: # Filter to specific tools
635
+ - read_file
636
+ - write_file
637
+ - list_files
638
+ max_agent_steps: 5 # Max ReAct iterations
639
+
640
+ # Optional: Seed initial state for scenarios
641
+ scenario_seed:
642
+ files:
643
+ "config.json": '{"debug": true}'
644
+ ```
645
+
646
+ ### Built-in VFS Tools
647
+
648
+ DeepFabric includes a virtual filesystem (VFS) component with these tools:
649
+
650
+ | Tool | Description |
651
+ |------|-------------|
652
+ | `read_file` | Read content from a file |
653
+ | `write_file` | Write content to a file |
654
+ | `list_files` | List all files in the session |
655
+ | `delete_file` | Delete a file |
656
+
657
+ Each session gets an isolated filesystem - changes don't persist between samples.
658
+
659
+ ### Running Spin Locally
660
+
661
+ ```bash
662
+ cd tools-sdk
663
+ spin build
664
+ spin up
665
+ ```
666
+
667
+ The Spin service runs at `http://localhost:3000` by default.
668
+
669
+ ### Adding Custom Tools
670
+
671
+ You can extend DeepFabric with custom tools written in Python, JavaScript, Go, or Rust. See [tool-traces.md](./tool-traces.md) for detailed documentation on:
672
+
673
+ - Creating custom Spin components
674
+ - Tool definition schemas
675
+ - Multi-language examples
676
+ - Containerization and deployment
677
+
678
+ ## Resources
679
+
680
+ - [Documentation](https://always-further.github.io/deepfabric/)
681
+ - [Examples](./examples/README.md)
682
+ - [Tool Tracing Guide](./tool-traces.md)
683
+ - [Discord](https://discord.gg/pPcjYzGvbS)
684
+ - [Issues](https://github.com/always-further/deepfabric/issues)
685
+
686
+ ## Development
687
+
688
+ ```bash
689
+ git clone https://github.com/always-further/deepfabric
690
+ cd deepfabric
691
+ uv sync --all-extras
692
+ make test
693
+ ```
694
+
695
+ ## Analytics
696
+
697
+ We collect anonymous usage metrics to improve DeepFabric. No personal data, prompts, or API keys are collected.
698
+
699
+ ```bash
700
+ # Disable analytics
701
+ export ANONYMIZED_TELEMETRY=False
702
+ ```