DeepFabric 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/__init__.py +70 -0
- deepfabric/__main__.py +6 -0
- deepfabric/auth.py +382 -0
- deepfabric/builders.py +303 -0
- deepfabric/builders_agent.py +1304 -0
- deepfabric/cli.py +1288 -0
- deepfabric/config.py +899 -0
- deepfabric/config_manager.py +251 -0
- deepfabric/constants.py +94 -0
- deepfabric/dataset_manager.py +534 -0
- deepfabric/error_codes.py +581 -0
- deepfabric/evaluation/__init__.py +47 -0
- deepfabric/evaluation/backends/__init__.py +32 -0
- deepfabric/evaluation/backends/ollama_backend.py +137 -0
- deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
- deepfabric/evaluation/backends/transformers_backend.py +326 -0
- deepfabric/evaluation/evaluator.py +845 -0
- deepfabric/evaluation/evaluators/__init__.py +13 -0
- deepfabric/evaluation/evaluators/base.py +104 -0
- deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
- deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
- deepfabric/evaluation/evaluators/registry.py +66 -0
- deepfabric/evaluation/inference.py +155 -0
- deepfabric/evaluation/metrics.py +397 -0
- deepfabric/evaluation/parser.py +304 -0
- deepfabric/evaluation/reporters/__init__.py +13 -0
- deepfabric/evaluation/reporters/base.py +56 -0
- deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
- deepfabric/evaluation/reporters/file_reporter.py +61 -0
- deepfabric/evaluation/reporters/multi_reporter.py +56 -0
- deepfabric/exceptions.py +67 -0
- deepfabric/factory.py +26 -0
- deepfabric/generator.py +1084 -0
- deepfabric/graph.py +545 -0
- deepfabric/hf_hub.py +214 -0
- deepfabric/kaggle_hub.py +219 -0
- deepfabric/llm/__init__.py +41 -0
- deepfabric/llm/api_key_verifier.py +534 -0
- deepfabric/llm/client.py +1206 -0
- deepfabric/llm/errors.py +105 -0
- deepfabric/llm/rate_limit_config.py +262 -0
- deepfabric/llm/rate_limit_detector.py +278 -0
- deepfabric/llm/retry_handler.py +270 -0
- deepfabric/metrics.py +212 -0
- deepfabric/progress.py +262 -0
- deepfabric/prompts.py +290 -0
- deepfabric/schemas.py +1000 -0
- deepfabric/spin/__init__.py +6 -0
- deepfabric/spin/client.py +263 -0
- deepfabric/spin/models.py +26 -0
- deepfabric/stream_simulator.py +90 -0
- deepfabric/tools/__init__.py +5 -0
- deepfabric/tools/defaults.py +85 -0
- deepfabric/tools/loader.py +87 -0
- deepfabric/tools/mcp_client.py +677 -0
- deepfabric/topic_manager.py +303 -0
- deepfabric/topic_model.py +20 -0
- deepfabric/training/__init__.py +35 -0
- deepfabric/training/api_key_prompt.py +302 -0
- deepfabric/training/callback.py +363 -0
- deepfabric/training/metrics_sender.py +301 -0
- deepfabric/tree.py +438 -0
- deepfabric/tui.py +1267 -0
- deepfabric/update_checker.py +166 -0
- deepfabric/utils.py +150 -0
- deepfabric/validation.py +143 -0
- deepfabric-4.4.0.dist-info/METADATA +702 -0
- deepfabric-4.4.0.dist-info/RECORD +71 -0
- deepfabric-4.4.0.dist-info/WHEEL +4 -0
- deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
- deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,702 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: DeepFabric
|
|
3
|
+
Version: 4.4.0
|
|
4
|
+
Summary: Curate High Quality Datasets, Train, Evaluate and Ship
|
|
5
|
+
Author-email: Luke Hinds <luke@alwaysfurther.ai>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: accelerate>=0.20.0
|
|
9
|
+
Requires-Dist: anthropic>=0.75.0
|
|
10
|
+
Requires-Dist: click>=8.1.7
|
|
11
|
+
Requires-Dist: componentize-py>=0.19.3
|
|
12
|
+
Requires-Dist: datasets<5.0,>=3.0
|
|
13
|
+
Requires-Dist: google-api-core>=2.0.0
|
|
14
|
+
Requires-Dist: google-genai>=1.56.0
|
|
15
|
+
Requires-Dist: huggingface-hub==0.36.0
|
|
16
|
+
Requires-Dist: kagglehub>=0.3.0
|
|
17
|
+
Requires-Dist: mermaid-py>=0.8.0
|
|
18
|
+
Requires-Dist: ollama>=0.6.1
|
|
19
|
+
Requires-Dist: openai>=1.107.2
|
|
20
|
+
Requires-Dist: outlines==1.2.9
|
|
21
|
+
Requires-Dist: packaging>=25.0
|
|
22
|
+
Requires-Dist: peft>=0.7.0
|
|
23
|
+
Requires-Dist: posthog>=3.0.0
|
|
24
|
+
Requires-Dist: protobuf>=3.20.0
|
|
25
|
+
Requires-Dist: pydantic>=2.0.0
|
|
26
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
27
|
+
Requires-Dist: rich>=13.0.0
|
|
28
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
29
|
+
Requires-Dist: spin-sdk>=3.4.1
|
|
30
|
+
Requires-Dist: torch>=2.4.0
|
|
31
|
+
Requires-Dist: transformers>=4.57.1
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: bandit>=1.7.10; extra == 'dev'
|
|
34
|
+
Requires-Dist: mermaid-py>=0.2.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: requests-mock>=1.11.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: mkdocs-material>=9.0.0; extra == 'docs'
|
|
42
|
+
Requires-Dist: mkdocstrings[python]>=0.30.0; extra == 'docs'
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
<div align="center">
|
|
46
|
+
<picture>
|
|
47
|
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/logo-light.png" />
|
|
48
|
+
<img alt="DeepFabric logo" src="./assets/logo-light-hols.png" style="width:40%;max-width:40%;height:auto;display:block;margin:0 auto;" />
|
|
49
|
+
</picture>
|
|
50
|
+
<h3>Training Model Behavior in Agentic Systems</h3>
|
|
51
|
+
|
|
52
|
+
<!-- CTA Buttons -->
|
|
53
|
+
<p>
|
|
54
|
+
<a href="https://github.com/always-further/deepfabric/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22">
|
|
55
|
+
<img src="https://img.shields.io/badge/Contribute-Good%20First%20Issues-green?style=for-the-badge&logo=github" alt="Good First Issues"/>
|
|
56
|
+
</a>
|
|
57
|
+
|
|
58
|
+
<a href="https://discord.gg/pPcjYzGvbS">
|
|
59
|
+
<img src="https://img.shields.io/badge/Chat-Join%20Discord-7289da?style=for-the-badge&logo=discord&logoColor=white" alt="Join Discord"/>
|
|
60
|
+
</a>
|
|
61
|
+
</p>
|
|
62
|
+
|
|
63
|
+
<!-- Badges -->
|
|
64
|
+
<p>
|
|
65
|
+
<a href="https://opensource.org/licenses/Apache-2.0">
|
|
66
|
+
<img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License"/>
|
|
67
|
+
</a>
|
|
68
|
+
<a href="https://github.com/always-further/deepfabric/actions/workflows/test.yml">
|
|
69
|
+
<img src="https://github.com/always-further/deepfabric/actions/workflows/test.yml/badge.svg" alt="CI Status"/>
|
|
70
|
+
</a>
|
|
71
|
+
<a href="https://pypi.org/project/deepfabric/">
|
|
72
|
+
<img src="https://img.shields.io/pypi/v/deepfabric.svg" alt="PyPI Version"/>
|
|
73
|
+
</a>
|
|
74
|
+
<a href="https://pepy.tech/project/deepfabric">
|
|
75
|
+
<img src="https://static.pepy.tech/badge/deepfabric" alt="Downloads"/>
|
|
76
|
+
</a>
|
|
77
|
+
<a href="https://discord.gg/pPcjYzGvbS">
|
|
78
|
+
<img src="https://img.shields.io/discord/1384081906773131274?color=7289da&label=Discord&logo=discord&logoColor=white" alt="Discord"/>
|
|
79
|
+
</a>
|
|
80
|
+
</p>
|
|
81
|
+
</div>
|
|
82
|
+
|
|
83
|
+
**DeepFabric** generates synthetic training data for language models and agent evaluations. By combining reasoning traces with tool-calling patterns, it creates high-quality, domain-specific datasets that teach models to think, plan, and act effectively, call tools correctly, and conform to strict schema structures.
|
|
84
|
+
|
|
85
|
+
What sets DeepFabric apart from other dataset generation tools is its ability to ensure high diversity yet domain-anchored relevance through unique topic graph generation algorithms. This guides sample creation to cover all necessary subtopics while avoiding redundancy, which is where other tools often fall short, resulting in model overfit.
|
|
86
|
+
|
|
87
|
+
<img src="/assets/df-demo.gif" width="100%" height="100%"/>
|
|
88
|
+
|
|
89
|
+
Constrained decoding and response validation, along with real tool executions within isolated webassembly environments, ensure that generated samples strictly adhere to structured schema, variable constraints, and execution correctness, ensuring datasets have exact syntax and structure for use in model training pipelines. Tool definations can be either directly imported from MCP (Model Context Protocol) server schemas and automatically mocked, real life interfaces along with a standard set of common tools (`list_files()`, 'read_file()` etc)
|
|
90
|
+
|
|
91
|
+
Once your dataset is generated, it can be automatically uploaded to Hugging Face and directly imported into popular training frameworks like TRL, Unsloth, and Axolotl.
|
|
92
|
+
|
|
93
|
+
Post-training, DeepFabric's built-in evaluation engine assesses model performance, whereby models prove their capabilities on unseen tasks derived from training splits—covering evaluation-only questions, answers, and tool traces.
|
|
94
|
+
|
|
95
|
+
## Quickstart
|
|
96
|
+
|
|
97
|
+
DeepFabric can be used in several ways, as a library, CLI tool, or via YAML configuration. Here's a quick example using the CLI:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install deepfabric
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
export OPENAI_API_KEY="your-api-key"
|
|
105
|
+
|
|
106
|
+
deepfabric generate \
|
|
107
|
+
--topic-prompt "Python programming fundamentals" \
|
|
108
|
+
--generation-system-prompt "You are a Python expert" \
|
|
109
|
+
--mode graph \
|
|
110
|
+
--depth 3 \
|
|
111
|
+
--degree 3 \
|
|
112
|
+
--num-samples 9 \
|
|
113
|
+
--batch-size 3 \
|
|
114
|
+
--provider openai \
|
|
115
|
+
--model gpt-4o \
|
|
116
|
+
--output-save-as dataset.jsonl
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
This generates a topic graph and creates 27 unique nodes, then generates 27 training samples saved to `dataset.jsonl`, giving you 100% topic coverage.
|
|
120
|
+
|
|
121
|
+
## Configuration
|
|
122
|
+
|
|
123
|
+
DeepFabric also uses YAML configuration with three main sections and optional shared LLM defaults:
|
|
124
|
+
|
|
125
|
+
```yaml
|
|
126
|
+
# Optional: Shared LLM defaults (inherited by topics and generation)
|
|
127
|
+
llm:
|
|
128
|
+
provider: "openai"
|
|
129
|
+
model: "gpt-4o"
|
|
130
|
+
temperature: 0.7
|
|
131
|
+
|
|
132
|
+
# TOPICS: Generate the topic tree/graph
|
|
133
|
+
topics:
|
|
134
|
+
prompt: "Building production-ready REST APIs with Python"
|
|
135
|
+
mode: tree # tree | graph
|
|
136
|
+
depth: 3
|
|
137
|
+
degree: 3
|
|
138
|
+
save_as: "topics.jsonl"
|
|
139
|
+
# Optional: Override shared LLM settings
|
|
140
|
+
llm:
|
|
141
|
+
model: "gpt-4o-mini" # Use cheaper model for topics
|
|
142
|
+
|
|
143
|
+
# GENERATION: Create training samples from topics
|
|
144
|
+
generation:
|
|
145
|
+
system_prompt: |
|
|
146
|
+
You are an expert Python backend developer and technical educator.
|
|
147
|
+
Create practical, production-ready code examples with clear explanations.
|
|
148
|
+
Include error handling, type hints, and follow PEP 8 conventions.
|
|
149
|
+
|
|
150
|
+
# Additional instructions for sample generation
|
|
151
|
+
instructions: |
|
|
152
|
+
Focus on real-world scenarios developers encounter daily.
|
|
153
|
+
Include both happy path and edge case handling.
|
|
154
|
+
Provide context on when and why to use specific patterns.
|
|
155
|
+
|
|
156
|
+
conversation:
|
|
157
|
+
type: chain_of_thought # basic | chain_of_thought
|
|
158
|
+
reasoning_style: agent # freetext | agent (for chain_of_thought)
|
|
159
|
+
agent_mode: single_turn # single_turn | multi_turn (for agent)
|
|
160
|
+
|
|
161
|
+
# Tool configuration (required for agent modes)
|
|
162
|
+
tools:
|
|
163
|
+
spin_endpoint: "http://localhost:3000" # Spin service for tool execution
|
|
164
|
+
available: # Filter to specific tools (empty = all VFS tools)
|
|
165
|
+
- read_file
|
|
166
|
+
- write_file
|
|
167
|
+
- list_files
|
|
168
|
+
max_per_query: 3 # Maximum tools per query
|
|
169
|
+
max_agent_steps: 5 # Max ReAct reasoning iterations
|
|
170
|
+
|
|
171
|
+
max_retries: 3 # Retries for failed generations
|
|
172
|
+
sample_retries: 2 # Retries for validation failures
|
|
173
|
+
max_tokens: 2000 # Max tokens per generation
|
|
174
|
+
|
|
175
|
+
# Optional: Override shared LLM settings
|
|
176
|
+
llm:
|
|
177
|
+
temperature: 0.3 # Lower temp for consistent code
|
|
178
|
+
|
|
179
|
+
# OUTPUT: Final dataset configuration
|
|
180
|
+
output:
|
|
181
|
+
# System prompt that goes INTO the training data
|
|
182
|
+
# This is what the trained model will see as its system message
|
|
183
|
+
system_prompt: |
|
|
184
|
+
You are a helpful Python programming assistant specialized in REST API
|
|
185
|
+
development. You provide clear, production-ready code with explanations.
|
|
186
|
+
Always consider security, error handling, and best practices.
|
|
187
|
+
|
|
188
|
+
include_system_message: true # Whether to include system message in output
|
|
189
|
+
num_samples: 4 # Total training samples to generate
|
|
190
|
+
batch_size: 3 # Parallel generation batch size
|
|
191
|
+
save_as: "api-dataset.jsonl"
|
|
192
|
+
|
|
193
|
+
# Optional: Upload to Hugging Face
|
|
194
|
+
huggingface:
|
|
195
|
+
repository: "your-username/api-dataset-training-name"
|
|
196
|
+
tags: ["python", "programming"]
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Run with:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
deepfabric generate config.yaml
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Generate, Train, Evaluate
|
|
206
|
+
|
|
207
|
+
DeepFabric returns standard HuggingFace datasets, making it easy to integrate with any training framework.
|
|
208
|
+
|
|
209
|
+
### 1. Generate Dataset
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
deepfabric generate config.yaml --output-save-as dataset.jsonl
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Or upload to HuggingFace Hub:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
deepfabric upload dataset.jsonl --repo your-username/my-dataset
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### 2. Load and Split for Training
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from datasets import load_dataset
|
|
225
|
+
from transformers import AutoTokenizer
|
|
226
|
+
|
|
227
|
+
# Load from Hub
|
|
228
|
+
dataset = load_dataset("alwaysfurther/deepfabric-generic-tools", split="train")
|
|
229
|
+
|
|
230
|
+
# Split into train/eval
|
|
231
|
+
splits = dataset.train_test_split(test_size=0.1, seed=42)
|
|
232
|
+
train_ds = splits["train"]
|
|
233
|
+
eval_ds = splits["test"]
|
|
234
|
+
|
|
235
|
+
# Format using your tokenizer
|
|
236
|
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
|
|
237
|
+
|
|
238
|
+
def format_example(example):
|
|
239
|
+
messages = [{k: v for k, v in msg.items() if v is not None}
|
|
240
|
+
for msg in example["messages"]]
|
|
241
|
+
return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}
|
|
242
|
+
|
|
243
|
+
formatted_train = train_ds.map(format_example)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### 3. Train with TRL or Unsloth
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
from trl import SFTTrainer, SFTConfig
|
|
250
|
+
|
|
251
|
+
trainer = SFTTrainer(
|
|
252
|
+
model=model,
|
|
253
|
+
tokenizer=tokenizer,
|
|
254
|
+
train_dataset=formatted_train,
|
|
255
|
+
args=SFTConfig(output_dir="./output", num_train_epochs=3),
|
|
256
|
+
)
|
|
257
|
+
trainer.train()
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### 4. Evaluate Your Model
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
|
|
264
|
+
|
|
265
|
+
config = EvaluatorConfig(
|
|
266
|
+
inference_config=InferenceConfig(
|
|
267
|
+
model_path="./output/checkpoint-final", # Local path or HF Hub ID
|
|
268
|
+
backend="transformers",
|
|
269
|
+
),
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
evaluator = Evaluator(config)
|
|
273
|
+
results = evaluator.evaluate(dataset=eval_ds) # Pass HF Dataset directly
|
|
274
|
+
|
|
275
|
+
print(f"Tool Selection Accuracy: {results.metrics.tool_selection_accuracy:.2%}")
|
|
276
|
+
print(f"Parameter Accuracy: {results.metrics.parameter_accuracy:.2%}")
|
|
277
|
+
print(f"Overall Score: {results.metrics.overall_score:.2%}")
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Evaluation
|
|
281
|
+
|
|
282
|
+
DeepFabric provides a comprehensive evaluation system to measure how well your fine-tuned models perform on tool-calling tasks.
|
|
283
|
+
|
|
284
|
+
### Basic Evaluation
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from datasets import load_dataset
|
|
288
|
+
from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
|
|
289
|
+
|
|
290
|
+
# Load your evaluation dataset
|
|
291
|
+
dataset = load_dataset("your-username/your-dataset", split="test")
|
|
292
|
+
|
|
293
|
+
# Configure the evaluator
|
|
294
|
+
config = EvaluatorConfig(
|
|
295
|
+
inference_config=InferenceConfig(
|
|
296
|
+
model_path="./output/checkpoint-final", # Local path or HF Hub ID
|
|
297
|
+
backend="transformers", # "transformers" or "ollama"
|
|
298
|
+
temperature=0.1, # Low temp for deterministic outputs
|
|
299
|
+
max_tokens=2048,
|
|
300
|
+
),
|
|
301
|
+
max_samples=100, # Limit samples for quick testing (None for all)
|
|
302
|
+
save_predictions=True, # Save individual predictions
|
|
303
|
+
output_path="eval_results.json",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Run evaluation
|
|
307
|
+
evaluator = Evaluator(config)
|
|
308
|
+
results = evaluator.evaluate(dataset=dataset)
|
|
309
|
+
|
|
310
|
+
# Print summary
|
|
311
|
+
evaluator.print_summary(results.metrics)
|
|
312
|
+
|
|
313
|
+
# Cleanup GPU memory
|
|
314
|
+
evaluator.cleanup()
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
### Evaluation with LoRA Adapters
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
|
|
321
|
+
|
|
322
|
+
config = EvaluatorConfig(
|
|
323
|
+
inference_config=InferenceConfig(
|
|
324
|
+
model_path="Qwen/Qwen2.5-7B-Instruct", # Base model
|
|
325
|
+
adapter_path="./output/lora-adapter", # LoRA adapter path
|
|
326
|
+
backend="transformers",
|
|
327
|
+
use_unsloth=True, # Use Unsloth for adapters trained with Unsloth
|
|
328
|
+
load_in_4bit=True, # 4-bit quantization
|
|
329
|
+
max_seq_length=2048,
|
|
330
|
+
),
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
evaluator = Evaluator(config)
|
|
334
|
+
results = evaluator.evaluate(dataset=eval_dataset)
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
### Understanding Evaluation Metrics
|
|
338
|
+
|
|
339
|
+
The evaluator computes several metrics for tool-calling tasks:
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
results = evaluator.evaluate(dataset=eval_dataset)
|
|
343
|
+
metrics = results.metrics
|
|
344
|
+
|
|
345
|
+
# Core metrics
|
|
346
|
+
print(f"Samples Evaluated: {metrics.samples_evaluated}")
|
|
347
|
+
print(f"Samples Processed: {metrics.samples_processed}")
|
|
348
|
+
print(f"Processing Errors: {metrics.processing_errors}")
|
|
349
|
+
|
|
350
|
+
# Tool-calling metrics
|
|
351
|
+
print(f"Tool Selection Accuracy: {metrics.tool_selection_accuracy:.2%}")
|
|
352
|
+
print(f"Parameter Accuracy: {metrics.parameter_accuracy:.2%}")
|
|
353
|
+
print(f"Execution Success Rate: {metrics.execution_success_rate:.2%}")
|
|
354
|
+
print(f"Response Quality: {metrics.response_quality:.2%}")
|
|
355
|
+
print(f"Overall Score: {metrics.overall_score:.2%}")
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
| Metric | Description |
|
|
359
|
+
|--------|-------------|
|
|
360
|
+
| `tool_selection_accuracy` | How often the model selects the correct tool |
|
|
361
|
+
| `parameter_accuracy` | How often tool parameters match expected values |
|
|
362
|
+
| `execution_success_rate` | Rate of valid, executable tool calls |
|
|
363
|
+
| `response_quality` | Quality score for non-tool responses |
|
|
364
|
+
| `overall_score` | Weighted combination of all metrics |
|
|
365
|
+
|
|
366
|
+
### Accessing Individual Predictions
|
|
367
|
+
|
|
368
|
+
```python
|
|
369
|
+
results = evaluator.evaluate(dataset=eval_dataset)
|
|
370
|
+
|
|
371
|
+
# Iterate through individual sample evaluations
|
|
372
|
+
for pred in results.predictions:
|
|
373
|
+
print(f"Sample {pred.sample_id}:")
|
|
374
|
+
print(f" Query: {pred.query}")
|
|
375
|
+
print(f" Expected Tool: {pred.expected_tool}")
|
|
376
|
+
print(f" Predicted Tool: {pred.predicted_tool}")
|
|
377
|
+
print(f" Tool Correct: {pred.tool_selection_correct}")
|
|
378
|
+
print(f" Params Correct: {pred.parameters_correct}")
|
|
379
|
+
if pred.error:
|
|
380
|
+
print(f" Error: {pred.error}")
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
### Evaluation from JSONL File
|
|
384
|
+
|
|
385
|
+
```python
|
|
386
|
+
from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
|
|
387
|
+
|
|
388
|
+
config = EvaluatorConfig(
|
|
389
|
+
dataset_path="eval_dataset.jsonl", # Load from file instead
|
|
390
|
+
inference_config=InferenceConfig(
|
|
391
|
+
model_path="./my-model",
|
|
392
|
+
backend="transformers",
|
|
393
|
+
),
|
|
394
|
+
output_path="results.json",
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
evaluator = Evaluator(config)
|
|
398
|
+
results = evaluator.evaluate() # No dataset argument needed
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
### Using Ollama Backend
|
|
402
|
+
|
|
403
|
+
```python
|
|
404
|
+
from deepfabric.evaluation import Evaluator, EvaluatorConfig, InferenceConfig
|
|
405
|
+
|
|
406
|
+
config = EvaluatorConfig(
|
|
407
|
+
inference_config=InferenceConfig(
|
|
408
|
+
model_path="llama3.2:latest", # Ollama model name
|
|
409
|
+
backend="ollama",
|
|
410
|
+
temperature=0.1,
|
|
411
|
+
),
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
evaluator = Evaluator(config)
|
|
415
|
+
results = evaluator.evaluate(dataset=eval_dataset)
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
## Training Metrics
|
|
419
|
+
|
|
420
|
+
DeepFabric provides a training callback that automatically logs metrics to the DeepFabric cloud during model training. This enables real-time monitoring and tracking of training runs.
|
|
421
|
+
|
|
422
|
+
### Basic Usage with HuggingFace Trainer
|
|
423
|
+
|
|
424
|
+
```python
|
|
425
|
+
from transformers import Trainer, TrainingArguments
|
|
426
|
+
from deepfabric import DeepFabricCallback
|
|
427
|
+
|
|
428
|
+
# Set up training arguments
|
|
429
|
+
training_args = TrainingArguments(
|
|
430
|
+
output_dir="./output",
|
|
431
|
+
num_train_epochs=3,
|
|
432
|
+
per_device_train_batch_size=4,
|
|
433
|
+
logging_steps=10,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Create trainer
|
|
437
|
+
trainer = Trainer(
|
|
438
|
+
model=model,
|
|
439
|
+
args=training_args,
|
|
440
|
+
train_dataset=train_dataset,
|
|
441
|
+
eval_dataset=eval_dataset,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Add DeepFabric callback for metrics logging
|
|
445
|
+
trainer.add_callback(DeepFabricCallback(trainer))
|
|
446
|
+
|
|
447
|
+
# Train - metrics are automatically logged
|
|
448
|
+
trainer.train()
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### Usage with TRL SFTTrainer
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
from trl import SFTTrainer, SFTConfig
|
|
455
|
+
from deepfabric import DeepFabricCallback
|
|
456
|
+
|
|
457
|
+
trainer = SFTTrainer(
|
|
458
|
+
model=model,
|
|
459
|
+
tokenizer=tokenizer,
|
|
460
|
+
train_dataset=train_dataset,
|
|
461
|
+
args=SFTConfig(
|
|
462
|
+
output_dir="./output",
|
|
463
|
+
num_train_epochs=3,
|
|
464
|
+
logging_steps=10,
|
|
465
|
+
),
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
# Add callback - works with any Trainer-compatible class
|
|
469
|
+
trainer.add_callback(DeepFabricCallback(trainer))
|
|
470
|
+
trainer.train()
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
### Configuration Options
|
|
474
|
+
|
|
475
|
+
```python
|
|
476
|
+
from deepfabric import DeepFabricCallback
|
|
477
|
+
|
|
478
|
+
callback = DeepFabricCallback(
|
|
479
|
+
trainer=trainer, # Optional: Trainer instance
|
|
480
|
+
api_key="your-api-key", # Or set DEEPFABRIC_API_KEY env var
|
|
481
|
+
endpoint="https://api.deepfabric.ai", # Custom endpoint (optional)
|
|
482
|
+
enabled=True, # Disable to skip logging
|
|
483
|
+
)
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
### Environment Variables
|
|
487
|
+
|
|
488
|
+
```bash
|
|
489
|
+
# API key for authentication
|
|
490
|
+
export DEEPFABRIC_API_KEY="your-api-key"
|
|
491
|
+
|
|
492
|
+
# Custom API endpoint (optional)
|
|
493
|
+
export DEEPFABRIC_API_URL="https://api.deepfabric.ai"
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
### Logged Metrics
|
|
497
|
+
|
|
498
|
+
The callback automatically captures and logs:
|
|
499
|
+
|
|
500
|
+
| Metric Type | Examples |
|
|
501
|
+
|-------------|----------|
|
|
502
|
+
| Training | `loss`, `learning_rate`, `epoch`, `global_step` |
|
|
503
|
+
| Throughput | `train_runtime`, `train_samples_per_second` |
|
|
504
|
+
| Evaluation | `eval_loss`, `eval_accuracy` (when evaluation is run) |
|
|
505
|
+
| TRL-specific | `rewards/chosen`, `rewards/rejected`, `kl_divergence` |
|
|
506
|
+
| Checkpoints | Checkpoint save events with step numbers |
|
|
507
|
+
|
|
508
|
+
### Callback Events
|
|
509
|
+
|
|
510
|
+
```python
|
|
511
|
+
# The callback hooks into these Trainer events:
|
|
512
|
+
# - on_train_begin: Logs run start with training configuration
|
|
513
|
+
# - on_log: Logs training metrics (loss, lr, etc.)
|
|
514
|
+
# - on_evaluate: Logs evaluation metrics
|
|
515
|
+
# - on_save: Logs checkpoint events
|
|
516
|
+
# - on_train_end: Logs run completion and flushes pending metrics
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
### Non-Blocking Design
|
|
520
|
+
|
|
521
|
+
The callback uses a background thread to send metrics asynchronously, ensuring training is never blocked by network operations:
|
|
522
|
+
|
|
523
|
+
```python
|
|
524
|
+
from deepfabric.training import MetricsSender
|
|
525
|
+
|
|
526
|
+
# Direct access to sender for advanced use cases
|
|
527
|
+
sender = MetricsSender(
|
|
528
|
+
endpoint="https://api.deepfabric.ai",
|
|
529
|
+
api_key="your-key",
|
|
530
|
+
batch_size=10, # Batch metrics before sending
|
|
531
|
+
flush_interval=5.0, # Auto-flush every 5 seconds
|
|
532
|
+
max_queue_size=1000, # Queue capacity
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# Manually send metrics
|
|
536
|
+
sender.send_metrics({"custom_metric": 0.95, "step": 100})
|
|
537
|
+
|
|
538
|
+
# Flush pending metrics (blocking)
|
|
539
|
+
sender.flush(timeout=30.0)
|
|
540
|
+
|
|
541
|
+
# Check sender statistics
|
|
542
|
+
print(sender.stats)
|
|
543
|
+
# {'metrics_sent': 150, 'metrics_dropped': 0, 'send_errors': 0, 'queue_size': 0}
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
### Interactive API Key Prompt
|
|
547
|
+
|
|
548
|
+
When running in an interactive environment (Jupyter notebook, terminal) without an API key configured, the callback will prompt for authentication:
|
|
549
|
+
|
|
550
|
+
```python
|
|
551
|
+
from deepfabric import DeepFabricCallback
|
|
552
|
+
|
|
553
|
+
# If DEEPFABRIC_API_KEY is not set, prompts for login
|
|
554
|
+
callback = DeepFabricCallback(trainer)
|
|
555
|
+
# > DeepFabric API key not found. Log in to enable cloud metrics.
|
|
556
|
+
# > Visit: https://app.deepfabric.ai/signup
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
### Disabling Metrics Logging
|
|
560
|
+
|
|
561
|
+
```python
|
|
562
|
+
# Disable via constructor
|
|
563
|
+
callback = DeepFabricCallback(trainer, enabled=False)
|
|
564
|
+
|
|
565
|
+
# Or set API key to None
|
|
566
|
+
callback = DeepFabricCallback(trainer, api_key=None)
|
|
567
|
+
|
|
568
|
+
# Or don't set DEEPFABRIC_API_KEY environment variable
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
## Providers
|
|
572
|
+
|
|
573
|
+
| Provider | Local/Cloud | Best For |
|
|
574
|
+
|----------|-------------|----------|
|
|
575
|
+
| OpenAI | Cloud | High quality, complex tasks |
|
|
576
|
+
| Anthropic | Cloud | Nuanced reasoning |
|
|
577
|
+
| Google Gemini | Cloud | Cost-effective at scale |
|
|
578
|
+
| Ollama | Local | Privacy, unlimited generation |
|
|
579
|
+
| OpenRouter | Cloud | Flexible model choice |
|
|
580
|
+
|
|
581
|
+
## Tool Tracing with Spin
|
|
582
|
+
|
|
583
|
+
DeepFabric supports **real tool execution** during dataset generation using the [Spin Framework](https://www.fermyon.com/spin). Instead of simulating tool outputs, tools actually execute in isolated WebAssembly sandboxes, producing authentic training data.
|
|
584
|
+
|
|
585
|
+
### Why Real Execution Matters
|
|
586
|
+
|
|
587
|
+
Traditional synthetic data generators simulate tool outputs, which creates unrealistic training data:
|
|
588
|
+
|
|
589
|
+
```
|
|
590
|
+
# Simulated (problematic)
|
|
591
|
+
Agent: read_file("config.json")
|
|
592
|
+
Result: {"setting": "value"} # LLM hallucinated this content
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
With Spin integration, tools execute against real state:
|
|
596
|
+
|
|
597
|
+
```
|
|
598
|
+
# Real execution (accurate)
|
|
599
|
+
Agent: read_file("config.json")
|
|
600
|
+
Result: FileNotFound # Actual filesystem state
|
|
601
|
+
Agent: write_file("config.json", "{...}")
|
|
602
|
+
Result: Written 42 bytes # Real operation
|
|
603
|
+
```
|
|
604
|
+
|
|
605
|
+
### ReAct-Style Execution
|
|
606
|
+
|
|
607
|
+
DeepFabric uses a ReAct (Reason-Act-Observe) loop for tool calling. The agent observes real results before deciding the next action:
|
|
608
|
+
|
|
609
|
+
```
|
|
610
|
+
Step 1: Agent thinks "I should check if config exists"
|
|
611
|
+
-> Calls read_file("config.json")
|
|
612
|
+
-> Observes: FileNotFound
|
|
613
|
+
|
|
614
|
+
Step 2: Agent thinks "Config doesn't exist, I'll create it"
|
|
615
|
+
-> Calls write_file("config.json", content)
|
|
616
|
+
-> Observes: Success
|
|
617
|
+
```
|
|
618
|
+
|
|
619
|
+
This produces training data where decisions are based on actual observations, not hallucinated assumptions.
|
|
620
|
+
|
|
621
|
+
### Configuration
|
|
622
|
+
|
|
623
|
+
Enable tool tracing in your YAML config:
|
|
624
|
+
|
|
625
|
+
```yaml
|
|
626
|
+
generation:
|
|
627
|
+
conversation:
|
|
628
|
+
type: chain_of_thought
|
|
629
|
+
reasoning_style: agent
|
|
630
|
+
agent_mode: single_turn
|
|
631
|
+
|
|
632
|
+
tools:
|
|
633
|
+
spin_endpoint: "http://localhost:3000" # Spin service URL
|
|
634
|
+
available: # Filter to specific tools
|
|
635
|
+
- read_file
|
|
636
|
+
- write_file
|
|
637
|
+
- list_files
|
|
638
|
+
max_agent_steps: 5 # Max ReAct iterations
|
|
639
|
+
|
|
640
|
+
# Optional: Seed initial state for scenarios
|
|
641
|
+
scenario_seed:
|
|
642
|
+
files:
|
|
643
|
+
"config.json": '{"debug": true}'
|
|
644
|
+
```
|
|
645
|
+
|
|
646
|
+
### Built-in VFS Tools
|
|
647
|
+
|
|
648
|
+
DeepFabric includes a virtual filesystem (VFS) component with these tools:
|
|
649
|
+
|
|
650
|
+
| Tool | Description |
|
|
651
|
+
|------|-------------|
|
|
652
|
+
| `read_file` | Read content from a file |
|
|
653
|
+
| `write_file` | Write content to a file |
|
|
654
|
+
| `list_files` | List all files in the session |
|
|
655
|
+
| `delete_file` | Delete a file |
|
|
656
|
+
|
|
657
|
+
Each session gets an isolated filesystem - changes don't persist between samples.
|
|
658
|
+
|
|
659
|
+
### Running Spin Locally
|
|
660
|
+
|
|
661
|
+
```bash
|
|
662
|
+
cd tools-sdk
|
|
663
|
+
spin build
|
|
664
|
+
spin up
|
|
665
|
+
```
|
|
666
|
+
|
|
667
|
+
The Spin service runs at `http://localhost:3000` by default.
|
|
668
|
+
|
|
669
|
+
### Adding Custom Tools
|
|
670
|
+
|
|
671
|
+
You can extend DeepFabric with custom tools written in Python, JavaScript, Go, or Rust. See [tool-traces.md](./tool-traces.md) for detailed documentation on:
|
|
672
|
+
|
|
673
|
+
- Creating custom Spin components
|
|
674
|
+
- Tool definition schemas
|
|
675
|
+
- Multi-language examples
|
|
676
|
+
- Containerization and deployment
|
|
677
|
+
|
|
678
|
+
## Resources
|
|
679
|
+
|
|
680
|
+
- [Documentation](https://always-further.github.io/deepfabric/)
|
|
681
|
+
- [Examples](./examples/README.md)
|
|
682
|
+
- [Tool Tracing Guide](./tool-traces.md)
|
|
683
|
+
- [Discord](https://discord.gg/pPcjYzGvbS)
|
|
684
|
+
- [Issues](https://github.com/always-further/deepfabric/issues)
|
|
685
|
+
|
|
686
|
+
## Development
|
|
687
|
+
|
|
688
|
+
```bash
|
|
689
|
+
git clone https://github.com/always-further/deepfabric
|
|
690
|
+
cd deepfabric
|
|
691
|
+
uv sync --all-extras
|
|
692
|
+
make test
|
|
693
|
+
```
|
|
694
|
+
|
|
695
|
+
## Analytics
|
|
696
|
+
|
|
697
|
+
We collect anonymous usage metrics to improve DeepFabric. No personal data, prompts, or API keys are collected.
|
|
698
|
+
|
|
699
|
+
```bash
|
|
700
|
+
# Disable analytics
|
|
701
|
+
export ANONYMIZED_TELEMETRY=False
|
|
702
|
+
```
|