DeepFabric 4.5.1__py3-none-any.whl → 4.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,7 @@ class DeepFabricCallback:
51
51
  trainer: Any | None = None,
52
52
  api_key: str | None = None,
53
53
  endpoint: str | None = None,
54
+ pipeline_id: str | None = None,
54
55
  enabled: bool = True,
55
56
  ):
56
57
  """Initialize the DeepFabric callback.
@@ -60,11 +61,14 @@ class DeepFabricCallback:
60
61
  api_key: DeepFabric API key (falls back to DEEPFABRIC_API_KEY env var,
61
62
  then prompts in interactive environments)
62
63
  endpoint: API endpoint URL (falls back to DEEPFABRIC_API_URL env var)
64
+ pipeline_id: Pipeline ID to associate training with (falls back to
65
+ DEEPFABRIC_PIPELINE_ID env var or pipeline_id.txt file)
63
66
  enabled: Whether logging is enabled (default: True)
64
67
  """
65
68
  # Get API key from arg, env, or prompt
66
69
  self.api_key = api_key or get_api_key()
67
70
  self.endpoint = endpoint or os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.ai")
71
+ self.pipeline_id = pipeline_id or self._get_pipeline_id()
68
72
  self.run_id = str(uuid.uuid4())
69
73
  self.enabled = enabled and self.api_key is not None
70
74
 
@@ -75,14 +79,26 @@ class DeepFabricCallback:
75
79
  self.sender = MetricsSender(
76
80
  endpoint=self.endpoint,
77
81
  api_key=self.api_key if self.enabled else None,
82
+ pipeline_id=self.pipeline_id,
78
83
  )
79
84
 
80
85
  self._run_started = False
81
86
  self._model_name: str | None = None
82
87
  self._training_args_logged = False
88
+ self._start_time: datetime | None = None
83
89
 
84
90
  if self.enabled:
85
- logger.debug(f"DeepFabric callback initialized (run_id={self.run_id})")
91
+ if self.pipeline_id:
92
+ logger.debug(
93
+ f"DeepFabric callback initialized (run_id={self.run_id}, "
94
+ f"pipeline_id={self.pipeline_id})"
95
+ )
96
+ else:
97
+ logger.warning(
98
+ "DeepFabric callback initialized but no pipeline_id set. "
99
+ "Metrics will not be sent. Set DEEPFABRIC_PIPELINE_ID env var "
100
+ "or create pipeline_id.txt file."
101
+ )
86
102
  else:
87
103
  logger.debug("DeepFabric callback disabled (no API key)")
88
104
 
@@ -101,6 +117,7 @@ class DeepFabricCallback:
101
117
  return
102
118
 
103
119
  self._run_started = True
120
+ self._start_time = datetime.now(timezone.utc)
104
121
 
105
122
  # Extract model name from various sources
106
123
  model = kwargs.get("model")
@@ -121,6 +138,7 @@ class DeepFabricCallback:
121
138
  "num_train_epochs": state.num_train_epochs,
122
139
  "is_world_process_zero": getattr(state, "is_world_process_zero", True),
123
140
  },
141
+ "started_at": self._start_time.isoformat(),
124
142
  }
125
143
  )
126
144
 
@@ -204,6 +222,8 @@ class DeepFabricCallback:
204
222
  if not self.enabled or not self._run_started:
205
223
  return
206
224
 
225
+ completed_at = datetime.now(timezone.utc)
226
+
207
227
  self.sender.send_run_end(
208
228
  {
209
229
  "run_id": self.run_id,
@@ -212,6 +232,7 @@ class DeepFabricCallback:
212
232
  "total_flos": getattr(state, "total_flos", None),
213
233
  "best_metric": getattr(state, "best_metric", None),
214
234
  "best_model_checkpoint": getattr(state, "best_model_checkpoint", None),
235
+ "completed_at": completed_at.isoformat(),
215
236
  }
216
237
  )
217
238
 
@@ -246,6 +267,27 @@ class DeepFabricCallback:
246
267
  }
247
268
  )
248
269
 
270
+ def _get_pipeline_id(self) -> str | None:
271
+ """Get pipeline ID from environment or file.
272
+
273
+ Returns:
274
+ Pipeline ID or None
275
+ """
276
+ # Try environment variable first
277
+ pipeline_id = os.getenv("DEEPFABRIC_PIPELINE_ID", "")
278
+ if pipeline_id:
279
+ return pipeline_id
280
+
281
+ # Try pipeline_id.txt file
282
+ pipeline_file = "pipeline_id.txt"
283
+ if os.path.exists(pipeline_file):
284
+ with open(pipeline_file) as f:
285
+ pipeline_id = f.read().strip()
286
+ if pipeline_id:
287
+ return pipeline_id
288
+
289
+ return None
290
+
249
291
  def _extract_model_name(self, args: TrainingArguments, model: Any | None) -> str | None:
250
292
  """Extract model name from various sources.
251
293
 
@@ -35,6 +35,7 @@ class MetricsSender:
35
35
  self,
36
36
  endpoint: str,
37
37
  api_key: str | None,
38
+ pipeline_id: str | None = None,
38
39
  batch_size: int = 10,
39
40
  flush_interval: float = 5.0,
40
41
  max_queue_size: int = 1000,
@@ -45,6 +46,7 @@ class MetricsSender:
45
46
  Args:
46
47
  endpoint: Base URL for the DeepFabric API
47
48
  api_key: API key for authentication (None disables sending)
49
+ pipeline_id: Pipeline ID to associate training runs with (required)
48
50
  batch_size: Number of metrics to batch before sending
49
51
  flush_interval: Seconds between automatic flushes
50
52
  max_queue_size: Maximum queue size (overflow drops metrics)
@@ -52,12 +54,14 @@ class MetricsSender:
52
54
  """
53
55
  self.endpoint = endpoint.rstrip("/")
54
56
  self.api_key = api_key
57
+ self.pipeline_id = pipeline_id
55
58
  self.batch_size = batch_size
56
59
  self.flush_interval = flush_interval
57
60
  self.timeout = timeout
58
61
 
59
62
  self._queue: queue.Queue[dict[str, Any]] = queue.Queue(maxsize=max_queue_size)
60
63
  self._stop_event = threading.Event()
64
+ self._flush_event = threading.Event()
61
65
  self._enabled = api_key is not None
62
66
 
63
67
  # Start background sender thread
@@ -177,19 +181,25 @@ class MetricsSender:
177
181
  should_flush = (
178
182
  len(batch) >= self.batch_size
179
183
  or (time.monotonic() - last_flush) >= self.flush_interval
184
+ or self._flush_event.is_set()
180
185
  )
181
186
 
182
187
  if should_flush:
183
188
  self._flush_batch(batch)
184
189
  batch = []
185
190
  last_flush = time.monotonic()
191
+ self._flush_event.clear()
186
192
 
187
193
  except queue.Empty:
188
- # Timeout - flush if we have pending items
189
- if batch and (time.monotonic() - last_flush) >= self.flush_interval:
194
+ # Timeout - flush if we have pending items or flush requested
195
+ if batch and (
196
+ (time.monotonic() - last_flush) >= self.flush_interval
197
+ or self._flush_event.is_set()
198
+ ):
190
199
  self._flush_batch(batch)
191
200
  batch = []
192
201
  last_flush = time.monotonic()
202
+ self._flush_event.clear()
193
203
 
194
204
  # On shutdown, drain the queue and flush everything
195
205
  while not self._queue.empty():
@@ -209,21 +219,34 @@ class MetricsSender:
209
219
  if not batch or not self._enabled:
210
220
  return
211
221
 
222
+ if not self.pipeline_id:
223
+ logger.debug("No pipeline_id set, skipping metrics send")
224
+ return
225
+
212
226
  # Separate events and metrics
213
- events = [item for item in batch if item["type"] != "metrics"]
227
+ run_start_events = [item for item in batch if item["type"] == "run_start"]
228
+ run_end_events = [item for item in batch if item["type"] == "run_end"]
214
229
  metrics = [item["data"] for item in batch if item["type"] == "metrics"]
215
230
 
216
- # Send events first (run_start, run_end)
217
- for event in events:
218
- self._send_to_api(
219
- endpoint=f"{self.endpoint}/v1/training/runs",
220
- payload={"event_type": event["type"], **event["data"]},
221
- )
231
+ # Build query string with pipeline_id
232
+ query = f"?pipeline_id={self.pipeline_id}"
233
+
234
+ def send_run_events(events: list[dict[str, Any]]) -> None:
235
+ """Send run start/end events."""
236
+ for event in events:
237
+ self._send_to_api(
238
+ endpoint=f"{self.endpoint}/api/v1/training/runs{query}",
239
+ payload={"event_type": event["type"], **event["data"]},
240
+ )
241
+
242
+ # Send run events, ensuring start events are processed before end events
243
+ send_run_events(run_start_events)
244
+ send_run_events(run_end_events)
222
245
 
223
246
  # Send metrics batch
224
247
  if metrics:
225
248
  self._send_to_api(
226
- endpoint=f"{self.endpoint}/v1/training/metrics",
249
+ endpoint=f"{self.endpoint}/api/v1/training/metrics{query}",
227
250
  payload={"metrics": metrics},
228
251
  )
229
252
  self._metrics_sent += len(metrics)
@@ -252,22 +275,27 @@ class MetricsSender:
252
275
 
253
276
  if not response.ok:
254
277
  self._send_errors += 1
255
- logger.debug(f"API request failed: {response.status_code} {response.text[:100]}")
278
+ logger.warning(
279
+ "API error: %s %s (endpoint: %s)",
280
+ response.status_code,
281
+ response.text[:200],
282
+ endpoint,
283
+ )
256
284
  return False
257
285
 
258
286
  except requests.exceptions.Timeout:
259
287
  self._send_errors += 1
260
- logger.debug("API request timed out")
288
+ logger.warning("Request timed out: %s", endpoint)
261
289
  return False
262
290
 
263
- except requests.exceptions.ConnectionError:
291
+ except requests.exceptions.ConnectionError as e:
264
292
  self._send_errors += 1
265
- logger.debug("API connection error")
293
+ logger.warning("Connection error: %s (endpoint: %s)", e, endpoint)
266
294
  return False
267
295
 
268
296
  except requests.exceptions.RequestException as e:
269
297
  self._send_errors += 1
270
- logger.debug(f"API request error: {e}")
298
+ logger.warning("Request error: %s (endpoint: %s)", e, endpoint)
271
299
  return False
272
300
 
273
301
  else:
@@ -282,8 +310,14 @@ class MetricsSender:
282
310
  if not self._enabled:
283
311
  return
284
312
 
313
+ # Signal the background thread to flush its current batch
314
+ self._flush_event.set()
315
+
285
316
  start = time.monotonic()
286
- while not self._queue.empty() and (time.monotonic() - start) < timeout:
317
+ # Wait for queue to empty and flush event to be cleared (indicates batch was sent)
318
+ while (time.monotonic() - start) < timeout:
319
+ if self._queue.empty() and not self._flush_event.is_set():
320
+ break
287
321
  time.sleep(0.1)
288
322
 
289
323
  def shutdown(self) -> None:
deepfabric/tui.py CHANGED
@@ -41,14 +41,22 @@ class TopicBuildingMixin:
41
41
 
42
42
  Subclasses must have these attributes:
43
43
  - tui: DeepFabricTUI instance
44
+ - live_display: Live | None
44
45
  - live_layout: Layout | None
45
46
  - events_log: deque
46
47
  """
47
48
 
48
49
  tui: "DeepFabricTUI"
50
+ live_display: "Live | None"
49
51
  live_layout: "Layout | None"
50
52
  events_log: "deque"
51
53
 
54
+ def stop_live(self) -> None:
55
+ """Stop the Live display if it's running."""
56
+ if self.live_display:
57
+ self.live_display.stop()
58
+ self.live_display = None
59
+
52
60
  def _refresh_left(self) -> None:
53
61
  """Update events panel in left column."""
54
62
  if self.live_layout is not None:
@@ -910,7 +918,7 @@ class DatasetGenerationTUI(StreamObserver):
910
918
  # Map conversation types to friendly names
911
919
  type_map = {
912
920
  "basic": "Basic Q&A",
913
- "chain_of_thought": "Chain of Thought",
921
+ "cot": "Chain of Thought",
914
922
  "single_turn_agent": "Single-Turn Agent (Tool Calling)",
915
923
  "multi_turn_agent": "Multi-Turn Agent (Tool Calling)",
916
924
  }
deepfabric/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import ast
2
2
  import asyncio
3
3
  import json
4
+ import os
4
5
  import re
5
6
 
6
7
  VALIDATION_ERROR_INDICATORS = [
@@ -147,4 +148,17 @@ def read_topic_tree_from_jsonl(file_path: str) -> list[dict]:
147
148
  with open(file_path) as file:
148
149
  for line in file:
149
150
  topic_tree.append(json.loads(line.strip()))
151
+
150
152
  return topic_tree
153
+
154
+
155
+ def get_bool_env(key: str, default: bool = False) -> bool:
156
+ """Get a boolean environment variable.
157
+
158
+ Supports: '1', 'true', 'yes', 'on' (case-insensitive) as True.
159
+ Everything else is False unless default is True and key is missing.
160
+ """
161
+ val = os.getenv(key)
162
+ if val is None:
163
+ return default
164
+ return val.lower() in ("1", "true", "yes", "on")
deepfabric/validation.py CHANGED
@@ -79,7 +79,7 @@ def validate_path_requirements(
79
79
  for steps, batch in optimal_combinations[:3]: # Show top 3
80
80
  total_samples = steps * batch
81
81
  recommendations.append(
82
- f" --num-steps {steps} --batch-size {batch} (generates {total_samples} samples)"
82
+ f" --num-samples {steps} --batch-size {batch} (generates {total_samples} samples)"
83
83
  )
84
84
 
85
85
  recommendations.extend(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: DeepFabric
3
- Version: 4.5.1
3
+ Version: 4.7.0
4
4
  Summary: Curate High Quality Datasets, Train, Evaluate and Ship
5
5
  Author-email: Luke Hinds <luke@alwaysfurther.ai>
6
6
  License-File: LICENSE
@@ -29,10 +29,12 @@ Requires-Dist: sentencepiece>=0.1.99
29
29
  Requires-Dist: spin-sdk>=3.4.1
30
30
  Requires-Dist: torch>=2.4.0
31
31
  Requires-Dist: transformers>=4.57.1
32
+ Requires-Dist: trl>=0.26.2
32
33
  Provides-Extra: dev
33
34
  Requires-Dist: bandit>=1.7.10; extra == 'dev'
34
35
  Requires-Dist: mermaid-py>=0.2.0; extra == 'dev'
35
36
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
37
+ Requires-Dist: pytest-httpx>=0.30.0; extra == 'dev'
36
38
  Requires-Dist: pytest-mock>=3.10.0; extra == 'dev'
37
39
  Requires-Dist: pytest>=7.0.0; extra == 'dev'
38
40
  Requires-Dist: requests-mock>=1.11.0; extra == 'dev'
@@ -45,7 +47,7 @@ Description-Content-Type: text/markdown
45
47
  <div align="center">
46
48
  <picture>
47
49
  <source media="(prefers-color-scheme: dark)" srcset="./assets/logo-light.png" />
48
- <img alt="DeepFabric logo" src="./assets/logo-light-hols.png" style="width:40%;max-width:40%;height:auto;display:block;margin:0 auto;" />
50
+ <img alt="DeepFabric logo" src="./assets/logo-light.png" style="width:40%;max-width:40%;height:auto;display:block;margin:0 auto;" />
49
51
  </picture>
50
52
  <h3>Training Model Behavior in Agentic Systems</h3>
51
53
 
@@ -123,7 +125,15 @@ This generates a topic graph and creates 27 unique nodes, then generates 27 trai
123
125
 
124
126
  ## Configuration
125
127
 
126
- DeepFabric also uses YAML configuration with three main sections and optional shared LLM defaults:
128
+ DeepFabric also uses YAML configuration with three main sections and optional shared LLM defaults
129
+
130
+ > [!NOTE]
131
+ > The following uses mocked tool execution, so will require a runing Spin service, which we provide in a docker image:
132
+ ```bash
133
+ docker run -d -p 3000:3000 ghcr.io/always-further/deepfabric/tools-sdk:latest`
134
+ ```
135
+
136
+ Save the following as `config.yaml`:
127
137
 
128
138
  ```yaml
129
139
  # Optional: Shared LLM defaults (inherited by topics and generation)
@@ -146,34 +156,74 @@ topics:
146
156
  # GENERATION: Create training samples from topics
147
157
  generation:
148
158
  system_prompt: |
149
- You are an expert Python backend developer and technical educator.
159
+ You are an expert Python backend developer specializing in REST API design.
150
160
  Create practical, production-ready code examples with clear explanations.
151
161
  Include error handling, type hints, and follow PEP 8 conventions.
162
+ Use the following tools to read, write, and list files in the virtual filesystem:
163
+ - read_file
164
+ - write_file
165
+ - list_files
152
166
 
153
167
  # Additional instructions for sample generation
154
168
  instructions: |
155
- Focus on real-world scenarios developers encounter daily.
169
+ Focus on real-world scenarios developers encounter daily when building REST APIs with Python.
156
170
  Include both happy path and edge case handling.
157
- Provide context on when and why to use specific patterns.
171
+ Provide context on when and why to use specific patterns or libraries.
172
+ Ensure code is modular, testable, and maintainable.
158
173
 
159
174
  conversation:
160
- type: chain_of_thought # basic | chain_of_thought
161
- reasoning_style: agent # freetext | agent (for chain_of_thought)
175
+ type: cot # basic | cot
176
+ reasoning_style: agent # freetext | agent (for cot)
162
177
  agent_mode: single_turn # single_turn | multi_turn (for agent)
163
178
 
164
179
  # Tool configuration (required for agent modes)
165
180
  tools:
166
181
  spin_endpoint: "http://localhost:3000" # Spin service for tool execution
167
- available: # Filter to specific tools (empty = all VFS tools)
168
- - read_file
169
- - write_file
170
- - list_files
182
+ components: # Map component name to tool names
183
+ builtin: # Routes to /vfs/execute
184
+ - read_file
185
+ - write_file
186
+ - list_files
171
187
  max_per_query: 3 # Maximum tools per query
172
188
  max_agent_steps: 5 # Max ReAct reasoning iterations
173
189
 
174
- max_retries: 3 # Retries for failed generations
175
- sample_retries: 2 # Retries for validation failures
176
- max_tokens: 2000 # Max tokens per generation
190
+ # Optional: Seed initial files into the spin before generation, used for tool calling
191
+ scenario_seed:
192
+ files:
193
+ "Dockerfile": |
194
+ FROM python:3.13
195
+ WORKDIR /usr/local/app
196
+
197
+ # Install the application dependencies
198
+ COPY requirements.txt ./
199
+ RUN pip install --no-cache-dir -r requirements.txt
200
+
201
+ # Copy in the source code
202
+ COPY src ./src
203
+ EXPOSE 8080
204
+
205
+ # Setup an app user so the container doesn't run as the root user
206
+ RUN useradd app
207
+ USER app
208
+
209
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]
210
+ "main.py": |
211
+ def greet(name):
212
+ return f"Hello, {name}!"
213
+
214
+ if __name__ == "__main__":
215
+ print(greet("World"))
216
+ "config.json": |
217
+ {
218
+ "version": "1.0.0",
219
+ "debug": true,
220
+ "max_retries": 3
221
+ }
222
+
223
+ # Generation control and retry settings
224
+ max_retries: 3 # Retries for failed generations
225
+ sample_retries: 2 # Retries for validation failures
226
+ max_tokens: 2000 # Max tokens per generation
177
227
 
178
228
  # Optional: Override shared LLM settings
179
229
  llm:
@@ -193,13 +243,13 @@ output:
193
243
  batch_size: 3 # Parallel generation batch size
194
244
  save_as: "api-dataset.jsonl"
195
245
 
196
- # Optional: Upload to Hugging Face
197
- huggingface:
198
- repository: "your-username/api-dataset-training-name"
199
- tags: ["python", "programming"]
246
+ Optional: Upload to Hugging Face
247
+ huggingface:
248
+ repository: "your-username/api-dataset-training-name"
249
+ tags: ["python", "programming"]
200
250
  ```
201
251
 
202
- Run with:
252
+ Run generation by sourcing the `config.yaml`:
203
253
 
204
254
  ```bash
205
255
  deepfabric generate config.yaml
@@ -209,6 +259,14 @@ deepfabric generate config.yaml
209
259
 
210
260
  DeepFabric returns standard HuggingFace datasets, making it easy to integrate with any training framework.
211
261
 
262
+ ### Colab Notebooks:
263
+
264
+ A quick way of seeing DeepFabric in action is via our notebooks in the [notebooks/](./notebooks/) folder or on Google Colab:
265
+
266
+ **Qwen4b Blender MCP**:
267
+
268
+ [![Qwen4b Blender MCP](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EG1V40v5xkJKLf6Ra6W4378vYqlZNVWqb)
269
+
212
270
  ### 1. Generate Dataset
213
271
 
214
272
  ```bash
@@ -327,7 +385,6 @@ config = EvaluatorConfig(
327
385
  model_path="Qwen/Qwen2.5-7B-Instruct", # Base model
328
386
  adapter_path="./output/lora-adapter", # LoRA adapter path
329
387
  backend="transformers",
330
- use_unsloth=True, # Use Unsloth for adapters trained with Unsloth
331
388
  load_in_4bit=True, # 4-bit quantization
332
389
  max_seq_length=2048,
333
390
  ),
@@ -418,159 +475,6 @@ evaluator = Evaluator(config)
418
475
  results = evaluator.evaluate(dataset=eval_dataset)
419
476
  ```
420
477
 
421
- ## Training Metrics
422
-
423
- DeepFabric provides a training callback that automatically logs metrics to the DeepFabric cloud during model training. This enables real-time monitoring and tracking of training runs.
424
-
425
- ### Basic Usage with HuggingFace Trainer
426
-
427
- ```python
428
- from transformers import Trainer, TrainingArguments
429
- from deepfabric import DeepFabricCallback
430
-
431
- # Set up training arguments
432
- training_args = TrainingArguments(
433
- output_dir="./output",
434
- num_train_epochs=3,
435
- per_device_train_batch_size=4,
436
- logging_steps=10,
437
- )
438
-
439
- # Create trainer
440
- trainer = Trainer(
441
- model=model,
442
- args=training_args,
443
- train_dataset=train_dataset,
444
- eval_dataset=eval_dataset,
445
- )
446
-
447
- # Add DeepFabric callback for metrics logging
448
- trainer.add_callback(DeepFabricCallback(trainer))
449
-
450
- # Train - metrics are automatically logged
451
- trainer.train()
452
- ```
453
-
454
- ### Usage with TRL SFTTrainer
455
-
456
- ```python
457
- from trl import SFTTrainer, SFTConfig
458
- from deepfabric import DeepFabricCallback
459
-
460
- trainer = SFTTrainer(
461
- model=model,
462
- tokenizer=tokenizer,
463
- train_dataset=train_dataset,
464
- args=SFTConfig(
465
- output_dir="./output",
466
- num_train_epochs=3,
467
- logging_steps=10,
468
- ),
469
- )
470
-
471
- # Add callback - works with any Trainer-compatible class
472
- trainer.add_callback(DeepFabricCallback(trainer))
473
- trainer.train()
474
- ```
475
-
476
- ### Configuration Options
477
-
478
- ```python
479
- from deepfabric import DeepFabricCallback
480
-
481
- callback = DeepFabricCallback(
482
- trainer=trainer, # Optional: Trainer instance
483
- api_key="your-api-key", # Or set DEEPFABRIC_API_KEY env var
484
- endpoint="https://api.deepfabric.ai", # Custom endpoint (optional)
485
- enabled=True, # Disable to skip logging
486
- )
487
- ```
488
-
489
- ### Environment Variables
490
-
491
- ```bash
492
- # API key for authentication
493
- export DEEPFABRIC_API_KEY="your-api-key"
494
-
495
- # Custom API endpoint (optional)
496
- export DEEPFABRIC_API_URL="https://api.deepfabric.ai"
497
- ```
498
-
499
- ### Logged Metrics
500
-
501
- The callback automatically captures and logs:
502
-
503
- | Metric Type | Examples |
504
- |-------------|----------|
505
- | Training | `loss`, `learning_rate`, `epoch`, `global_step` |
506
- | Throughput | `train_runtime`, `train_samples_per_second` |
507
- | Evaluation | `eval_loss`, `eval_accuracy` (when evaluation is run) |
508
- | TRL-specific | `rewards/chosen`, `rewards/rejected`, `kl_divergence` |
509
- | Checkpoints | Checkpoint save events with step numbers |
510
-
511
- ### Callback Events
512
-
513
- ```python
514
- # The callback hooks into these Trainer events:
515
- # - on_train_begin: Logs run start with training configuration
516
- # - on_log: Logs training metrics (loss, lr, etc.)
517
- # - on_evaluate: Logs evaluation metrics
518
- # - on_save: Logs checkpoint events
519
- # - on_train_end: Logs run completion and flushes pending metrics
520
- ```
521
-
522
- ### Non-Blocking Design
523
-
524
- The callback uses a background thread to send metrics asynchronously, ensuring training is never blocked by network operations:
525
-
526
- ```python
527
- from deepfabric.training import MetricsSender
528
-
529
- # Direct access to sender for advanced use cases
530
- sender = MetricsSender(
531
- endpoint="https://api.deepfabric.ai",
532
- api_key="your-key",
533
- batch_size=10, # Batch metrics before sending
534
- flush_interval=5.0, # Auto-flush every 5 seconds
535
- max_queue_size=1000, # Queue capacity
536
- )
537
-
538
- # Manually send metrics
539
- sender.send_metrics({"custom_metric": 0.95, "step": 100})
540
-
541
- # Flush pending metrics (blocking)
542
- sender.flush(timeout=30.0)
543
-
544
- # Check sender statistics
545
- print(sender.stats)
546
- # {'metrics_sent': 150, 'metrics_dropped': 0, 'send_errors': 0, 'queue_size': 0}
547
- ```
548
-
549
- ### Interactive API Key Prompt
550
-
551
- When running in an interactive environment (Jupyter notebook, terminal) without an API key configured, the callback will prompt for authentication:
552
-
553
- ```python
554
- from deepfabric import DeepFabricCallback
555
-
556
- # If DEEPFABRIC_API_KEY is not set, prompts for login
557
- callback = DeepFabricCallback(trainer)
558
- # > DeepFabric API key not found. Log in to enable cloud metrics.
559
- # > Visit: https://app.deepfabric.ai/signup
560
- ```
561
-
562
- ### Disabling Metrics Logging
563
-
564
- ```python
565
- # Disable via constructor
566
- callback = DeepFabricCallback(trainer, enabled=False)
567
-
568
- # Or set API key to None
569
- callback = DeepFabricCallback(trainer, api_key=None)
570
-
571
- # Or don't set DEEPFABRIC_API_KEY environment variable
572
- ```
573
-
574
478
  ## Providers
575
479
 
576
480
  | Provider | Local/Cloud | Best For |
@@ -628,7 +532,7 @@ Enable tool tracing in your YAML config:
628
532
  ```yaml
629
533
  generation:
630
534
  conversation:
631
- type: chain_of_thought
535
+ type: cot
632
536
  reasoning_style: agent
633
537
  agent_mode: single_turn
634
538