DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. deepfabric/__init__.py +70 -0
  2. deepfabric/__main__.py +6 -0
  3. deepfabric/auth.py +382 -0
  4. deepfabric/builders.py +303 -0
  5. deepfabric/builders_agent.py +1304 -0
  6. deepfabric/cli.py +1288 -0
  7. deepfabric/config.py +899 -0
  8. deepfabric/config_manager.py +251 -0
  9. deepfabric/constants.py +94 -0
  10. deepfabric/dataset_manager.py +534 -0
  11. deepfabric/error_codes.py +581 -0
  12. deepfabric/evaluation/__init__.py +47 -0
  13. deepfabric/evaluation/backends/__init__.py +32 -0
  14. deepfabric/evaluation/backends/ollama_backend.py +137 -0
  15. deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
  16. deepfabric/evaluation/backends/transformers_backend.py +326 -0
  17. deepfabric/evaluation/evaluator.py +845 -0
  18. deepfabric/evaluation/evaluators/__init__.py +13 -0
  19. deepfabric/evaluation/evaluators/base.py +104 -0
  20. deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
  21. deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
  22. deepfabric/evaluation/evaluators/registry.py +66 -0
  23. deepfabric/evaluation/inference.py +155 -0
  24. deepfabric/evaluation/metrics.py +397 -0
  25. deepfabric/evaluation/parser.py +304 -0
  26. deepfabric/evaluation/reporters/__init__.py +13 -0
  27. deepfabric/evaluation/reporters/base.py +56 -0
  28. deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
  29. deepfabric/evaluation/reporters/file_reporter.py +61 -0
  30. deepfabric/evaluation/reporters/multi_reporter.py +56 -0
  31. deepfabric/exceptions.py +67 -0
  32. deepfabric/factory.py +26 -0
  33. deepfabric/generator.py +1084 -0
  34. deepfabric/graph.py +545 -0
  35. deepfabric/hf_hub.py +214 -0
  36. deepfabric/kaggle_hub.py +219 -0
  37. deepfabric/llm/__init__.py +41 -0
  38. deepfabric/llm/api_key_verifier.py +534 -0
  39. deepfabric/llm/client.py +1206 -0
  40. deepfabric/llm/errors.py +105 -0
  41. deepfabric/llm/rate_limit_config.py +262 -0
  42. deepfabric/llm/rate_limit_detector.py +278 -0
  43. deepfabric/llm/retry_handler.py +270 -0
  44. deepfabric/metrics.py +212 -0
  45. deepfabric/progress.py +262 -0
  46. deepfabric/prompts.py +290 -0
  47. deepfabric/schemas.py +1000 -0
  48. deepfabric/spin/__init__.py +6 -0
  49. deepfabric/spin/client.py +263 -0
  50. deepfabric/spin/models.py +26 -0
  51. deepfabric/stream_simulator.py +90 -0
  52. deepfabric/tools/__init__.py +5 -0
  53. deepfabric/tools/defaults.py +85 -0
  54. deepfabric/tools/loader.py +87 -0
  55. deepfabric/tools/mcp_client.py +677 -0
  56. deepfabric/topic_manager.py +303 -0
  57. deepfabric/topic_model.py +20 -0
  58. deepfabric/training/__init__.py +35 -0
  59. deepfabric/training/api_key_prompt.py +302 -0
  60. deepfabric/training/callback.py +363 -0
  61. deepfabric/training/metrics_sender.py +301 -0
  62. deepfabric/tree.py +438 -0
  63. deepfabric/tui.py +1267 -0
  64. deepfabric/update_checker.py +166 -0
  65. deepfabric/utils.py +150 -0
  66. deepfabric/validation.py +143 -0
  67. deepfabric-4.4.0.dist-info/METADATA +702 -0
  68. deepfabric-4.4.0.dist-info/RECORD +71 -0
  69. deepfabric-4.4.0.dist-info/WHEEL +4 -0
  70. deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
  71. deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,363 @@
1
+ """DeepFabric TrainerCallback for automatic metrics logging."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ import uuid
8
+
9
+ from datetime import datetime, timezone
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from .api_key_prompt import get_api_key
13
+ from .metrics_sender import MetricsSender
14
+
15
+ if TYPE_CHECKING:
16
+ from transformers import TrainerControl, TrainerState
17
+ from transformers.training_args import TrainingArguments
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class DeepFabricCallback:
23
+ """Callback that sends training metrics to DeepFabric SaaS.
24
+
25
+ This callback integrates with HuggingFace Trainer and TRL trainers to
26
+ automatically log training metrics (loss, learning rate, epoch, global step,
27
+ throughput, TRL-specific metrics, and custom metrics) to the DeepFabric
28
+ backend.
29
+
30
+ The callback is designed to be non-blocking and gracefully handles failures
31
+ without impacting training.
32
+
33
+ Example:
34
+ from deepfabric.training import DeepFabricCallback
35
+
36
+ trainer = Trainer(
37
+ model=model,
38
+ args=training_args,
39
+ train_dataset=train_dataset,
40
+ )
41
+ trainer.add_callback(DeepFabricCallback(trainer))
42
+ trainer.train()
43
+
44
+ Environment Variables:
45
+ DEEPFABRIC_API_KEY: API key (alternative to constructor arg)
46
+ DEEPFABRIC_API_URL: Backend URL (default: https://api.deepfabric.ai)
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ trainer: Any | None = None,
52
+ api_key: str | None = None,
53
+ endpoint: str | None = None,
54
+ enabled: bool = True,
55
+ ):
56
+ """Initialize the DeepFabric callback.
57
+
58
+ Args:
59
+ trainer: Optional Trainer instance to extract model info from
60
+ api_key: DeepFabric API key (falls back to DEEPFABRIC_API_KEY env var,
61
+ then prompts in interactive environments)
62
+ endpoint: API endpoint URL (falls back to DEEPFABRIC_API_URL env var)
63
+ enabled: Whether logging is enabled (default: True)
64
+ """
65
+ # Get API key from arg, env, or prompt
66
+ self.api_key = api_key or get_api_key()
67
+ self.endpoint = endpoint or os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.ai")
68
+ self.run_id = str(uuid.uuid4())
69
+ self.enabled = enabled and self.api_key is not None
70
+
71
+ # Store trainer reference for model extraction
72
+ self._trainer = trainer
73
+
74
+ # Initialize sender (handles None api_key gracefully)
75
+ self.sender = MetricsSender(
76
+ endpoint=self.endpoint,
77
+ api_key=self.api_key if self.enabled else None,
78
+ )
79
+
80
+ self._run_started = False
81
+ self._model_name: str | None = None
82
+ self._training_args_logged = False
83
+
84
+ if self.enabled:
85
+ logger.debug(f"DeepFabric callback initialized (run_id={self.run_id})")
86
+ else:
87
+ logger.debug("DeepFabric callback disabled (no API key)")
88
+
89
+ def on_train_begin(
90
+ self,
91
+ args: TrainingArguments,
92
+ state: TrainerState,
93
+ control: TrainerControl, # noqa: ARG002
94
+ **kwargs: Any,
95
+ ) -> None:
96
+ """Called at the beginning of training.
97
+
98
+ Sends run start event with training configuration.
99
+ """
100
+ if not self.enabled or self._run_started:
101
+ return
102
+
103
+ self._run_started = True
104
+
105
+ # Extract model name from various sources
106
+ model = kwargs.get("model")
107
+ if model is None and self._trainer is not None:
108
+ model = getattr(self._trainer, "model", None)
109
+ self._model_name = self._extract_model_name(args, model)
110
+
111
+ # Build training args dict (safe extraction)
112
+ training_config = self._extract_training_args(args)
113
+
114
+ self.sender.send_run_start(
115
+ {
116
+ "run_id": self.run_id,
117
+ "model_name": self._model_name,
118
+ "training_config": training_config,
119
+ "state": {
120
+ "max_steps": state.max_steps,
121
+ "num_train_epochs": state.num_train_epochs,
122
+ "is_world_process_zero": getattr(state, "is_world_process_zero", True),
123
+ },
124
+ }
125
+ )
126
+
127
+ def on_log(
128
+ self,
129
+ args: TrainingArguments, # noqa: ARG002
130
+ state: TrainerState,
131
+ control: TrainerControl, # noqa: ARG002
132
+ logs: dict[str, float] | None = None,
133
+ **kwargs: Any, # noqa: ARG002
134
+ ) -> None:
135
+ """Called when metrics are logged.
136
+
137
+ Sends all logged metrics to DeepFabric (loss, learning_rate, epoch,
138
+ global_step, throughput, TRL metrics, custom metrics, etc.).
139
+ """
140
+ if not self.enabled or logs is None:
141
+ return
142
+
143
+ # Filter out None values and non-numeric values
144
+ filtered_logs = {}
145
+ for key, value in logs.items():
146
+ if value is not None:
147
+ if isinstance(value, int | float):
148
+ filtered_logs[key] = value
149
+ elif isinstance(value, str):
150
+ # Keep string values for metadata
151
+ filtered_logs[key] = value
152
+
153
+ if not filtered_logs:
154
+ return
155
+
156
+ payload = {
157
+ "run_id": self.run_id,
158
+ "global_step": state.global_step,
159
+ "epoch": state.epoch,
160
+ "timestamp": datetime.now(timezone.utc).isoformat(),
161
+ "type": "log",
162
+ "metrics": filtered_logs,
163
+ }
164
+
165
+ self.sender.send_metrics(payload)
166
+
167
+ def on_evaluate(
168
+ self,
169
+ args: TrainingArguments, # noqa: ARG002
170
+ state: TrainerState,
171
+ control: TrainerControl, # noqa: ARG002
172
+ metrics: dict[str, float] | None = None,
173
+ **kwargs: Any, # noqa: ARG002
174
+ ) -> None:
175
+ """Called after evaluation.
176
+
177
+ Sends evaluation metrics to DeepFabric.
178
+ """
179
+ if not self.enabled or metrics is None:
180
+ return
181
+
182
+ payload = {
183
+ "run_id": self.run_id,
184
+ "global_step": state.global_step,
185
+ "epoch": state.epoch,
186
+ "timestamp": datetime.now(timezone.utc).isoformat(),
187
+ "type": "eval",
188
+ "metrics": metrics,
189
+ }
190
+
191
+ self.sender.send_metrics(payload)
192
+
193
+ def on_train_end(
194
+ self,
195
+ args: TrainingArguments, # noqa: ARG002
196
+ state: TrainerState,
197
+ control: TrainerControl, # noqa: ARG002
198
+ **kwargs: Any, # noqa: ARG002
199
+ ) -> None:
200
+ """Called at the end of training.
201
+
202
+ Sends run end event and flushes pending metrics.
203
+ """
204
+ if not self.enabled or not self._run_started:
205
+ return
206
+
207
+ self.sender.send_run_end(
208
+ {
209
+ "run_id": self.run_id,
210
+ "final_step": state.global_step,
211
+ "final_epoch": state.epoch,
212
+ "total_flos": getattr(state, "total_flos", None),
213
+ "best_metric": getattr(state, "best_metric", None),
214
+ "best_model_checkpoint": getattr(state, "best_model_checkpoint", None),
215
+ }
216
+ )
217
+
218
+ # Flush remaining metrics
219
+ self.sender.flush(timeout=30.0)
220
+
221
+ logger.debug(f"DeepFabric run completed: {self.sender.stats}")
222
+
223
+ def on_save(
224
+ self,
225
+ args: TrainingArguments, # noqa: ARG002
226
+ state: TrainerState,
227
+ control: TrainerControl, # noqa: ARG002
228
+ **kwargs: Any, # noqa: ARG002
229
+ ) -> None:
230
+ """Called when a checkpoint is saved.
231
+
232
+ Optionally logs checkpoint events.
233
+ """
234
+ if not self.enabled:
235
+ return
236
+
237
+ # Log checkpoint event
238
+ self.sender.send_metrics(
239
+ {
240
+ "run_id": self.run_id,
241
+ "global_step": state.global_step,
242
+ "epoch": state.epoch,
243
+ "timestamp": datetime.now(timezone.utc).isoformat(),
244
+ "type": "checkpoint",
245
+ "metrics": {"checkpoint_step": state.global_step},
246
+ }
247
+ )
248
+
249
+ def _extract_model_name(self, args: TrainingArguments, model: Any | None) -> str | None:
250
+ """Extract model name from various sources.
251
+
252
+ Args:
253
+ args: Training arguments
254
+ model: Model instance (may be None)
255
+
256
+ Returns:
257
+ Model name or None
258
+ """
259
+ # Try args first
260
+ if hasattr(args, "model_name_or_path"):
261
+ return args.model_name_or_path
262
+
263
+ # Try model config
264
+ if model is not None:
265
+ if hasattr(model, "config") and hasattr(model.config, "name_or_path"):
266
+ return model.config.name_or_path
267
+ if hasattr(model, "name_or_path"):
268
+ return model.name_or_path
269
+
270
+ # Try output_dir as fallback
271
+ if hasattr(args, "output_dir"):
272
+ return os.path.basename(args.output_dir)
273
+
274
+ return None
275
+
276
+ def _extract_training_args(self, args: TrainingArguments) -> dict[str, Any]:
277
+ """Extract training arguments for logging.
278
+
279
+ Args:
280
+ args: Training arguments
281
+
282
+ Returns:
283
+ Dictionary of training configuration
284
+ """
285
+ config = {}
286
+
287
+ # Core training args
288
+ safe_attrs = [
289
+ "num_train_epochs",
290
+ "max_steps",
291
+ "per_device_train_batch_size",
292
+ "per_device_eval_batch_size",
293
+ "gradient_accumulation_steps",
294
+ "learning_rate",
295
+ "weight_decay",
296
+ "adam_beta1",
297
+ "adam_beta2",
298
+ "adam_epsilon",
299
+ "max_grad_norm",
300
+ "warmup_steps",
301
+ "warmup_ratio",
302
+ "lr_scheduler_type",
303
+ "logging_steps",
304
+ "eval_steps",
305
+ "save_steps",
306
+ "seed",
307
+ "fp16",
308
+ "bf16",
309
+ "gradient_checkpointing",
310
+ "deepspeed",
311
+ "local_rank",
312
+ "dataloader_num_workers",
313
+ ]
314
+
315
+ for attr in safe_attrs:
316
+ if hasattr(args, attr):
317
+ value = getattr(args, attr)
318
+ # Convert enums to strings
319
+ if hasattr(value, "value"):
320
+ value = value.value
321
+ config[attr] = value
322
+
323
+ return config
324
+
325
+
326
+ # Make it compatible with transformers TrainerCallback protocol
327
+ # by ensuring it has all required methods (even as no-ops)
328
+ def _ensure_trainer_callback_compatibility():
329
+ """Ensure DeepFabricCallback has all TrainerCallback methods."""
330
+ # These methods are optional but good to have for completeness
331
+ # Include all methods that transformers might call on callbacks
332
+ optional_methods = [
333
+ "on_step_begin",
334
+ "on_step_end",
335
+ "on_substep_end",
336
+ "on_epoch_begin",
337
+ "on_epoch_end",
338
+ "on_prediction_step",
339
+ "on_init_end",
340
+ # Newer transformers versions
341
+ "on_pre_optimizer_step",
342
+ "on_optimizer_step",
343
+ "on_post_optimizer_step",
344
+ "on_pre_scheduler_step",
345
+ "on_scheduler_step",
346
+ "on_post_scheduler_step",
347
+ ]
348
+
349
+ def _make_noop(name):
350
+ """Create a no-op method that returns control unchanged."""
351
+
352
+ def noop(self, args, state, control, **kwargs): # noqa: ARG001
353
+ return control
354
+
355
+ noop.__name__ = name
356
+ return noop
357
+
358
+ for method in optional_methods:
359
+ if not hasattr(DeepFabricCallback, method):
360
+ setattr(DeepFabricCallback, method, _make_noop(method))
361
+
362
+
363
+ _ensure_trainer_callback_compatibility()
@@ -0,0 +1,301 @@
1
+ """Non-blocking async metrics sender for training metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import atexit
6
+ import logging
7
+ import queue
8
+ import threading
9
+ import time
10
+
11
+ from datetime import datetime, timezone
12
+ from typing import Any
13
+
14
+ import requests
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class MetricsSender:
20
+ """Non-blocking metrics sender with background thread.
21
+
22
+ Queues metrics and sends them in batches via a background thread to avoid
23
+ blocking training. Gracefully handles network errors and queue overflow.
24
+
25
+ Example:
26
+ sender = MetricsSender(
27
+ endpoint="https://api.deepfabric.ai",
28
+ api_key="your-api-key",
29
+ )
30
+ sender.send_metrics({"loss": 2.5, "step": 100})
31
+ sender.flush() # Ensure all metrics are sent
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ endpoint: str,
37
+ api_key: str | None,
38
+ batch_size: int = 10,
39
+ flush_interval: float = 5.0,
40
+ max_queue_size: int = 1000,
41
+ timeout: float = 10.0,
42
+ ):
43
+ """Initialize the metrics sender.
44
+
45
+ Args:
46
+ endpoint: Base URL for the DeepFabric API
47
+ api_key: API key for authentication (None disables sending)
48
+ batch_size: Number of metrics to batch before sending
49
+ flush_interval: Seconds between automatic flushes
50
+ max_queue_size: Maximum queue size (overflow drops metrics)
51
+ timeout: HTTP request timeout in seconds
52
+ """
53
+ self.endpoint = endpoint.rstrip("/")
54
+ self.api_key = api_key
55
+ self.batch_size = batch_size
56
+ self.flush_interval = flush_interval
57
+ self.timeout = timeout
58
+
59
+ self._queue: queue.Queue[dict[str, Any]] = queue.Queue(maxsize=max_queue_size)
60
+ self._stop_event = threading.Event()
61
+ self._enabled = api_key is not None
62
+
63
+ # Start background sender thread
64
+ if self._enabled:
65
+ self._thread = threading.Thread(
66
+ target=self._sender_loop,
67
+ daemon=True,
68
+ name="deepfabric-metrics-sender",
69
+ )
70
+ self._thread.start()
71
+ atexit.register(self.shutdown)
72
+ else:
73
+ self._thread = None
74
+
75
+ self._send_errors = 0
76
+ self._metrics_sent = 0
77
+ self._metrics_dropped = 0
78
+
79
+ @property
80
+ def enabled(self) -> bool:
81
+ """Whether the sender is enabled (has API key)."""
82
+ return self._enabled
83
+
84
+ @property
85
+ def stats(self) -> dict[str, int]:
86
+ """Get sender statistics."""
87
+ return {
88
+ "metrics_sent": self._metrics_sent,
89
+ "metrics_dropped": self._metrics_dropped,
90
+ "send_errors": self._send_errors,
91
+ "queue_size": self._queue.qsize(),
92
+ }
93
+
94
+ def send_metrics(self, metrics: dict[str, Any]) -> bool:
95
+ """Queue metrics for async sending (non-blocking).
96
+
97
+ Args:
98
+ metrics: Dictionary of metric names to values
99
+
100
+ Returns:
101
+ True if queued successfully, False if dropped
102
+ """
103
+ if not self._enabled:
104
+ return False
105
+
106
+ # Add timestamp if not present
107
+ if "timestamp" not in metrics:
108
+ metrics["timestamp"] = datetime.now(timezone.utc).isoformat()
109
+
110
+ try:
111
+ self._queue.put_nowait({"type": "metrics", "data": metrics})
112
+ except queue.Full:
113
+ self._metrics_dropped += 1
114
+ logger.debug("Metrics queue full, dropping metrics")
115
+ return False
116
+ else:
117
+ return True
118
+
119
+ def send_run_start(self, metadata: dict[str, Any]) -> bool:
120
+ """Send run start event.
121
+
122
+ Args:
123
+ metadata: Run metadata (model_name, training_args, etc.)
124
+
125
+ Returns:
126
+ True if queued successfully
127
+ """
128
+ return self._send_event("run_start", metadata)
129
+
130
+ def send_run_end(self, metadata: dict[str, Any]) -> bool:
131
+ """Send run end event.
132
+
133
+ Args:
134
+ metadata: Run end metadata (final_step, final_epoch, etc.)
135
+
136
+ Returns:
137
+ True if queued successfully
138
+ """
139
+ return self._send_event("run_end", metadata)
140
+
141
+ def _send_event(self, event_type: str, data: dict[str, Any]) -> bool:
142
+ """Queue an event for sending.
143
+
144
+ Args:
145
+ event_type: Type of event (run_start, run_end, etc.)
146
+ data: Event data
147
+
148
+ Returns:
149
+ True if queued successfully
150
+ """
151
+ if not self._enabled:
152
+ return False
153
+
154
+ if "timestamp" not in data:
155
+ data["timestamp"] = datetime.now(timezone.utc).isoformat()
156
+
157
+ try:
158
+ self._queue.put_nowait({"type": event_type, "data": data})
159
+ except queue.Full:
160
+ logger.debug(f"Queue full, dropping {event_type} event")
161
+ return False
162
+ else:
163
+ return True
164
+
165
+ def _sender_loop(self) -> None:
166
+ """Background thread that batches and sends metrics."""
167
+ batch: list[dict[str, Any]] = []
168
+ last_flush = time.monotonic()
169
+
170
+ while not self._stop_event.is_set():
171
+ try:
172
+ # Wait for item with timeout
173
+ item = self._queue.get(timeout=min(1.0, self.flush_interval))
174
+ batch.append(item)
175
+
176
+ # Check if we should flush
177
+ should_flush = (
178
+ len(batch) >= self.batch_size
179
+ or (time.monotonic() - last_flush) >= self.flush_interval
180
+ )
181
+
182
+ if should_flush:
183
+ self._flush_batch(batch)
184
+ batch = []
185
+ last_flush = time.monotonic()
186
+
187
+ except queue.Empty:
188
+ # Timeout - flush if we have pending items
189
+ if batch and (time.monotonic() - last_flush) >= self.flush_interval:
190
+ self._flush_batch(batch)
191
+ batch = []
192
+ last_flush = time.monotonic()
193
+
194
+ # On shutdown, drain the queue and flush everything
195
+ while not self._queue.empty():
196
+ try:
197
+ batch.append(self._queue.get_nowait())
198
+ except queue.Empty:
199
+ break
200
+ if batch:
201
+ self._flush_batch(batch)
202
+
203
+ def _flush_batch(self, batch: list[dict[str, Any]]) -> None:
204
+ """Send batch of metrics to API.
205
+
206
+ Args:
207
+ batch: List of queued items to send
208
+ """
209
+ if not batch or not self._enabled:
210
+ return
211
+
212
+ # Separate events and metrics
213
+ events = [item for item in batch if item["type"] != "metrics"]
214
+ metrics = [item["data"] for item in batch if item["type"] == "metrics"]
215
+
216
+ # Send events first (run_start, run_end)
217
+ for event in events:
218
+ self._send_to_api(
219
+ endpoint=f"{self.endpoint}/v1/training/runs",
220
+ payload={"event_type": event["type"], **event["data"]},
221
+ )
222
+
223
+ # Send metrics batch
224
+ if metrics:
225
+ self._send_to_api(
226
+ endpoint=f"{self.endpoint}/v1/training/metrics",
227
+ payload={"metrics": metrics},
228
+ )
229
+ self._metrics_sent += len(metrics)
230
+
231
+ def _send_to_api(self, endpoint: str, payload: dict[str, Any]) -> bool:
232
+ """Send payload to API endpoint.
233
+
234
+ Args:
235
+ endpoint: Full API endpoint URL
236
+ payload: JSON payload to send
237
+
238
+ Returns:
239
+ True if sent successfully
240
+ """
241
+ try:
242
+ response = requests.post(
243
+ endpoint,
244
+ json=payload,
245
+ headers={
246
+ "Authorization": f"Bearer {self.api_key}",
247
+ "Content-Type": "application/json",
248
+ "User-Agent": "deepfabric-training/1.0",
249
+ },
250
+ timeout=self.timeout,
251
+ )
252
+
253
+ if not response.ok:
254
+ self._send_errors += 1
255
+ logger.debug(f"API request failed: {response.status_code} {response.text[:100]}")
256
+ return False
257
+
258
+ except requests.exceptions.Timeout:
259
+ self._send_errors += 1
260
+ logger.debug("API request timed out")
261
+ return False
262
+
263
+ except requests.exceptions.ConnectionError:
264
+ self._send_errors += 1
265
+ logger.debug("API connection error")
266
+ return False
267
+
268
+ except requests.exceptions.RequestException as e:
269
+ self._send_errors += 1
270
+ logger.debug(f"API request error: {e}")
271
+ return False
272
+
273
+ else:
274
+ return True
275
+
276
+ def flush(self, timeout: float = 30.0) -> None:
277
+ """Flush all pending metrics (blocking).
278
+
279
+ Args:
280
+ timeout: Maximum time to wait for flush
281
+ """
282
+ if not self._enabled:
283
+ return
284
+
285
+ start = time.monotonic()
286
+ while not self._queue.empty() and (time.monotonic() - start) < timeout:
287
+ time.sleep(0.1)
288
+
289
+ def shutdown(self) -> None:
290
+ """Stop the sender thread and flush remaining metrics."""
291
+ if not self._enabled or self._thread is None:
292
+ return
293
+
294
+ self._stop_event.set()
295
+ self._thread.join(timeout=5.0)
296
+
297
+ # Log final stats
298
+ logger.debug(
299
+ f"MetricsSender shutdown: sent={self._metrics_sent}, "
300
+ f"dropped={self._metrics_dropped}, errors={self._send_errors}"
301
+ )