DeepFabric 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/__init__.py +70 -0
- deepfabric/__main__.py +6 -0
- deepfabric/auth.py +382 -0
- deepfabric/builders.py +303 -0
- deepfabric/builders_agent.py +1304 -0
- deepfabric/cli.py +1288 -0
- deepfabric/config.py +899 -0
- deepfabric/config_manager.py +251 -0
- deepfabric/constants.py +94 -0
- deepfabric/dataset_manager.py +534 -0
- deepfabric/error_codes.py +581 -0
- deepfabric/evaluation/__init__.py +47 -0
- deepfabric/evaluation/backends/__init__.py +32 -0
- deepfabric/evaluation/backends/ollama_backend.py +137 -0
- deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
- deepfabric/evaluation/backends/transformers_backend.py +326 -0
- deepfabric/evaluation/evaluator.py +845 -0
- deepfabric/evaluation/evaluators/__init__.py +13 -0
- deepfabric/evaluation/evaluators/base.py +104 -0
- deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
- deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
- deepfabric/evaluation/evaluators/registry.py +66 -0
- deepfabric/evaluation/inference.py +155 -0
- deepfabric/evaluation/metrics.py +397 -0
- deepfabric/evaluation/parser.py +304 -0
- deepfabric/evaluation/reporters/__init__.py +13 -0
- deepfabric/evaluation/reporters/base.py +56 -0
- deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
- deepfabric/evaluation/reporters/file_reporter.py +61 -0
- deepfabric/evaluation/reporters/multi_reporter.py +56 -0
- deepfabric/exceptions.py +67 -0
- deepfabric/factory.py +26 -0
- deepfabric/generator.py +1084 -0
- deepfabric/graph.py +545 -0
- deepfabric/hf_hub.py +214 -0
- deepfabric/kaggle_hub.py +219 -0
- deepfabric/llm/__init__.py +41 -0
- deepfabric/llm/api_key_verifier.py +534 -0
- deepfabric/llm/client.py +1206 -0
- deepfabric/llm/errors.py +105 -0
- deepfabric/llm/rate_limit_config.py +262 -0
- deepfabric/llm/rate_limit_detector.py +278 -0
- deepfabric/llm/retry_handler.py +270 -0
- deepfabric/metrics.py +212 -0
- deepfabric/progress.py +262 -0
- deepfabric/prompts.py +290 -0
- deepfabric/schemas.py +1000 -0
- deepfabric/spin/__init__.py +6 -0
- deepfabric/spin/client.py +263 -0
- deepfabric/spin/models.py +26 -0
- deepfabric/stream_simulator.py +90 -0
- deepfabric/tools/__init__.py +5 -0
- deepfabric/tools/defaults.py +85 -0
- deepfabric/tools/loader.py +87 -0
- deepfabric/tools/mcp_client.py +677 -0
- deepfabric/topic_manager.py +303 -0
- deepfabric/topic_model.py +20 -0
- deepfabric/training/__init__.py +35 -0
- deepfabric/training/api_key_prompt.py +302 -0
- deepfabric/training/callback.py +363 -0
- deepfabric/training/metrics_sender.py +301 -0
- deepfabric/tree.py +438 -0
- deepfabric/tui.py +1267 -0
- deepfabric/update_checker.py +166 -0
- deepfabric/utils.py +150 -0
- deepfabric/validation.py +143 -0
- deepfabric-4.4.0.dist-info/METADATA +702 -0
- deepfabric-4.4.0.dist-info/RECORD +71 -0
- deepfabric-4.4.0.dist-info/WHEEL +4 -0
- deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
- deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""DeepFabric TrainerCallback for automatic metrics logging."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import uuid
|
|
8
|
+
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from .api_key_prompt import get_api_key
|
|
13
|
+
from .metrics_sender import MetricsSender
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from transformers import TrainerControl, TrainerState
|
|
17
|
+
from transformers.training_args import TrainingArguments
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DeepFabricCallback:
|
|
23
|
+
"""Callback that sends training metrics to DeepFabric SaaS.
|
|
24
|
+
|
|
25
|
+
This callback integrates with HuggingFace Trainer and TRL trainers to
|
|
26
|
+
automatically log training metrics (loss, learning rate, epoch, global step,
|
|
27
|
+
throughput, TRL-specific metrics, and custom metrics) to the DeepFabric
|
|
28
|
+
backend.
|
|
29
|
+
|
|
30
|
+
The callback is designed to be non-blocking and gracefully handles failures
|
|
31
|
+
without impacting training.
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
from deepfabric.training import DeepFabricCallback
|
|
35
|
+
|
|
36
|
+
trainer = Trainer(
|
|
37
|
+
model=model,
|
|
38
|
+
args=training_args,
|
|
39
|
+
train_dataset=train_dataset,
|
|
40
|
+
)
|
|
41
|
+
trainer.add_callback(DeepFabricCallback(trainer))
|
|
42
|
+
trainer.train()
|
|
43
|
+
|
|
44
|
+
Environment Variables:
|
|
45
|
+
DEEPFABRIC_API_KEY: API key (alternative to constructor arg)
|
|
46
|
+
DEEPFABRIC_API_URL: Backend URL (default: https://api.deepfabric.ai)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
trainer: Any | None = None,
|
|
52
|
+
api_key: str | None = None,
|
|
53
|
+
endpoint: str | None = None,
|
|
54
|
+
enabled: bool = True,
|
|
55
|
+
):
|
|
56
|
+
"""Initialize the DeepFabric callback.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
trainer: Optional Trainer instance to extract model info from
|
|
60
|
+
api_key: DeepFabric API key (falls back to DEEPFABRIC_API_KEY env var,
|
|
61
|
+
then prompts in interactive environments)
|
|
62
|
+
endpoint: API endpoint URL (falls back to DEEPFABRIC_API_URL env var)
|
|
63
|
+
enabled: Whether logging is enabled (default: True)
|
|
64
|
+
"""
|
|
65
|
+
# Get API key from arg, env, or prompt
|
|
66
|
+
self.api_key = api_key or get_api_key()
|
|
67
|
+
self.endpoint = endpoint or os.getenv("DEEPFABRIC_API_URL", "https://api.deepfabric.ai")
|
|
68
|
+
self.run_id = str(uuid.uuid4())
|
|
69
|
+
self.enabled = enabled and self.api_key is not None
|
|
70
|
+
|
|
71
|
+
# Store trainer reference for model extraction
|
|
72
|
+
self._trainer = trainer
|
|
73
|
+
|
|
74
|
+
# Initialize sender (handles None api_key gracefully)
|
|
75
|
+
self.sender = MetricsSender(
|
|
76
|
+
endpoint=self.endpoint,
|
|
77
|
+
api_key=self.api_key if self.enabled else None,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
self._run_started = False
|
|
81
|
+
self._model_name: str | None = None
|
|
82
|
+
self._training_args_logged = False
|
|
83
|
+
|
|
84
|
+
if self.enabled:
|
|
85
|
+
logger.debug(f"DeepFabric callback initialized (run_id={self.run_id})")
|
|
86
|
+
else:
|
|
87
|
+
logger.debug("DeepFabric callback disabled (no API key)")
|
|
88
|
+
|
|
89
|
+
def on_train_begin(
|
|
90
|
+
self,
|
|
91
|
+
args: TrainingArguments,
|
|
92
|
+
state: TrainerState,
|
|
93
|
+
control: TrainerControl, # noqa: ARG002
|
|
94
|
+
**kwargs: Any,
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Called at the beginning of training.
|
|
97
|
+
|
|
98
|
+
Sends run start event with training configuration.
|
|
99
|
+
"""
|
|
100
|
+
if not self.enabled or self._run_started:
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
self._run_started = True
|
|
104
|
+
|
|
105
|
+
# Extract model name from various sources
|
|
106
|
+
model = kwargs.get("model")
|
|
107
|
+
if model is None and self._trainer is not None:
|
|
108
|
+
model = getattr(self._trainer, "model", None)
|
|
109
|
+
self._model_name = self._extract_model_name(args, model)
|
|
110
|
+
|
|
111
|
+
# Build training args dict (safe extraction)
|
|
112
|
+
training_config = self._extract_training_args(args)
|
|
113
|
+
|
|
114
|
+
self.sender.send_run_start(
|
|
115
|
+
{
|
|
116
|
+
"run_id": self.run_id,
|
|
117
|
+
"model_name": self._model_name,
|
|
118
|
+
"training_config": training_config,
|
|
119
|
+
"state": {
|
|
120
|
+
"max_steps": state.max_steps,
|
|
121
|
+
"num_train_epochs": state.num_train_epochs,
|
|
122
|
+
"is_world_process_zero": getattr(state, "is_world_process_zero", True),
|
|
123
|
+
},
|
|
124
|
+
}
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def on_log(
|
|
128
|
+
self,
|
|
129
|
+
args: TrainingArguments, # noqa: ARG002
|
|
130
|
+
state: TrainerState,
|
|
131
|
+
control: TrainerControl, # noqa: ARG002
|
|
132
|
+
logs: dict[str, float] | None = None,
|
|
133
|
+
**kwargs: Any, # noqa: ARG002
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Called when metrics are logged.
|
|
136
|
+
|
|
137
|
+
Sends all logged metrics to DeepFabric (loss, learning_rate, epoch,
|
|
138
|
+
global_step, throughput, TRL metrics, custom metrics, etc.).
|
|
139
|
+
"""
|
|
140
|
+
if not self.enabled or logs is None:
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
# Filter out None values and non-numeric values
|
|
144
|
+
filtered_logs = {}
|
|
145
|
+
for key, value in logs.items():
|
|
146
|
+
if value is not None:
|
|
147
|
+
if isinstance(value, int | float):
|
|
148
|
+
filtered_logs[key] = value
|
|
149
|
+
elif isinstance(value, str):
|
|
150
|
+
# Keep string values for metadata
|
|
151
|
+
filtered_logs[key] = value
|
|
152
|
+
|
|
153
|
+
if not filtered_logs:
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
payload = {
|
|
157
|
+
"run_id": self.run_id,
|
|
158
|
+
"global_step": state.global_step,
|
|
159
|
+
"epoch": state.epoch,
|
|
160
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
161
|
+
"type": "log",
|
|
162
|
+
"metrics": filtered_logs,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
self.sender.send_metrics(payload)
|
|
166
|
+
|
|
167
|
+
def on_evaluate(
|
|
168
|
+
self,
|
|
169
|
+
args: TrainingArguments, # noqa: ARG002
|
|
170
|
+
state: TrainerState,
|
|
171
|
+
control: TrainerControl, # noqa: ARG002
|
|
172
|
+
metrics: dict[str, float] | None = None,
|
|
173
|
+
**kwargs: Any, # noqa: ARG002
|
|
174
|
+
) -> None:
|
|
175
|
+
"""Called after evaluation.
|
|
176
|
+
|
|
177
|
+
Sends evaluation metrics to DeepFabric.
|
|
178
|
+
"""
|
|
179
|
+
if not self.enabled or metrics is None:
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
payload = {
|
|
183
|
+
"run_id": self.run_id,
|
|
184
|
+
"global_step": state.global_step,
|
|
185
|
+
"epoch": state.epoch,
|
|
186
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
187
|
+
"type": "eval",
|
|
188
|
+
"metrics": metrics,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
self.sender.send_metrics(payload)
|
|
192
|
+
|
|
193
|
+
def on_train_end(
|
|
194
|
+
self,
|
|
195
|
+
args: TrainingArguments, # noqa: ARG002
|
|
196
|
+
state: TrainerState,
|
|
197
|
+
control: TrainerControl, # noqa: ARG002
|
|
198
|
+
**kwargs: Any, # noqa: ARG002
|
|
199
|
+
) -> None:
|
|
200
|
+
"""Called at the end of training.
|
|
201
|
+
|
|
202
|
+
Sends run end event and flushes pending metrics.
|
|
203
|
+
"""
|
|
204
|
+
if not self.enabled or not self._run_started:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
self.sender.send_run_end(
|
|
208
|
+
{
|
|
209
|
+
"run_id": self.run_id,
|
|
210
|
+
"final_step": state.global_step,
|
|
211
|
+
"final_epoch": state.epoch,
|
|
212
|
+
"total_flos": getattr(state, "total_flos", None),
|
|
213
|
+
"best_metric": getattr(state, "best_metric", None),
|
|
214
|
+
"best_model_checkpoint": getattr(state, "best_model_checkpoint", None),
|
|
215
|
+
}
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Flush remaining metrics
|
|
219
|
+
self.sender.flush(timeout=30.0)
|
|
220
|
+
|
|
221
|
+
logger.debug(f"DeepFabric run completed: {self.sender.stats}")
|
|
222
|
+
|
|
223
|
+
def on_save(
|
|
224
|
+
self,
|
|
225
|
+
args: TrainingArguments, # noqa: ARG002
|
|
226
|
+
state: TrainerState,
|
|
227
|
+
control: TrainerControl, # noqa: ARG002
|
|
228
|
+
**kwargs: Any, # noqa: ARG002
|
|
229
|
+
) -> None:
|
|
230
|
+
"""Called when a checkpoint is saved.
|
|
231
|
+
|
|
232
|
+
Optionally logs checkpoint events.
|
|
233
|
+
"""
|
|
234
|
+
if not self.enabled:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
# Log checkpoint event
|
|
238
|
+
self.sender.send_metrics(
|
|
239
|
+
{
|
|
240
|
+
"run_id": self.run_id,
|
|
241
|
+
"global_step": state.global_step,
|
|
242
|
+
"epoch": state.epoch,
|
|
243
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
244
|
+
"type": "checkpoint",
|
|
245
|
+
"metrics": {"checkpoint_step": state.global_step},
|
|
246
|
+
}
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def _extract_model_name(self, args: TrainingArguments, model: Any | None) -> str | None:
|
|
250
|
+
"""Extract model name from various sources.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
args: Training arguments
|
|
254
|
+
model: Model instance (may be None)
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Model name or None
|
|
258
|
+
"""
|
|
259
|
+
# Try args first
|
|
260
|
+
if hasattr(args, "model_name_or_path"):
|
|
261
|
+
return args.model_name_or_path
|
|
262
|
+
|
|
263
|
+
# Try model config
|
|
264
|
+
if model is not None:
|
|
265
|
+
if hasattr(model, "config") and hasattr(model.config, "name_or_path"):
|
|
266
|
+
return model.config.name_or_path
|
|
267
|
+
if hasattr(model, "name_or_path"):
|
|
268
|
+
return model.name_or_path
|
|
269
|
+
|
|
270
|
+
# Try output_dir as fallback
|
|
271
|
+
if hasattr(args, "output_dir"):
|
|
272
|
+
return os.path.basename(args.output_dir)
|
|
273
|
+
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
def _extract_training_args(self, args: TrainingArguments) -> dict[str, Any]:
|
|
277
|
+
"""Extract training arguments for logging.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
args: Training arguments
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Dictionary of training configuration
|
|
284
|
+
"""
|
|
285
|
+
config = {}
|
|
286
|
+
|
|
287
|
+
# Core training args
|
|
288
|
+
safe_attrs = [
|
|
289
|
+
"num_train_epochs",
|
|
290
|
+
"max_steps",
|
|
291
|
+
"per_device_train_batch_size",
|
|
292
|
+
"per_device_eval_batch_size",
|
|
293
|
+
"gradient_accumulation_steps",
|
|
294
|
+
"learning_rate",
|
|
295
|
+
"weight_decay",
|
|
296
|
+
"adam_beta1",
|
|
297
|
+
"adam_beta2",
|
|
298
|
+
"adam_epsilon",
|
|
299
|
+
"max_grad_norm",
|
|
300
|
+
"warmup_steps",
|
|
301
|
+
"warmup_ratio",
|
|
302
|
+
"lr_scheduler_type",
|
|
303
|
+
"logging_steps",
|
|
304
|
+
"eval_steps",
|
|
305
|
+
"save_steps",
|
|
306
|
+
"seed",
|
|
307
|
+
"fp16",
|
|
308
|
+
"bf16",
|
|
309
|
+
"gradient_checkpointing",
|
|
310
|
+
"deepspeed",
|
|
311
|
+
"local_rank",
|
|
312
|
+
"dataloader_num_workers",
|
|
313
|
+
]
|
|
314
|
+
|
|
315
|
+
for attr in safe_attrs:
|
|
316
|
+
if hasattr(args, attr):
|
|
317
|
+
value = getattr(args, attr)
|
|
318
|
+
# Convert enums to strings
|
|
319
|
+
if hasattr(value, "value"):
|
|
320
|
+
value = value.value
|
|
321
|
+
config[attr] = value
|
|
322
|
+
|
|
323
|
+
return config
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# Make it compatible with transformers TrainerCallback protocol
|
|
327
|
+
# by ensuring it has all required methods (even as no-ops)
|
|
328
|
+
def _ensure_trainer_callback_compatibility():
|
|
329
|
+
"""Ensure DeepFabricCallback has all TrainerCallback methods."""
|
|
330
|
+
# These methods are optional but good to have for completeness
|
|
331
|
+
# Include all methods that transformers might call on callbacks
|
|
332
|
+
optional_methods = [
|
|
333
|
+
"on_step_begin",
|
|
334
|
+
"on_step_end",
|
|
335
|
+
"on_substep_end",
|
|
336
|
+
"on_epoch_begin",
|
|
337
|
+
"on_epoch_end",
|
|
338
|
+
"on_prediction_step",
|
|
339
|
+
"on_init_end",
|
|
340
|
+
# Newer transformers versions
|
|
341
|
+
"on_pre_optimizer_step",
|
|
342
|
+
"on_optimizer_step",
|
|
343
|
+
"on_post_optimizer_step",
|
|
344
|
+
"on_pre_scheduler_step",
|
|
345
|
+
"on_scheduler_step",
|
|
346
|
+
"on_post_scheduler_step",
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
def _make_noop(name):
|
|
350
|
+
"""Create a no-op method that returns control unchanged."""
|
|
351
|
+
|
|
352
|
+
def noop(self, args, state, control, **kwargs): # noqa: ARG001
|
|
353
|
+
return control
|
|
354
|
+
|
|
355
|
+
noop.__name__ = name
|
|
356
|
+
return noop
|
|
357
|
+
|
|
358
|
+
for method in optional_methods:
|
|
359
|
+
if not hasattr(DeepFabricCallback, method):
|
|
360
|
+
setattr(DeepFabricCallback, method, _make_noop(method))
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
_ensure_trainer_callback_compatibility()
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Non-blocking async metrics sender for training metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import atexit
|
|
6
|
+
import logging
|
|
7
|
+
import queue
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MetricsSender:
|
|
20
|
+
"""Non-blocking metrics sender with background thread.
|
|
21
|
+
|
|
22
|
+
Queues metrics and sends them in batches via a background thread to avoid
|
|
23
|
+
blocking training. Gracefully handles network errors and queue overflow.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
sender = MetricsSender(
|
|
27
|
+
endpoint="https://api.deepfabric.ai",
|
|
28
|
+
api_key="your-api-key",
|
|
29
|
+
)
|
|
30
|
+
sender.send_metrics({"loss": 2.5, "step": 100})
|
|
31
|
+
sender.flush() # Ensure all metrics are sent
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
endpoint: str,
|
|
37
|
+
api_key: str | None,
|
|
38
|
+
batch_size: int = 10,
|
|
39
|
+
flush_interval: float = 5.0,
|
|
40
|
+
max_queue_size: int = 1000,
|
|
41
|
+
timeout: float = 10.0,
|
|
42
|
+
):
|
|
43
|
+
"""Initialize the metrics sender.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
endpoint: Base URL for the DeepFabric API
|
|
47
|
+
api_key: API key for authentication (None disables sending)
|
|
48
|
+
batch_size: Number of metrics to batch before sending
|
|
49
|
+
flush_interval: Seconds between automatic flushes
|
|
50
|
+
max_queue_size: Maximum queue size (overflow drops metrics)
|
|
51
|
+
timeout: HTTP request timeout in seconds
|
|
52
|
+
"""
|
|
53
|
+
self.endpoint = endpoint.rstrip("/")
|
|
54
|
+
self.api_key = api_key
|
|
55
|
+
self.batch_size = batch_size
|
|
56
|
+
self.flush_interval = flush_interval
|
|
57
|
+
self.timeout = timeout
|
|
58
|
+
|
|
59
|
+
self._queue: queue.Queue[dict[str, Any]] = queue.Queue(maxsize=max_queue_size)
|
|
60
|
+
self._stop_event = threading.Event()
|
|
61
|
+
self._enabled = api_key is not None
|
|
62
|
+
|
|
63
|
+
# Start background sender thread
|
|
64
|
+
if self._enabled:
|
|
65
|
+
self._thread = threading.Thread(
|
|
66
|
+
target=self._sender_loop,
|
|
67
|
+
daemon=True,
|
|
68
|
+
name="deepfabric-metrics-sender",
|
|
69
|
+
)
|
|
70
|
+
self._thread.start()
|
|
71
|
+
atexit.register(self.shutdown)
|
|
72
|
+
else:
|
|
73
|
+
self._thread = None
|
|
74
|
+
|
|
75
|
+
self._send_errors = 0
|
|
76
|
+
self._metrics_sent = 0
|
|
77
|
+
self._metrics_dropped = 0
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def enabled(self) -> bool:
|
|
81
|
+
"""Whether the sender is enabled (has API key)."""
|
|
82
|
+
return self._enabled
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def stats(self) -> dict[str, int]:
|
|
86
|
+
"""Get sender statistics."""
|
|
87
|
+
return {
|
|
88
|
+
"metrics_sent": self._metrics_sent,
|
|
89
|
+
"metrics_dropped": self._metrics_dropped,
|
|
90
|
+
"send_errors": self._send_errors,
|
|
91
|
+
"queue_size": self._queue.qsize(),
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
def send_metrics(self, metrics: dict[str, Any]) -> bool:
|
|
95
|
+
"""Queue metrics for async sending (non-blocking).
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
metrics: Dictionary of metric names to values
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
True if queued successfully, False if dropped
|
|
102
|
+
"""
|
|
103
|
+
if not self._enabled:
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
# Add timestamp if not present
|
|
107
|
+
if "timestamp" not in metrics:
|
|
108
|
+
metrics["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
self._queue.put_nowait({"type": "metrics", "data": metrics})
|
|
112
|
+
except queue.Full:
|
|
113
|
+
self._metrics_dropped += 1
|
|
114
|
+
logger.debug("Metrics queue full, dropping metrics")
|
|
115
|
+
return False
|
|
116
|
+
else:
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
def send_run_start(self, metadata: dict[str, Any]) -> bool:
|
|
120
|
+
"""Send run start event.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
metadata: Run metadata (model_name, training_args, etc.)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
True if queued successfully
|
|
127
|
+
"""
|
|
128
|
+
return self._send_event("run_start", metadata)
|
|
129
|
+
|
|
130
|
+
def send_run_end(self, metadata: dict[str, Any]) -> bool:
|
|
131
|
+
"""Send run end event.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
metadata: Run end metadata (final_step, final_epoch, etc.)
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
True if queued successfully
|
|
138
|
+
"""
|
|
139
|
+
return self._send_event("run_end", metadata)
|
|
140
|
+
|
|
141
|
+
def _send_event(self, event_type: str, data: dict[str, Any]) -> bool:
|
|
142
|
+
"""Queue an event for sending.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
event_type: Type of event (run_start, run_end, etc.)
|
|
146
|
+
data: Event data
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
True if queued successfully
|
|
150
|
+
"""
|
|
151
|
+
if not self._enabled:
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
if "timestamp" not in data:
|
|
155
|
+
data["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
self._queue.put_nowait({"type": event_type, "data": data})
|
|
159
|
+
except queue.Full:
|
|
160
|
+
logger.debug(f"Queue full, dropping {event_type} event")
|
|
161
|
+
return False
|
|
162
|
+
else:
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
def _sender_loop(self) -> None:
|
|
166
|
+
"""Background thread that batches and sends metrics."""
|
|
167
|
+
batch: list[dict[str, Any]] = []
|
|
168
|
+
last_flush = time.monotonic()
|
|
169
|
+
|
|
170
|
+
while not self._stop_event.is_set():
|
|
171
|
+
try:
|
|
172
|
+
# Wait for item with timeout
|
|
173
|
+
item = self._queue.get(timeout=min(1.0, self.flush_interval))
|
|
174
|
+
batch.append(item)
|
|
175
|
+
|
|
176
|
+
# Check if we should flush
|
|
177
|
+
should_flush = (
|
|
178
|
+
len(batch) >= self.batch_size
|
|
179
|
+
or (time.monotonic() - last_flush) >= self.flush_interval
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if should_flush:
|
|
183
|
+
self._flush_batch(batch)
|
|
184
|
+
batch = []
|
|
185
|
+
last_flush = time.monotonic()
|
|
186
|
+
|
|
187
|
+
except queue.Empty:
|
|
188
|
+
# Timeout - flush if we have pending items
|
|
189
|
+
if batch and (time.monotonic() - last_flush) >= self.flush_interval:
|
|
190
|
+
self._flush_batch(batch)
|
|
191
|
+
batch = []
|
|
192
|
+
last_flush = time.monotonic()
|
|
193
|
+
|
|
194
|
+
# On shutdown, drain the queue and flush everything
|
|
195
|
+
while not self._queue.empty():
|
|
196
|
+
try:
|
|
197
|
+
batch.append(self._queue.get_nowait())
|
|
198
|
+
except queue.Empty:
|
|
199
|
+
break
|
|
200
|
+
if batch:
|
|
201
|
+
self._flush_batch(batch)
|
|
202
|
+
|
|
203
|
+
def _flush_batch(self, batch: list[dict[str, Any]]) -> None:
|
|
204
|
+
"""Send batch of metrics to API.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
batch: List of queued items to send
|
|
208
|
+
"""
|
|
209
|
+
if not batch or not self._enabled:
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
# Separate events and metrics
|
|
213
|
+
events = [item for item in batch if item["type"] != "metrics"]
|
|
214
|
+
metrics = [item["data"] for item in batch if item["type"] == "metrics"]
|
|
215
|
+
|
|
216
|
+
# Send events first (run_start, run_end)
|
|
217
|
+
for event in events:
|
|
218
|
+
self._send_to_api(
|
|
219
|
+
endpoint=f"{self.endpoint}/v1/training/runs",
|
|
220
|
+
payload={"event_type": event["type"], **event["data"]},
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Send metrics batch
|
|
224
|
+
if metrics:
|
|
225
|
+
self._send_to_api(
|
|
226
|
+
endpoint=f"{self.endpoint}/v1/training/metrics",
|
|
227
|
+
payload={"metrics": metrics},
|
|
228
|
+
)
|
|
229
|
+
self._metrics_sent += len(metrics)
|
|
230
|
+
|
|
231
|
+
def _send_to_api(self, endpoint: str, payload: dict[str, Any]) -> bool:
|
|
232
|
+
"""Send payload to API endpoint.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
endpoint: Full API endpoint URL
|
|
236
|
+
payload: JSON payload to send
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
True if sent successfully
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
response = requests.post(
|
|
243
|
+
endpoint,
|
|
244
|
+
json=payload,
|
|
245
|
+
headers={
|
|
246
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
247
|
+
"Content-Type": "application/json",
|
|
248
|
+
"User-Agent": "deepfabric-training/1.0",
|
|
249
|
+
},
|
|
250
|
+
timeout=self.timeout,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if not response.ok:
|
|
254
|
+
self._send_errors += 1
|
|
255
|
+
logger.debug(f"API request failed: {response.status_code} {response.text[:100]}")
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
except requests.exceptions.Timeout:
|
|
259
|
+
self._send_errors += 1
|
|
260
|
+
logger.debug("API request timed out")
|
|
261
|
+
return False
|
|
262
|
+
|
|
263
|
+
except requests.exceptions.ConnectionError:
|
|
264
|
+
self._send_errors += 1
|
|
265
|
+
logger.debug("API connection error")
|
|
266
|
+
return False
|
|
267
|
+
|
|
268
|
+
except requests.exceptions.RequestException as e:
|
|
269
|
+
self._send_errors += 1
|
|
270
|
+
logger.debug(f"API request error: {e}")
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
else:
|
|
274
|
+
return True
|
|
275
|
+
|
|
276
|
+
def flush(self, timeout: float = 30.0) -> None:
|
|
277
|
+
"""Flush all pending metrics (blocking).
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
timeout: Maximum time to wait for flush
|
|
281
|
+
"""
|
|
282
|
+
if not self._enabled:
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
start = time.monotonic()
|
|
286
|
+
while not self._queue.empty() and (time.monotonic() - start) < timeout:
|
|
287
|
+
time.sleep(0.1)
|
|
288
|
+
|
|
289
|
+
def shutdown(self) -> None:
|
|
290
|
+
"""Stop the sender thread and flush remaining metrics."""
|
|
291
|
+
if not self._enabled or self._thread is None:
|
|
292
|
+
return
|
|
293
|
+
|
|
294
|
+
self._stop_event.set()
|
|
295
|
+
self._thread.join(timeout=5.0)
|
|
296
|
+
|
|
297
|
+
# Log final stats
|
|
298
|
+
logger.debug(
|
|
299
|
+
f"MetricsSender shutdown: sent={self._metrics_sent}, "
|
|
300
|
+
f"dropped={self._metrics_dropped}, errors={self._send_errors}"
|
|
301
|
+
)
|