truss 0.11.13rc2__py3-none-any.whl → 0.11.13rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of truss might be problematic. Click here for more details.
- truss/templates/shared/lazy_data_resolver.py +136 -2
- {truss-0.11.13rc2.dist-info → truss-0.11.13rc3.dist-info}/METADATA +1 -1
- {truss-0.11.13rc2.dist-info → truss-0.11.13rc3.dist-info}/RECORD +6 -6
- {truss-0.11.13rc2.dist-info → truss-0.11.13rc3.dist-info}/WHEEL +0 -0
- {truss-0.11.13rc2.dist-info → truss-0.11.13rc3.dist-info}/entry_points.txt +0 -0
- {truss-0.11.13rc2.dist-info → truss-0.11.13rc3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import atexit
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
import time
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
from functools import lru_cache
|
|
@@ -8,6 +9,14 @@ from pathlib import Path
|
|
|
8
9
|
from threading import Lock, Thread
|
|
9
10
|
from typing import Optional, Union
|
|
10
11
|
|
|
12
|
+
try:
|
|
13
|
+
from prometheus_client import Counter, Gauge, Histogram
|
|
14
|
+
|
|
15
|
+
PROMETHEUS_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
PROMETHEUS_AVAILABLE = False
|
|
18
|
+
METRICS_REGISTERED = False
|
|
19
|
+
|
|
11
20
|
|
|
12
21
|
@dataclass(frozen=True)
|
|
13
22
|
class FileDownloadMetric:
|
|
@@ -61,6 +70,121 @@ class TrussTransferStats:
|
|
|
61
70
|
except Exception:
|
|
62
71
|
return None
|
|
63
72
|
|
|
73
|
+
def publish_to_prometheus(self):
|
|
74
|
+
"""Publish transfer stats to Prometheus metrics. Only runs once."""
|
|
75
|
+
if not PROMETHEUS_AVAILABLE:
|
|
76
|
+
return
|
|
77
|
+
global METRICS_REGISTERED
|
|
78
|
+
if not METRICS_REGISTERED:
|
|
79
|
+
# Ensure metrics are only registered once
|
|
80
|
+
METRICS_REGISTERED = True
|
|
81
|
+
|
|
82
|
+
# Define metrics with model_cache label
|
|
83
|
+
manifest_size_gauge = Gauge(
|
|
84
|
+
"model_cache_manifest_size_bytes", "Total manifest size in bytes"
|
|
85
|
+
)
|
|
86
|
+
download_time_histogram = Histogram(
|
|
87
|
+
"model_cache_download_time_seconds",
|
|
88
|
+
"Total download time in seconds",
|
|
89
|
+
buckets=[
|
|
90
|
+
2**i
|
|
91
|
+
for i in range(-3, 11) # = [0.125, .. 2048] seconds
|
|
92
|
+
],
|
|
93
|
+
)
|
|
94
|
+
download_speed_gauge = Gauge(
|
|
95
|
+
"model_cache_download_speed_mbps", "Aggregated download speed in MB/s"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# File download metrics (aggregated)
|
|
99
|
+
files_downloaded_counter = Counter(
|
|
100
|
+
"model_cache_files_downloaded_total", "Total number of files downloaded"
|
|
101
|
+
)
|
|
102
|
+
total_file_size_counter = Counter(
|
|
103
|
+
"model_cache_file_size_bytes_total",
|
|
104
|
+
"Total size of downloaded files in bytes",
|
|
105
|
+
)
|
|
106
|
+
file_download_time_histogram = Histogram(
|
|
107
|
+
"model_cache_file_download_time_seconds",
|
|
108
|
+
"File download time distribution",
|
|
109
|
+
buckets=[
|
|
110
|
+
2**i
|
|
111
|
+
for i in range(-3, 11) # = [0.125, .. 2048] seconds
|
|
112
|
+
],
|
|
113
|
+
)
|
|
114
|
+
file_download_speed_histogram = Histogram(
|
|
115
|
+
"model_cache_file_download_speed_mbps",
|
|
116
|
+
"File download speed distribution",
|
|
117
|
+
buckets=[
|
|
118
|
+
2**i
|
|
119
|
+
for i in range(-1, 12) # = [0.5, .. 4096] MB/s
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# B10FS specific metrics
|
|
124
|
+
b10fs_enabled_gauge = Gauge(
|
|
125
|
+
"model_cache_b10fs_enabled", "Whether B10FS is enabled"
|
|
126
|
+
)
|
|
127
|
+
b10fs_decision_gauge = Gauge(
|
|
128
|
+
"model_cache_b10fs_decision_to_use", "Whether B10FS was chosen for use"
|
|
129
|
+
)
|
|
130
|
+
b10fs_read_speed_gauge = Gauge(
|
|
131
|
+
"model_cache_b10fs_read_speed_mbps", "B10FS read speed in Mbps"
|
|
132
|
+
)
|
|
133
|
+
b10fs_hot_files_gauge = Gauge(
|
|
134
|
+
"model_cache_b10fs_hot_starts_files", "Number of hot start files"
|
|
135
|
+
)
|
|
136
|
+
b10fs_hot_bytes_gauge = Gauge(
|
|
137
|
+
"model_cache_b10fs_hot_starts_bytes", "Number of hot start bytes"
|
|
138
|
+
)
|
|
139
|
+
b10fs_cold_files_gauge = Gauge(
|
|
140
|
+
"model_cache_b10fs_cold_starts_files", "Number of cold start files"
|
|
141
|
+
)
|
|
142
|
+
b10fs_cold_bytes_gauge = Gauge(
|
|
143
|
+
"model_cache_b10fs_cold_starts_bytes", "Number of cold start bytes"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Transfer success metric
|
|
147
|
+
transfer_success_counter = Counter(
|
|
148
|
+
"model_cache_transfer_success_total",
|
|
149
|
+
"Total successful transfers",
|
|
150
|
+
["success"],
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Set main transfer metrics
|
|
154
|
+
manifest_size_gauge.set(self.total_manifest_size_bytes)
|
|
155
|
+
download_time_histogram.observe(self.total_download_time_secs)
|
|
156
|
+
|
|
157
|
+
if self.total_aggregated_mb_s is not None:
|
|
158
|
+
download_speed_gauge.set(self.total_aggregated_mb_s)
|
|
159
|
+
|
|
160
|
+
# Aggregate file download metrics
|
|
161
|
+
total_files = len(self.file_downloads)
|
|
162
|
+
total_file_bytes = sum(fd.file_size_bytes for fd in self.file_downloads)
|
|
163
|
+
|
|
164
|
+
files_downloaded_counter.inc(total_files)
|
|
165
|
+
total_file_size_counter.inc(total_file_bytes)
|
|
166
|
+
|
|
167
|
+
# Record individual file metrics for distribution
|
|
168
|
+
for fd in self.file_downloads:
|
|
169
|
+
if fd.file_size_bytes > 1 * 1024 * 1024: # Only log files larger than 1MB
|
|
170
|
+
file_download_time_histogram.observe(fd.download_time_secs)
|
|
171
|
+
file_download_speed_histogram.observe(fd.download_speed_mb_s)
|
|
172
|
+
|
|
173
|
+
# B10FS metrics
|
|
174
|
+
b10fs_enabled_gauge.set(1 if self.b10fs_enabled else 0)
|
|
175
|
+
b10fs_decision_gauge.set(1 if self.b10fs_decision_to_use else 0)
|
|
176
|
+
|
|
177
|
+
if self.b10fs_read_speed_mbps is not None:
|
|
178
|
+
b10fs_read_speed_gauge.set(self.b10fs_read_speed_mbps)
|
|
179
|
+
|
|
180
|
+
b10fs_hot_files_gauge.set(self.b10fs_hot_starts_files)
|
|
181
|
+
b10fs_hot_bytes_gauge.set(self.b10fs_hot_starts_bytes)
|
|
182
|
+
b10fs_cold_files_gauge.set(self.b10fs_cold_starts_files)
|
|
183
|
+
b10fs_cold_bytes_gauge.set(self.b10fs_cold_starts_bytes)
|
|
184
|
+
|
|
185
|
+
# Success metric
|
|
186
|
+
transfer_success_counter.labels(success=str(self.success)).inc()
|
|
187
|
+
|
|
64
188
|
|
|
65
189
|
LAZY_DATA_RESOLVER_PATH = [
|
|
66
190
|
# synced with pub static LAZY_DATA_RESOLVER_PATHS: &[&str]
|
|
@@ -185,6 +309,9 @@ class LazyDataResolverV2:
|
|
|
185
309
|
|
|
186
310
|
"""
|
|
187
311
|
start_lock = time.time()
|
|
312
|
+
publish_stats = (
|
|
313
|
+
log_stats and not self._is_collected_by_user
|
|
314
|
+
) # only publish results once per resolver
|
|
188
315
|
self._is_collected_by_user = issue_collect or self._is_collected_by_user
|
|
189
316
|
with self._lock:
|
|
190
317
|
result = self._fetch()
|
|
@@ -196,13 +323,20 @@ class LazyDataResolverV2:
|
|
|
196
323
|
# TODO: instument the stats, which are written to /tmp/truss_transfer_stats.json
|
|
197
324
|
# also add fetch time, and blocking time
|
|
198
325
|
# TrussTransferStats
|
|
326
|
+
fetch_t = time.time() - self._start_time
|
|
327
|
+
start_lock_t = time.time() - start_lock
|
|
199
328
|
stats = TrussTransferStats.from_json_file(
|
|
200
329
|
Path("/tmp/truss_transfer_stats.json")
|
|
201
330
|
)
|
|
202
|
-
if stats
|
|
331
|
+
if stats and publish_stats:
|
|
203
332
|
self.logger.info(f"model_cache: {stats}")
|
|
333
|
+
# Publish stats to Prometheus
|
|
334
|
+
if (
|
|
335
|
+
os.getenv("TRUSS_MODEL_CACHE_PROMETHEUS", "0") == "1"
|
|
336
|
+
): # Hide behind feature flag for core-product to enabled.
|
|
337
|
+
stats.publish_to_prometheus()
|
|
204
338
|
self.logger.info(
|
|
205
|
-
f"model_cache: Fetch took {
|
|
339
|
+
f"model_cache: Fetch took {fetch_t:.2f} seconds, of which {start_lock_t:.2f} seconds were spent blocking."
|
|
206
340
|
)
|
|
207
341
|
return result
|
|
208
342
|
|
|
@@ -107,7 +107,7 @@ truss/templates/server/common/tracing.py,sha256=XSTXNoRtV8vXwveJoX3H32go0JKnLmzn
|
|
|
107
107
|
truss/templates/server/common/patches/whisper/patch.py,sha256=kDECQ-wmEpeAZFhUTQP457ofueeMsm7DgNy9tqinhJQ,2383
|
|
108
108
|
truss/templates/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
109
109
|
truss/templates/shared/dynamic_config_resolver.py,sha256=75s42NFhQI5jL7BqlJH_UkuQS7ptbtFh13f2nh6X5Wo,920
|
|
110
|
-
truss/templates/shared/lazy_data_resolver.py,sha256=
|
|
110
|
+
truss/templates/shared/lazy_data_resolver.py,sha256=czfggu9DZ_qDnE2MxOdE2R8aZyJe2G1Cd-PL0AUGx-I,13561
|
|
111
111
|
truss/templates/shared/log_config.py,sha256=l9udyu4VKHZePlfK9LQEd5TOUUodPuehypsXRSUL4Ac,5411
|
|
112
112
|
truss/templates/shared/secrets_resolver.py,sha256=3prDe3Q06NTmUEe7KCW-W4TD1CzGck9lpDG789209z4,2110
|
|
113
113
|
truss/templates/shared/serialization.py,sha256=_WC_2PPkRi-MdTwxwjG8LKQptnHi4sANfpOlKWevqWc,3736
|
|
@@ -370,8 +370,8 @@ truss_train/deployment.py,sha256=lWWANSuzBWu2M4oK4qD7n-oVR1JKdmw2Pn5BJQHg-Ck,307
|
|
|
370
370
|
truss_train/loader.py,sha256=0o66EjBaHc2YY4syxxHVR4ordJWs13lNXnKjKq2wq0U,1630
|
|
371
371
|
truss_train/public_api.py,sha256=9N_NstiUlmBuLUwH_fNG_1x7OhGCytZLNvqKXBlStrM,1220
|
|
372
372
|
truss_train/restore_from_checkpoint.py,sha256=8hdPm-WSgkt74HDPjvCjZMBpvA9MwtoYsxVjOoa7BaM,1176
|
|
373
|
-
truss-0.11.
|
|
374
|
-
truss-0.11.
|
|
375
|
-
truss-0.11.
|
|
376
|
-
truss-0.11.
|
|
377
|
-
truss-0.11.
|
|
373
|
+
truss-0.11.13rc3.dist-info/METADATA,sha256=cPpD-bEoveXxM_dTFQAJPui0DFO940QYrTTWlO7aivc,6681
|
|
374
|
+
truss-0.11.13rc3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
375
|
+
truss-0.11.13rc3.dist-info/entry_points.txt,sha256=-MwKfHHQHQ6j0HqIgvxrz3CehCmczDLTD-OsRHnjjuU,130
|
|
376
|
+
truss-0.11.13rc3.dist-info/licenses/LICENSE,sha256=FTqGzu85i-uw1Gi8E_o0oD60bH9yQ_XIGtZbA1QUYiw,1064
|
|
377
|
+
truss-0.11.13rc3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|