truss 0.11.13rc2__py3-none-any.whl → 0.11.13rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of truss might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  import atexit
2
2
  import json
3
3
  import logging
4
+ import os
4
5
  import time
5
6
  from dataclasses import dataclass
6
7
  from functools import lru_cache
@@ -8,6 +9,14 @@ from pathlib import Path
8
9
  from threading import Lock, Thread
9
10
  from typing import Optional, Union
10
11
 
12
+ try:
13
+ from prometheus_client import Counter, Gauge, Histogram
14
+
15
+ PROMETHEUS_AVAILABLE = True
16
+ except ImportError:
17
+ PROMETHEUS_AVAILABLE = False
18
+ METRICS_REGISTERED = False
19
+
11
20
 
12
21
  @dataclass(frozen=True)
13
22
  class FileDownloadMetric:
@@ -61,6 +70,121 @@ class TrussTransferStats:
61
70
  except Exception:
62
71
  return None
63
72
 
73
+ def publish_to_prometheus(self):
74
+ """Publish transfer stats to Prometheus metrics. Only runs once."""
75
+ if not PROMETHEUS_AVAILABLE:
76
+ return
77
+ global METRICS_REGISTERED
78
+ if not METRICS_REGISTERED:
79
+ # Ensure metrics are only registered once
80
+ METRICS_REGISTERED = True
81
+
82
+ # Define metrics with model_cache label
83
+ manifest_size_gauge = Gauge(
84
+ "model_cache_manifest_size_bytes", "Total manifest size in bytes"
85
+ )
86
+ download_time_histogram = Histogram(
87
+ "model_cache_download_time_seconds",
88
+ "Total download time in seconds",
89
+ buckets=[
90
+ 2**i
91
+ for i in range(-3, 11) # = [0.125, .. 2048] seconds
92
+ ],
93
+ )
94
+ download_speed_gauge = Gauge(
95
+ "model_cache_download_speed_mbps", "Aggregated download speed in MB/s"
96
+ )
97
+
98
+ # File download metrics (aggregated)
99
+ files_downloaded_counter = Counter(
100
+ "model_cache_files_downloaded_total", "Total number of files downloaded"
101
+ )
102
+ total_file_size_counter = Counter(
103
+ "model_cache_file_size_bytes_total",
104
+ "Total size of downloaded files in bytes",
105
+ )
106
+ file_download_time_histogram = Histogram(
107
+ "model_cache_file_download_time_seconds",
108
+ "File download time distribution",
109
+ buckets=[
110
+ 2**i
111
+ for i in range(-3, 11) # = [0.125, .. 2048] seconds
112
+ ],
113
+ )
114
+ file_download_speed_histogram = Histogram(
115
+ "model_cache_file_download_speed_mbps",
116
+ "File download speed distribution",
117
+ buckets=[
118
+ 2**i
119
+ for i in range(-1, 12) # = [0.5, .. 4096] MB/s
120
+ ],
121
+ )
122
+
123
+ # B10FS specific metrics
124
+ b10fs_enabled_gauge = Gauge(
125
+ "model_cache_b10fs_enabled", "Whether B10FS is enabled"
126
+ )
127
+ b10fs_decision_gauge = Gauge(
128
+ "model_cache_b10fs_decision_to_use", "Whether B10FS was chosen for use"
129
+ )
130
+ b10fs_read_speed_gauge = Gauge(
131
+ "model_cache_b10fs_read_speed_mbps", "B10FS read speed in Mbps"
132
+ )
133
+ b10fs_hot_files_gauge = Gauge(
134
+ "model_cache_b10fs_hot_starts_files", "Number of hot start files"
135
+ )
136
+ b10fs_hot_bytes_gauge = Gauge(
137
+ "model_cache_b10fs_hot_starts_bytes", "Number of hot start bytes"
138
+ )
139
+ b10fs_cold_files_gauge = Gauge(
140
+ "model_cache_b10fs_cold_starts_files", "Number of cold start files"
141
+ )
142
+ b10fs_cold_bytes_gauge = Gauge(
143
+ "model_cache_b10fs_cold_starts_bytes", "Number of cold start bytes"
144
+ )
145
+
146
+ # Transfer success metric
147
+ transfer_success_counter = Counter(
148
+ "model_cache_transfer_success_total",
149
+ "Total successful transfers",
150
+ ["success"],
151
+ )
152
+
153
+ # Set main transfer metrics
154
+ manifest_size_gauge.set(self.total_manifest_size_bytes)
155
+ download_time_histogram.observe(self.total_download_time_secs)
156
+
157
+ if self.total_aggregated_mb_s is not None:
158
+ download_speed_gauge.set(self.total_aggregated_mb_s)
159
+
160
+ # Aggregate file download metrics
161
+ total_files = len(self.file_downloads)
162
+ total_file_bytes = sum(fd.file_size_bytes for fd in self.file_downloads)
163
+
164
+ files_downloaded_counter.inc(total_files)
165
+ total_file_size_counter.inc(total_file_bytes)
166
+
167
+ # Record individual file metrics for distribution
168
+ for fd in self.file_downloads:
169
+ if fd.file_size_bytes > 1 * 1024 * 1024: # Only log files larger than 1MB
170
+ file_download_time_histogram.observe(fd.download_time_secs)
171
+ file_download_speed_histogram.observe(fd.download_speed_mb_s)
172
+
173
+ # B10FS metrics
174
+ b10fs_enabled_gauge.set(1 if self.b10fs_enabled else 0)
175
+ b10fs_decision_gauge.set(1 if self.b10fs_decision_to_use else 0)
176
+
177
+ if self.b10fs_read_speed_mbps is not None:
178
+ b10fs_read_speed_gauge.set(self.b10fs_read_speed_mbps)
179
+
180
+ b10fs_hot_files_gauge.set(self.b10fs_hot_starts_files)
181
+ b10fs_hot_bytes_gauge.set(self.b10fs_hot_starts_bytes)
182
+ b10fs_cold_files_gauge.set(self.b10fs_cold_starts_files)
183
+ b10fs_cold_bytes_gauge.set(self.b10fs_cold_starts_bytes)
184
+
185
+ # Success metric
186
+ transfer_success_counter.labels(success=str(self.success)).inc()
187
+
64
188
 
65
189
  LAZY_DATA_RESOLVER_PATH = [
66
190
  # synced with pub static LAZY_DATA_RESOLVER_PATHS: &[&str]
@@ -185,6 +309,9 @@ class LazyDataResolverV2:
185
309
 
186
310
  """
187
311
  start_lock = time.time()
312
+ publish_stats = (
313
+ log_stats and not self._is_collected_by_user
314
+ ) # only publish results once per resolver
188
315
  self._is_collected_by_user = issue_collect or self._is_collected_by_user
189
316
  with self._lock:
190
317
  result = self._fetch()
@@ -196,13 +323,20 @@ class LazyDataResolverV2:
196
323
  # TODO: instument the stats, which are written to /tmp/truss_transfer_stats.json
197
324
  # also add fetch time, and blocking time
198
325
  # TrussTransferStats
326
+ fetch_t = time.time() - self._start_time
327
+ start_lock_t = time.time() - start_lock
199
328
  stats = TrussTransferStats.from_json_file(
200
329
  Path("/tmp/truss_transfer_stats.json")
201
330
  )
202
- if stats is None:
331
+ if stats and publish_stats:
203
332
  self.logger.info(f"model_cache: {stats}")
333
+ # Publish stats to Prometheus
334
+ if (
335
+ os.getenv("TRUSS_MODEL_CACHE_PROMETHEUS", "0") == "1"
336
+ ): # Hide behind feature flag for core-product to enabled.
337
+ stats.publish_to_prometheus()
204
338
  self.logger.info(
205
- f"model_cache: Fetch took {time.time() - self._start_time:.2f} seconds, of which {time.time() - start_lock:.2f} seconds were spent blocking."
339
+ f"model_cache: Fetch took {fetch_t:.2f} seconds, of which {start_lock_t:.2f} seconds were spent blocking."
206
340
  )
207
341
  return result
208
342
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: truss
3
- Version: 0.11.13rc2
3
+ Version: 0.11.13rc3
4
4
  Summary: A seamless bridge from model development to model delivery
5
5
  Project-URL: Repository, https://github.com/basetenlabs/truss
6
6
  Project-URL: Homepage, https://truss.baseten.co
@@ -107,7 +107,7 @@ truss/templates/server/common/tracing.py,sha256=XSTXNoRtV8vXwveJoX3H32go0JKnLmzn
107
107
  truss/templates/server/common/patches/whisper/patch.py,sha256=kDECQ-wmEpeAZFhUTQP457ofueeMsm7DgNy9tqinhJQ,2383
108
108
  truss/templates/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
109
  truss/templates/shared/dynamic_config_resolver.py,sha256=75s42NFhQI5jL7BqlJH_UkuQS7ptbtFh13f2nh6X5Wo,920
110
- truss/templates/shared/lazy_data_resolver.py,sha256=eOq7Fgr9QkAWpsxyDWkeZic1Z2S4Mt-drB1A7zNEkYE,8368
110
+ truss/templates/shared/lazy_data_resolver.py,sha256=czfggu9DZ_qDnE2MxOdE2R8aZyJe2G1Cd-PL0AUGx-I,13561
111
111
  truss/templates/shared/log_config.py,sha256=l9udyu4VKHZePlfK9LQEd5TOUUodPuehypsXRSUL4Ac,5411
112
112
  truss/templates/shared/secrets_resolver.py,sha256=3prDe3Q06NTmUEe7KCW-W4TD1CzGck9lpDG789209z4,2110
113
113
  truss/templates/shared/serialization.py,sha256=_WC_2PPkRi-MdTwxwjG8LKQptnHi4sANfpOlKWevqWc,3736
@@ -370,8 +370,8 @@ truss_train/deployment.py,sha256=lWWANSuzBWu2M4oK4qD7n-oVR1JKdmw2Pn5BJQHg-Ck,307
370
370
  truss_train/loader.py,sha256=0o66EjBaHc2YY4syxxHVR4ordJWs13lNXnKjKq2wq0U,1630
371
371
  truss_train/public_api.py,sha256=9N_NstiUlmBuLUwH_fNG_1x7OhGCytZLNvqKXBlStrM,1220
372
372
  truss_train/restore_from_checkpoint.py,sha256=8hdPm-WSgkt74HDPjvCjZMBpvA9MwtoYsxVjOoa7BaM,1176
373
- truss-0.11.13rc2.dist-info/METADATA,sha256=GCadCR-s-rOXbH__XXvO5wFJEp-sPcFjpRC1tlcRBwo,6681
374
- truss-0.11.13rc2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
375
- truss-0.11.13rc2.dist-info/entry_points.txt,sha256=-MwKfHHQHQ6j0HqIgvxrz3CehCmczDLTD-OsRHnjjuU,130
376
- truss-0.11.13rc2.dist-info/licenses/LICENSE,sha256=FTqGzu85i-uw1Gi8E_o0oD60bH9yQ_XIGtZbA1QUYiw,1064
377
- truss-0.11.13rc2.dist-info/RECORD,,
373
+ truss-0.11.13rc3.dist-info/METADATA,sha256=cPpD-bEoveXxM_dTFQAJPui0DFO940QYrTTWlO7aivc,6681
374
+ truss-0.11.13rc3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
375
+ truss-0.11.13rc3.dist-info/entry_points.txt,sha256=-MwKfHHQHQ6j0HqIgvxrz3CehCmczDLTD-OsRHnjjuU,130
376
+ truss-0.11.13rc3.dist-info/licenses/LICENSE,sha256=FTqGzu85i-uw1Gi8E_o0oD60bH9yQ_XIGtZbA1QUYiw,1064
377
+ truss-0.11.13rc3.dist-info/RECORD,,