mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from datetime import datetime
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
16
|
from io import StringIO
|
|
17
17
|
from typing import Literal, Optional, Union
|
|
18
18
|
|
|
@@ -24,6 +24,7 @@ import mlrun.common.model_monitoring
|
|
|
24
24
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
25
25
|
import mlrun.feature_store.steps
|
|
26
26
|
import mlrun.utils.v3io_clients
|
|
27
|
+
from mlrun.common.schemas import EventFieldType
|
|
27
28
|
from mlrun.model_monitoring.db import TSDBConnector
|
|
28
29
|
from mlrun.model_monitoring.helpers import get_invocations_fqn
|
|
29
30
|
from mlrun.utils import logger
|
|
@@ -33,7 +34,7 @@ _TSDB_RATE = "1/s"
|
|
|
33
34
|
_CONTAINER = "users"
|
|
34
35
|
|
|
35
36
|
|
|
36
|
-
def _is_no_schema_error(exc: v3io_frames.
|
|
37
|
+
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
37
38
|
"""
|
|
38
39
|
In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
|
|
39
40
|
Check if the error message contains the relevant string to verify the cause.
|
|
@@ -64,14 +65,17 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
64
65
|
self.container = container
|
|
65
66
|
|
|
66
67
|
self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
|
|
67
|
-
self._frames_client: v3io_frames.client.ClientBase =
|
|
68
|
-
self._get_v3io_frames_client(self.container)
|
|
69
|
-
)
|
|
70
|
-
|
|
68
|
+
self._frames_client: Optional[v3io_frames.client.ClientBase] = None
|
|
71
69
|
self._init_tables_path()
|
|
70
|
+
self._create_table = create_table
|
|
72
71
|
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
@property
|
|
73
|
+
def frames_client(self) -> v3io_frames.client.ClientBase:
|
|
74
|
+
if not self._frames_client:
|
|
75
|
+
self._frames_client = self._get_v3io_frames_client(self.container)
|
|
76
|
+
if self._create_table:
|
|
77
|
+
self.create_tables()
|
|
78
|
+
return self._frames_client
|
|
75
79
|
|
|
76
80
|
def _init_tables_path(self):
|
|
77
81
|
self.tables = {}
|
|
@@ -89,6 +93,19 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
89
93
|
)
|
|
90
94
|
self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
|
|
91
95
|
|
|
96
|
+
errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
97
|
+
project=self.project,
|
|
98
|
+
kind=mm_schemas.FileTargetKind.ERRORS,
|
|
99
|
+
)
|
|
100
|
+
(
|
|
101
|
+
_,
|
|
102
|
+
_,
|
|
103
|
+
errors_path,
|
|
104
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
105
|
+
errors_table_full_path
|
|
106
|
+
)
|
|
107
|
+
self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
|
|
108
|
+
|
|
92
109
|
monitoring_application_full_path = (
|
|
93
110
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
94
111
|
project=self.project,
|
|
@@ -138,7 +155,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
138
155
|
for table_name in application_tables:
|
|
139
156
|
logger.info("Creating table in V3IO TSDB", table_name=table_name)
|
|
140
157
|
table = self.tables[table_name]
|
|
141
|
-
self.
|
|
158
|
+
self.frames_client.create(
|
|
142
159
|
backend=_TSDB_BE,
|
|
143
160
|
table=table,
|
|
144
161
|
if_exists=v3io_frames.IGNORE,
|
|
@@ -148,8 +165,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
148
165
|
def apply_monitoring_stream_steps(
|
|
149
166
|
self,
|
|
150
167
|
graph,
|
|
151
|
-
tsdb_batching_max_events: int =
|
|
152
|
-
tsdb_batching_timeout_secs: int =
|
|
168
|
+
tsdb_batching_max_events: int = 1000,
|
|
169
|
+
tsdb_batching_timeout_secs: int = 30,
|
|
170
|
+
sample_window: int = 10,
|
|
153
171
|
):
|
|
154
172
|
"""
|
|
155
173
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
@@ -171,7 +189,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
171
189
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
172
190
|
container=self.container,
|
|
173
191
|
v3io_frames=self.v3io_framesd,
|
|
174
|
-
columns=[
|
|
192
|
+
columns=[
|
|
193
|
+
mm_schemas.EventFieldType.LATENCY,
|
|
194
|
+
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
195
|
+
],
|
|
175
196
|
index_cols=[
|
|
176
197
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
177
198
|
],
|
|
@@ -182,17 +203,23 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
182
203
|
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
183
204
|
)
|
|
184
205
|
|
|
206
|
+
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
207
|
+
graph.add_step(
|
|
208
|
+
"storey.steps.SampleWindow",
|
|
209
|
+
name="sample",
|
|
210
|
+
after="Rename",
|
|
211
|
+
window_size=sample_window,
|
|
212
|
+
key=EventFieldType.ENDPOINT_ID,
|
|
213
|
+
)
|
|
214
|
+
|
|
185
215
|
# Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
|
|
186
216
|
# stats and details about the events
|
|
187
217
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
apply_process_before_tsdb()
|
|
218
|
+
graph.add_step(
|
|
219
|
+
"mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ProcessBeforeTSDB",
|
|
220
|
+
name="ProcessBeforeTSDB",
|
|
221
|
+
after="sample",
|
|
222
|
+
)
|
|
196
223
|
|
|
197
224
|
# Unpacked keys from each dictionary and write to TSDB target
|
|
198
225
|
def apply_filter_and_unpacked_keys(name, keys):
|
|
@@ -255,6 +282,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
255
282
|
apply_storey_filter()
|
|
256
283
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
257
284
|
|
|
285
|
+
def handle_model_error(
|
|
286
|
+
self,
|
|
287
|
+
graph,
|
|
288
|
+
tsdb_batching_max_events: int = 1000,
|
|
289
|
+
tsdb_batching_timeout_secs: int = 30,
|
|
290
|
+
**kwargs,
|
|
291
|
+
) -> None:
|
|
292
|
+
graph.add_step(
|
|
293
|
+
"mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
|
|
294
|
+
name="error_extractor",
|
|
295
|
+
after="ForwardError",
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
graph.add_step(
|
|
299
|
+
"storey.TSDBTarget",
|
|
300
|
+
name="tsdb_error",
|
|
301
|
+
after="error_extractor",
|
|
302
|
+
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
|
|
303
|
+
rate="1/s",
|
|
304
|
+
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
305
|
+
container=self.container,
|
|
306
|
+
v3io_frames=self.v3io_framesd,
|
|
307
|
+
columns=[
|
|
308
|
+
mm_schemas.EventFieldType.MODEL_ERROR,
|
|
309
|
+
mm_schemas.EventFieldType.ERROR_COUNT,
|
|
310
|
+
],
|
|
311
|
+
index_cols=[
|
|
312
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
313
|
+
],
|
|
314
|
+
max_events=tsdb_batching_max_events,
|
|
315
|
+
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
316
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
317
|
+
)
|
|
318
|
+
|
|
258
319
|
def write_application_event(
|
|
259
320
|
self,
|
|
260
321
|
event: dict,
|
|
@@ -277,12 +338,14 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
277
338
|
elif kind == mm_schemas.WriterEventKind.RESULT:
|
|
278
339
|
table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
279
340
|
index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
|
|
280
|
-
|
|
341
|
+
event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
|
|
342
|
+
# TODO: remove this when extra data is supported (ML-7460)
|
|
343
|
+
event.pop(mm_schemas.ResultData.RESULT_EXTRA_DATA, None)
|
|
281
344
|
else:
|
|
282
345
|
raise ValueError(f"Invalid {kind = }")
|
|
283
346
|
|
|
284
347
|
try:
|
|
285
|
-
self.
|
|
348
|
+
self.frames_client.write(
|
|
286
349
|
backend=_TSDB_BE,
|
|
287
350
|
table=table,
|
|
288
351
|
dfs=pd.DataFrame.from_records([event]),
|
|
@@ -309,7 +372,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
309
372
|
tables = mm_schemas.V3IOTSDBTables.list()
|
|
310
373
|
for table_to_delete in tables:
|
|
311
374
|
try:
|
|
312
|
-
self.
|
|
375
|
+
self.frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
|
|
313
376
|
except v3io_frames.DeleteError as e:
|
|
314
377
|
logger.warning(
|
|
315
378
|
f"Failed to delete TSDB table '{table}'",
|
|
@@ -425,7 +488,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
425
488
|
aggregators = ",".join(agg_funcs) if agg_funcs else None
|
|
426
489
|
table_path = self.tables[table]
|
|
427
490
|
try:
|
|
428
|
-
df = self.
|
|
491
|
+
df = self.frames_client.read(
|
|
429
492
|
backend=_TSDB_BE,
|
|
430
493
|
table=table_path,
|
|
431
494
|
start=start,
|
|
@@ -437,7 +500,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
437
500
|
step=sliding_window_step,
|
|
438
501
|
**kwargs,
|
|
439
502
|
)
|
|
440
|
-
except v3io_frames.
|
|
503
|
+
except v3io_frames.Error as err:
|
|
441
504
|
if _is_no_schema_error(err):
|
|
442
505
|
return pd.DataFrame()
|
|
443
506
|
else:
|
|
@@ -504,10 +567,16 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
504
567
|
if type == "metrics":
|
|
505
568
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
506
569
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
570
|
+
columns = [mm_schemas.MetricData.METRIC_VALUE]
|
|
507
571
|
df_handler = self.df_to_metrics_values
|
|
508
572
|
elif type == "results":
|
|
509
573
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
510
574
|
name = mm_schemas.ResultData.RESULT_NAME
|
|
575
|
+
columns = [
|
|
576
|
+
mm_schemas.ResultData.RESULT_VALUE,
|
|
577
|
+
mm_schemas.ResultData.RESULT_STATUS,
|
|
578
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
579
|
+
]
|
|
511
580
|
df_handler = self.df_to_results_values
|
|
512
581
|
else:
|
|
513
582
|
raise ValueError(f"Invalid {type = }")
|
|
@@ -517,11 +586,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
517
586
|
metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
|
|
518
587
|
table_path=table_path,
|
|
519
588
|
name=name,
|
|
589
|
+
columns=columns,
|
|
520
590
|
)
|
|
521
591
|
|
|
522
592
|
logger.debug("Querying V3IO TSDB", query=query)
|
|
523
593
|
|
|
524
|
-
df: pd.DataFrame = self.
|
|
594
|
+
df: pd.DataFrame = self.frames_client.read(
|
|
525
595
|
backend=_TSDB_BE,
|
|
526
596
|
start=start,
|
|
527
597
|
end=end,
|
|
@@ -599,7 +669,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
599
669
|
end=end,
|
|
600
670
|
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
601
671
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
602
|
-
interval=aggregation_window,
|
|
603
672
|
agg_funcs=agg_funcs,
|
|
604
673
|
sliding_window_step=aggregation_window,
|
|
605
674
|
)
|
|
@@ -628,33 +697,153 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
628
697
|
), # pyright: ignore[reportArgumentType]
|
|
629
698
|
)
|
|
630
699
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
700
|
+
def get_last_request(
|
|
701
|
+
self,
|
|
702
|
+
endpoint_ids: Union[str, list[str]],
|
|
703
|
+
start: Union[datetime, str] = "0",
|
|
704
|
+
end: Union[datetime, str] = "now",
|
|
705
|
+
) -> pd.DataFrame:
|
|
706
|
+
endpoint_ids = (
|
|
707
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
708
|
+
)
|
|
709
|
+
df = self._get_records(
|
|
710
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
711
|
+
start=start,
|
|
712
|
+
end=end,
|
|
713
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
714
|
+
agg_funcs=["last"],
|
|
715
|
+
)
|
|
716
|
+
if not df.empty:
|
|
717
|
+
df.rename(
|
|
718
|
+
columns={
|
|
719
|
+
f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
|
|
720
|
+
f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
|
|
721
|
+
},
|
|
722
|
+
inplace=True,
|
|
723
|
+
)
|
|
724
|
+
df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
|
|
725
|
+
mm_schemas.EventFieldType.LAST_REQUEST
|
|
726
|
+
].map(
|
|
727
|
+
lambda last_request: datetime.fromtimestamp(
|
|
728
|
+
last_request, tz=timezone.utc
|
|
729
|
+
)
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
return df.reset_index(drop=True)
|
|
733
|
+
|
|
734
|
+
def get_drift_status(
|
|
735
|
+
self,
|
|
736
|
+
endpoint_ids: Union[str, list[str]],
|
|
737
|
+
start: Union[datetime, str] = "now-24h",
|
|
738
|
+
end: Union[datetime, str] = "now",
|
|
739
|
+
) -> pd.DataFrame:
|
|
740
|
+
endpoint_ids = (
|
|
741
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
742
|
+
)
|
|
743
|
+
df = self._get_records(
|
|
744
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
745
|
+
start=start,
|
|
746
|
+
end=end,
|
|
747
|
+
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
748
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
749
|
+
agg_funcs=["max"],
|
|
750
|
+
group_by="endpoint_id",
|
|
751
|
+
)
|
|
752
|
+
if not df.empty:
|
|
753
|
+
df.columns = [
|
|
754
|
+
col[len("max(") : -1] if "max(" in col else col for col in df.columns
|
|
755
|
+
]
|
|
756
|
+
return df.reset_index(drop=True)
|
|
757
|
+
|
|
758
|
+
def get_metrics_metadata(
|
|
759
|
+
self,
|
|
760
|
+
endpoint_id: str,
|
|
761
|
+
start: Union[datetime, str] = "0",
|
|
762
|
+
end: Union[datetime, str] = "now",
|
|
763
|
+
) -> pd.DataFrame:
|
|
764
|
+
df = self._get_records(
|
|
765
|
+
table=mm_schemas.V3IOTSDBTables.METRICS,
|
|
766
|
+
start=start,
|
|
767
|
+
end=end,
|
|
768
|
+
columns=[mm_schemas.MetricData.METRIC_VALUE],
|
|
769
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
770
|
+
agg_funcs=["last"],
|
|
771
|
+
)
|
|
772
|
+
if not df.empty:
|
|
773
|
+
df.drop(
|
|
774
|
+
columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
|
|
775
|
+
)
|
|
776
|
+
return df.reset_index(drop=True)
|
|
777
|
+
|
|
778
|
+
def get_results_metadata(
|
|
779
|
+
self,
|
|
780
|
+
endpoint_id: str,
|
|
781
|
+
start: Union[datetime, str] = "0",
|
|
782
|
+
end: Union[datetime, str] = "now",
|
|
783
|
+
) -> pd.DataFrame:
|
|
784
|
+
df = self._get_records(
|
|
785
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
786
|
+
start=start,
|
|
787
|
+
end=end,
|
|
788
|
+
columns=[
|
|
789
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
790
|
+
],
|
|
791
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
792
|
+
agg_funcs=["last"],
|
|
793
|
+
)
|
|
794
|
+
if not df.empty:
|
|
795
|
+
df.rename(
|
|
796
|
+
columns={
|
|
797
|
+
f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
|
|
798
|
+
},
|
|
799
|
+
inplace=True,
|
|
800
|
+
)
|
|
801
|
+
return df.reset_index(drop=True)
|
|
802
|
+
|
|
803
|
+
def get_error_count(
|
|
804
|
+
self,
|
|
805
|
+
endpoint_ids: Union[str, list[str]],
|
|
806
|
+
start: Union[datetime, str] = "0",
|
|
807
|
+
end: Union[datetime, str] = "now",
|
|
808
|
+
) -> pd.DataFrame:
|
|
809
|
+
endpoint_ids = (
|
|
810
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
811
|
+
)
|
|
812
|
+
df = self._get_records(
|
|
813
|
+
table=mm_schemas.FileTargetKind.ERRORS,
|
|
814
|
+
start=start,
|
|
815
|
+
end=end,
|
|
816
|
+
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
817
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
818
|
+
agg_funcs=["count"],
|
|
819
|
+
)
|
|
820
|
+
if not df.empty:
|
|
821
|
+
df.rename(
|
|
822
|
+
columns={
|
|
823
|
+
f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
|
|
824
|
+
},
|
|
825
|
+
inplace=True,
|
|
826
|
+
)
|
|
827
|
+
df.dropna(inplace=True)
|
|
828
|
+
return df.reset_index(drop=True)
|
|
829
|
+
|
|
830
|
+
def get_avg_latency(
|
|
831
|
+
self,
|
|
832
|
+
endpoint_ids: Union[str, list[str]],
|
|
833
|
+
start: Union[datetime, str] = "0",
|
|
834
|
+
end: Union[datetime, str] = "now",
|
|
835
|
+
) -> pd.DataFrame:
|
|
836
|
+
endpoint_ids = (
|
|
837
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
838
|
+
)
|
|
839
|
+
df = self._get_records(
|
|
840
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
841
|
+
start=start,
|
|
842
|
+
end=end,
|
|
843
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
844
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
845
|
+
agg_funcs=["avg"],
|
|
846
|
+
)
|
|
847
|
+
if not df.empty:
|
|
848
|
+
df.dropna(inplace=True)
|
|
849
|
+
return df.reset_index(drop=True)
|
|
@@ -18,25 +18,23 @@ import typing
|
|
|
18
18
|
import numpy as np
|
|
19
19
|
import pandas as pd
|
|
20
20
|
|
|
21
|
+
if typing.TYPE_CHECKING:
|
|
22
|
+
from mlrun.db.base import RunDBInterface
|
|
23
|
+
from mlrun.projects import MlrunProject
|
|
24
|
+
|
|
21
25
|
import mlrun
|
|
26
|
+
import mlrun.artifacts
|
|
22
27
|
import mlrun.common.model_monitoring.helpers
|
|
23
|
-
import mlrun.common.schemas
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
)
|
|
28
|
+
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
29
|
+
import mlrun.data_types.infer
|
|
30
|
+
import mlrun.model_monitoring
|
|
27
31
|
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
28
32
|
ModelEndpointMonitoringMetric,
|
|
29
|
-
ModelEndpointMonitoringMetricType,
|
|
30
33
|
_compose_full_name,
|
|
31
34
|
)
|
|
32
35
|
from mlrun.model_monitoring.model_endpoint import ModelEndpoint
|
|
33
36
|
from mlrun.utils import logger
|
|
34
37
|
|
|
35
|
-
if typing.TYPE_CHECKING:
|
|
36
|
-
from mlrun.db.base import RunDBInterface
|
|
37
|
-
from mlrun.projects import MlrunProject
|
|
38
|
-
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
39
|
-
|
|
40
38
|
|
|
41
39
|
class _BatchDict(typing.TypedDict):
|
|
42
40
|
minutes: int
|
|
@@ -45,33 +43,32 @@ class _BatchDict(typing.TypedDict):
|
|
|
45
43
|
|
|
46
44
|
|
|
47
45
|
def get_stream_path(
|
|
48
|
-
project: str
|
|
46
|
+
project: str,
|
|
49
47
|
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
48
|
+
stream_uri: typing.Optional[str] = None,
|
|
50
49
|
) -> str:
|
|
51
50
|
"""
|
|
52
51
|
Get stream path from the project secret. If wasn't set, take it from the system configurations
|
|
53
52
|
|
|
54
53
|
:param project: Project name.
|
|
55
|
-
:param function_name:
|
|
54
|
+
:param function_name: Application name. Default is model_monitoring_stream.
|
|
55
|
+
:param stream_uri: Stream URI. If provided, it will be used instead of the one from the project secret.
|
|
56
56
|
|
|
57
57
|
:return: Monitoring stream path to the relevant application.
|
|
58
58
|
"""
|
|
59
59
|
|
|
60
|
-
stream_uri = mlrun.get_secret_or_env(
|
|
61
|
-
|
|
60
|
+
stream_uri = stream_uri or mlrun.get_secret_or_env(
|
|
61
|
+
mm_constants.ProjectSecretKeys.STREAM_PATH
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
if not stream_uri or stream_uri == "v3io":
|
|
65
|
-
# TODO : remove the first part of this condition in 1.9.0
|
|
66
65
|
stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
67
66
|
project=project,
|
|
68
|
-
kind=
|
|
67
|
+
kind=mm_constants.FileTargetKind.STREAM,
|
|
69
68
|
target="online",
|
|
70
69
|
function_name=function_name,
|
|
71
70
|
)
|
|
72
71
|
|
|
73
|
-
if isinstance(stream_uri, list): # ML-6043 - user side gets only the new stream uri
|
|
74
|
-
stream_uri = stream_uri[1] # get new stream path, under projects
|
|
75
72
|
return mlrun.common.model_monitoring.helpers.parse_monitoring_stream_path(
|
|
76
73
|
stream_uri=stream_uri, project=project, function_name=function_name
|
|
77
74
|
)
|
|
@@ -79,7 +76,7 @@ def get_stream_path(
|
|
|
79
76
|
|
|
80
77
|
def get_monitoring_parquet_path(
|
|
81
78
|
project: "MlrunProject",
|
|
82
|
-
kind: str =
|
|
79
|
+
kind: str = mm_constants.FileTargetKind.PARQUET,
|
|
83
80
|
) -> str:
|
|
84
81
|
"""Get model monitoring parquet target for the current project and kind. The parquet target path is based on the
|
|
85
82
|
project artifact path. If project artifact path is not defined, the parquet target path will be based on MLRun
|
|
@@ -111,12 +108,9 @@ def get_connection_string(secret_provider: typing.Callable[[str], str] = None) -
|
|
|
111
108
|
|
|
112
109
|
"""
|
|
113
110
|
|
|
114
|
-
return (
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
secret_provider=secret_provider,
|
|
118
|
-
)
|
|
119
|
-
or mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection
|
|
111
|
+
return mlrun.get_secret_or_env(
|
|
112
|
+
key=mm_constants.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
|
|
113
|
+
secret_provider=secret_provider,
|
|
120
114
|
)
|
|
121
115
|
|
|
122
116
|
|
|
@@ -129,12 +123,9 @@ def get_tsdb_connection_string(
|
|
|
129
123
|
:return: Valid TSDB connection string.
|
|
130
124
|
"""
|
|
131
125
|
|
|
132
|
-
return (
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
secret_provider=secret_provider,
|
|
136
|
-
)
|
|
137
|
-
or mlrun.mlconf.model_endpoint_monitoring.tsdb_connection
|
|
126
|
+
return mlrun.get_secret_or_env(
|
|
127
|
+
key=mm_constants.ProjectSecretKeys.TSDB_CONNECTION,
|
|
128
|
+
secret_provider=secret_provider,
|
|
138
129
|
)
|
|
139
130
|
|
|
140
131
|
|
|
@@ -184,7 +175,7 @@ def _get_monitoring_time_window_from_controller_run(
|
|
|
184
175
|
def update_model_endpoint_last_request(
|
|
185
176
|
project: str,
|
|
186
177
|
model_endpoint: ModelEndpoint,
|
|
187
|
-
current_request: datetime,
|
|
178
|
+
current_request: datetime.datetime,
|
|
188
179
|
db: "RunDBInterface",
|
|
189
180
|
) -> None:
|
|
190
181
|
"""
|
|
@@ -195,7 +186,8 @@ def update_model_endpoint_last_request(
|
|
|
195
186
|
:param current_request: current request time
|
|
196
187
|
:param db: DB interface.
|
|
197
188
|
"""
|
|
198
|
-
|
|
189
|
+
is_model_server_endpoint = model_endpoint.spec.stream_path != ""
|
|
190
|
+
if is_model_server_endpoint:
|
|
199
191
|
current_request = current_request.isoformat()
|
|
200
192
|
logger.info(
|
|
201
193
|
"Update model endpoint last request time (EP with serving)",
|
|
@@ -207,14 +199,15 @@ def update_model_endpoint_last_request(
|
|
|
207
199
|
db.patch_model_endpoint(
|
|
208
200
|
project=project,
|
|
209
201
|
endpoint_id=model_endpoint.metadata.uid,
|
|
210
|
-
attributes={EventFieldType.LAST_REQUEST: current_request},
|
|
202
|
+
attributes={mm_constants.EventFieldType.LAST_REQUEST: current_request},
|
|
211
203
|
)
|
|
212
|
-
else:
|
|
204
|
+
else: # model endpoint without any serving function - close the window "manually"
|
|
213
205
|
try:
|
|
214
206
|
time_window = _get_monitoring_time_window_from_controller_run(project, db)
|
|
215
207
|
except mlrun.errors.MLRunNotFoundError:
|
|
216
|
-
logger.
|
|
217
|
-
"Not bumping model endpoint last request time - the monitoring controller isn't deployed yet"
|
|
208
|
+
logger.warn(
|
|
209
|
+
"Not bumping model endpoint last request time - the monitoring controller isn't deployed yet.\n"
|
|
210
|
+
"Call `project.enable_model_monitoring()` first."
|
|
218
211
|
)
|
|
219
212
|
return
|
|
220
213
|
|
|
@@ -236,7 +229,7 @@ def update_model_endpoint_last_request(
|
|
|
236
229
|
db.patch_model_endpoint(
|
|
237
230
|
project=project,
|
|
238
231
|
endpoint_id=model_endpoint.metadata.uid,
|
|
239
|
-
attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
|
|
232
|
+
attributes={mm_constants.EventFieldType.LAST_REQUEST: bumped_last_request},
|
|
240
233
|
)
|
|
241
234
|
|
|
242
235
|
|
|
@@ -256,12 +249,11 @@ def calculate_inputs_statistics(
|
|
|
256
249
|
|
|
257
250
|
# Use `DFDataInfer` to calculate the statistics over the inputs:
|
|
258
251
|
inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
|
|
259
|
-
df=inputs,
|
|
260
|
-
options=mlrun.data_types.infer.InferOptions.Histogram,
|
|
252
|
+
df=inputs, options=mlrun.data_types.infer.InferOptions.Histogram
|
|
261
253
|
)
|
|
262
254
|
|
|
263
255
|
# Recalculate the histograms over the bins that are set in the sample-set of the end point:
|
|
264
|
-
for feature in inputs_statistics
|
|
256
|
+
for feature in list(inputs_statistics):
|
|
265
257
|
if feature in sample_set_statistics:
|
|
266
258
|
counts, bins = np.histogram(
|
|
267
259
|
inputs[feature].to_numpy(),
|
|
@@ -271,13 +263,9 @@ def calculate_inputs_statistics(
|
|
|
271
263
|
counts.tolist(),
|
|
272
264
|
bins.tolist(),
|
|
273
265
|
]
|
|
274
|
-
|
|
275
|
-
#
|
|
276
|
-
|
|
277
|
-
mlrun.common.model_monitoring.helpers.Histogram(
|
|
278
|
-
inputs_statistics[feature]["hist"]
|
|
279
|
-
)
|
|
280
|
-
)
|
|
266
|
+
else:
|
|
267
|
+
# If the feature is not in the sample set and doesn't have a histogram, remove it from the statistics:
|
|
268
|
+
inputs_statistics.pop(feature)
|
|
281
269
|
|
|
282
270
|
return inputs_statistics
|
|
283
271
|
|
|
@@ -312,7 +300,7 @@ def get_invocations_fqn(project: str) -> str:
|
|
|
312
300
|
project=project,
|
|
313
301
|
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
314
302
|
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
315
|
-
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
303
|
+
type=mm_constants.ModelEndpointMonitoringMetricType.METRIC,
|
|
316
304
|
)
|
|
317
305
|
|
|
318
306
|
|
|
@@ -326,7 +314,39 @@ def get_invocations_metric(project: str) -> ModelEndpointMonitoringMetric:
|
|
|
326
314
|
return ModelEndpointMonitoringMetric(
|
|
327
315
|
project=project,
|
|
328
316
|
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
329
|
-
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
317
|
+
type=mm_constants.ModelEndpointMonitoringMetricType.METRIC,
|
|
330
318
|
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
331
319
|
full_name=get_invocations_fqn(project),
|
|
332
320
|
)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def enrich_model_endpoint_with_model_uri(
|
|
324
|
+
model_endpoint: ModelEndpoint,
|
|
325
|
+
model_obj: mlrun.artifacts.ModelArtifact,
|
|
326
|
+
):
|
|
327
|
+
"""
|
|
328
|
+
Enrich the model endpoint object with the model uri from the model object. We will use a unique reference
|
|
329
|
+
to the model object that includes the project, db_key, iter, and tree.
|
|
330
|
+
In addition, we verify that the model object is of type `ModelArtifact`.
|
|
331
|
+
|
|
332
|
+
:param model_endpoint: An object representing the model endpoint that will be enriched with the model uri.
|
|
333
|
+
:param model_obj: An object representing the model artifact.
|
|
334
|
+
|
|
335
|
+
:raise: `MLRunInvalidArgumentError` if the model object is not of type `ModelArtifact`.
|
|
336
|
+
"""
|
|
337
|
+
mlrun.utils.helpers.verify_field_of_type(
|
|
338
|
+
field_name="model_endpoint.spec.model_uri",
|
|
339
|
+
field_value=model_obj,
|
|
340
|
+
expected_type=mlrun.artifacts.ModelArtifact,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Update model_uri with a unique reference to handle future changes
|
|
344
|
+
model_artifact_uri = mlrun.utils.helpers.generate_artifact_uri(
|
|
345
|
+
project=model_endpoint.metadata.project,
|
|
346
|
+
key=model_obj.db_key,
|
|
347
|
+
iter=model_obj.iter,
|
|
348
|
+
tree=model_obj.tree,
|
|
349
|
+
)
|
|
350
|
+
model_endpoint.spec.model_uri = mlrun.datastore.get_store_uri(
|
|
351
|
+
kind=mlrun.utils.helpers.StorePrefix.Model, uri=model_artifact_uri
|
|
352
|
+
)
|