mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
|
@@ -18,24 +18,19 @@ import json
|
|
|
18
18
|
import os
|
|
19
19
|
import re
|
|
20
20
|
from collections.abc import Iterator
|
|
21
|
-
from typing import
|
|
21
|
+
from typing import NamedTuple, Optional, Union, cast
|
|
22
22
|
|
|
23
23
|
import nuclio
|
|
24
24
|
|
|
25
25
|
import mlrun
|
|
26
26
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
27
27
|
import mlrun.data_types.infer
|
|
28
|
-
import mlrun.feature_store as fstore
|
|
29
28
|
import mlrun.model_monitoring.db.stores
|
|
30
|
-
from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
|
|
31
29
|
from mlrun.datastore import get_stream_pusher
|
|
32
|
-
from mlrun.datastore.targets import ParquetTarget
|
|
33
30
|
from mlrun.errors import err_to_str
|
|
34
31
|
from mlrun.model_monitoring.helpers import (
|
|
35
32
|
_BatchDict,
|
|
36
33
|
batch_dict2timedelta,
|
|
37
|
-
calculate_inputs_statistics,
|
|
38
|
-
get_monitoring_parquet_path,
|
|
39
34
|
get_stream_path,
|
|
40
35
|
)
|
|
41
36
|
from mlrun.utils import datetime_now, logger
|
|
@@ -218,7 +213,7 @@ class _BatchWindowGenerator:
|
|
|
218
213
|
# If the endpoint does not have a stream, `last_updated` should be
|
|
219
214
|
# the minimum between the current time and the last updated time.
|
|
220
215
|
# This compensates for the bumping mechanism - see
|
|
221
|
-
# `
|
|
216
|
+
# `update_model_endpoint_last_request`.
|
|
222
217
|
last_updated = min(int(datetime_now().timestamp()), last_updated)
|
|
223
218
|
logger.debug(
|
|
224
219
|
"The endpoint does not have a stream", last_updated=last_updated
|
|
@@ -273,26 +268,14 @@ class MonitoringApplicationController:
|
|
|
273
268
|
Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
|
|
274
269
|
"""
|
|
275
270
|
|
|
276
|
-
def __init__(
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
project
|
|
280
|
-
):
|
|
281
|
-
"""
|
|
282
|
-
Initialize Monitoring Application Processor object.
|
|
271
|
+
def __init__(self) -> None:
|
|
272
|
+
"""Initialize Monitoring Application Controller"""
|
|
273
|
+
self.project = cast(str, mlrun.mlconf.default_project)
|
|
274
|
+
self.project_obj = mlrun.load_project(name=self.project, url=self.project)
|
|
283
275
|
|
|
284
|
-
|
|
285
|
-
:param project: Project name.
|
|
286
|
-
"""
|
|
287
|
-
self.context = mlrun_context
|
|
288
|
-
self.project = project
|
|
289
|
-
self.project_obj = mlrun.get_or_create_project(project)
|
|
276
|
+
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
290
277
|
|
|
291
|
-
|
|
292
|
-
f"Initializing {self.__class__.__name__}", project=project
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
self.db = mlrun.model_monitoring.get_store_object(project=project)
|
|
278
|
+
self.db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
296
279
|
|
|
297
280
|
self._batch_window_generator = _BatchWindowGenerator(
|
|
298
281
|
batch_dict=json.loads(
|
|
@@ -303,15 +286,9 @@ class MonitoringApplicationController:
|
|
|
303
286
|
)
|
|
304
287
|
|
|
305
288
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
306
|
-
self.
|
|
307
|
-
self.
|
|
308
|
-
kind=mm_constants.FileTargetKind.APPS_PARQUET,
|
|
289
|
+
self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
290
|
+
project=self.project
|
|
309
291
|
)
|
|
310
|
-
self.storage_options = None
|
|
311
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
312
|
-
self._initialize_v3io_configurations()
|
|
313
|
-
elif self.parquet_directory.startswith("s3://"):
|
|
314
|
-
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
315
292
|
|
|
316
293
|
@staticmethod
|
|
317
294
|
def _get_model_monitoring_access_key() -> Optional[str]:
|
|
@@ -321,98 +298,85 @@ class MonitoringApplicationController:
|
|
|
321
298
|
access_key = mlrun.mlconf.get_v3io_access_key()
|
|
322
299
|
return access_key
|
|
323
300
|
|
|
324
|
-
def
|
|
325
|
-
self.v3io_framesd = mlrun.mlconf.v3io_framesd
|
|
326
|
-
self.v3io_api = mlrun.mlconf.v3io_api
|
|
327
|
-
self.storage_options = dict(
|
|
328
|
-
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
def run(self, event: nuclio.Event):
|
|
301
|
+
def run(self) -> None:
|
|
332
302
|
"""
|
|
333
|
-
Main method for run all the relevant monitoring applications on each endpoint
|
|
334
|
-
|
|
335
|
-
|
|
303
|
+
Main method for run all the relevant monitoring applications on each endpoint.
|
|
304
|
+
This method handles the following:
|
|
305
|
+
1. List model endpoints
|
|
306
|
+
2. List applications
|
|
307
|
+
3. Check model monitoring windows
|
|
308
|
+
4. Send data to applications
|
|
309
|
+
5. Delete old parquets
|
|
336
310
|
"""
|
|
337
311
|
logger.info("Start running monitoring controller")
|
|
338
312
|
try:
|
|
339
313
|
applications_names = []
|
|
340
|
-
endpoints = self.db.list_model_endpoints()
|
|
314
|
+
endpoints = self.db.list_model_endpoints(include_stats=True)
|
|
341
315
|
if not endpoints:
|
|
342
|
-
|
|
343
|
-
"No model endpoints found", project=self.project
|
|
344
|
-
)
|
|
316
|
+
logger.info("No model endpoints found", project=self.project)
|
|
345
317
|
return
|
|
346
318
|
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
347
319
|
if monitoring_functions:
|
|
348
|
-
# Gets only application in ready state
|
|
349
320
|
applications_names = list(
|
|
350
|
-
{
|
|
351
|
-
app.metadata.name
|
|
352
|
-
for app in monitoring_functions
|
|
353
|
-
if (
|
|
354
|
-
app.status.state == "ready"
|
|
355
|
-
# workaround for the default app, as its `status.state` is `None`
|
|
356
|
-
or app.metadata.name
|
|
357
|
-
== mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
358
|
-
)
|
|
359
|
-
}
|
|
321
|
+
{app.metadata.name for app in monitoring_functions}
|
|
360
322
|
)
|
|
323
|
+
# if monitoring_functions: - TODO : ML-7700
|
|
324
|
+
# Gets only application in ready state
|
|
325
|
+
# applications_names = list(
|
|
326
|
+
# {
|
|
327
|
+
# app.metadata.name
|
|
328
|
+
# for app in monitoring_functions
|
|
329
|
+
# if (
|
|
330
|
+
# app.status.state == "ready"
|
|
331
|
+
# # workaround for the default app, as its `status.state` is `None`
|
|
332
|
+
# or app.metadata.name
|
|
333
|
+
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
334
|
+
# )
|
|
335
|
+
# }
|
|
336
|
+
# )
|
|
361
337
|
if not applications_names:
|
|
362
|
-
|
|
363
|
-
"No monitoring functions found", project=self.project
|
|
364
|
-
)
|
|
338
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
365
339
|
return
|
|
366
|
-
|
|
340
|
+
logger.info(
|
|
367
341
|
"Starting to iterate over the applications",
|
|
368
342
|
applications=applications_names,
|
|
369
343
|
)
|
|
370
344
|
|
|
371
345
|
except Exception as e:
|
|
372
|
-
|
|
346
|
+
logger.error(
|
|
373
347
|
"Failed to list endpoints and monitoring applications",
|
|
374
348
|
exc=err_to_str(e),
|
|
375
349
|
)
|
|
376
350
|
return
|
|
377
351
|
# Initialize a process pool that will be used to run each endpoint applications on a dedicated process
|
|
378
|
-
|
|
352
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
379
353
|
max_workers=min(len(endpoints), 10),
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
for endpoint in endpoints:
|
|
383
|
-
if (
|
|
384
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
385
|
-
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
386
|
-
== mm_constants.ModelMonitoringMode.enabled.value
|
|
387
|
-
):
|
|
388
|
-
# Skip router endpoint:
|
|
354
|
+
) as pool:
|
|
355
|
+
for endpoint in endpoints:
|
|
389
356
|
if (
|
|
390
|
-
|
|
391
|
-
|
|
357
|
+
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
358
|
+
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
359
|
+
== mm_constants.ModelMonitoringMode.enabled.value
|
|
392
360
|
):
|
|
393
|
-
#
|
|
394
|
-
|
|
395
|
-
|
|
361
|
+
# Skip router endpoint:
|
|
362
|
+
if (
|
|
363
|
+
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
364
|
+
== mm_constants.EndpointType.ROUTER
|
|
365
|
+
):
|
|
366
|
+
# Router endpoint has no feature stats
|
|
367
|
+
logger.info(
|
|
368
|
+
f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
|
|
369
|
+
)
|
|
370
|
+
continue
|
|
371
|
+
pool.submit(
|
|
372
|
+
MonitoringApplicationController.model_endpoint_process,
|
|
373
|
+
endpoint=endpoint,
|
|
374
|
+
applications_names=applications_names,
|
|
375
|
+
batch_window_generator=self._batch_window_generator,
|
|
376
|
+
project=self.project,
|
|
377
|
+
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
378
|
+
tsdb_connector=self.tsdb_connector,
|
|
396
379
|
)
|
|
397
|
-
continue
|
|
398
|
-
future = pool.submit(
|
|
399
|
-
MonitoringApplicationController.model_endpoint_process,
|
|
400
|
-
endpoint=endpoint,
|
|
401
|
-
applications_names=applications_names,
|
|
402
|
-
batch_window_generator=self._batch_window_generator,
|
|
403
|
-
project=self.project,
|
|
404
|
-
parquet_directory=self.parquet_directory,
|
|
405
|
-
storage_options=self.storage_options,
|
|
406
|
-
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
407
|
-
)
|
|
408
|
-
futures.append(future)
|
|
409
|
-
|
|
410
|
-
for future in concurrent.futures.as_completed(futures):
|
|
411
|
-
result = future.result()
|
|
412
|
-
if result:
|
|
413
|
-
self.context.log_results(result)
|
|
414
|
-
|
|
415
|
-
self._delete_old_parquet(endpoints=endpoints)
|
|
416
380
|
|
|
417
381
|
@classmethod
|
|
418
382
|
def model_endpoint_process(
|
|
@@ -421,10 +385,9 @@ class MonitoringApplicationController:
|
|
|
421
385
|
applications_names: list[str],
|
|
422
386
|
batch_window_generator: _BatchWindowGenerator,
|
|
423
387
|
project: str,
|
|
424
|
-
parquet_directory: str,
|
|
425
|
-
storage_options: dict,
|
|
426
388
|
model_monitoring_access_key: str,
|
|
427
|
-
|
|
389
|
+
tsdb_connector: mlrun.model_monitoring.db.tsdb.TSDBConnector,
|
|
390
|
+
) -> None:
|
|
428
391
|
"""
|
|
429
392
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
430
393
|
for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
|
|
@@ -434,18 +397,13 @@ class MonitoringApplicationController:
|
|
|
434
397
|
:param applications_names: (list[str]) List of application names to push results to.
|
|
435
398
|
:param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
|
|
436
399
|
:param project: (str) Project name.
|
|
437
|
-
:param parquet_directory: (str) Directory to store application parquet files
|
|
438
|
-
:param storage_options: (dict) Storage options for writing ParquetTarget.
|
|
439
400
|
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
440
|
-
|
|
401
|
+
:param tsdb_connector: (mlrun.model_monitoring.db.tsdb.TSDBConnector) TSDB connector
|
|
441
402
|
"""
|
|
442
403
|
endpoint_id = endpoint[mm_constants.EventFieldType.UID]
|
|
443
|
-
|
|
404
|
+
# if false the endpoint represent batch infer step.
|
|
405
|
+
has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
|
|
444
406
|
try:
|
|
445
|
-
m_fs = fstore.get_feature_set(
|
|
446
|
-
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
447
|
-
)
|
|
448
|
-
|
|
449
407
|
for application in applications_names:
|
|
450
408
|
batch_window = batch_window_generator.get_batch_window(
|
|
451
409
|
project=project,
|
|
@@ -453,168 +411,75 @@ class MonitoringApplicationController:
|
|
|
453
411
|
application=application,
|
|
454
412
|
first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
|
|
455
413
|
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
456
|
-
has_stream=
|
|
414
|
+
has_stream=has_stream,
|
|
457
415
|
)
|
|
458
416
|
|
|
459
417
|
for start_infer_time, end_infer_time in batch_window.get_intervals():
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
418
|
+
prediction_metric = tsdb_connector.read_predictions(
|
|
419
|
+
endpoint_id=endpoint_id,
|
|
420
|
+
start=start_infer_time,
|
|
421
|
+
end=end_infer_time,
|
|
422
|
+
)
|
|
423
|
+
if not prediction_metric.data and has_stream:
|
|
424
|
+
logger.info(
|
|
425
|
+
"No data found for the given interval",
|
|
426
|
+
start=start_infer_time,
|
|
427
|
+
end=end_infer_time,
|
|
465
428
|
endpoint_id=endpoint_id,
|
|
429
|
+
)
|
|
430
|
+
else:
|
|
431
|
+
logger.info(
|
|
432
|
+
"Data found for the given interval",
|
|
433
|
+
start=start_infer_time,
|
|
434
|
+
end=end_infer_time,
|
|
435
|
+
endpoint_id=endpoint_id,
|
|
436
|
+
)
|
|
437
|
+
cls._push_to_applications(
|
|
466
438
|
start_infer_time=start_infer_time,
|
|
467
439
|
end_infer_time=end_infer_time,
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
df = offline_response.to_dataframe()
|
|
474
|
-
parquet_target_path = offline_response.vector.get_target_path()
|
|
475
|
-
|
|
476
|
-
if len(df) == 0:
|
|
477
|
-
logger.info(
|
|
478
|
-
"During this time window, the endpoint has not received any data",
|
|
479
|
-
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
480
|
-
start_time=start_infer_time,
|
|
481
|
-
end_time=end_infer_time,
|
|
482
|
-
)
|
|
483
|
-
continue
|
|
484
|
-
|
|
485
|
-
except FileNotFoundError:
|
|
486
|
-
logger.warn(
|
|
487
|
-
"No parquets were written yet",
|
|
488
|
-
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
440
|
+
endpoint_id=endpoint_id,
|
|
441
|
+
project=project,
|
|
442
|
+
applications_names=[application],
|
|
443
|
+
model_monitoring_access_key=model_monitoring_access_key,
|
|
489
444
|
)
|
|
490
|
-
continue
|
|
491
|
-
|
|
492
|
-
# Get the timestamp of the latest request:
|
|
493
|
-
latest_request = df[mm_constants.EventFieldType.TIMESTAMP].iloc[-1]
|
|
494
|
-
|
|
495
|
-
# Get the feature stats from the model endpoint for reference data
|
|
496
|
-
feature_stats = json.loads(
|
|
497
|
-
endpoint[mm_constants.EventFieldType.FEATURE_STATS]
|
|
498
|
-
)
|
|
499
|
-
|
|
500
|
-
# Pad the original feature stats to accommodate current
|
|
501
|
-
# data out of the original range (unless already padded)
|
|
502
|
-
pad_features_hist(FeatureStats(feature_stats))
|
|
503
|
-
|
|
504
|
-
# Get the current stats:
|
|
505
|
-
current_stats = calculate_inputs_statistics(
|
|
506
|
-
sample_set_statistics=feature_stats, inputs=df
|
|
507
|
-
)
|
|
508
|
-
# end - TODO : delete in 1.9.0 (V1 app deprecation)
|
|
509
|
-
cls._push_to_applications(
|
|
510
|
-
current_stats=current_stats,
|
|
511
|
-
feature_stats=feature_stats,
|
|
512
|
-
start_infer_time=start_infer_time,
|
|
513
|
-
end_infer_time=end_infer_time,
|
|
514
|
-
endpoint_id=endpoint_id,
|
|
515
|
-
latest_request=latest_request,
|
|
516
|
-
project=project,
|
|
517
|
-
applications_names=[application],
|
|
518
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
519
|
-
parquet_target_path=parquet_target_path,
|
|
520
|
-
)
|
|
521
|
-
start_times.add(start_infer_time)
|
|
522
445
|
except Exception:
|
|
523
446
|
logger.exception(
|
|
524
447
|
"Encountered an exception",
|
|
525
448
|
endpoint_id=endpoint[mm_constants.EventFieldType.UID],
|
|
526
449
|
)
|
|
527
450
|
|
|
528
|
-
if start_times:
|
|
529
|
-
return {endpoint_id: [str(t) for t in sorted(list(start_times))]}
|
|
530
|
-
|
|
531
|
-
def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
|
|
532
|
-
"""
|
|
533
|
-
Delete application parquets older than the argument days.
|
|
534
|
-
|
|
535
|
-
:param endpoints: A list of dictionaries of model endpoints records.
|
|
536
|
-
"""
|
|
537
|
-
if self.parquet_directory.startswith("v3io:///"):
|
|
538
|
-
# create fs with access to the user side (under projects)
|
|
539
|
-
store, _, _ = mlrun.store_manager.get_or_create_store(
|
|
540
|
-
self.parquet_directory,
|
|
541
|
-
{"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
|
|
542
|
-
)
|
|
543
|
-
fs = store.filesystem
|
|
544
|
-
|
|
545
|
-
# calculate time threshold (keep only files from the last 24 hours)
|
|
546
|
-
time_to_keep = (
|
|
547
|
-
datetime.datetime.now(tz=datetime.timezone.utc)
|
|
548
|
-
- datetime.timedelta(days=days)
|
|
549
|
-
).timestamp()
|
|
550
|
-
|
|
551
|
-
for endpoint in endpoints:
|
|
552
|
-
try:
|
|
553
|
-
apps_parquet_directories = fs.listdir(
|
|
554
|
-
path=f"{self.parquet_directory}"
|
|
555
|
-
f"/key={endpoint[mm_constants.EventFieldType.UID]}"
|
|
556
|
-
)
|
|
557
|
-
for directory in apps_parquet_directories:
|
|
558
|
-
if directory["mtime"] < time_to_keep:
|
|
559
|
-
# Delete files
|
|
560
|
-
fs.rm(path=directory["name"], recursive=True)
|
|
561
|
-
# Delete directory
|
|
562
|
-
fs.rmdir(path=directory["name"])
|
|
563
|
-
except FileNotFoundError:
|
|
564
|
-
logger.info(
|
|
565
|
-
"Application parquet directory is empty, "
|
|
566
|
-
"probably parquets have not yet been created for this app",
|
|
567
|
-
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
568
|
-
path=f"{self.parquet_directory}"
|
|
569
|
-
f"/key={endpoint[mm_constants.EventFieldType.UID]}",
|
|
570
|
-
)
|
|
571
|
-
|
|
572
451
|
@staticmethod
|
|
573
452
|
def _push_to_applications(
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
project,
|
|
581
|
-
applications_names,
|
|
582
|
-
model_monitoring_access_key,
|
|
583
|
-
parquet_target_path,
|
|
453
|
+
start_infer_time: datetime.datetime,
|
|
454
|
+
end_infer_time: datetime.datetime,
|
|
455
|
+
endpoint_id: str,
|
|
456
|
+
project: str,
|
|
457
|
+
applications_names: list[str],
|
|
458
|
+
model_monitoring_access_key: str,
|
|
584
459
|
):
|
|
585
460
|
"""
|
|
586
461
|
Pushes data to multiple stream applications.
|
|
587
462
|
|
|
588
|
-
:param
|
|
589
|
-
:param
|
|
590
|
-
:param
|
|
591
|
-
:param
|
|
592
|
-
:param
|
|
593
|
-
:param
|
|
594
|
-
:param project: mlrun Project name.
|
|
595
|
-
:param applications_names: List of application names to which data will be pushed.
|
|
463
|
+
:param start_infer_time: The beginning of the infer interval window.
|
|
464
|
+
:param end_infer_time: The end of the infer interval window.
|
|
465
|
+
:param endpoint_id: Identifier for the model endpoint.
|
|
466
|
+
:param project: mlrun Project name.
|
|
467
|
+
:param applications_names: List of application names to which data will be pushed.
|
|
468
|
+
:param model_monitoring_access_key: Access key to apply the model monitoring process.
|
|
596
469
|
|
|
597
470
|
"""
|
|
598
|
-
|
|
599
471
|
data = {
|
|
600
|
-
mm_constants.ApplicationEvent.CURRENT_STATS: json.dumps(current_stats),
|
|
601
|
-
mm_constants.ApplicationEvent.FEATURE_STATS: json.dumps(feature_stats),
|
|
602
|
-
mm_constants.ApplicationEvent.SAMPLE_PARQUET_PATH: parquet_target_path,
|
|
603
472
|
mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
|
|
604
473
|
sep=" ", timespec="microseconds"
|
|
605
474
|
),
|
|
606
475
|
mm_constants.ApplicationEvent.END_INFER_TIME: end_infer_time.isoformat(
|
|
607
476
|
sep=" ", timespec="microseconds"
|
|
608
477
|
),
|
|
609
|
-
mm_constants.ApplicationEvent.LAST_REQUEST: latest_request.isoformat(
|
|
610
|
-
sep=" ", timespec="microseconds"
|
|
611
|
-
),
|
|
612
478
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
613
479
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
614
480
|
project=project,
|
|
615
481
|
function_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
616
482
|
),
|
|
617
|
-
mm_constants.ApplicationEvent.MLRUN_CONTEXT: {}, # TODO : for future use by ad-hoc batch infer
|
|
618
483
|
}
|
|
619
484
|
for app_name in applications_names:
|
|
620
485
|
data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
|
|
@@ -627,49 +492,12 @@ class MonitoringApplicationController:
|
|
|
627
492
|
[data]
|
|
628
493
|
)
|
|
629
494
|
|
|
630
|
-
@staticmethod
|
|
631
|
-
def _get_sample_df(
|
|
632
|
-
feature_set: mlrun.common.schemas.FeatureSet,
|
|
633
|
-
endpoint_id: str,
|
|
634
|
-
start_infer_time: datetime.datetime,
|
|
635
|
-
end_infer_time: datetime.datetime,
|
|
636
|
-
parquet_directory: str,
|
|
637
|
-
storage_options: dict,
|
|
638
|
-
application_name: str,
|
|
639
|
-
) -> mlrun.feature_store.OfflineVectorResponse:
|
|
640
|
-
"""
|
|
641
|
-
Retrieves a sample DataFrame of the current input according to the provided infer interval window.
|
|
642
|
-
|
|
643
|
-
:param feature_set: The main feature set.
|
|
644
|
-
:param endpoint_id: Identifier for the model endpoint.
|
|
645
|
-
:param start_infer_time: The beginning of the infer interval window.
|
|
646
|
-
:param end_infer_time: The end of the infer interval window.
|
|
647
|
-
:param parquet_directory: Directory where Parquet files are stored.
|
|
648
|
-
:param storage_options: Storage options for accessing the data.
|
|
649
|
-
:param application_name: Current application name.
|
|
650
495
|
|
|
651
|
-
|
|
496
|
+
def handler(context: nuclio.Context, event: nuclio.Event) -> None:
|
|
497
|
+
"""
|
|
498
|
+
Run model monitoring application processor
|
|
652
499
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
features=features,
|
|
658
|
-
with_indexes=True,
|
|
659
|
-
)
|
|
660
|
-
vector.metadata.tag = application_name
|
|
661
|
-
vector.feature_set_objects = {feature_set.metadata.name: feature_set}
|
|
662
|
-
|
|
663
|
-
# get offline features based on application start and end time.
|
|
664
|
-
# store the result parquet by partitioning by controller end processing time
|
|
665
|
-
offline_response = vector.get_offline_features(
|
|
666
|
-
start_time=start_infer_time,
|
|
667
|
-
end_time=end_infer_time,
|
|
668
|
-
timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
|
|
669
|
-
target=ParquetTarget(
|
|
670
|
-
path=parquet_directory
|
|
671
|
-
+ f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
|
|
672
|
-
storage_options=storage_options,
|
|
673
|
-
),
|
|
674
|
-
)
|
|
675
|
-
return offline_response
|
|
500
|
+
:param context: the Nuclio context
|
|
501
|
+
:param event: trigger event
|
|
502
|
+
"""
|
|
503
|
+
MonitoringApplicationController().run()
|
|
@@ -63,7 +63,7 @@ class ObjectStoreFactory(enum.Enum):
|
|
|
63
63
|
:param value: Provided enum (invalid) value.
|
|
64
64
|
"""
|
|
65
65
|
valid_values = list(cls.__members__.keys())
|
|
66
|
-
raise mlrun.errors.
|
|
66
|
+
raise mlrun.errors.MLRunInvalidMMStoreTypeError(
|
|
67
67
|
f"{value} is not a valid endpoint store, please choose a valid value: %{valid_values}."
|
|
68
68
|
)
|
|
69
69
|
|
|
@@ -100,7 +100,9 @@ def get_store_object(
|
|
|
100
100
|
:param store_connection_string: Optional explicit connection string of the store.
|
|
101
101
|
|
|
102
102
|
:return: `StoreBase` object. Using this object, the user can apply different operations such as write, update, get
|
|
103
|
-
|
|
103
|
+
and delete a model endpoint record.
|
|
104
|
+
:raise: `MLRunInvalidMMStoreTypeError` if the user didn't provide store connection
|
|
105
|
+
or the provided store connection is invalid.
|
|
104
106
|
"""
|
|
105
107
|
|
|
106
108
|
store_connection_string = (
|
|
@@ -121,7 +123,10 @@ def get_store_object(
|
|
|
121
123
|
mlrun.common.schemas.model_monitoring.ModelEndpointTarget.V3IO_NOSQL
|
|
122
124
|
)
|
|
123
125
|
else:
|
|
124
|
-
|
|
126
|
+
raise mlrun.errors.MLRunInvalidMMStoreTypeError(
|
|
127
|
+
"You must provide a valid store connection by using "
|
|
128
|
+
"set_model_monitoring_credentials API."
|
|
129
|
+
)
|
|
125
130
|
# Get store type value from ObjectStoreFactory enum class
|
|
126
131
|
store_type_fact = ObjectStoreFactory(store_type)
|
|
127
132
|
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import json
|
|
15
16
|
import typing
|
|
16
17
|
from abc import ABC, abstractmethod
|
|
@@ -94,6 +95,7 @@ class StoreBase(ABC):
|
|
|
94
95
|
labels: list[str] = None,
|
|
95
96
|
top_level: bool = None,
|
|
96
97
|
uids: list = None,
|
|
98
|
+
include_stats: bool = None,
|
|
97
99
|
) -> list[dict[str, typing.Any]]:
|
|
98
100
|
"""
|
|
99
101
|
Returns a list of model endpoint dictionaries, supports filtering by model, function, labels or top level.
|
|
@@ -107,6 +109,7 @@ class StoreBase(ABC):
|
|
|
107
109
|
key (i.e. "key").
|
|
108
110
|
:param top_level: If True will return only routers and endpoint that are NOT children of any router.
|
|
109
111
|
:param uids: List of model endpoint unique ids to include in the result.
|
|
112
|
+
:param include_stats: If True, will include model endpoint statistics in the result.
|
|
110
113
|
|
|
111
114
|
:return: A list of model endpoint dictionaries.
|
|
112
115
|
"""
|
|
@@ -11,8 +11,10 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from sqlalchemy import (
|
|
15
|
-
|
|
16
|
+
DATETIME,
|
|
17
|
+
TIMESTAMP, # TODO: migrate to DATETIME, see ML-6921
|
|
16
18
|
Boolean,
|
|
17
19
|
Column,
|
|
18
20
|
Float,
|
|
@@ -90,11 +92,11 @@ class ModelEndpointsBaseTable(BaseModel):
|
|
|
90
92
|
metrics = Column(EventFieldType.METRICS, Text)
|
|
91
93
|
first_request = Column(
|
|
92
94
|
EventFieldType.FIRST_REQUEST,
|
|
93
|
-
TIMESTAMP(timezone=True),
|
|
95
|
+
TIMESTAMP(timezone=True), # TODO: migrate to DATETIME, see ML-6921
|
|
94
96
|
)
|
|
95
97
|
last_request = Column(
|
|
96
98
|
EventFieldType.LAST_REQUEST,
|
|
97
|
-
TIMESTAMP(timezone=True),
|
|
99
|
+
TIMESTAMP(timezone=True), # TODO: migrate to DATETIME, see ML-6921
|
|
98
100
|
)
|
|
99
101
|
|
|
100
102
|
|
|
@@ -122,11 +124,11 @@ class ApplicationResultBaseTable(BaseModel):
|
|
|
122
124
|
|
|
123
125
|
start_infer_time = Column(
|
|
124
126
|
WriterEvent.START_INFER_TIME,
|
|
125
|
-
|
|
127
|
+
DATETIME(timezone=True),
|
|
126
128
|
)
|
|
127
129
|
end_infer_time = Column(
|
|
128
130
|
WriterEvent.END_INFER_TIME,
|
|
129
|
-
|
|
131
|
+
DATETIME(timezone=True),
|
|
130
132
|
)
|
|
131
133
|
|
|
132
134
|
result_status = Column(ResultData.RESULT_STATUS, String(10))
|
|
@@ -152,11 +154,11 @@ class ApplicationMetricsBaseTable(BaseModel):
|
|
|
152
154
|
)
|
|
153
155
|
start_infer_time = Column(
|
|
154
156
|
WriterEvent.START_INFER_TIME,
|
|
155
|
-
|
|
157
|
+
DATETIME(timezone=True),
|
|
156
158
|
)
|
|
157
159
|
end_infer_time = Column(
|
|
158
160
|
WriterEvent.END_INFER_TIME,
|
|
159
|
-
|
|
161
|
+
DATETIME(timezone=True),
|
|
160
162
|
)
|
|
161
163
|
metric_name = Column(
|
|
162
164
|
MetricData.METRIC_NAME,
|