mlrun 1.6.0rc20__py3-none-any.whl → 1.6.0rc22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +15 -8
- mlrun/artifacts/manager.py +6 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/config.py +1 -1
- mlrun/data_types/to_pandas.py +1 -1
- mlrun/datastore/azure_blob.py +12 -16
- mlrun/datastore/base.py +32 -10
- mlrun/datastore/datastore_profile.py +4 -4
- mlrun/datastore/dbfs_store.py +12 -11
- mlrun/datastore/filestore.py +2 -1
- mlrun/datastore/google_cloud_storage.py +11 -10
- mlrun/datastore/redis.py +2 -1
- mlrun/datastore/s3.py +12 -15
- mlrun/datastore/sources.py +16 -11
- mlrun/datastore/targets.py +2 -13
- mlrun/datastore/v3io.py +18 -20
- mlrun/db/httpdb.py +76 -7
- mlrun/errors.py +4 -0
- mlrun/execution.py +13 -4
- mlrun/feature_store/api.py +3 -4
- mlrun/launcher/base.py +4 -4
- mlrun/lists.py +0 -6
- mlrun/model.py +8 -1
- mlrun/model_monitoring/api.py +9 -31
- mlrun/model_monitoring/batch.py +14 -13
- mlrun/model_monitoring/controller.py +100 -70
- mlrun/model_monitoring/controller_handler.py +1 -3
- mlrun/model_monitoring/helpers.py +65 -20
- mlrun/model_monitoring/stream_processing.py +0 -3
- mlrun/projects/operations.py +1 -1
- mlrun/projects/project.py +10 -4
- mlrun/runtimes/base.py +6 -1
- mlrun/runtimes/constants.py +11 -0
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -9
- mlrun/runtimes/kubejob.py +1 -1
- mlrun/runtimes/local.py +64 -53
- mlrun/runtimes/serving.py +8 -1
- mlrun/serving/routers.py +7 -20
- mlrun/serving/server.py +4 -14
- mlrun/serving/utils.py +0 -3
- mlrun/utils/helpers.py +10 -2
- mlrun/utils/logger.py +5 -5
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/METADATA +5 -3
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/RECORD +51 -51
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/LICENSE +0 -0
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/WHEEL +0 -0
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ import datetime
|
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
19
|
import re
|
|
20
|
-
from typing import Any, Iterator,
|
|
20
|
+
from typing import Any, Iterator, NamedTuple, Optional, Union, cast
|
|
21
21
|
|
|
22
22
|
from v3io.dataplane.response import HttpResponseError
|
|
23
23
|
|
|
@@ -29,11 +29,21 @@ from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_his
|
|
|
29
29
|
from mlrun.datastore import get_stream_pusher
|
|
30
30
|
from mlrun.datastore.targets import ParquetTarget
|
|
31
31
|
from mlrun.model_monitoring.batch import calculate_inputs_statistics
|
|
32
|
-
from mlrun.model_monitoring.helpers import
|
|
33
|
-
|
|
32
|
+
from mlrun.model_monitoring.helpers import (
|
|
33
|
+
_BatchDict,
|
|
34
|
+
batch_dict2timedelta,
|
|
35
|
+
get_monitoring_parquet_path,
|
|
36
|
+
get_stream_path,
|
|
37
|
+
)
|
|
38
|
+
from mlrun.utils import create_logger, datetime_now, logger
|
|
34
39
|
from mlrun.utils.v3io_clients import get_v3io_client
|
|
35
40
|
|
|
36
41
|
|
|
42
|
+
class _Interval(NamedTuple):
|
|
43
|
+
start: datetime.datetime
|
|
44
|
+
end: datetime.datetime
|
|
45
|
+
|
|
46
|
+
|
|
37
47
|
class _BatchWindow:
|
|
38
48
|
V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
|
|
39
49
|
|
|
@@ -55,11 +65,15 @@ class _BatchWindow:
|
|
|
55
65
|
self._endpoint = endpoint
|
|
56
66
|
self._application = application
|
|
57
67
|
self._first_request = first_request
|
|
58
|
-
self._kv_storage = get_v3io_client(
|
|
68
|
+
self._kv_storage = get_v3io_client(
|
|
69
|
+
endpoint=mlrun.mlconf.v3io_api,
|
|
70
|
+
# Avoid noisy warning logs before the KV table is created
|
|
71
|
+
logger=create_logger(name="v3io_client", level="error"),
|
|
72
|
+
).kv
|
|
59
73
|
self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
|
|
60
|
-
self._start = self._get_last_analyzed()
|
|
61
74
|
self._stop = last_updated
|
|
62
75
|
self._step = timedelta_seconds
|
|
76
|
+
self._start = self._get_last_analyzed()
|
|
63
77
|
|
|
64
78
|
def _get_last_analyzed(self) -> Optional[int]:
|
|
65
79
|
try:
|
|
@@ -69,15 +83,26 @@ class _BatchWindow:
|
|
|
69
83
|
key=self._application,
|
|
70
84
|
)
|
|
71
85
|
except HttpResponseError as err:
|
|
72
|
-
logger.
|
|
73
|
-
"
|
|
74
|
-
"as this is probably the first time this
|
|
75
|
-
"Using the
|
|
86
|
+
logger.info(
|
|
87
|
+
"No last analyzed time was found for this endpoint and "
|
|
88
|
+
"application, as this is probably the first time this "
|
|
89
|
+
"application is running. Using the latest between first "
|
|
90
|
+
"request time or last update time minus one day instead",
|
|
76
91
|
endpoint=self._endpoint,
|
|
77
92
|
application=self._application,
|
|
78
93
|
first_request=self._first_request,
|
|
79
|
-
|
|
94
|
+
last_updated=self._stop,
|
|
80
95
|
)
|
|
96
|
+
logger.debug("Error while getting last analyzed time", err=err)
|
|
97
|
+
if self._first_request and self._stop:
|
|
98
|
+
# TODO : Change the timedelta according to the policy.
|
|
99
|
+
first_period_in_seconds = max(
|
|
100
|
+
int(datetime.timedelta(days=1).total_seconds()), self._step
|
|
101
|
+
) # max between one day and the base period
|
|
102
|
+
return max(
|
|
103
|
+
self._first_request,
|
|
104
|
+
self._stop - first_period_in_seconds,
|
|
105
|
+
)
|
|
81
106
|
return self._first_request
|
|
82
107
|
|
|
83
108
|
last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
|
|
@@ -105,20 +130,29 @@ class _BatchWindow:
|
|
|
105
130
|
|
|
106
131
|
def get_intervals(
|
|
107
132
|
self,
|
|
108
|
-
) -> Iterator[
|
|
133
|
+
) -> Iterator[_Interval]:
|
|
109
134
|
"""Generate the batch interval time ranges."""
|
|
110
135
|
if self._start is not None and self._stop is not None:
|
|
111
136
|
entered = False
|
|
112
|
-
|
|
137
|
+
# Iterate timestamp from start until timestamp <= stop - step
|
|
138
|
+
# so that the last interval will end at (timestamp + step) <= stop.
|
|
139
|
+
# Add 1 to stop - step to get <= and not <.
|
|
140
|
+
for timestamp in range(
|
|
141
|
+
self._start, self._stop - self._step + 1, self._step
|
|
142
|
+
):
|
|
113
143
|
entered = True
|
|
114
|
-
start_time = datetime.datetime.
|
|
115
|
-
|
|
116
|
-
|
|
144
|
+
start_time = datetime.datetime.fromtimestamp(
|
|
145
|
+
timestamp, tz=datetime.timezone.utc
|
|
146
|
+
)
|
|
147
|
+
end_time = datetime.datetime.fromtimestamp(
|
|
148
|
+
timestamp + self._step, tz=datetime.timezone.utc
|
|
149
|
+
)
|
|
150
|
+
yield _Interval(start_time, end_time)
|
|
117
151
|
self._update_last_analyzed(timestamp + self._step)
|
|
118
152
|
if not entered:
|
|
119
153
|
logger.info(
|
|
120
154
|
"All the data is set, but no complete intervals were found. "
|
|
121
|
-
"Wait for last_updated to be updated
|
|
155
|
+
"Wait for last_updated to be updated",
|
|
122
156
|
endpoint=self._endpoint,
|
|
123
157
|
application=self._application,
|
|
124
158
|
start=self._start,
|
|
@@ -127,8 +161,8 @@ class _BatchWindow:
|
|
|
127
161
|
)
|
|
128
162
|
else:
|
|
129
163
|
logger.warn(
|
|
130
|
-
"The first request time is not
|
|
131
|
-
"No intervals will be generated
|
|
164
|
+
"The first request time is not found for this endpoint. "
|
|
165
|
+
"No intervals will be generated",
|
|
132
166
|
endpoint=self._endpoint,
|
|
133
167
|
application=self._application,
|
|
134
168
|
start=self._start,
|
|
@@ -165,38 +199,44 @@ class _BatchWindowGenerator:
|
|
|
165
199
|
self._batch_dict[pair_list[0]] = float(pair_list[1])
|
|
166
200
|
|
|
167
201
|
def _get_timedelta(self) -> int:
|
|
168
|
-
"""Get the timedelta from
|
|
169
|
-
self._batch_dict = cast(dict[str, int], self._batch_dict)
|
|
170
|
-
minutes, hours, days = (
|
|
171
|
-
self._batch_dict[mm_constants.EventFieldType.MINUTES],
|
|
172
|
-
self._batch_dict[mm_constants.EventFieldType.HOURS],
|
|
173
|
-
self._batch_dict[mm_constants.EventFieldType.DAYS],
|
|
174
|
-
)
|
|
202
|
+
"""Get the timedelta in seconds from the batch dictionary"""
|
|
175
203
|
return int(
|
|
176
|
-
|
|
204
|
+
batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
|
|
177
205
|
)
|
|
178
206
|
|
|
179
207
|
@classmethod
|
|
180
|
-
def _get_last_updated_time(
|
|
208
|
+
def _get_last_updated_time(
|
|
209
|
+
cls, last_request: Optional[str], has_stream: bool
|
|
210
|
+
) -> Optional[int]:
|
|
181
211
|
"""
|
|
182
212
|
Get the last updated time of a model endpoint.
|
|
183
213
|
"""
|
|
184
214
|
if not last_request:
|
|
185
215
|
return None
|
|
186
|
-
|
|
216
|
+
last_updated = int(
|
|
187
217
|
cls._date_string2timestamp(last_request)
|
|
188
218
|
- cast(
|
|
189
219
|
float,
|
|
190
220
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
191
221
|
)
|
|
192
222
|
)
|
|
223
|
+
if not has_stream:
|
|
224
|
+
# If the endpoint does not have a stream, `last_updated` should be
|
|
225
|
+
# the minimum between the current time and the last updated time.
|
|
226
|
+
# This compensates for the bumping mechanism - see
|
|
227
|
+
# `bump_model_endpoint_last_request`.
|
|
228
|
+
last_updated = min(int(datetime_now().timestamp()), last_updated)
|
|
229
|
+
logger.debug(
|
|
230
|
+
"The endpoint does not have a stream", last_updated=last_updated
|
|
231
|
+
)
|
|
232
|
+
return last_updated
|
|
193
233
|
|
|
194
234
|
@classmethod
|
|
195
235
|
def _normalize_first_request(
|
|
196
236
|
cls, first_request: Optional[str], endpoint: str
|
|
197
237
|
) -> Optional[int]:
|
|
198
238
|
if not first_request:
|
|
199
|
-
logger.
|
|
239
|
+
logger.debug(
|
|
200
240
|
"There is no first request time for this endpoint.",
|
|
201
241
|
endpoint=endpoint,
|
|
202
242
|
first_request=first_request,
|
|
@@ -215,6 +255,7 @@ class _BatchWindowGenerator:
|
|
|
215
255
|
application: str,
|
|
216
256
|
first_request: Optional[str],
|
|
217
257
|
last_request: Optional[str],
|
|
258
|
+
has_stream: bool,
|
|
218
259
|
) -> _BatchWindow:
|
|
219
260
|
"""
|
|
220
261
|
Get the batch window for a specific endpoint and application.
|
|
@@ -226,7 +267,7 @@ class _BatchWindowGenerator:
|
|
|
226
267
|
endpoint=endpoint,
|
|
227
268
|
application=application,
|
|
228
269
|
timedelta_seconds=self._timedelta,
|
|
229
|
-
last_updated=self._get_last_updated_time(last_request),
|
|
270
|
+
last_updated=self._get_last_updated_time(last_request, has_stream),
|
|
230
271
|
first_request=self._normalize_first_request(first_request, endpoint),
|
|
231
272
|
)
|
|
232
273
|
|
|
@@ -251,20 +292,12 @@ class MonitoringApplicationController:
|
|
|
251
292
|
"""
|
|
252
293
|
self.context = context
|
|
253
294
|
self.project = project
|
|
295
|
+
self.project_obj = mlrun.get_or_create_project(project)
|
|
254
296
|
|
|
255
|
-
logger.
|
|
256
|
-
"Initializing MonitoringApplicationController",
|
|
257
|
-
project=project,
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
# Get a runtime database
|
|
297
|
+
context.logger.debug(f"Initializing {self.__class__.__name__}", project=project)
|
|
261
298
|
|
|
262
299
|
self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
|
|
263
300
|
|
|
264
|
-
# If an error occurs, it will be raised using the following argument
|
|
265
|
-
self.endpoints_exceptions = {}
|
|
266
|
-
|
|
267
|
-
# The batch window
|
|
268
301
|
self._batch_window_generator = _BatchWindowGenerator(
|
|
269
302
|
batch_dict=context.parameters[
|
|
270
303
|
mm_constants.EventFieldType.BATCH_INTERVALS_DICT
|
|
@@ -277,7 +310,7 @@ class MonitoringApplicationController:
|
|
|
277
310
|
)
|
|
278
311
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
279
312
|
self.parquet_directory = get_monitoring_parquet_path(
|
|
280
|
-
|
|
313
|
+
self.project_obj,
|
|
281
314
|
kind=mm_constants.FileTargetKind.APPS_PARQUET,
|
|
282
315
|
)
|
|
283
316
|
self.storage_options = None
|
|
@@ -303,21 +336,23 @@ class MonitoringApplicationController:
|
|
|
303
336
|
|
|
304
337
|
def run(self):
|
|
305
338
|
"""
|
|
306
|
-
Main method for run all the relevant monitoring
|
|
339
|
+
Main method for run all the relevant monitoring applications on each endpoint
|
|
307
340
|
"""
|
|
308
341
|
try:
|
|
309
342
|
endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
343
|
+
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
344
|
+
if monitoring_functions:
|
|
345
|
+
applications_names = list(
|
|
346
|
+
{app.metadata.name for app in monitoring_functions}
|
|
347
|
+
)
|
|
315
348
|
else:
|
|
316
|
-
logger.info(
|
|
349
|
+
self.context.logger.info(
|
|
350
|
+
"No monitoring functions found", project=self.project
|
|
351
|
+
)
|
|
317
352
|
applications_names = []
|
|
318
353
|
|
|
319
354
|
except Exception as e:
|
|
320
|
-
logger.error("Failed to list endpoints", exc=e)
|
|
355
|
+
self.context.logger.error("Failed to list endpoints", exc=e)
|
|
321
356
|
return
|
|
322
357
|
if endpoints and applications_names:
|
|
323
358
|
# Initialize a process pool that will be used to run each endpoint applications on a dedicated process
|
|
@@ -354,9 +389,7 @@ class MonitoringApplicationController:
|
|
|
354
389
|
futures.append(future)
|
|
355
390
|
|
|
356
391
|
for future in concurrent.futures.as_completed(futures):
|
|
357
|
-
|
|
358
|
-
if res:
|
|
359
|
-
self.endpoints_exceptions[res[0]] = res[1]
|
|
392
|
+
future.result()
|
|
360
393
|
|
|
361
394
|
self._delete_old_parquet(endpoints=endpoints)
|
|
362
395
|
|
|
@@ -370,7 +403,7 @@ class MonitoringApplicationController:
|
|
|
370
403
|
parquet_directory: str,
|
|
371
404
|
storage_options: dict,
|
|
372
405
|
model_monitoring_access_key: str,
|
|
373
|
-
) ->
|
|
406
|
+
) -> None:
|
|
374
407
|
"""
|
|
375
408
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
376
409
|
for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
|
|
@@ -405,6 +438,7 @@ class MonitoringApplicationController:
|
|
|
405
438
|
application=application,
|
|
406
439
|
first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
|
|
407
440
|
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
441
|
+
has_stream=endpoint[mm_constants.EventFieldType.STREAM_PATH] != "",
|
|
408
442
|
)
|
|
409
443
|
|
|
410
444
|
for start_infer_time, end_infer_time in batch_window.get_intervals():
|
|
@@ -424,22 +458,18 @@ class MonitoringApplicationController:
|
|
|
424
458
|
parquet_target_path = offline_response.vector.get_target_path()
|
|
425
459
|
|
|
426
460
|
if len(df) == 0:
|
|
427
|
-
logger.
|
|
428
|
-
"
|
|
429
|
-
featureset_name=m_fs.metadata.name,
|
|
461
|
+
logger.info(
|
|
462
|
+
"During this time window, the endpoint has not received any data",
|
|
430
463
|
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
431
|
-
min_required_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
432
464
|
start_time=start_infer_time,
|
|
433
465
|
end_time=end_infer_time,
|
|
434
466
|
)
|
|
435
467
|
continue
|
|
436
468
|
|
|
437
|
-
# Continue if not enough events provided since the deployment of the model endpoint
|
|
438
469
|
except FileNotFoundError:
|
|
439
470
|
logger.warn(
|
|
440
|
-
"
|
|
471
|
+
"No parquets were written yet",
|
|
441
472
|
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
442
|
-
min_required_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
443
473
|
)
|
|
444
474
|
continue
|
|
445
475
|
|
|
@@ -473,12 +503,11 @@ class MonitoringApplicationController:
|
|
|
473
503
|
model_monitoring_access_key=model_monitoring_access_key,
|
|
474
504
|
parquet_target_path=parquet_target_path,
|
|
475
505
|
)
|
|
476
|
-
except Exception
|
|
477
|
-
logger.
|
|
506
|
+
except Exception:
|
|
507
|
+
logger.exception(
|
|
478
508
|
"Encountered an exception",
|
|
479
509
|
endpoint_id=endpoint[mm_constants.EventFieldType.UID],
|
|
480
510
|
)
|
|
481
|
-
return endpoint_id, e
|
|
482
511
|
|
|
483
512
|
def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
|
|
484
513
|
"""
|
|
@@ -492,12 +521,14 @@ class MonitoringApplicationController:
|
|
|
492
521
|
self.parquet_directory,
|
|
493
522
|
{"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
|
|
494
523
|
)
|
|
495
|
-
fs = store.
|
|
524
|
+
fs = store.filesystem
|
|
496
525
|
|
|
497
526
|
# calculate time threshold (keep only files from the last 24 hours)
|
|
498
|
-
time_to_keep =
|
|
499
|
-
|
|
500
|
-
|
|
527
|
+
time_to_keep = (
|
|
528
|
+
datetime.datetime.now(tz=datetime.timezone.utc)
|
|
529
|
+
- datetime.timedelta(days=days)
|
|
530
|
+
).timestamp()
|
|
531
|
+
|
|
501
532
|
for endpoint in endpoints:
|
|
502
533
|
try:
|
|
503
534
|
apps_parquet_directories = fs.listdir(
|
|
@@ -611,14 +642,13 @@ class MonitoringApplicationController:
|
|
|
611
642
|
|
|
612
643
|
# get offline features based on application start and end time.
|
|
613
644
|
# store the result parquet by partitioning by controller end processing time
|
|
614
|
-
offline_response =
|
|
615
|
-
feature_vector=vector,
|
|
645
|
+
offline_response = vector.get_offline_features(
|
|
616
646
|
start_time=start_infer_time,
|
|
617
647
|
end_time=end_infer_time,
|
|
618
648
|
timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
|
|
619
649
|
target=ParquetTarget(
|
|
620
650
|
path=parquet_directory
|
|
621
|
-
+ f"/key={endpoint_id}/{start_infer_time.
|
|
651
|
+
+ f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
|
|
622
652
|
storage_options=storage_options,
|
|
623
653
|
),
|
|
624
654
|
)
|
|
@@ -16,7 +16,7 @@ import mlrun
|
|
|
16
16
|
from mlrun.model_monitoring.controller import MonitoringApplicationController
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def handler(context: mlrun.run.MLClientCtx):
|
|
19
|
+
def handler(context: mlrun.run.MLClientCtx) -> None:
|
|
20
20
|
"""
|
|
21
21
|
Run model monitoring application processor
|
|
22
22
|
|
|
@@ -27,5 +27,3 @@ def handler(context: mlrun.run.MLClientCtx):
|
|
|
27
27
|
project=context.project,
|
|
28
28
|
)
|
|
29
29
|
monitor_app_controller.run()
|
|
30
|
-
if monitor_app_controller.endpoints_exceptions:
|
|
31
|
-
context.logger.error(monitor_app_controller.endpoints_exceptions)
|
|
@@ -12,20 +12,33 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
import datetime
|
|
17
16
|
import typing
|
|
18
17
|
|
|
19
18
|
import mlrun
|
|
20
19
|
import mlrun.common.model_monitoring.helpers
|
|
21
20
|
import mlrun.common.schemas
|
|
22
|
-
from mlrun.common.schemas.model_monitoring import
|
|
23
|
-
|
|
21
|
+
from mlrun.common.schemas.model_monitoring import (
|
|
22
|
+
EventFieldType,
|
|
23
|
+
MonitoringFunctionNames,
|
|
24
|
+
)
|
|
25
|
+
from mlrun.errors import MLRunValueError
|
|
24
26
|
from mlrun.model_monitoring.model_endpoint import ModelEndpoint
|
|
25
27
|
from mlrun.utils import logger
|
|
26
28
|
|
|
27
29
|
if typing.TYPE_CHECKING:
|
|
28
30
|
from mlrun.db.base import RunDBInterface
|
|
31
|
+
from mlrun.projects import MlrunProject
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _BatchDict(typing.TypedDict):
|
|
35
|
+
minutes: int
|
|
36
|
+
hours: int
|
|
37
|
+
days: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class _MLRunNoRunsFoundError(Exception):
|
|
41
|
+
pass
|
|
29
42
|
|
|
30
43
|
|
|
31
44
|
def get_stream_path(project: str = None, application_name: str = None):
|
|
@@ -55,24 +68,22 @@ def get_stream_path(project: str = None, application_name: str = None):
|
|
|
55
68
|
|
|
56
69
|
|
|
57
70
|
def get_monitoring_parquet_path(
|
|
58
|
-
project:
|
|
71
|
+
project: "MlrunProject",
|
|
59
72
|
kind: str = mlrun.common.schemas.model_monitoring.FileTargetKind.PARQUET,
|
|
60
73
|
) -> str:
|
|
61
74
|
"""Get model monitoring parquet target for the current project and kind. The parquet target path is based on the
|
|
62
75
|
project artifact path. If project artifact path is not defined, the parquet target path will be based on MLRun
|
|
63
76
|
artifact path.
|
|
64
77
|
|
|
65
|
-
:param project: Project
|
|
78
|
+
:param project: Project object.
|
|
66
79
|
:param kind: indicate the kind of the parquet path, can be either stream_parquet or stream_controller_parquet
|
|
67
80
|
|
|
68
81
|
:return: Monitoring parquet target path.
|
|
69
82
|
"""
|
|
70
|
-
|
|
71
|
-
project_obj = mlrun.get_or_create_project(name=project)
|
|
72
|
-
artifact_path = project_obj.spec.artifact_path
|
|
83
|
+
artifact_path = project.spec.artifact_path
|
|
73
84
|
# Generate monitoring parquet path value
|
|
74
85
|
parquet_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
75
|
-
project=project,
|
|
86
|
+
project=project.name,
|
|
76
87
|
kind=kind,
|
|
77
88
|
target="offline",
|
|
78
89
|
artifact_path=artifact_path,
|
|
@@ -99,12 +110,46 @@ def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
|
99
110
|
)
|
|
100
111
|
|
|
101
112
|
|
|
113
|
+
def batch_dict2timedelta(batch_dict: _BatchDict) -> datetime.timedelta:
|
|
114
|
+
"""
|
|
115
|
+
Convert a batch dictionary to timedelta.
|
|
116
|
+
|
|
117
|
+
:param batch_dict: Batch dict.
|
|
118
|
+
|
|
119
|
+
:return: Timedelta.
|
|
120
|
+
"""
|
|
121
|
+
return datetime.timedelta(**batch_dict)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_monitoring_time_window_from_controller_run(
|
|
125
|
+
project: str, db: "RunDBInterface"
|
|
126
|
+
) -> datetime.timedelta:
|
|
127
|
+
"""
|
|
128
|
+
Get timedelta for the controller to run.
|
|
129
|
+
|
|
130
|
+
:param project: Project name.
|
|
131
|
+
:param db: DB interface.
|
|
132
|
+
|
|
133
|
+
:return: Timedelta for the controller to run.
|
|
134
|
+
"""
|
|
135
|
+
run_name = MonitoringFunctionNames.APPLICATION_CONTROLLER
|
|
136
|
+
runs = db.list_runs(project=project, name=run_name, sort=True)
|
|
137
|
+
if not runs:
|
|
138
|
+
raise _MLRunNoRunsFoundError(f"No {run_name} runs were found")
|
|
139
|
+
last_run = runs[0]
|
|
140
|
+
try:
|
|
141
|
+
batch_dict = last_run["spec"]["parameters"]["batch_intervals_dict"]
|
|
142
|
+
except KeyError:
|
|
143
|
+
raise MLRunValueError(
|
|
144
|
+
f"Could not find `batch_intervals_dict` in {run_name} run"
|
|
145
|
+
)
|
|
146
|
+
return batch_dict2timedelta(batch_dict)
|
|
147
|
+
|
|
148
|
+
|
|
102
149
|
def bump_model_endpoint_last_request(
|
|
103
150
|
project: str,
|
|
104
151
|
model_endpoint: ModelEndpoint,
|
|
105
152
|
db: "RunDBInterface",
|
|
106
|
-
minutes_delta: int = 10, # TODO: move to config - should be the same as `batch_interval`
|
|
107
|
-
seconds_delta: int = 1,
|
|
108
153
|
) -> None:
|
|
109
154
|
"""
|
|
110
155
|
Update the last request field of the model endpoint to be after the current last request time.
|
|
@@ -112,10 +157,6 @@ def bump_model_endpoint_last_request(
|
|
|
112
157
|
:param project: Project name.
|
|
113
158
|
:param model_endpoint: Model endpoint object.
|
|
114
159
|
:param db: DB interface.
|
|
115
|
-
:param minutes_delta: Minutes delta to add to the last request time.
|
|
116
|
-
:param seconds_delta: Seconds delta to add to the last request time. This is mainly to ensure that the last
|
|
117
|
-
request time is strongly greater than the previous one (with respect to the window time)
|
|
118
|
-
after adding the minutes delta.
|
|
119
160
|
"""
|
|
120
161
|
if not model_endpoint.status.last_request:
|
|
121
162
|
logger.error(
|
|
@@ -123,14 +164,18 @@ def bump_model_endpoint_last_request(
|
|
|
123
164
|
project=project,
|
|
124
165
|
endpoint_id=model_endpoint.metadata.uid,
|
|
125
166
|
)
|
|
126
|
-
raise
|
|
167
|
+
raise MLRunValueError("Model endpoint last request time is empty")
|
|
168
|
+
try:
|
|
169
|
+
time_window = _get_monitoring_time_window_from_controller_run(project, db)
|
|
170
|
+
except _MLRunNoRunsFoundError:
|
|
171
|
+
logger.debug(
|
|
172
|
+
"Not bumping model endpoint last request time - no controller runs were found"
|
|
173
|
+
)
|
|
174
|
+
return
|
|
127
175
|
|
|
128
176
|
bumped_last_request = (
|
|
129
177
|
datetime.datetime.fromisoformat(model_endpoint.status.last_request)
|
|
130
|
-
+
|
|
131
|
-
minutes=minutes_delta,
|
|
132
|
-
seconds=seconds_delta,
|
|
133
|
-
)
|
|
178
|
+
+ time_window
|
|
134
179
|
+ datetime.timedelta(
|
|
135
180
|
seconds=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
|
|
136
181
|
)
|
|
@@ -528,9 +528,6 @@ class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
|
528
528
|
|
|
529
529
|
# Getting event timestamp and endpoint_id
|
|
530
530
|
base_event = {k: event[k] for k in base_fields}
|
|
531
|
-
base_event[EventFieldType.TIMESTAMP] = datetime.datetime.fromisoformat(
|
|
532
|
-
base_event[EventFieldType.TIMESTAMP]
|
|
533
|
-
)
|
|
534
531
|
|
|
535
532
|
# base_metrics includes the stats about the average latency and the amount of predictions over time
|
|
536
533
|
base_metrics = {
|
mlrun/projects/operations.py
CHANGED
|
@@ -274,7 +274,7 @@ def build_function(
|
|
|
274
274
|
if not overwrite_build_params:
|
|
275
275
|
# TODO: change overwrite_build_params default to True in 1.8.0
|
|
276
276
|
warnings.warn(
|
|
277
|
-
"The `overwrite_build_params` parameter default will change from 'False' to 'True in 1.8.0.",
|
|
277
|
+
"The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.",
|
|
278
278
|
mlrun.utils.OverwriteBuildParamsWarning,
|
|
279
279
|
)
|
|
280
280
|
|
mlrun/projects/project.py
CHANGED
|
@@ -2167,7 +2167,7 @@ class MlrunProject(ModelObj):
|
|
|
2167
2167
|
self.spec.remove_function(name)
|
|
2168
2168
|
|
|
2169
2169
|
def remove_model_monitoring_function(self, name):
|
|
2170
|
-
"""remove the specified model-monitoring-app function from the project
|
|
2170
|
+
"""remove the specified model-monitoring-app function from the project and from the db
|
|
2171
2171
|
|
|
2172
2172
|
:param name: name of the model-monitoring-app function (under the project)
|
|
2173
2173
|
"""
|
|
@@ -2177,6 +2177,7 @@ class MlrunProject(ModelObj):
|
|
|
2177
2177
|
== mm_constants.ModelMonitoringAppLabel.VAL
|
|
2178
2178
|
):
|
|
2179
2179
|
self.remove_function(name=name)
|
|
2180
|
+
mlrun.db.get_run_db().delete_function(name=name.lower())
|
|
2180
2181
|
logger.info(f"{name} function has been removed from {self.name} project")
|
|
2181
2182
|
else:
|
|
2182
2183
|
raise logger.error(
|
|
@@ -2753,6 +2754,11 @@ class MlrunProject(ModelObj):
|
|
|
2753
2754
|
project_file_path = path.join(
|
|
2754
2755
|
self.spec.context, self.spec.subpath or "", "project.yaml"
|
|
2755
2756
|
)
|
|
2757
|
+
if filepath and "://" in str(filepath) and not archive_code:
|
|
2758
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
2759
|
+
"URLs are only applicable to archives"
|
|
2760
|
+
)
|
|
2761
|
+
|
|
2756
2762
|
project_dir = pathlib.Path(project_file_path).parent
|
|
2757
2763
|
project_dir.mkdir(parents=True, exist_ok=True)
|
|
2758
2764
|
with open(project_file_path, "w") as fp:
|
|
@@ -3011,7 +3017,7 @@ class MlrunProject(ModelObj):
|
|
|
3011
3017
|
if not overwrite_build_params:
|
|
3012
3018
|
# TODO: change overwrite_build_params default to True in 1.8.0
|
|
3013
3019
|
warnings.warn(
|
|
3014
|
-
"The `overwrite_build_params` parameter default will change from 'False' to 'True in 1.8.0.",
|
|
3020
|
+
"The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.",
|
|
3015
3021
|
mlrun.utils.OverwriteBuildParamsWarning,
|
|
3016
3022
|
)
|
|
3017
3023
|
default_image_name = mlrun.mlconf.default_project_image_name.format(
|
|
@@ -3097,7 +3103,7 @@ class MlrunProject(ModelObj):
|
|
|
3097
3103
|
if not overwrite_build_params:
|
|
3098
3104
|
# TODO: change overwrite_build_params default to True in 1.8.0
|
|
3099
3105
|
warnings.warn(
|
|
3100
|
-
"The `overwrite_build_params` parameter default will change from 'False' to 'True in 1.8.0.",
|
|
3106
|
+
"The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.",
|
|
3101
3107
|
mlrun.utils.OverwriteBuildParamsWarning,
|
|
3102
3108
|
)
|
|
3103
3109
|
|
|
@@ -3402,7 +3408,7 @@ class MlrunProject(ModelObj):
|
|
|
3402
3408
|
:param state: List only runs whose state is specified.
|
|
3403
3409
|
:param sort: Whether to sort the result according to their start time. Otherwise, results will be
|
|
3404
3410
|
returned by their internal order in the DB (order will not be guaranteed).
|
|
3405
|
-
:param last: Deprecated - currently not used.
|
|
3411
|
+
:param last: Deprecated - currently not used (will be removed in 1.8.0).
|
|
3406
3412
|
:param iter: If ``True`` return runs from all iterations. Otherwise, return only runs whose ``iter`` is 0.
|
|
3407
3413
|
:param start_time_from: Filter by run start time in ``[start_time_from, start_time_to]``.
|
|
3408
3414
|
:param start_time_to: Filter by run start time in ``[start_time_from, start_time_to]``.
|
mlrun/runtimes/base.py
CHANGED
|
@@ -550,7 +550,12 @@ class BaseRuntime(ModelObj):
|
|
|
550
550
|
if err:
|
|
551
551
|
updates["status.error"] = err_to_str(err)
|
|
552
552
|
|
|
553
|
-
elif
|
|
553
|
+
elif (
|
|
554
|
+
not was_none
|
|
555
|
+
and last_state != mlrun.runtimes.constants.RunStates.completed
|
|
556
|
+
and last_state
|
|
557
|
+
not in mlrun.runtimes.constants.RunStates.error_and_abortion_states()
|
|
558
|
+
):
|
|
554
559
|
try:
|
|
555
560
|
runtime_cls = mlrun.runtimes.get_runtime_class(kind)
|
|
556
561
|
updates = runtime_cls._get_run_completion_updates(resp)
|
mlrun/runtimes/constants.py
CHANGED
|
@@ -165,6 +165,17 @@ class RunStates(object):
|
|
|
165
165
|
RunStates.aborted,
|
|
166
166
|
]
|
|
167
167
|
|
|
168
|
+
@staticmethod
|
|
169
|
+
def abortion_states():
|
|
170
|
+
return [
|
|
171
|
+
RunStates.aborted,
|
|
172
|
+
RunStates.aborting,
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def error_and_abortion_states():
|
|
177
|
+
return list(set(RunStates.error_states()) | set(RunStates.abortion_states()))
|
|
178
|
+
|
|
168
179
|
@staticmethod
|
|
169
180
|
def non_terminal_states():
|
|
170
181
|
return list(set(RunStates.all()) - set(RunStates.terminal_states()))
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import os
|
|
16
15
|
from ast import FunctionDef, parse, unparse
|
|
17
16
|
from base64 import b64decode, b64encode
|
|
18
17
|
from typing import Callable, Dict, List, Optional, Union
|
|
@@ -197,14 +196,13 @@ class DatabricksRuntime(kubejob.KubejobRuntime):
|
|
|
197
196
|
if value:
|
|
198
197
|
task_parameters[key] = value # in order to handle reruns.
|
|
199
198
|
runspec.spec.parameters["task_parameters"] = task_parameters
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
wrap_code = b64encode(wrap_code.encode("utf-8")).decode("utf-8")
|
|
199
|
+
wrap_code = b"""
|
|
200
|
+
from mlrun.runtimes.databricks_job import databricks_wrapper
|
|
201
|
+
|
|
202
|
+
def run_mlrun_databricks_job(context,task_parameters: dict, **kwargs):
|
|
203
|
+
databricks_wrapper.run_mlrun_databricks_job(context, task_parameters, **kwargs)
|
|
204
|
+
"""
|
|
205
|
+
wrap_code = b64encode(wrap_code).decode("utf-8")
|
|
208
206
|
self.spec.build.functionSourceCode = wrap_code
|
|
209
207
|
runspec.spec.handler = "run_mlrun_databricks_job"
|
|
210
208
|
|