mlrun 1.10.0rc16__py3-none-any.whl → 1.10.0rc18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlrun/common/constants.py +2 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/schemas/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +33 -6
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/config.py +15 -5
- mlrun/datastore/datastore.py +4 -4
- mlrun/datastore/datastore_profile.py +26 -0
- mlrun/datastore/model_provider/huggingface_provider.py +183 -0
- mlrun/datastore/model_provider/model_provider.py +6 -1
- mlrun/datastore/model_provider/openai_provider.py +24 -12
- mlrun/datastore/utils.py +6 -0
- mlrun/db/base.py +1 -0
- mlrun/db/httpdb.py +4 -0
- mlrun/model_monitoring/api.py +5 -3
- mlrun/model_monitoring/applications/base.py +107 -28
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +175 -121
- mlrun/model_monitoring/stream_processing.py +29 -2
- mlrun/projects/project.py +7 -2
- mlrun/run.py +3 -1
- mlrun/serving/server.py +98 -11
- mlrun/serving/states.py +8 -19
- mlrun/serving/system_steps.py +20 -10
- mlrun/utils/helpers.py +6 -1
- mlrun/utils/logger.py +3 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc18.dist-info}/METADATA +2 -2
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc18.dist-info}/RECORD +35 -34
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc18.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc18.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc18.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc18.dist-info}/top_level.txt +0 -0
|
@@ -12,13 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import collections
|
|
16
15
|
import concurrent.futures
|
|
17
16
|
import datetime
|
|
18
17
|
import json
|
|
19
18
|
import os
|
|
20
19
|
import traceback
|
|
21
|
-
from collections import OrderedDict
|
|
22
20
|
from collections.abc import Iterator
|
|
23
21
|
from contextlib import AbstractContextManager
|
|
24
22
|
from types import TracebackType
|
|
@@ -29,20 +27,17 @@ import pandas as pd
|
|
|
29
27
|
|
|
30
28
|
import mlrun
|
|
31
29
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
32
|
-
import mlrun.feature_store as fstore
|
|
33
30
|
import mlrun.model_monitoring
|
|
34
31
|
import mlrun.model_monitoring.db._schedules as schedules
|
|
35
32
|
import mlrun.model_monitoring.helpers
|
|
36
33
|
import mlrun.platforms.iguazio
|
|
37
|
-
from mlrun.common.schemas import EndpointType
|
|
38
34
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
39
35
|
ControllerEvent,
|
|
40
36
|
ControllerEventEndpointPolicy,
|
|
41
|
-
ControllerEventKind,
|
|
42
37
|
)
|
|
43
38
|
from mlrun.errors import err_to_str
|
|
44
39
|
from mlrun.model_monitoring.helpers import batch_dict2timedelta
|
|
45
|
-
from mlrun.utils import
|
|
40
|
+
from mlrun.utils import logger
|
|
46
41
|
|
|
47
42
|
_SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
|
|
48
43
|
_SECONDS_IN_MINUTE = 60
|
|
@@ -62,6 +57,7 @@ class _BatchWindow:
|
|
|
62
57
|
timedelta_seconds: int,
|
|
63
58
|
last_updated: int,
|
|
64
59
|
first_request: int,
|
|
60
|
+
endpoint_mode: mm_constants.EndpointMode = mm_constants.EndpointMode.REAL_TIME,
|
|
65
61
|
) -> None:
|
|
66
62
|
"""
|
|
67
63
|
Initialize a batch window object that handles the batch interval time range
|
|
@@ -74,6 +70,7 @@ class _BatchWindow:
|
|
|
74
70
|
self._stop = last_updated
|
|
75
71
|
self._step = timedelta_seconds
|
|
76
72
|
self._db = schedules_file
|
|
73
|
+
self._endpoint_mode = endpoint_mode
|
|
77
74
|
self._start = self._get_last_analyzed()
|
|
78
75
|
|
|
79
76
|
def _get_saved_last_analyzed(self) -> Optional[int]:
|
|
@@ -85,10 +82,20 @@ class _BatchWindow:
|
|
|
85
82
|
)
|
|
86
83
|
|
|
87
84
|
def _get_initial_last_analyzed(self) -> int:
|
|
85
|
+
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
86
|
+
logger.info(
|
|
87
|
+
"No last analyzed time was found for this endpoint and application, as this is "
|
|
88
|
+
"probably the first time this application is running. Initializing last analyzed "
|
|
89
|
+
"to the start of the batch time",
|
|
90
|
+
application=self._application,
|
|
91
|
+
start_batch_time=self._first_request,
|
|
92
|
+
)
|
|
93
|
+
return self._first_request
|
|
88
94
|
logger.info(
|
|
89
95
|
"No last analyzed time was found for this endpoint and application, as this is "
|
|
90
96
|
"probably the first time this application is running. Initializing last analyzed "
|
|
91
|
-
"to the latest between first request
|
|
97
|
+
"to the latest between first request the latest between first request time or last "
|
|
98
|
+
"update time minus one day",
|
|
92
99
|
application=self._application,
|
|
93
100
|
first_request=self._first_request,
|
|
94
101
|
last_updated=self._stop,
|
|
@@ -103,6 +110,9 @@ class _BatchWindow:
|
|
|
103
110
|
def _get_last_analyzed(self) -> int:
|
|
104
111
|
saved_last_analyzed = self._get_saved_last_analyzed()
|
|
105
112
|
if saved_last_analyzed is not None:
|
|
113
|
+
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
114
|
+
# Use the maximum between the saved last analyzed and the start of the batch
|
|
115
|
+
return max(saved_last_analyzed, self._first_request)
|
|
106
116
|
return saved_last_analyzed
|
|
107
117
|
else:
|
|
108
118
|
last_analyzed = self._get_initial_last_analyzed()
|
|
@@ -113,6 +123,7 @@ class _BatchWindow:
|
|
|
113
123
|
def get_intervals(self) -> Iterator[_Interval]:
|
|
114
124
|
"""Generate the batch interval time ranges."""
|
|
115
125
|
entered = False
|
|
126
|
+
last_analyzed = None
|
|
116
127
|
# Iterate timestamp from start until timestamp <= stop - step
|
|
117
128
|
# so that the last interval will end at (timestamp + step) <= stop.
|
|
118
129
|
# Add 1 to stop - step to get <= and not <.
|
|
@@ -134,6 +145,40 @@ class _BatchWindow:
|
|
|
134
145
|
last_analyzed=last_analyzed,
|
|
135
146
|
)
|
|
136
147
|
|
|
148
|
+
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
149
|
+
# If the endpoint is a batch endpoint, we need to update the last analyzed time
|
|
150
|
+
# to the end of the batch time.
|
|
151
|
+
if last_analyzed:
|
|
152
|
+
if last_analyzed < self._stop:
|
|
153
|
+
# If the last analyzed time is earlier than the stop time,
|
|
154
|
+
# yield the final partial interval from last_analyzed to stop
|
|
155
|
+
yield _Interval(
|
|
156
|
+
datetime.datetime.fromtimestamp(
|
|
157
|
+
last_analyzed, tz=datetime.timezone.utc
|
|
158
|
+
),
|
|
159
|
+
datetime.datetime.fromtimestamp(
|
|
160
|
+
self._stop, tz=datetime.timezone.utc
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
# The time span between the start and end of the batch is shorter than the step,
|
|
165
|
+
# so we need to yield a partial interval covering that range.
|
|
166
|
+
yield _Interval(
|
|
167
|
+
datetime.datetime.fromtimestamp(
|
|
168
|
+
self._start, tz=datetime.timezone.utc
|
|
169
|
+
),
|
|
170
|
+
datetime.datetime.fromtimestamp(
|
|
171
|
+
self._stop, tz=datetime.timezone.utc
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
self._update_last_analyzed(last_analyzed=self._stop)
|
|
176
|
+
logger.debug(
|
|
177
|
+
"Updated the last analyzed time for this endpoint and application to the end of the batch time",
|
|
178
|
+
application=self._application,
|
|
179
|
+
last_analyzed=self._stop,
|
|
180
|
+
)
|
|
181
|
+
|
|
137
182
|
if not entered:
|
|
138
183
|
logger.debug(
|
|
139
184
|
"All the data is set, but no complete intervals were found. "
|
|
@@ -183,28 +228,25 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
183
228
|
|
|
184
229
|
@classmethod
|
|
185
230
|
def _get_last_updated_time(
|
|
186
|
-
cls,
|
|
231
|
+
cls,
|
|
232
|
+
last_request: datetime.datetime,
|
|
233
|
+
endpoint_mode: mm_constants.EndpointMode,
|
|
187
234
|
) -> int:
|
|
188
235
|
"""
|
|
189
236
|
Get the last updated time of a model endpoint.
|
|
190
237
|
"""
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
# If the endpoint does not have a stream, `last_updated` should be
|
|
200
|
-
# the minimum between the current time and the last updated time.
|
|
201
|
-
# This compensates for the bumping mechanism - see
|
|
202
|
-
# `update_model_endpoint_last_request`.
|
|
203
|
-
last_updated = min(int(datetime_now().timestamp()), last_updated)
|
|
204
|
-
logger.debug(
|
|
205
|
-
"The endpoint does not have a stream", last_updated=last_updated
|
|
238
|
+
|
|
239
|
+
if endpoint_mode == mm_constants.EndpointMode.REAL_TIME:
|
|
240
|
+
last_updated = int(
|
|
241
|
+
last_request.timestamp()
|
|
242
|
+
- cast(
|
|
243
|
+
float,
|
|
244
|
+
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
245
|
+
)
|
|
206
246
|
)
|
|
207
|
-
|
|
247
|
+
|
|
248
|
+
return last_updated
|
|
249
|
+
return int(last_request.timestamp())
|
|
208
250
|
|
|
209
251
|
def get_intervals(
|
|
210
252
|
self,
|
|
@@ -212,19 +254,21 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
212
254
|
application: str,
|
|
213
255
|
first_request: datetime.datetime,
|
|
214
256
|
last_request: datetime.datetime,
|
|
215
|
-
|
|
257
|
+
endpoint_mode: mm_constants.EndpointMode,
|
|
216
258
|
) -> Iterator[_Interval]:
|
|
217
259
|
"""
|
|
218
260
|
Get the batch window for a specific endpoint and application.
|
|
219
261
|
`first_request` and `last_request` are the timestamps of the first request and last
|
|
220
262
|
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
221
263
|
"""
|
|
264
|
+
|
|
222
265
|
self.batch_window = _BatchWindow(
|
|
223
266
|
schedules_file=self._schedules_file,
|
|
224
267
|
application=application,
|
|
225
268
|
timedelta_seconds=self._timedelta,
|
|
226
|
-
last_updated=self._get_last_updated_time(last_request,
|
|
269
|
+
last_updated=self._get_last_updated_time(last_request, endpoint_mode),
|
|
227
270
|
first_request=int(first_request.timestamp()),
|
|
271
|
+
endpoint_mode=endpoint_mode,
|
|
228
272
|
)
|
|
229
273
|
yield from self.batch_window.get_intervals()
|
|
230
274
|
|
|
@@ -247,8 +291,6 @@ class MonitoringApplicationController:
|
|
|
247
291
|
Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
|
|
248
292
|
"""
|
|
249
293
|
|
|
250
|
-
_MAX_FEATURE_SET_PER_WORKER = 1000
|
|
251
|
-
|
|
252
294
|
def __init__(self) -> None:
|
|
253
295
|
"""Initialize Monitoring Application Controller"""
|
|
254
296
|
self.project = cast(str, mlrun.mlconf.active_project)
|
|
@@ -282,9 +324,6 @@ class MonitoringApplicationController:
|
|
|
282
324
|
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
283
325
|
],
|
|
284
326
|
] = {}
|
|
285
|
-
self.feature_sets: OrderedDict[str, mlrun.feature_store.FeatureSet] = (
|
|
286
|
-
collections.OrderedDict()
|
|
287
|
-
)
|
|
288
327
|
self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
289
328
|
project=self.project
|
|
290
329
|
)
|
|
@@ -421,7 +460,6 @@ class MonitoringApplicationController:
|
|
|
421
460
|
last_request=endpoint.status.last_request,
|
|
422
461
|
first_request=endpoint.status.first_request,
|
|
423
462
|
endpoint_type=endpoint.metadata.endpoint_type,
|
|
424
|
-
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
425
463
|
)
|
|
426
464
|
return False
|
|
427
465
|
|
|
@@ -477,24 +515,67 @@ class MonitoringApplicationController:
|
|
|
477
515
|
try:
|
|
478
516
|
project_name = event[ControllerEvent.PROJECT]
|
|
479
517
|
endpoint_id = event[ControllerEvent.ENDPOINT_ID]
|
|
480
|
-
endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
|
|
481
|
-
applications_names = event[ControllerEvent.ENDPOINT_POLICY][
|
|
482
|
-
ControllerEventEndpointPolicy.MONITORING_APPLICATIONS
|
|
483
|
-
]
|
|
484
518
|
|
|
485
|
-
|
|
486
|
-
event[ControllerEvent.
|
|
487
|
-
|
|
519
|
+
if (
|
|
520
|
+
event[ControllerEvent.KIND]
|
|
521
|
+
== mm_constants.ControllerEventKind.BATCH_COMPLETE
|
|
522
|
+
):
|
|
523
|
+
monitoring_functions = (
|
|
524
|
+
self.project_obj.list_model_monitoring_functions()
|
|
525
|
+
)
|
|
526
|
+
if monitoring_functions:
|
|
527
|
+
applications_names = list(
|
|
528
|
+
{app.metadata.name for app in monitoring_functions}
|
|
529
|
+
)
|
|
530
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
531
|
+
event[ControllerEvent.LAST_TIMESTAMP]
|
|
532
|
+
)
|
|
533
|
+
first_request = datetime.datetime.fromisoformat(
|
|
534
|
+
event[ControllerEvent.FIRST_TIMESTAMP]
|
|
535
|
+
)
|
|
536
|
+
endpoint_mode = mm_constants.EndpointMode.BATCH
|
|
537
|
+
model_endpoint = self.project_obj.list_model_endpoints(
|
|
538
|
+
uids=[endpoint_id],
|
|
539
|
+
latest_only=True,
|
|
540
|
+
).endpoints
|
|
541
|
+
|
|
542
|
+
if not model_endpoint:
|
|
543
|
+
logger.error(
|
|
544
|
+
"Batch model endpoint not found",
|
|
545
|
+
endpoint_id=endpoint_id,
|
|
546
|
+
project=project_name,
|
|
547
|
+
)
|
|
548
|
+
return
|
|
549
|
+
|
|
550
|
+
endpoint_name = model_endpoint[0].metadata.name
|
|
551
|
+
endpoint_updated = model_endpoint[0].metadata.updated.isoformat()
|
|
552
|
+
|
|
553
|
+
else:
|
|
554
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
else:
|
|
558
|
+
endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
|
|
559
|
+
applications_names = event[ControllerEvent.ENDPOINT_POLICY][
|
|
560
|
+
ControllerEventEndpointPolicy.MONITORING_APPLICATIONS
|
|
561
|
+
]
|
|
562
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
563
|
+
event[ControllerEvent.TIMESTAMP]
|
|
564
|
+
)
|
|
565
|
+
first_request = datetime.datetime.fromisoformat(
|
|
566
|
+
event[ControllerEvent.FIRST_REQUEST]
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
endpoint_updated = event[ControllerEvent.ENDPOINT_POLICY][
|
|
570
|
+
ControllerEventEndpointPolicy.ENDPOINT_UPDATED
|
|
571
|
+
]
|
|
572
|
+
|
|
573
|
+
endpoint_mode = mm_constants.EndpointMode.REAL_TIME
|
|
488
574
|
|
|
489
575
|
logger.info(
|
|
490
|
-
"Starting
|
|
491
|
-
)
|
|
492
|
-
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
493
|
-
event[ControllerEvent.TIMESTAMP]
|
|
494
|
-
)
|
|
495
|
-
first_request = datetime.datetime.fromisoformat(
|
|
496
|
-
event[ControllerEvent.FIRST_REQUEST]
|
|
576
|
+
"Starting to analyze", timestamp=last_stream_timestamp.isoformat()
|
|
497
577
|
)
|
|
578
|
+
|
|
498
579
|
with _BatchWindowGenerator(
|
|
499
580
|
project=project_name,
|
|
500
581
|
endpoint_id=endpoint_id,
|
|
@@ -506,42 +587,20 @@ class MonitoringApplicationController:
|
|
|
506
587
|
end_infer_time,
|
|
507
588
|
) in batch_window_generator.get_intervals(
|
|
508
589
|
application=application,
|
|
509
|
-
not_batch_endpoint=not_batch_endpoint,
|
|
510
590
|
first_request=first_request,
|
|
511
591
|
last_request=last_stream_timestamp,
|
|
592
|
+
endpoint_mode=endpoint_mode,
|
|
512
593
|
):
|
|
513
594
|
data_in_window = False
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
else:
|
|
524
|
-
if endpoint_id not in self.feature_sets:
|
|
525
|
-
self.feature_sets[endpoint_id] = fstore.get_feature_set(
|
|
526
|
-
event[ControllerEvent.FEATURE_SET_URI]
|
|
527
|
-
)
|
|
528
|
-
self.feature_sets.move_to_end(endpoint_id, last=False)
|
|
529
|
-
if (
|
|
530
|
-
len(self.feature_sets)
|
|
531
|
-
> self._MAX_FEATURE_SET_PER_WORKER
|
|
532
|
-
):
|
|
533
|
-
self.feature_sets.popitem(last=True)
|
|
534
|
-
m_fs = self.feature_sets.get(endpoint_id)
|
|
535
|
-
|
|
536
|
-
# Batch endpoint - get the relevant window data from the parquet target
|
|
537
|
-
df = m_fs.to_dataframe(
|
|
538
|
-
start_time=start_infer_time,
|
|
539
|
-
end_time=end_infer_time,
|
|
540
|
-
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
541
|
-
storage_options=self.storage_options,
|
|
542
|
-
)
|
|
543
|
-
if len(df) > 0:
|
|
544
|
-
data_in_window = True
|
|
595
|
+
# Serving endpoint - get the relevant window data from the TSDB
|
|
596
|
+
prediction_metric = self.tsdb_connector.read_predictions(
|
|
597
|
+
start=start_infer_time,
|
|
598
|
+
end=end_infer_time,
|
|
599
|
+
endpoint_id=endpoint_id,
|
|
600
|
+
)
|
|
601
|
+
if prediction_metric.data:
|
|
602
|
+
data_in_window = True
|
|
603
|
+
|
|
545
604
|
if not data_in_window:
|
|
546
605
|
logger.info(
|
|
547
606
|
"No data found for the given interval",
|
|
@@ -564,49 +623,47 @@ class MonitoringApplicationController:
|
|
|
564
623
|
project=project_name,
|
|
565
624
|
applications_names=[application],
|
|
566
625
|
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
567
|
-
endpoint_updated=
|
|
568
|
-
ControllerEventEndpointPolicy.ENDPOINT_UPDATED
|
|
569
|
-
],
|
|
626
|
+
endpoint_updated=endpoint_updated,
|
|
570
627
|
)
|
|
571
|
-
|
|
572
|
-
ControllerEventEndpointPolicy.BASE_PERIOD
|
|
573
|
-
]
|
|
574
|
-
current_time = mlrun.utils.datetime_now()
|
|
628
|
+
|
|
575
629
|
if (
|
|
576
|
-
|
|
630
|
+
event[ControllerEvent.KIND]
|
|
631
|
+
== mm_constants.ControllerEventKind.REGULAR_EVENT
|
|
632
|
+
):
|
|
633
|
+
base_period = event[ControllerEvent.ENDPOINT_POLICY][
|
|
634
|
+
ControllerEventEndpointPolicy.BASE_PERIOD
|
|
635
|
+
]
|
|
636
|
+
current_time = mlrun.utils.datetime_now()
|
|
637
|
+
if self._should_send_nop_event(
|
|
577
638
|
base_period,
|
|
578
639
|
batch_window_generator.get_min_last_analyzed(),
|
|
579
640
|
current_time,
|
|
580
|
-
)
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
ControllerEvent.
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
ControllerEvent.
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
}
|
|
604
|
-
self._push_to_main_stream(
|
|
605
|
-
event=event,
|
|
606
|
-
endpoint_id=endpoint_id,
|
|
607
|
-
)
|
|
641
|
+
):
|
|
642
|
+
event = {
|
|
643
|
+
ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
|
|
644
|
+
ControllerEvent.PROJECT: project_name,
|
|
645
|
+
ControllerEvent.ENDPOINT_ID: endpoint_id,
|
|
646
|
+
ControllerEvent.ENDPOINT_NAME: endpoint_name,
|
|
647
|
+
ControllerEvent.TIMESTAMP: current_time.isoformat(
|
|
648
|
+
timespec="microseconds"
|
|
649
|
+
),
|
|
650
|
+
ControllerEvent.ENDPOINT_POLICY: event[
|
|
651
|
+
ControllerEvent.ENDPOINT_POLICY
|
|
652
|
+
],
|
|
653
|
+
ControllerEvent.ENDPOINT_TYPE: event[
|
|
654
|
+
ControllerEvent.ENDPOINT_TYPE
|
|
655
|
+
],
|
|
656
|
+
ControllerEvent.FIRST_REQUEST: event[
|
|
657
|
+
ControllerEvent.FIRST_REQUEST
|
|
658
|
+
],
|
|
659
|
+
}
|
|
660
|
+
self._push_to_main_stream(
|
|
661
|
+
event=event,
|
|
662
|
+
endpoint_id=endpoint_id,
|
|
663
|
+
)
|
|
608
664
|
logger.info(
|
|
609
|
-
"Finish analyze for",
|
|
665
|
+
"Finish analyze for",
|
|
666
|
+
timestamp=last_stream_timestamp,
|
|
610
667
|
)
|
|
611
668
|
|
|
612
669
|
except Exception:
|
|
@@ -674,7 +731,9 @@ class MonitoringApplicationController:
|
|
|
674
731
|
"""
|
|
675
732
|
logger.info("Starting monitoring controller chief")
|
|
676
733
|
applications_names = []
|
|
677
|
-
endpoints = self.project_obj.list_model_endpoints(
|
|
734
|
+
endpoints = self.project_obj.list_model_endpoints(
|
|
735
|
+
tsdb_metrics=False, mode=mm_constants.EndpointMode.REAL_TIME
|
|
736
|
+
).endpoints
|
|
678
737
|
last_request_dict = self.tsdb_connector.get_last_request(
|
|
679
738
|
endpoint_ids=[mep.metadata.uid for mep in endpoints]
|
|
680
739
|
)
|
|
@@ -783,7 +842,6 @@ class MonitoringApplicationController:
|
|
|
783
842
|
sep=" ", timespec="microseconds"
|
|
784
843
|
),
|
|
785
844
|
endpoint_type=endpoint.metadata.endpoint_type,
|
|
786
|
-
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
787
845
|
endpoint_policy=json.dumps(policy),
|
|
788
846
|
)
|
|
789
847
|
policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
|
|
@@ -801,7 +859,6 @@ class MonitoringApplicationController:
|
|
|
801
859
|
sep=" ", timespec="microseconds"
|
|
802
860
|
),
|
|
803
861
|
endpoint_type=endpoint.metadata.endpoint_type.value,
|
|
804
|
-
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
805
862
|
endpoint_policy=policy,
|
|
806
863
|
)
|
|
807
864
|
|
|
@@ -814,7 +871,6 @@ class MonitoringApplicationController:
|
|
|
814
871
|
timestamp: str,
|
|
815
872
|
first_request: str,
|
|
816
873
|
endpoint_type: int,
|
|
817
|
-
feature_set_uri: str,
|
|
818
874
|
endpoint_policy: dict[str, Any],
|
|
819
875
|
) -> None:
|
|
820
876
|
"""
|
|
@@ -827,7 +883,6 @@ class MonitoringApplicationController:
|
|
|
827
883
|
:param endpoint_id: endpoint id string
|
|
828
884
|
:param endpoint_name: the endpoint name string
|
|
829
885
|
:param endpoint_type: Enum of the endpoint type
|
|
830
|
-
:param feature_set_uri: the feature set uri string
|
|
831
886
|
"""
|
|
832
887
|
event = {
|
|
833
888
|
ControllerEvent.KIND.value: kind,
|
|
@@ -837,7 +892,6 @@ class MonitoringApplicationController:
|
|
|
837
892
|
ControllerEvent.TIMESTAMP.value: timestamp,
|
|
838
893
|
ControllerEvent.FIRST_REQUEST.value: first_request,
|
|
839
894
|
ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
|
|
840
|
-
ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
|
|
841
895
|
ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
|
|
842
896
|
}
|
|
843
897
|
logger.info(
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import asyncio
|
|
15
15
|
import datetime
|
|
16
16
|
import typing
|
|
17
17
|
|
|
@@ -134,6 +134,9 @@ class EventStreamProcessor:
|
|
|
134
134
|
the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
|
|
135
135
|
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
136
136
|
|
|
137
|
+
In a separate branch, "batch complete" events are forwarded to the controller stream with an intentional delay,
|
|
138
|
+
to allow for data to first be written to parquet.
|
|
139
|
+
|
|
137
140
|
:param fn: A serving function.
|
|
138
141
|
:param tsdb_connector: Time series database connector.
|
|
139
142
|
:param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
|
|
@@ -145,6 +148,20 @@ class EventStreamProcessor:
|
|
|
145
148
|
fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
|
|
146
149
|
)
|
|
147
150
|
|
|
151
|
+
# forward back complete events to controller
|
|
152
|
+
graph.add_step(
|
|
153
|
+
"storey.Filter",
|
|
154
|
+
"FilterBatchComplete",
|
|
155
|
+
_fn="(event.get('kind') == 'batch_complete')",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
graph.add_step(
|
|
159
|
+
"Delay",
|
|
160
|
+
name="BatchDelay",
|
|
161
|
+
after="FilterBatchComplete",
|
|
162
|
+
delay=self.parquet_batching_timeout_secs + 5, # add margin
|
|
163
|
+
)
|
|
164
|
+
|
|
148
165
|
# split the graph between event with error vs valid event
|
|
149
166
|
graph.add_step(
|
|
150
167
|
"storey.Filter",
|
|
@@ -261,7 +278,7 @@ class EventStreamProcessor:
|
|
|
261
278
|
"controller_stream",
|
|
262
279
|
path=stream_uri,
|
|
263
280
|
sharding_func=ControllerEvent.ENDPOINT_ID,
|
|
264
|
-
after="ForwardNOP",
|
|
281
|
+
after=["ForwardNOP", "BatchDelay"],
|
|
265
282
|
# Force using the pipeline key instead of the one in the profile in case of v3io profile.
|
|
266
283
|
# In case of Kafka, this parameter will be ignored.
|
|
267
284
|
alternative_v3io_access_key="V3IO_ACCESS_KEY",
|
|
@@ -309,6 +326,16 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
|
309
326
|
return event
|
|
310
327
|
|
|
311
328
|
|
|
329
|
+
class Delay(mlrun.feature_store.steps.MapClass):
|
|
330
|
+
def __init__(self, delay: int, **kwargs):
|
|
331
|
+
super().__init__(**kwargs)
|
|
332
|
+
self._delay = delay
|
|
333
|
+
|
|
334
|
+
async def do(self, event):
|
|
335
|
+
await asyncio.sleep(self._delay)
|
|
336
|
+
return event
|
|
337
|
+
|
|
338
|
+
|
|
312
339
|
class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
313
340
|
def __init__(
|
|
314
341
|
self,
|
mlrun/projects/project.py
CHANGED
|
@@ -3901,6 +3901,7 @@ class MlrunProject(ModelObj):
|
|
|
3901
3901
|
start: Optional[datetime.datetime] = None,
|
|
3902
3902
|
end: Optional[datetime.datetime] = None,
|
|
3903
3903
|
top_level: bool = False,
|
|
3904
|
+
mode: Optional[mlrun.common.schemas.EndpointMode] = None,
|
|
3904
3905
|
uids: Optional[list[str]] = None,
|
|
3905
3906
|
latest_only: bool = False,
|
|
3906
3907
|
tsdb_metrics: bool = False,
|
|
@@ -3916,8 +3917,9 @@ class MlrunProject(ModelObj):
|
|
|
3916
3917
|
5) function_tag
|
|
3917
3918
|
6) labels
|
|
3918
3919
|
7) top level
|
|
3919
|
-
8)
|
|
3920
|
-
9)
|
|
3920
|
+
8) mode
|
|
3921
|
+
9) uids
|
|
3922
|
+
10) start and end time, corresponding to the `created` field.
|
|
3921
3923
|
By default, when no filters are applied, all available endpoints for the given project will be listed.
|
|
3922
3924
|
|
|
3923
3925
|
In addition, this functions provides a facade for listing endpoint related metrics. This facade is time-based
|
|
@@ -3937,6 +3939,8 @@ class MlrunProject(ModelObj):
|
|
|
3937
3939
|
:param start: The start time to filter by.Corresponding to the `created` field.
|
|
3938
3940
|
:param end: The end time to filter by. Corresponding to the `created` field.
|
|
3939
3941
|
:param top_level: If true will return only routers and endpoint that are NOT children of any router.
|
|
3942
|
+
:param mode: Specifies the mode of the model endpoint. Can be "real-time", "batch", or both if set
|
|
3943
|
+
to None.
|
|
3940
3944
|
:param uids: If passed will return a list `ModelEndpoint` object with uid in uids.
|
|
3941
3945
|
:param tsdb_metrics: When True, the time series metrics will be added to the output
|
|
3942
3946
|
of the resulting.
|
|
@@ -3958,6 +3962,7 @@ class MlrunProject(ModelObj):
|
|
|
3958
3962
|
start=start,
|
|
3959
3963
|
end=end,
|
|
3960
3964
|
top_level=top_level,
|
|
3965
|
+
mode=mode,
|
|
3961
3966
|
uids=uids,
|
|
3962
3967
|
latest_only=latest_only,
|
|
3963
3968
|
tsdb_metrics=tsdb_metrics,
|
mlrun/run.py
CHANGED
|
@@ -1184,11 +1184,13 @@ def get_model_provider(
|
|
|
1184
1184
|
raise_missing_schema_exception=True,
|
|
1185
1185
|
) -> ModelProvider:
|
|
1186
1186
|
"""get mlrun dataitem object (from path/url)"""
|
|
1187
|
-
|
|
1187
|
+
# without caching secrets
|
|
1188
|
+
store_manager.set(db=db)
|
|
1188
1189
|
return store_manager.model_provider_object(
|
|
1189
1190
|
url=url,
|
|
1190
1191
|
default_invoke_kwargs=default_invoke_kwargs,
|
|
1191
1192
|
raise_missing_schema_exception=raise_missing_schema_exception,
|
|
1193
|
+
secrets=secrets,
|
|
1192
1194
|
)
|
|
1193
1195
|
|
|
1194
1196
|
|