mlrun 1.8.0rc20__py3-none-any.whl → 1.8.0rc24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +37 -3
- mlrun/artifacts/document.py +40 -11
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/model_monitoring/constants.py +26 -9
- mlrun/config.py +39 -6
- mlrun/datastore/datastore_profile.py +58 -16
- mlrun/datastore/sources.py +7 -1
- mlrun/datastore/vectorstore.py +17 -1
- mlrun/db/base.py +3 -0
- mlrun/db/httpdb.py +0 -8
- mlrun/db/nopdb.py +3 -0
- mlrun/errors.py +4 -0
- mlrun/execution.py +1 -0
- mlrun/model_monitoring/controller.py +266 -103
- mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +5 -2
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +2 -2
- mlrun/model_monitoring/helpers.py +15 -9
- mlrun/model_monitoring/stream_processing.py +72 -2
- mlrun/projects/project.py +95 -32
- mlrun/runtimes/nuclio/serving.py +1 -1
- mlrun/serving/server.py +11 -3
- mlrun/serving/states.py +33 -8
- mlrun/utils/notifications/notification_pusher.py +11 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc20.dist-info → mlrun-1.8.0rc24.dist-info}/METADATA +14 -2
- {mlrun-1.8.0rc20.dist-info → mlrun-1.8.0rc24.dist-info}/RECORD +31 -31
- {mlrun-1.8.0rc20.dist-info → mlrun-1.8.0rc24.dist-info}/WHEEL +1 -1
- {mlrun-1.8.0rc20.dist-info → mlrun-1.8.0rc24.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc20.dist-info → mlrun-1.8.0rc24.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc20.dist-info → mlrun-1.8.0rc24.dist-info}/top_level.txt +0 -0
|
@@ -12,14 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import concurrent.futures
|
|
16
15
|
import datetime
|
|
17
16
|
import json
|
|
18
17
|
import os
|
|
19
18
|
from collections.abc import Iterator
|
|
20
19
|
from contextlib import AbstractContextManager
|
|
21
20
|
from types import TracebackType
|
|
22
|
-
from typing import NamedTuple, Optional, cast
|
|
21
|
+
from typing import Any, NamedTuple, Optional, cast
|
|
23
22
|
|
|
24
23
|
import nuclio_sdk
|
|
25
24
|
|
|
@@ -28,6 +27,10 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
|
28
27
|
import mlrun.feature_store as fstore
|
|
29
28
|
import mlrun.model_monitoring
|
|
30
29
|
from mlrun.common.schemas import EndpointType
|
|
30
|
+
from mlrun.common.schemas.model_monitoring.constants import (
|
|
31
|
+
ControllerEvent,
|
|
32
|
+
ControllerEventKind,
|
|
33
|
+
)
|
|
31
34
|
from mlrun.datastore import get_stream_pusher
|
|
32
35
|
from mlrun.errors import err_to_str
|
|
33
36
|
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
@@ -140,6 +143,7 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
140
143
|
Initialize a batch window generator object that generates batch window objects
|
|
141
144
|
for the monitoring functions.
|
|
142
145
|
"""
|
|
146
|
+
self.batch_window: _BatchWindow = None
|
|
143
147
|
self._project = project
|
|
144
148
|
self._endpoint_id = endpoint_id
|
|
145
149
|
self._timedelta = window_length
|
|
@@ -199,14 +203,14 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
199
203
|
`first_request` and `last_request` are the timestamps of the first request and last
|
|
200
204
|
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
201
205
|
"""
|
|
202
|
-
batch_window = _BatchWindow(
|
|
206
|
+
self.batch_window = _BatchWindow(
|
|
203
207
|
schedules_file=self._schedules_file,
|
|
204
208
|
application=application,
|
|
205
209
|
timedelta_seconds=self._timedelta,
|
|
206
210
|
last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
|
|
207
211
|
first_request=int(first_request.timestamp()),
|
|
208
212
|
)
|
|
209
|
-
yield from batch_window.get_intervals()
|
|
213
|
+
yield from self.batch_window.get_intervals()
|
|
210
214
|
|
|
211
215
|
|
|
212
216
|
def _get_window_length() -> int:
|
|
@@ -237,6 +241,7 @@ class MonitoringApplicationController:
|
|
|
237
241
|
self._window_length = _get_window_length()
|
|
238
242
|
|
|
239
243
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
244
|
+
self.v3io_access_key = mlrun.get_secret_or_env("V3IO_ACCESS_KEY")
|
|
240
245
|
self.storage_options = None
|
|
241
246
|
if mlrun.mlconf.artifact_path.startswith("s3://"):
|
|
242
247
|
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
@@ -262,112 +267,65 @@ class MonitoringApplicationController:
|
|
|
262
267
|
!= mm_constants.EndpointType.ROUTER.value
|
|
263
268
|
)
|
|
264
269
|
|
|
265
|
-
def run(self) -> None:
|
|
270
|
+
def run(self, event: nuclio_sdk.Event) -> None:
|
|
266
271
|
"""
|
|
267
|
-
Main method for
|
|
272
|
+
Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
|
|
273
|
+
Handles nop events logic.
|
|
268
274
|
This method handles the following:
|
|
269
|
-
1.
|
|
270
|
-
2.
|
|
271
|
-
3.
|
|
272
|
-
4.
|
|
273
|
-
5. Delete old parquets
|
|
275
|
+
1. Read applications from the event (endpoint_policy)
|
|
276
|
+
2. Check model monitoring windows
|
|
277
|
+
3. Send data to applications
|
|
278
|
+
4. Pushes nop event to main stream if needed
|
|
274
279
|
"""
|
|
275
|
-
logger.info("Start running monitoring controller")
|
|
280
|
+
logger.info("Start running monitoring controller worker")
|
|
276
281
|
try:
|
|
277
|
-
|
|
278
|
-
endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
|
|
279
|
-
project=self.project, tsdb_metrics=True
|
|
280
|
-
)
|
|
281
|
-
endpoints = endpoints_list.endpoints
|
|
282
|
-
if not endpoints:
|
|
283
|
-
logger.info("No model endpoints found", project=self.project)
|
|
284
|
-
return
|
|
285
|
-
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
286
|
-
if monitoring_functions:
|
|
287
|
-
applications_names = list(
|
|
288
|
-
{app.metadata.name for app in monitoring_functions}
|
|
289
|
-
)
|
|
290
|
-
# if monitoring_functions: - TODO : ML-7700
|
|
291
|
-
# Gets only application in ready state
|
|
292
|
-
# applications_names = list(
|
|
293
|
-
# {
|
|
294
|
-
# app.metadata.name
|
|
295
|
-
# for app in monitoring_functions
|
|
296
|
-
# if (
|
|
297
|
-
# app.status.state == "ready"
|
|
298
|
-
# # workaround for the default app, as its `status.state` is `None`
|
|
299
|
-
# or app.metadata.name
|
|
300
|
-
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
301
|
-
# )
|
|
302
|
-
# }
|
|
303
|
-
# )
|
|
304
|
-
if not applications_names:
|
|
305
|
-
logger.info("No monitoring functions found", project=self.project)
|
|
306
|
-
return
|
|
307
|
-
logger.info(
|
|
308
|
-
"Starting to iterate over the applications",
|
|
309
|
-
applications=applications_names,
|
|
310
|
-
)
|
|
311
|
-
|
|
282
|
+
body = json.loads(event.body.decode("utf-8"))
|
|
312
283
|
except Exception as e:
|
|
313
284
|
logger.error(
|
|
314
|
-
"Failed to
|
|
285
|
+
"Failed to decode event",
|
|
315
286
|
exc=err_to_str(e),
|
|
316
287
|
)
|
|
317
288
|
return
|
|
318
|
-
#
|
|
319
|
-
|
|
320
|
-
max_workers=min(len(endpoints), 10)
|
|
321
|
-
) as pool:
|
|
322
|
-
for endpoint in endpoints:
|
|
323
|
-
if self._should_monitor_endpoint(endpoint):
|
|
324
|
-
pool.submit(
|
|
325
|
-
MonitoringApplicationController.model_endpoint_process,
|
|
326
|
-
project=self.project,
|
|
327
|
-
endpoint=endpoint,
|
|
328
|
-
applications_names=applications_names,
|
|
329
|
-
window_length=self._window_length,
|
|
330
|
-
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
331
|
-
storage_options=self.storage_options,
|
|
332
|
-
)
|
|
333
|
-
else:
|
|
334
|
-
logger.debug(
|
|
335
|
-
"Skipping endpoint, not ready or not suitable for monitoring",
|
|
336
|
-
endpoint_id=endpoint.metadata.uid,
|
|
337
|
-
endpoint_name=endpoint.metadata.name,
|
|
338
|
-
)
|
|
339
|
-
logger.info("Finished running monitoring controller")
|
|
289
|
+
# Run single endpoint process
|
|
290
|
+
self.model_endpoint_process(event=body)
|
|
340
291
|
|
|
341
|
-
@classmethod
|
|
342
292
|
def model_endpoint_process(
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
346
|
-
applications_names: list[str],
|
|
347
|
-
window_length: int,
|
|
348
|
-
model_monitoring_access_key: str,
|
|
349
|
-
storage_options: Optional[dict] = None,
|
|
293
|
+
self,
|
|
294
|
+
event: Optional[dict] = None,
|
|
350
295
|
) -> None:
|
|
351
296
|
"""
|
|
352
297
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
353
|
-
for each endpoint.
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
:param endpoint: (dict) Model endpoint record.
|
|
357
|
-
:param applications_names: (list[str]) List of application names to push results to.
|
|
358
|
-
:param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
|
|
359
|
-
:param project: (str) Project name.
|
|
360
|
-
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
361
|
-
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
298
|
+
for each endpoint.
|
|
299
|
+
|
|
300
|
+
:param event: (dict) Event that triggered the monitoring process.
|
|
362
301
|
"""
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
|
|
366
|
-
)
|
|
367
|
-
m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
|
|
302
|
+
logger.info("Model endpoint process started", event=event)
|
|
303
|
+
|
|
368
304
|
try:
|
|
305
|
+
project_name = event[ControllerEvent.PROJECT]
|
|
306
|
+
endpoint_id = event[ControllerEvent.ENDPOINT_ID]
|
|
307
|
+
endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
|
|
308
|
+
applications_names = event[ControllerEvent.ENDPOINT_POLICY][
|
|
309
|
+
"monitoring_applications"
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
not_batch_endpoint = (
|
|
313
|
+
event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
|
|
314
|
+
)
|
|
315
|
+
m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
|
|
316
|
+
logger.info(
|
|
317
|
+
"Starting analyzing for:", timestamp=event[ControllerEvent.TIMESTAMP]
|
|
318
|
+
)
|
|
319
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
320
|
+
event[ControllerEvent.TIMESTAMP]
|
|
321
|
+
)
|
|
322
|
+
first_request = datetime.datetime.fromisoformat(
|
|
323
|
+
event[ControllerEvent.FIRST_REQUEST]
|
|
324
|
+
)
|
|
369
325
|
with _BatchWindowGenerator(
|
|
370
|
-
project=
|
|
326
|
+
project=project_name,
|
|
327
|
+
endpoint_id=endpoint_id,
|
|
328
|
+
window_length=self._window_length,
|
|
371
329
|
) as batch_window_generator:
|
|
372
330
|
for application in applications_names:
|
|
373
331
|
for (
|
|
@@ -375,15 +333,15 @@ class MonitoringApplicationController:
|
|
|
375
333
|
end_infer_time,
|
|
376
334
|
) in batch_window_generator.get_intervals(
|
|
377
335
|
application=application,
|
|
378
|
-
first_request=endpoint.status.first_request,
|
|
379
|
-
last_request=endpoint.status.last_request,
|
|
380
336
|
not_batch_endpoint=not_batch_endpoint,
|
|
337
|
+
first_request=first_request,
|
|
338
|
+
last_request=last_stream_timestamp,
|
|
381
339
|
):
|
|
382
340
|
df = m_fs.to_dataframe(
|
|
383
341
|
start_time=start_infer_time,
|
|
384
342
|
end_time=end_infer_time,
|
|
385
343
|
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
386
|
-
storage_options=storage_options,
|
|
344
|
+
storage_options=self.storage_options,
|
|
387
345
|
)
|
|
388
346
|
if len(df) == 0:
|
|
389
347
|
logger.info(
|
|
@@ -399,21 +357,53 @@ class MonitoringApplicationController:
|
|
|
399
357
|
end=end_infer_time,
|
|
400
358
|
endpoint_id=endpoint_id,
|
|
401
359
|
)
|
|
402
|
-
|
|
360
|
+
self._push_to_applications(
|
|
403
361
|
start_infer_time=start_infer_time,
|
|
404
362
|
end_infer_time=end_infer_time,
|
|
405
363
|
endpoint_id=endpoint_id,
|
|
406
|
-
endpoint_name=
|
|
407
|
-
project=
|
|
364
|
+
endpoint_name=endpoint_name,
|
|
365
|
+
project=project_name,
|
|
408
366
|
applications_names=[application],
|
|
409
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
367
|
+
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
410
368
|
)
|
|
411
|
-
|
|
369
|
+
base_period = event[ControllerEvent.ENDPOINT_POLICY]["base_period"]
|
|
370
|
+
current_time = mlrun.utils.datetime_now()
|
|
371
|
+
if (
|
|
372
|
+
current_time.timestamp()
|
|
373
|
+
- batch_window_generator.batch_window._get_last_analyzed()
|
|
374
|
+
>= datetime.timedelta(minutes=base_period).total_seconds()
|
|
375
|
+
and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
|
|
376
|
+
):
|
|
377
|
+
event = {
|
|
378
|
+
ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
|
|
379
|
+
ControllerEvent.PROJECT: project_name,
|
|
380
|
+
ControllerEvent.ENDPOINT_ID: endpoint_id,
|
|
381
|
+
ControllerEvent.ENDPOINT_NAME: endpoint_name,
|
|
382
|
+
ControllerEvent.TIMESTAMP: current_time.isoformat(
|
|
383
|
+
timespec="microseconds"
|
|
384
|
+
),
|
|
385
|
+
ControllerEvent.ENDPOINT_POLICY: event[
|
|
386
|
+
ControllerEvent.ENDPOINT_POLICY
|
|
387
|
+
],
|
|
388
|
+
ControllerEvent.ENDPOINT_TYPE: event[
|
|
389
|
+
ControllerEvent.ENDPOINT_TYPE
|
|
390
|
+
],
|
|
391
|
+
ControllerEvent.FEATURE_SET_URI: event[
|
|
392
|
+
ControllerEvent.FEATURE_SET_URI
|
|
393
|
+
],
|
|
394
|
+
ControllerEvent.FIRST_REQUEST: event[
|
|
395
|
+
ControllerEvent.FIRST_REQUEST
|
|
396
|
+
],
|
|
397
|
+
}
|
|
398
|
+
self._push_to_main_stream(
|
|
399
|
+
event=event,
|
|
400
|
+
endpoint_id=endpoint_id,
|
|
401
|
+
)
|
|
412
402
|
|
|
413
403
|
except Exception:
|
|
414
404
|
logger.exception(
|
|
415
405
|
"Encountered an exception",
|
|
416
|
-
endpoint_id=
|
|
406
|
+
endpoint_id=event[ControllerEvent.ENDPOINT_ID],
|
|
417
407
|
)
|
|
418
408
|
|
|
419
409
|
@staticmethod
|
|
@@ -465,6 +455,168 @@ class MonitoringApplicationController:
|
|
|
465
455
|
[data]
|
|
466
456
|
)
|
|
467
457
|
|
|
458
|
+
def push_regular_event_to_controller_stream(self, event: nuclio_sdk.Event) -> None:
|
|
459
|
+
"""
|
|
460
|
+
pushes a regular event to the controller stream.
|
|
461
|
+
:param event: the nuclio trigger event
|
|
462
|
+
"""
|
|
463
|
+
logger.info("Starting monitoring controller chief")
|
|
464
|
+
applications_names = []
|
|
465
|
+
db = mlrun.get_run_db()
|
|
466
|
+
endpoints = db.list_model_endpoints(
|
|
467
|
+
project=self.project, tsdb_metrics=True
|
|
468
|
+
).endpoints
|
|
469
|
+
if not endpoints:
|
|
470
|
+
logger.info("No model endpoints found", project=self.project)
|
|
471
|
+
return
|
|
472
|
+
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
473
|
+
if monitoring_functions:
|
|
474
|
+
# if monitoring_functions: - TODO : ML-7700
|
|
475
|
+
# Gets only application in ready state
|
|
476
|
+
# applications_names = list(
|
|
477
|
+
# {
|
|
478
|
+
# app.metadata.name
|
|
479
|
+
# for app in monitoring_functions
|
|
480
|
+
# if (
|
|
481
|
+
# app.status.state == "ready"
|
|
482
|
+
# # workaround for the default app, as its `status.state` is `None`
|
|
483
|
+
# or app.metadata.name
|
|
484
|
+
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
485
|
+
# )
|
|
486
|
+
# }
|
|
487
|
+
# )
|
|
488
|
+
applications_names = list(
|
|
489
|
+
{app.metadata.name for app in monitoring_functions}
|
|
490
|
+
)
|
|
491
|
+
if not applications_names:
|
|
492
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
493
|
+
return
|
|
494
|
+
policy = {
|
|
495
|
+
"monitoring_applications": applications_names,
|
|
496
|
+
"base_period": int(
|
|
497
|
+
batch_dict2timedelta(
|
|
498
|
+
json.loads(
|
|
499
|
+
cast(
|
|
500
|
+
str,
|
|
501
|
+
os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
|
|
502
|
+
)
|
|
503
|
+
)
|
|
504
|
+
).total_seconds()
|
|
505
|
+
// 60
|
|
506
|
+
),
|
|
507
|
+
}
|
|
508
|
+
for endpoint in endpoints:
|
|
509
|
+
if self._should_monitor_endpoint(endpoint):
|
|
510
|
+
logger.info(
|
|
511
|
+
"Regular event is being pushed to controller stream for model endpoint",
|
|
512
|
+
endpoint_id=endpoint.metadata.uid,
|
|
513
|
+
endpoint_name=endpoint.metadata.name,
|
|
514
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
515
|
+
sep=" ", timespec="microseconds"
|
|
516
|
+
),
|
|
517
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
518
|
+
sep=" ", timespec="microseconds"
|
|
519
|
+
),
|
|
520
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
521
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
522
|
+
endpoint_policy=json.dumps(policy),
|
|
523
|
+
)
|
|
524
|
+
self.push_to_controller_stream(
|
|
525
|
+
kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
|
|
526
|
+
project=self.project,
|
|
527
|
+
endpoint_id=endpoint.metadata.uid,
|
|
528
|
+
endpoint_name=endpoint.metadata.name,
|
|
529
|
+
stream_access_key=self.v3io_access_key,
|
|
530
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
531
|
+
sep=" ", timespec="microseconds"
|
|
532
|
+
),
|
|
533
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
534
|
+
sep=" ", timespec="microseconds"
|
|
535
|
+
),
|
|
536
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
537
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
538
|
+
endpoint_policy=policy,
|
|
539
|
+
)
|
|
540
|
+
else:
|
|
541
|
+
logger.info(
|
|
542
|
+
"Should not monitor model endpoint, didn't push regular event",
|
|
543
|
+
endpoint_id=endpoint.metadata.uid,
|
|
544
|
+
endpoint_name=endpoint.metadata.name,
|
|
545
|
+
timestamp=endpoint.status.last_request,
|
|
546
|
+
first_request=endpoint.status.first_request,
|
|
547
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
548
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
@staticmethod
|
|
552
|
+
def push_to_controller_stream(
|
|
553
|
+
kind: str,
|
|
554
|
+
project: str,
|
|
555
|
+
endpoint_id: str,
|
|
556
|
+
endpoint_name: str,
|
|
557
|
+
stream_access_key: str,
|
|
558
|
+
timestamp: str,
|
|
559
|
+
first_request: str,
|
|
560
|
+
endpoint_type: str,
|
|
561
|
+
feature_set_uri: str,
|
|
562
|
+
endpoint_policy: dict[str, Any],
|
|
563
|
+
) -> None:
|
|
564
|
+
"""
|
|
565
|
+
Pushes event data to controller stream.
|
|
566
|
+
:param timestamp: the event timestamp str isoformat utc timezone
|
|
567
|
+
:param first_request: the first request str isoformat utc timezone
|
|
568
|
+
:param endpoint_policy: dictionary hold the monitoring policy
|
|
569
|
+
:param kind: str event kind
|
|
570
|
+
:param project: project name
|
|
571
|
+
:param endpoint_id: endpoint id string
|
|
572
|
+
:param endpoint_name: the endpoint name string
|
|
573
|
+
:param endpoint_type: Enum of the endpoint type
|
|
574
|
+
:param feature_set_uri: the feature set uri string
|
|
575
|
+
:param stream_access_key: access key to apply the model monitoring process.
|
|
576
|
+
"""
|
|
577
|
+
stream_uri = get_stream_path(
|
|
578
|
+
project=project,
|
|
579
|
+
function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
|
|
580
|
+
)
|
|
581
|
+
event = {
|
|
582
|
+
ControllerEvent.KIND.value: kind,
|
|
583
|
+
ControllerEvent.PROJECT.value: project,
|
|
584
|
+
ControllerEvent.ENDPOINT_ID.value: endpoint_id,
|
|
585
|
+
ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
|
|
586
|
+
ControllerEvent.TIMESTAMP.value: timestamp,
|
|
587
|
+
ControllerEvent.FIRST_REQUEST.value: first_request,
|
|
588
|
+
ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
|
|
589
|
+
ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
|
|
590
|
+
ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
|
|
591
|
+
}
|
|
592
|
+
logger.info(
|
|
593
|
+
"Pushing data to controller stream",
|
|
594
|
+
event=event,
|
|
595
|
+
endpoint_id=endpoint_id,
|
|
596
|
+
stream_uri=stream_uri,
|
|
597
|
+
)
|
|
598
|
+
get_stream_pusher(stream_uri, access_key=stream_access_key).push(
|
|
599
|
+
[event], partition_key=endpoint_id
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
|
|
603
|
+
"""
|
|
604
|
+
Pushes the given event to model monitoring stream
|
|
605
|
+
:param event: event dictionary to push to stream
|
|
606
|
+
:param endpoint_id: endpoint id string
|
|
607
|
+
"""
|
|
608
|
+
stream_uri = get_stream_path(project=event.get(ControllerEvent.PROJECT))
|
|
609
|
+
|
|
610
|
+
logger.info(
|
|
611
|
+
"Pushing data to main stream, NOP event is been generated",
|
|
612
|
+
event=json.dumps(event),
|
|
613
|
+
endpoint_id=endpoint_id,
|
|
614
|
+
stream_uri=stream_uri,
|
|
615
|
+
)
|
|
616
|
+
get_stream_pusher(stream_uri, access_key=self.model_monitoring_access_key).push(
|
|
617
|
+
[event], partition_key=endpoint_id
|
|
618
|
+
)
|
|
619
|
+
|
|
468
620
|
|
|
469
621
|
def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
470
622
|
"""
|
|
@@ -473,4 +625,15 @@ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
|
473
625
|
:param context: the Nuclio context
|
|
474
626
|
:param event: trigger event
|
|
475
627
|
"""
|
|
476
|
-
|
|
628
|
+
logger.info(
|
|
629
|
+
"Controller got event",
|
|
630
|
+
trigger=event.trigger,
|
|
631
|
+
trigger_kind=event.trigger.kind,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
if event.trigger.kind == "http":
|
|
635
|
+
# Runs controller chief:
|
|
636
|
+
MonitoringApplicationController().push_regular_event_to_controller_stream(event)
|
|
637
|
+
else:
|
|
638
|
+
# Runs controller worker:
|
|
639
|
+
MonitoringApplicationController().run(event=event)
|
|
@@ -67,43 +67,31 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
67
67
|
def get_tsdb_connector(
|
|
68
68
|
project: str,
|
|
69
69
|
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
70
|
-
|
|
71
|
-
**kwargs,
|
|
70
|
+
profile: typing.Optional[mlrun.datastore.datastore_profile.DatastoreProfile] = None,
|
|
72
71
|
) -> TSDBConnector:
|
|
73
72
|
"""
|
|
74
73
|
Get TSDB connector object.
|
|
75
74
|
:param project: The name of the project.
|
|
76
75
|
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
77
|
-
:param
|
|
76
|
+
:param profile: An optional profile to initialize the TSDB connector from.
|
|
78
77
|
|
|
79
78
|
:return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
|
|
80
79
|
TSDB connector such as updating drift metrics or write application record result.
|
|
81
80
|
:raise: `MLRunInvalidMMStoreTypeError` if the user didn't provide TSDB connection
|
|
82
81
|
or the provided TSDB connection is invalid.
|
|
83
82
|
"""
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
profile = mlrun.model_monitoring.helpers._get_tsdb_profile(
|
|
87
|
-
project=project, secret_provider=secret_provider
|
|
88
|
-
)
|
|
89
|
-
except mlrun.errors.MLRunNotFoundError:
|
|
90
|
-
profile = None
|
|
91
|
-
|
|
92
|
-
tsdb_connection_string = (
|
|
93
|
-
tsdb_connection_string
|
|
94
|
-
or mlrun.model_monitoring.helpers.get_tsdb_connection_string(
|
|
95
|
-
secret_provider=secret_provider
|
|
96
|
-
)
|
|
83
|
+
profile = profile or mlrun.model_monitoring.helpers._get_tsdb_profile(
|
|
84
|
+
project=project, secret_provider=secret_provider
|
|
97
85
|
)
|
|
98
|
-
|
|
99
|
-
if
|
|
100
|
-
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
|
|
101
|
-
kwargs["connection_string"] = tsdb_connection_string
|
|
102
|
-
elif tsdb_connection_string and tsdb_connection_string == "v3io":
|
|
103
|
-
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
|
|
104
|
-
elif isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
|
|
86
|
+
kwargs = {}
|
|
87
|
+
if isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
|
|
105
88
|
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
|
|
106
89
|
kwargs["v3io_access_key"] = profile.v3io_access_key
|
|
90
|
+
elif isinstance(
|
|
91
|
+
profile, mlrun.datastore.datastore_profile.TDEngineDatastoreProfile
|
|
92
|
+
):
|
|
93
|
+
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
|
|
94
|
+
kwargs["connection_string"] = profile.dsn()
|
|
107
95
|
else:
|
|
108
96
|
raise mlrun.errors.MLRunInvalidMMStoreTypeError(
|
|
109
97
|
"You must provide a valid tsdb store connection by using "
|
|
@@ -145,8 +145,11 @@ class TDEngineConnector(TSDBConnector):
|
|
|
145
145
|
|
|
146
146
|
create_table_sql = table._create_subtable_sql(subtable=table_name, values=event)
|
|
147
147
|
|
|
148
|
+
# we need the string values to be sent to the connection, not the enum
|
|
149
|
+
columns = {str(key): str(val) for key, val in table.columns.items()}
|
|
150
|
+
|
|
148
151
|
insert_statement = Statement(
|
|
149
|
-
columns=
|
|
152
|
+
columns=columns,
|
|
150
153
|
subtable=table_name,
|
|
151
154
|
values=event,
|
|
152
155
|
)
|
|
@@ -188,7 +191,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
188
191
|
graph.add_step(
|
|
189
192
|
"mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ProcessBeforeTDEngine",
|
|
190
193
|
name="ProcessBeforeTDEngine",
|
|
191
|
-
after="
|
|
194
|
+
after="FilterNOP",
|
|
192
195
|
)
|
|
193
196
|
|
|
194
197
|
def apply_tdengine_target(name, after):
|
|
@@ -204,7 +204,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
204
204
|
}
|
|
205
205
|
],
|
|
206
206
|
name=EventFieldType.LATENCY,
|
|
207
|
-
after="
|
|
207
|
+
after="FilterNOP",
|
|
208
208
|
step_name="Aggregates",
|
|
209
209
|
table=".",
|
|
210
210
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
@@ -225,7 +225,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
225
225
|
graph.add_step(
|
|
226
226
|
"storey.TSDBTarget",
|
|
227
227
|
name="tsdb_predictions",
|
|
228
|
-
after="
|
|
228
|
+
after="FilterNOP",
|
|
229
229
|
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.PREDICTIONS]}",
|
|
230
230
|
rate="1/s",
|
|
231
231
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
@@ -117,6 +117,7 @@ def get_stream_path(
|
|
|
117
117
|
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
118
118
|
stream_uri: Optional[str] = None,
|
|
119
119
|
secret_provider: Optional[Callable[[str], str]] = None,
|
|
120
|
+
profile: Optional[mlrun.datastore.datastore_profile.DatastoreProfile] = None,
|
|
120
121
|
) -> str:
|
|
121
122
|
"""
|
|
122
123
|
Get stream path from the project secret. If wasn't set, take it from the system configurations
|
|
@@ -126,20 +127,25 @@ def get_stream_path(
|
|
|
126
127
|
:param stream_uri: Stream URI. If provided, it will be used instead of the one from the project's secret.
|
|
127
128
|
:param secret_provider: Optional secret provider to get the connection string secret.
|
|
128
129
|
If not set, the env vars are used.
|
|
130
|
+
:param profile: Optional datastore profile of the stream (V3IO/KafkaSource profile).
|
|
129
131
|
:return: Monitoring stream path to the relevant application.
|
|
130
132
|
"""
|
|
131
133
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
profile = None
|
|
134
|
+
profile = profile or _get_stream_profile(
|
|
135
|
+
project=project, secret_provider=secret_provider
|
|
136
|
+
)
|
|
136
137
|
|
|
137
138
|
if isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
|
|
138
139
|
stream_uri = "v3io"
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
140
|
+
elif isinstance(
|
|
141
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
|
|
142
|
+
):
|
|
143
|
+
stream_uri = f"kafka://{profile.brokers[0]}"
|
|
144
|
+
else:
|
|
145
|
+
raise mlrun.errors.MLRunValueError(
|
|
146
|
+
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
147
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
|
|
148
|
+
)
|
|
143
149
|
|
|
144
150
|
if not stream_uri or stream_uri == "v3io":
|
|
145
151
|
stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
@@ -273,7 +279,7 @@ def _get_profile(
|
|
|
273
279
|
)
|
|
274
280
|
if not profile_name:
|
|
275
281
|
raise mlrun.errors.MLRunNotFoundError(
|
|
276
|
-
f"Not found `{profile_name_key}` profile name"
|
|
282
|
+
f"Not found `{profile_name_key}` profile name for project '{project}'"
|
|
277
283
|
)
|
|
278
284
|
return mlrun.datastore.datastore_profile.datastore_profile_read(
|
|
279
285
|
url=f"ds://{profile_name}", project_name=project, secrets=secret_provider
|