mlrun 1.8.0rc19__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (52) hide show
  1. mlrun/__init__.py +37 -3
  2. mlrun/__main__.py +5 -0
  3. mlrun/alerts/alert.py +1 -0
  4. mlrun/artifacts/document.py +78 -36
  5. mlrun/common/formatters/feature_set.py +1 -0
  6. mlrun/common/runtimes/constants.py +17 -0
  7. mlrun/common/schemas/alert.py +3 -0
  8. mlrun/common/schemas/client_spec.py +0 -1
  9. mlrun/common/schemas/model_monitoring/constants.py +32 -9
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +2 -0
  11. mlrun/common/schemas/workflow.py +1 -0
  12. mlrun/config.py +39 -6
  13. mlrun/datastore/datastore_profile.py +58 -16
  14. mlrun/datastore/sources.py +7 -1
  15. mlrun/datastore/vectorstore.py +20 -1
  16. mlrun/db/base.py +20 -0
  17. mlrun/db/httpdb.py +97 -10
  18. mlrun/db/nopdb.py +19 -0
  19. mlrun/errors.py +4 -0
  20. mlrun/execution.py +15 -6
  21. mlrun/frameworks/_common/model_handler.py +0 -2
  22. mlrun/launcher/client.py +2 -2
  23. mlrun/launcher/local.py +5 -1
  24. mlrun/model_monitoring/applications/_application_steps.py +3 -1
  25. mlrun/model_monitoring/controller.py +266 -103
  26. mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
  27. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
  28. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +20 -21
  29. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -34
  30. mlrun/model_monitoring/helpers.py +16 -10
  31. mlrun/model_monitoring/stream_processing.py +106 -35
  32. mlrun/package/context_handler.py +1 -1
  33. mlrun/package/packagers_manager.py +4 -18
  34. mlrun/projects/pipelines.py +18 -5
  35. mlrun/projects/project.py +156 -39
  36. mlrun/runtimes/nuclio/serving.py +22 -13
  37. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  38. mlrun/secrets.py +1 -1
  39. mlrun/serving/server.py +11 -3
  40. mlrun/serving/states.py +65 -8
  41. mlrun/serving/v2_serving.py +67 -44
  42. mlrun/utils/helpers.py +111 -23
  43. mlrun/utils/notifications/notification/base.py +6 -1
  44. mlrun/utils/notifications/notification/slack.py +5 -1
  45. mlrun/utils/notifications/notification_pusher.py +67 -36
  46. mlrun/utils/version/version.json +2 -2
  47. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/METADATA +33 -16
  48. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/RECORD +52 -52
  49. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/WHEEL +1 -1
  50. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/LICENSE +0 -0
  51. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/entry_points.txt +0 -0
  52. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/top_level.txt +0 -0
@@ -12,14 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import concurrent.futures
16
15
  import datetime
17
16
  import json
18
17
  import os
19
18
  from collections.abc import Iterator
20
19
  from contextlib import AbstractContextManager
21
20
  from types import TracebackType
22
- from typing import NamedTuple, Optional, cast
21
+ from typing import Any, NamedTuple, Optional, cast
23
22
 
24
23
  import nuclio_sdk
25
24
 
@@ -28,6 +27,10 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
28
27
  import mlrun.feature_store as fstore
29
28
  import mlrun.model_monitoring
30
29
  from mlrun.common.schemas import EndpointType
30
+ from mlrun.common.schemas.model_monitoring.constants import (
31
+ ControllerEvent,
32
+ ControllerEventKind,
33
+ )
31
34
  from mlrun.datastore import get_stream_pusher
32
35
  from mlrun.errors import err_to_str
33
36
  from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
@@ -140,6 +143,7 @@ class _BatchWindowGenerator(AbstractContextManager):
140
143
  Initialize a batch window generator object that generates batch window objects
141
144
  for the monitoring functions.
142
145
  """
146
+ self.batch_window: _BatchWindow = None
143
147
  self._project = project
144
148
  self._endpoint_id = endpoint_id
145
149
  self._timedelta = window_length
@@ -199,14 +203,14 @@ class _BatchWindowGenerator(AbstractContextManager):
199
203
  `first_request` and `last_request` are the timestamps of the first request and last
200
204
  request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
201
205
  """
202
- batch_window = _BatchWindow(
206
+ self.batch_window = _BatchWindow(
203
207
  schedules_file=self._schedules_file,
204
208
  application=application,
205
209
  timedelta_seconds=self._timedelta,
206
210
  last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
207
211
  first_request=int(first_request.timestamp()),
208
212
  )
209
- yield from batch_window.get_intervals()
213
+ yield from self.batch_window.get_intervals()
210
214
 
211
215
 
212
216
  def _get_window_length() -> int:
@@ -237,6 +241,7 @@ class MonitoringApplicationController:
237
241
  self._window_length = _get_window_length()
238
242
 
239
243
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
244
+ self.v3io_access_key = mlrun.get_secret_or_env("V3IO_ACCESS_KEY")
240
245
  self.storage_options = None
241
246
  if mlrun.mlconf.artifact_path.startswith("s3://"):
242
247
  self.storage_options = mlrun.mlconf.get_s3_storage_options()
@@ -262,112 +267,65 @@ class MonitoringApplicationController:
262
267
  != mm_constants.EndpointType.ROUTER.value
263
268
  )
264
269
 
265
- def run(self) -> None:
270
+ def run(self, event: nuclio_sdk.Event) -> None:
266
271
  """
267
- Main method for run all the relevant monitoring applications on each endpoint.
272
+ Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
273
+ Handles nop events logic.
268
274
  This method handles the following:
269
- 1. List model endpoints
270
- 2. List applications
271
- 3. Check model monitoring windows
272
- 4. Send data to applications
273
- 5. Delete old parquets
275
+ 1. Read applications from the event (endpoint_policy)
276
+ 2. Check model monitoring windows
277
+ 3. Send data to applications
278
+ 4. Pushes nop event to main stream if needed
274
279
  """
275
- logger.info("Start running monitoring controller")
280
+ logger.info("Start running monitoring controller worker")
276
281
  try:
277
- applications_names = []
278
- endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
279
- project=self.project, tsdb_metrics=True
280
- )
281
- endpoints = endpoints_list.endpoints
282
- if not endpoints:
283
- logger.info("No model endpoints found", project=self.project)
284
- return
285
- monitoring_functions = self.project_obj.list_model_monitoring_functions()
286
- if monitoring_functions:
287
- applications_names = list(
288
- {app.metadata.name for app in monitoring_functions}
289
- )
290
- # if monitoring_functions: - TODO : ML-7700
291
- # Gets only application in ready state
292
- # applications_names = list(
293
- # {
294
- # app.metadata.name
295
- # for app in monitoring_functions
296
- # if (
297
- # app.status.state == "ready"
298
- # # workaround for the default app, as its `status.state` is `None`
299
- # or app.metadata.name
300
- # == mm_constants.HistogramDataDriftApplicationConstants.NAME
301
- # )
302
- # }
303
- # )
304
- if not applications_names:
305
- logger.info("No monitoring functions found", project=self.project)
306
- return
307
- logger.info(
308
- "Starting to iterate over the applications",
309
- applications=applications_names,
310
- )
311
-
282
+ body = json.loads(event.body.decode("utf-8"))
312
283
  except Exception as e:
313
284
  logger.error(
314
- "Failed to list endpoints and monitoring applications",
285
+ "Failed to decode event",
315
286
  exc=err_to_str(e),
316
287
  )
317
288
  return
318
- # Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
319
- with concurrent.futures.ThreadPoolExecutor(
320
- max_workers=min(len(endpoints), 10)
321
- ) as pool:
322
- for endpoint in endpoints:
323
- if self._should_monitor_endpoint(endpoint):
324
- pool.submit(
325
- MonitoringApplicationController.model_endpoint_process,
326
- project=self.project,
327
- endpoint=endpoint,
328
- applications_names=applications_names,
329
- window_length=self._window_length,
330
- model_monitoring_access_key=self.model_monitoring_access_key,
331
- storage_options=self.storage_options,
332
- )
333
- else:
334
- logger.debug(
335
- "Skipping endpoint, not ready or not suitable for monitoring",
336
- endpoint_id=endpoint.metadata.uid,
337
- endpoint_name=endpoint.metadata.name,
338
- )
339
- logger.info("Finished running monitoring controller")
289
+ # Run single endpoint process
290
+ self.model_endpoint_process(event=body)
340
291
 
341
- @classmethod
342
292
  def model_endpoint_process(
343
- cls,
344
- project: str,
345
- endpoint: mlrun.common.schemas.ModelEndpoint,
346
- applications_names: list[str],
347
- window_length: int,
348
- model_monitoring_access_key: str,
349
- storage_options: Optional[dict] = None,
293
+ self,
294
+ event: Optional[dict] = None,
350
295
  ) -> None:
351
296
  """
352
297
  Process a model endpoint and trigger the monitoring applications. This function running on different process
353
- for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
354
- for a specific time range.
355
-
356
- :param endpoint: (dict) Model endpoint record.
357
- :param applications_names: (list[str]) List of application names to push results to.
358
- :param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
359
- :param project: (str) Project name.
360
- :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
361
- :param storage_options: (dict) Storage options for reading the infer parquet files.
298
+ for each endpoint.
299
+
300
+ :param event: (dict) Event that triggered the monitoring process.
362
301
  """
363
- endpoint_id = endpoint.metadata.uid
364
- not_batch_endpoint = not (
365
- endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
366
- )
367
- m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
302
+ logger.info("Model endpoint process started", event=event)
303
+
368
304
  try:
305
+ project_name = event[ControllerEvent.PROJECT]
306
+ endpoint_id = event[ControllerEvent.ENDPOINT_ID]
307
+ endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
308
+ applications_names = event[ControllerEvent.ENDPOINT_POLICY][
309
+ "monitoring_applications"
310
+ ]
311
+
312
+ not_batch_endpoint = (
313
+ event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
314
+ )
315
+ m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
316
+ logger.info(
317
+ "Starting analyzing for:", timestamp=event[ControllerEvent.TIMESTAMP]
318
+ )
319
+ last_stream_timestamp = datetime.datetime.fromisoformat(
320
+ event[ControllerEvent.TIMESTAMP]
321
+ )
322
+ first_request = datetime.datetime.fromisoformat(
323
+ event[ControllerEvent.FIRST_REQUEST]
324
+ )
369
325
  with _BatchWindowGenerator(
370
- project=project, endpoint_id=endpoint_id, window_length=window_length
326
+ project=project_name,
327
+ endpoint_id=endpoint_id,
328
+ window_length=self._window_length,
371
329
  ) as batch_window_generator:
372
330
  for application in applications_names:
373
331
  for (
@@ -375,15 +333,15 @@ class MonitoringApplicationController:
375
333
  end_infer_time,
376
334
  ) in batch_window_generator.get_intervals(
377
335
  application=application,
378
- first_request=endpoint.status.first_request,
379
- last_request=endpoint.status.last_request,
380
336
  not_batch_endpoint=not_batch_endpoint,
337
+ first_request=first_request,
338
+ last_request=last_stream_timestamp,
381
339
  ):
382
340
  df = m_fs.to_dataframe(
383
341
  start_time=start_infer_time,
384
342
  end_time=end_infer_time,
385
343
  time_column=mm_constants.EventFieldType.TIMESTAMP,
386
- storage_options=storage_options,
344
+ storage_options=self.storage_options,
387
345
  )
388
346
  if len(df) == 0:
389
347
  logger.info(
@@ -399,21 +357,53 @@ class MonitoringApplicationController:
399
357
  end=end_infer_time,
400
358
  endpoint_id=endpoint_id,
401
359
  )
402
- cls._push_to_applications(
360
+ self._push_to_applications(
403
361
  start_infer_time=start_infer_time,
404
362
  end_infer_time=end_infer_time,
405
363
  endpoint_id=endpoint_id,
406
- endpoint_name=endpoint.metadata.name,
407
- project=project,
364
+ endpoint_name=endpoint_name,
365
+ project=project_name,
408
366
  applications_names=[application],
409
- model_monitoring_access_key=model_monitoring_access_key,
367
+ model_monitoring_access_key=self.model_monitoring_access_key,
410
368
  )
411
- logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
369
+ base_period = event[ControllerEvent.ENDPOINT_POLICY]["base_period"]
370
+ current_time = mlrun.utils.datetime_now()
371
+ if (
372
+ current_time.timestamp()
373
+ - batch_window_generator.batch_window._get_last_analyzed()
374
+ >= datetime.timedelta(minutes=base_period).total_seconds()
375
+ and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
376
+ ):
377
+ event = {
378
+ ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
379
+ ControllerEvent.PROJECT: project_name,
380
+ ControllerEvent.ENDPOINT_ID: endpoint_id,
381
+ ControllerEvent.ENDPOINT_NAME: endpoint_name,
382
+ ControllerEvent.TIMESTAMP: current_time.isoformat(
383
+ timespec="microseconds"
384
+ ),
385
+ ControllerEvent.ENDPOINT_POLICY: event[
386
+ ControllerEvent.ENDPOINT_POLICY
387
+ ],
388
+ ControllerEvent.ENDPOINT_TYPE: event[
389
+ ControllerEvent.ENDPOINT_TYPE
390
+ ],
391
+ ControllerEvent.FEATURE_SET_URI: event[
392
+ ControllerEvent.FEATURE_SET_URI
393
+ ],
394
+ ControllerEvent.FIRST_REQUEST: event[
395
+ ControllerEvent.FIRST_REQUEST
396
+ ],
397
+ }
398
+ self._push_to_main_stream(
399
+ event=event,
400
+ endpoint_id=endpoint_id,
401
+ )
412
402
 
413
403
  except Exception:
414
404
  logger.exception(
415
405
  "Encountered an exception",
416
- endpoint_id=endpoint.metadata.uid,
406
+ endpoint_id=event[ControllerEvent.ENDPOINT_ID],
417
407
  )
418
408
 
419
409
  @staticmethod
@@ -465,6 +455,168 @@ class MonitoringApplicationController:
465
455
  [data]
466
456
  )
467
457
 
458
+ def push_regular_event_to_controller_stream(self, event: nuclio_sdk.Event) -> None:
459
+ """
460
+ pushes a regular event to the controller stream.
461
+ :param event: the nuclio trigger event
462
+ """
463
+ logger.info("Starting monitoring controller chief")
464
+ applications_names = []
465
+ db = mlrun.get_run_db()
466
+ endpoints = db.list_model_endpoints(
467
+ project=self.project, tsdb_metrics=True
468
+ ).endpoints
469
+ if not endpoints:
470
+ logger.info("No model endpoints found", project=self.project)
471
+ return
472
+ monitoring_functions = self.project_obj.list_model_monitoring_functions()
473
+ if monitoring_functions:
474
+ # if monitoring_functions: - TODO : ML-7700
475
+ # Gets only application in ready state
476
+ # applications_names = list(
477
+ # {
478
+ # app.metadata.name
479
+ # for app in monitoring_functions
480
+ # if (
481
+ # app.status.state == "ready"
482
+ # # workaround for the default app, as its `status.state` is `None`
483
+ # or app.metadata.name
484
+ # == mm_constants.HistogramDataDriftApplicationConstants.NAME
485
+ # )
486
+ # }
487
+ # )
488
+ applications_names = list(
489
+ {app.metadata.name for app in monitoring_functions}
490
+ )
491
+ if not applications_names:
492
+ logger.info("No monitoring functions found", project=self.project)
493
+ return
494
+ policy = {
495
+ "monitoring_applications": applications_names,
496
+ "base_period": int(
497
+ batch_dict2timedelta(
498
+ json.loads(
499
+ cast(
500
+ str,
501
+ os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
502
+ )
503
+ )
504
+ ).total_seconds()
505
+ // 60
506
+ ),
507
+ }
508
+ for endpoint in endpoints:
509
+ if self._should_monitor_endpoint(endpoint):
510
+ logger.info(
511
+ "Regular event is being pushed to controller stream for model endpoint",
512
+ endpoint_id=endpoint.metadata.uid,
513
+ endpoint_name=endpoint.metadata.name,
514
+ timestamp=endpoint.status.last_request.isoformat(
515
+ sep=" ", timespec="microseconds"
516
+ ),
517
+ first_request=endpoint.status.first_request.isoformat(
518
+ sep=" ", timespec="microseconds"
519
+ ),
520
+ endpoint_type=endpoint.metadata.endpoint_type,
521
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
522
+ endpoint_policy=json.dumps(policy),
523
+ )
524
+ self.push_to_controller_stream(
525
+ kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
526
+ project=self.project,
527
+ endpoint_id=endpoint.metadata.uid,
528
+ endpoint_name=endpoint.metadata.name,
529
+ stream_access_key=self.v3io_access_key,
530
+ timestamp=endpoint.status.last_request.isoformat(
531
+ sep=" ", timespec="microseconds"
532
+ ),
533
+ first_request=endpoint.status.first_request.isoformat(
534
+ sep=" ", timespec="microseconds"
535
+ ),
536
+ endpoint_type=endpoint.metadata.endpoint_type,
537
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
538
+ endpoint_policy=policy,
539
+ )
540
+ else:
541
+ logger.info(
542
+ "Should not monitor model endpoint, didn't push regular event",
543
+ endpoint_id=endpoint.metadata.uid,
544
+ endpoint_name=endpoint.metadata.name,
545
+ timestamp=endpoint.status.last_request,
546
+ first_request=endpoint.status.first_request,
547
+ endpoint_type=endpoint.metadata.endpoint_type,
548
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
549
+ )
550
+
551
+ @staticmethod
552
+ def push_to_controller_stream(
553
+ kind: str,
554
+ project: str,
555
+ endpoint_id: str,
556
+ endpoint_name: str,
557
+ stream_access_key: str,
558
+ timestamp: str,
559
+ first_request: str,
560
+ endpoint_type: str,
561
+ feature_set_uri: str,
562
+ endpoint_policy: dict[str, Any],
563
+ ) -> None:
564
+ """
565
+ Pushes event data to controller stream.
566
+ :param timestamp: the event timestamp str isoformat utc timezone
567
+ :param first_request: the first request str isoformat utc timezone
568
+ :param endpoint_policy: dictionary hold the monitoring policy
569
+ :param kind: str event kind
570
+ :param project: project name
571
+ :param endpoint_id: endpoint id string
572
+ :param endpoint_name: the endpoint name string
573
+ :param endpoint_type: Enum of the endpoint type
574
+ :param feature_set_uri: the feature set uri string
575
+ :param stream_access_key: access key to apply the model monitoring process.
576
+ """
577
+ stream_uri = get_stream_path(
578
+ project=project,
579
+ function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
580
+ )
581
+ event = {
582
+ ControllerEvent.KIND.value: kind,
583
+ ControllerEvent.PROJECT.value: project,
584
+ ControllerEvent.ENDPOINT_ID.value: endpoint_id,
585
+ ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
586
+ ControllerEvent.TIMESTAMP.value: timestamp,
587
+ ControllerEvent.FIRST_REQUEST.value: first_request,
588
+ ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
589
+ ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
590
+ ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
591
+ }
592
+ logger.info(
593
+ "Pushing data to controller stream",
594
+ event=event,
595
+ endpoint_id=endpoint_id,
596
+ stream_uri=stream_uri,
597
+ )
598
+ get_stream_pusher(stream_uri, access_key=stream_access_key).push(
599
+ [event], partition_key=endpoint_id
600
+ )
601
+
602
+ def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
603
+ """
604
+ Pushes the given event to model monitoring stream
605
+ :param event: event dictionary to push to stream
606
+ :param endpoint_id: endpoint id string
607
+ """
608
+ stream_uri = get_stream_path(project=event.get(ControllerEvent.PROJECT))
609
+
610
+ logger.info(
611
+ "Pushing data to main stream, NOP event is been generated",
612
+ event=json.dumps(event),
613
+ endpoint_id=endpoint_id,
614
+ stream_uri=stream_uri,
615
+ )
616
+ get_stream_pusher(stream_uri, access_key=self.model_monitoring_access_key).push(
617
+ [event], partition_key=endpoint_id
618
+ )
619
+
468
620
 
469
621
  def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
470
622
  """
@@ -473,4 +625,15 @@ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
473
625
  :param context: the Nuclio context
474
626
  :param event: trigger event
475
627
  """
476
- MonitoringApplicationController().run()
628
+ logger.info(
629
+ "Controller got event",
630
+ trigger=event.trigger,
631
+ trigger_kind=event.trigger.kind,
632
+ )
633
+
634
+ if event.trigger.kind == "http":
635
+ # Runs controller chief:
636
+ MonitoringApplicationController().push_regular_event_to_controller_stream(event)
637
+ else:
638
+ # Runs controller worker:
639
+ MonitoringApplicationController().run(event=event)
@@ -67,43 +67,31 @@ class ObjectTSDBFactory(enum.Enum):
67
67
  def get_tsdb_connector(
68
68
  project: str,
69
69
  secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
70
- tsdb_connection_string: typing.Optional[str] = None,
71
- **kwargs,
70
+ profile: typing.Optional[mlrun.datastore.datastore_profile.DatastoreProfile] = None,
72
71
  ) -> TSDBConnector:
73
72
  """
74
73
  Get TSDB connector object.
75
74
  :param project: The name of the project.
76
75
  :param secret_provider: An optional secret provider to get the connection string secret.
77
- :param tsdb_connection_string: An optional explicit connection string to the TSDB.
76
+ :param profile: An optional profile to initialize the TSDB connector from.
78
77
 
79
78
  :return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
80
79
  TSDB connector such as updating drift metrics or write application record result.
81
80
  :raise: `MLRunInvalidMMStoreTypeError` if the user didn't provide TSDB connection
82
81
  or the provided TSDB connection is invalid.
83
82
  """
84
-
85
- try:
86
- profile = mlrun.model_monitoring.helpers._get_tsdb_profile(
87
- project=project, secret_provider=secret_provider
88
- )
89
- except mlrun.errors.MLRunNotFoundError:
90
- profile = None
91
-
92
- tsdb_connection_string = (
93
- tsdb_connection_string
94
- or mlrun.model_monitoring.helpers.get_tsdb_connection_string(
95
- secret_provider=secret_provider
96
- )
83
+ profile = profile or mlrun.model_monitoring.helpers._get_tsdb_profile(
84
+ project=project, secret_provider=secret_provider
97
85
  )
98
-
99
- if tsdb_connection_string and tsdb_connection_string.startswith("taosws"):
100
- tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
101
- kwargs["connection_string"] = tsdb_connection_string
102
- elif tsdb_connection_string and tsdb_connection_string == "v3io":
103
- tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
104
- elif isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
86
+ kwargs = {}
87
+ if isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
105
88
  tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
106
89
  kwargs["v3io_access_key"] = profile.v3io_access_key
90
+ elif isinstance(
91
+ profile, mlrun.datastore.datastore_profile.TDEngineDatastoreProfile
92
+ ):
93
+ tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
94
+ kwargs["connection_string"] = profile.dsn()
107
95
  else:
108
96
  raise mlrun.errors.MLRunInvalidMMStoreTypeError(
109
97
  "You must provide a valid tsdb store connection by using "
@@ -298,6 +298,8 @@ class Predictions(TDEngineSchema):
298
298
  mm_schemas.EventFieldType.TIME: _TDEngineColumn.TIMESTAMP,
299
299
  mm_schemas.EventFieldType.LATENCY: _TDEngineColumn.FLOAT,
300
300
  mm_schemas.EventKeyMetrics.CUSTOM_METRICS: _TDEngineColumn.BINARY_1000,
301
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT: _TDEngineColumn.FLOAT,
302
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT: _TDEngineColumn.INT,
301
303
  }
302
304
  tags = {
303
305
  mm_schemas.WriterEvent.ENDPOINT_ID: _TDEngineColumn.BINARY_64,
@@ -145,8 +145,11 @@ class TDEngineConnector(TSDBConnector):
145
145
 
146
146
  create_table_sql = table._create_subtable_sql(subtable=table_name, values=event)
147
147
 
148
+ # we need the string values to be sent to the connection, not the enum
149
+ columns = {str(key): str(val) for key, val in table.columns.items()}
150
+
148
151
  insert_statement = Statement(
149
- columns=table.columns,
152
+ columns=columns,
150
153
  subtable=table_name,
151
154
  values=event,
152
155
  )
@@ -165,7 +168,7 @@ class TDEngineConnector(TSDBConnector):
165
168
  return datetime.fromisoformat(val) if isinstance(val, str) else val
166
169
 
167
170
  @staticmethod
168
- def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]):
171
+ def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]) -> str:
169
172
  if isinstance(endpoint_id, str):
170
173
  return f"endpoint_id='{endpoint_id}'"
171
174
  elif isinstance(endpoint_id, list):
@@ -188,7 +191,7 @@ class TDEngineConnector(TSDBConnector):
188
191
  graph.add_step(
189
192
  "mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ProcessBeforeTDEngine",
190
193
  name="ProcessBeforeTDEngine",
191
- after="MapFeatureNames",
194
+ after="FilterNOP",
192
195
  )
193
196
 
194
197
  def apply_tdengine_target(name, after):
@@ -206,6 +209,8 @@ class TDEngineConnector(TSDBConnector):
206
209
  columns=[
207
210
  mm_schemas.EventFieldType.LATENCY,
208
211
  mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
212
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
213
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
209
214
  ],
210
215
  tag_cols=[
211
216
  mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -483,7 +488,7 @@ class TDEngineConnector(TSDBConnector):
483
488
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
484
489
  start=start,
485
490
  end=end,
486
- columns=[mm_schemas.EventFieldType.LATENCY],
491
+ columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
487
492
  filter_query=f"endpoint_id='{endpoint_id}'",
488
493
  agg_funcs=agg_funcs,
489
494
  interval=aggregation_window,
@@ -503,10 +508,10 @@ class TDEngineConnector(TSDBConnector):
503
508
  df["_wend"] = pd.to_datetime(df["_wend"])
504
509
  df.set_index("_wend", inplace=True)
505
510
 
506
- latency_column = (
507
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
511
+ estimated_prediction_count = (
512
+ f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
508
513
  if agg_funcs
509
- else mm_schemas.EventFieldType.LATENCY
514
+ else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
510
515
  )
511
516
 
512
517
  return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -514,7 +519,7 @@ class TDEngineConnector(TSDBConnector):
514
519
  values=list(
515
520
  zip(
516
521
  df.index,
517
- df[latency_column],
522
+ df[estimated_prediction_count],
518
523
  )
519
524
  ), # pyright: ignore[reportArgumentType]
520
525
  )
@@ -525,9 +530,7 @@ class TDEngineConnector(TSDBConnector):
525
530
  start: typing.Optional[datetime] = None,
526
531
  end: typing.Optional[datetime] = None,
527
532
  ) -> pd.DataFrame:
528
- endpoint_ids = (
529
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
530
- )
533
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
531
534
  start, end = self._get_start_end(start, end)
532
535
  df = self._get_records(
533
536
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
@@ -538,7 +541,7 @@ class TDEngineConnector(TSDBConnector):
538
541
  mm_schemas.EventFieldType.TIME,
539
542
  mm_schemas.EventFieldType.LATENCY,
540
543
  ],
541
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
544
+ filter_query=filter_query,
542
545
  timestamp_column=mm_schemas.EventFieldType.TIME,
543
546
  agg_funcs=["last"],
544
547
  group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -567,9 +570,7 @@ class TDEngineConnector(TSDBConnector):
567
570
  start: typing.Optional[datetime] = None,
568
571
  end: typing.Optional[datetime] = None,
569
572
  ) -> pd.DataFrame:
570
- endpoint_ids = (
571
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
572
- )
573
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
573
574
  start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
574
575
  start, end = self._get_start_end(start, end)
575
576
  df = self._get_records(
@@ -580,7 +581,7 @@ class TDEngineConnector(TSDBConnector):
580
581
  mm_schemas.ResultData.RESULT_STATUS,
581
582
  mm_schemas.EventFieldType.ENDPOINT_ID,
582
583
  ],
583
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
584
+ filter_query=filter_query,
584
585
  timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
585
586
  agg_funcs=["max"],
586
587
  group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -678,9 +679,8 @@ class TDEngineConnector(TSDBConnector):
678
679
  start: typing.Optional[datetime] = None,
679
680
  end: typing.Optional[datetime] = None,
680
681
  ) -> pd.DataFrame:
681
- endpoint_ids = (
682
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
683
- )
682
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
683
+ filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
684
684
  start, end = self._get_start_end(start, end)
685
685
  df = self._get_records(
686
686
  table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
@@ -691,8 +691,7 @@ class TDEngineConnector(TSDBConnector):
691
691
  mm_schemas.EventFieldType.ENDPOINT_ID,
692
692
  ],
693
693
  agg_funcs=["count"],
694
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
695
- f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'",
694
+ filter_query=filter_query,
696
695
  group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
697
696
  preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
698
697
  )