mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show
  1. mlrun/__main__.py +4 -2
  2. mlrun/alerts/alert.py +75 -8
  3. mlrun/artifacts/base.py +1 -0
  4. mlrun/artifacts/manager.py +9 -2
  5. mlrun/common/constants.py +4 -1
  6. mlrun/common/db/sql_session.py +3 -2
  7. mlrun/common/formatters/__init__.py +1 -0
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
  10. mlrun/common/formatters/run.py +3 -0
  11. mlrun/common/helpers.py +0 -1
  12. mlrun/common/schemas/__init__.py +3 -1
  13. mlrun/common/schemas/alert.py +15 -12
  14. mlrun/common/schemas/api_gateway.py +6 -6
  15. mlrun/common/schemas/auth.py +5 -0
  16. mlrun/common/schemas/client_spec.py +0 -1
  17. mlrun/common/schemas/common.py +7 -4
  18. mlrun/common/schemas/frontend_spec.py +7 -0
  19. mlrun/common/schemas/function.py +7 -0
  20. mlrun/common/schemas/model_monitoring/__init__.py +4 -3
  21. mlrun/common/schemas/model_monitoring/constants.py +41 -26
  22. mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
  23. mlrun/common/schemas/notification.py +69 -12
  24. mlrun/common/schemas/project.py +45 -12
  25. mlrun/common/schemas/workflow.py +10 -2
  26. mlrun/common/types.py +1 -0
  27. mlrun/config.py +91 -35
  28. mlrun/data_types/data_types.py +6 -1
  29. mlrun/data_types/spark.py +2 -2
  30. mlrun/data_types/to_pandas.py +57 -25
  31. mlrun/datastore/__init__.py +1 -0
  32. mlrun/datastore/alibaba_oss.py +3 -2
  33. mlrun/datastore/azure_blob.py +125 -37
  34. mlrun/datastore/base.py +42 -21
  35. mlrun/datastore/datastore.py +4 -2
  36. mlrun/datastore/datastore_profile.py +1 -1
  37. mlrun/datastore/dbfs_store.py +3 -7
  38. mlrun/datastore/filestore.py +1 -3
  39. mlrun/datastore/google_cloud_storage.py +85 -29
  40. mlrun/datastore/inmem.py +4 -1
  41. mlrun/datastore/redis.py +1 -0
  42. mlrun/datastore/s3.py +25 -12
  43. mlrun/datastore/sources.py +76 -4
  44. mlrun/datastore/spark_utils.py +30 -0
  45. mlrun/datastore/storeytargets.py +151 -0
  46. mlrun/datastore/targets.py +102 -131
  47. mlrun/datastore/v3io.py +1 -0
  48. mlrun/db/base.py +15 -6
  49. mlrun/db/httpdb.py +57 -28
  50. mlrun/db/nopdb.py +29 -5
  51. mlrun/errors.py +20 -3
  52. mlrun/execution.py +46 -5
  53. mlrun/feature_store/api.py +25 -1
  54. mlrun/feature_store/common.py +6 -11
  55. mlrun/feature_store/feature_vector.py +3 -1
  56. mlrun/feature_store/retrieval/job.py +4 -1
  57. mlrun/feature_store/retrieval/spark_merger.py +10 -39
  58. mlrun/feature_store/steps.py +8 -0
  59. mlrun/frameworks/_common/plan.py +3 -3
  60. mlrun/frameworks/_ml_common/plan.py +1 -1
  61. mlrun/frameworks/parallel_coordinates.py +2 -3
  62. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  63. mlrun/k8s_utils.py +48 -2
  64. mlrun/launcher/client.py +6 -6
  65. mlrun/launcher/local.py +2 -2
  66. mlrun/model.py +215 -34
  67. mlrun/model_monitoring/api.py +38 -24
  68. mlrun/model_monitoring/applications/__init__.py +1 -2
  69. mlrun/model_monitoring/applications/_application_steps.py +60 -29
  70. mlrun/model_monitoring/applications/base.py +2 -174
  71. mlrun/model_monitoring/applications/context.py +197 -70
  72. mlrun/model_monitoring/applications/evidently_base.py +11 -85
  73. mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
  74. mlrun/model_monitoring/applications/results.py +4 -4
  75. mlrun/model_monitoring/controller.py +110 -282
  76. mlrun/model_monitoring/db/stores/__init__.py +8 -3
  77. mlrun/model_monitoring/db/stores/base/store.py +3 -0
  78. mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
  79. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
  80. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
  81. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
  82. mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
  83. mlrun/model_monitoring/db/tsdb/base.py +147 -15
  84. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
  85. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
  86. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
  87. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
  88. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
  89. mlrun/model_monitoring/helpers.py +70 -50
  90. mlrun/model_monitoring/stream_processing.py +96 -195
  91. mlrun/model_monitoring/writer.py +13 -5
  92. mlrun/package/packagers/default_packager.py +2 -2
  93. mlrun/projects/operations.py +16 -8
  94. mlrun/projects/pipelines.py +126 -115
  95. mlrun/projects/project.py +286 -129
  96. mlrun/render.py +3 -3
  97. mlrun/run.py +38 -19
  98. mlrun/runtimes/__init__.py +19 -8
  99. mlrun/runtimes/base.py +4 -1
  100. mlrun/runtimes/daskjob.py +1 -1
  101. mlrun/runtimes/funcdoc.py +1 -1
  102. mlrun/runtimes/kubejob.py +6 -6
  103. mlrun/runtimes/local.py +12 -5
  104. mlrun/runtimes/nuclio/api_gateway.py +68 -8
  105. mlrun/runtimes/nuclio/application/application.py +307 -70
  106. mlrun/runtimes/nuclio/function.py +63 -14
  107. mlrun/runtimes/nuclio/serving.py +10 -10
  108. mlrun/runtimes/pod.py +25 -19
  109. mlrun/runtimes/remotesparkjob.py +2 -5
  110. mlrun/runtimes/sparkjob/spark3job.py +16 -17
  111. mlrun/runtimes/utils.py +34 -0
  112. mlrun/serving/routers.py +2 -5
  113. mlrun/serving/server.py +37 -19
  114. mlrun/serving/states.py +30 -3
  115. mlrun/serving/v2_serving.py +44 -35
  116. mlrun/track/trackers/mlflow_tracker.py +5 -0
  117. mlrun/utils/async_http.py +1 -1
  118. mlrun/utils/db.py +18 -0
  119. mlrun/utils/helpers.py +150 -36
  120. mlrun/utils/http.py +1 -1
  121. mlrun/utils/notifications/notification/__init__.py +0 -1
  122. mlrun/utils/notifications/notification/webhook.py +8 -1
  123. mlrun/utils/notifications/notification_pusher.py +1 -1
  124. mlrun/utils/v3io_clients.py +2 -2
  125. mlrun/utils/version/version.json +2 -2
  126. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
  127. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
  128. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
  129. mlrun/feature_store/retrieval/conversion.py +0 -271
  130. mlrun/model_monitoring/controller_handler.py +0 -37
  131. mlrun/model_monitoring/evidently_application.py +0 -20
  132. mlrun/model_monitoring/prometheus.py +0 -216
  133. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
  134. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
  135. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ from sqlalchemy.ext.declarative import declarative_base, declared_attr
18
18
 
19
19
  from mlrun.common.schemas.model_monitoring import (
20
20
  EventFieldType,
21
+ ResultData,
21
22
  WriterEvent,
22
23
  )
23
24
 
@@ -32,12 +33,21 @@ Base = declarative_base()
32
33
 
33
34
 
34
35
  class ModelEndpointsTable(Base, ModelEndpointsBaseTable):
36
+ feature_stats = Column(
37
+ EventFieldType.FEATURE_STATS, sqlalchemy.dialects.mysql.MEDIUMTEXT
38
+ )
39
+ current_stats = Column(
40
+ EventFieldType.CURRENT_STATS, sqlalchemy.dialects.mysql.MEDIUMTEXT
41
+ )
42
+ metrics = Column(EventFieldType.METRICS, sqlalchemy.dialects.mysql.MEDIUMTEXT)
35
43
  first_request = Column(
36
44
  EventFieldType.FIRST_REQUEST,
45
+ # TODO: migrate to DATETIME, see ML-6921
37
46
  sqlalchemy.dialects.mysql.TIMESTAMP(fsp=3, timezone=True),
38
47
  )
39
48
  last_request = Column(
40
49
  EventFieldType.LAST_REQUEST,
50
+ # TODO: migrate to DATETIME, see ML-6921
41
51
  sqlalchemy.dialects.mysql.TIMESTAMP(fsp=3, timezone=True),
42
52
  )
43
53
 
@@ -52,11 +62,11 @@ class _ApplicationResultOrMetric:
52
62
 
53
63
  start_infer_time = Column(
54
64
  WriterEvent.START_INFER_TIME,
55
- sqlalchemy.dialects.mysql.TIMESTAMP(fsp=3, timezone=True),
65
+ sqlalchemy.dialects.mysql.DATETIME(fsp=3, timezone=True),
56
66
  )
57
67
  end_infer_time = Column(
58
68
  WriterEvent.END_INFER_TIME,
59
- sqlalchemy.dialects.mysql.TIMESTAMP(fsp=3, timezone=True),
69
+ sqlalchemy.dialects.mysql.DATETIME(fsp=3, timezone=True),
60
70
  )
61
71
 
62
72
  @declared_attr
@@ -70,7 +80,12 @@ class _ApplicationResultOrMetric:
70
80
  class ApplicationResultTable(
71
81
  Base, _ApplicationResultOrMetric, ApplicationResultBaseTable
72
82
  ):
73
- pass
83
+ result_extra_data = Column(
84
+ ResultData.RESULT_EXTRA_DATA, sqlalchemy.dialects.mysql.MEDIUMTEXT
85
+ )
86
+ current_stats = Column(
87
+ ResultData.CURRENT_STATS, sqlalchemy.dialects.mysql.MEDIUMTEXT
88
+ )
74
89
 
75
90
 
76
91
  class ApplicationMetricsTable(
@@ -20,7 +20,7 @@ import pandas as pd
20
20
  import sqlalchemy
21
21
  import sqlalchemy.exc
22
22
  import sqlalchemy.orm
23
- from sqlalchemy.engine import make_url
23
+ from sqlalchemy.engine import Engine, make_url
24
24
  from sqlalchemy.sql.elements import BinaryExpression
25
25
 
26
26
  import mlrun.common.model_monitoring.helpers
@@ -61,9 +61,15 @@ class SQLStoreBase(StoreBase):
61
61
  )
62
62
 
63
63
  self._sql_connection_string = kwargs.get("store_connection_string")
64
- self._engine = get_engine(dsn=self._sql_connection_string)
64
+ self._engine = None
65
65
  self._init_tables()
66
66
 
67
+ @property
68
+ def engine(self) -> Engine:
69
+ if not self._engine:
70
+ self._engine = get_engine(dsn=self._sql_connection_string)
71
+ return self._engine
72
+
67
73
  def create_tables(self):
68
74
  self._create_tables_if_not_exist()
69
75
 
@@ -116,7 +122,7 @@ class SQLStoreBase(StoreBase):
116
122
  :param table_name: Target table name.
117
123
  :param event: Event dictionary that will be written into the DB.
118
124
  """
119
- with self._engine.connect() as connection:
125
+ with self.engine.connect() as connection:
120
126
  # Convert the result into a pandas Dataframe and write it into the database
121
127
  event_df = pd.DataFrame([event])
122
128
  event_df.to_sql(table_name, con=connection, index=False, if_exists="append")
@@ -177,6 +183,11 @@ class SQLStoreBase(StoreBase):
177
183
  param table: SQLAlchemy declarative table.
178
184
  :param criteria: A list of binary expressions that filter the query.
179
185
  """
186
+ if not self.engine.has_table(table.__tablename__):
187
+ logger.debug(
188
+ f"Table {table.__tablename__} does not exist in the database. Skipping deletion."
189
+ )
190
+ return
180
191
  with create_session(dsn=self._sql_connection_string) as session:
181
192
  # Generate and commit the delete query
182
193
  session.query(
@@ -266,22 +277,8 @@ class SQLStoreBase(StoreBase):
266
277
  labels: list[str] = None,
267
278
  top_level: bool = None,
268
279
  uids: list = None,
280
+ include_stats: bool = None,
269
281
  ) -> list[dict[str, typing.Any]]:
270
- """
271
- Returns a list of model endpoint dictionaries, supports filtering by model, function, labels or top level.
272
- By default, when no filters are applied, all available model endpoints for the given project will
273
- be listed.
274
-
275
- :param model: The name of the model to filter by.
276
- :param function: The name of the function to filter by.
277
- :param labels: A list of labels to filter by. Label filters work by either filtering a specific value
278
- of a label (i.e. list("key=value")) or by looking for the existence of a given
279
- key (i.e. "key").
280
- :param top_level: If True will return only routers and endpoint that are NOT children of any router.
281
- :param uids: List of model endpoint unique ids to include in the result.
282
-
283
- :return: A list of model endpoint dictionaries.
284
- """
285
282
  # Generate an empty model endpoints that will be filled afterwards with model endpoint dictionaries
286
283
  endpoint_list = []
287
284
 
@@ -341,6 +338,12 @@ class SQLStoreBase(StoreBase):
341
338
  ):
342
339
  continue
343
340
 
341
+ if not include_stats:
342
+ # Exclude these fields when listing model endpoints to avoid returning too much data (ML-6594)
343
+ # TODO: Remove stats from table schema (ML-7196)
344
+ endpoint_dict.pop(mm_schemas.EventFieldType.FEATURE_STATS)
345
+ endpoint_dict.pop(mm_schemas.EventFieldType.CURRENT_STATS)
346
+
344
347
  endpoint_list.append(endpoint_dict)
345
348
 
346
349
  return endpoint_list
@@ -527,9 +530,9 @@ class SQLStoreBase(StoreBase):
527
530
  for table in self._tables:
528
531
  # Create table if not exist. The `metadata` contains the `ModelEndpointsTable`
529
532
  db_name = make_url(self._sql_connection_string).database
530
- if not self._engine.has_table(table):
533
+ if not self.engine.has_table(table):
531
534
  logger.info(f"Creating table {table} on {db_name} db.")
532
- self._tables[table].metadata.create_all(bind=self._engine)
535
+ self._tables[table].metadata.create_all(bind=self.engine)
533
536
  else:
534
537
  logger.info(f"Table {table} already exists on {db_name} db.")
535
538
 
@@ -577,12 +580,19 @@ class SQLStoreBase(StoreBase):
577
580
  """
578
581
  Delete all the model monitoring resources of the project in the SQL tables.
579
582
  """
583
+ logger.debug(
584
+ "Deleting model monitoring endpoints resources from the SQL tables",
585
+ project=self.project,
586
+ )
580
587
  endpoints = self.list_model_endpoints()
581
- logger.debug("Deleting model monitoring resources", project=self.project)
582
588
 
583
589
  for endpoint_dict in endpoints:
584
590
  endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
585
-
591
+ logger.debug(
592
+ "Deleting model endpoint resources from the SQL tables",
593
+ endpoint_id=endpoint_id,
594
+ project=self.project,
595
+ )
586
596
  # Delete last analyzed records
587
597
  self._delete_last_analyzed(endpoint_id=endpoint_id)
588
598
 
@@ -592,6 +602,16 @@ class SQLStoreBase(StoreBase):
592
602
 
593
603
  # Delete model endpoint record
594
604
  self.delete_model_endpoint(endpoint_id=endpoint_id)
605
+ logger.debug(
606
+ "Successfully deleted model endpoint resources",
607
+ endpoint_id=endpoint_id,
608
+ project=self.project,
609
+ )
610
+
611
+ logger.debug(
612
+ "Successfully deleted model monitoring endpoints resources from the SQL tables",
613
+ project=self.project,
614
+ )
595
615
 
596
616
  def get_model_endpoint_metrics(
597
617
  self, endpoint_id: str, type: mm_schemas.ModelEndpointMonitoringMetricType
@@ -615,7 +635,7 @@ class SQLStoreBase(StoreBase):
615
635
 
616
636
  # Note: the block below does not use self._get, as we need here all the
617
637
  # results, not only `one_or_none`.
618
- with sqlalchemy.orm.Session(self._engine) as session:
638
+ with sqlalchemy.orm.Session(self.engine) as session:
619
639
  metric_rows = (
620
640
  session.query(table) # pyright: ignore[reportOptionalCall]
621
641
  .filter(table.endpoint_id == endpoint_id)
@@ -11,7 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import http
15
15
  import json
16
16
  import typing
17
17
  from dataclasses import dataclass
@@ -20,6 +20,7 @@ from http import HTTPStatus
20
20
  import v3io.dataplane
21
21
  import v3io.dataplane.output
22
22
  import v3io.dataplane.response
23
+ from v3io.dataplane import Client as V3IOClient
23
24
 
24
25
  import mlrun.common.model_monitoring.helpers
25
26
  import mlrun.common.schemas.model_monitoring as mm_schemas
@@ -34,11 +35,11 @@ fields_to_encode_decode = [
34
35
  ]
35
36
 
36
37
  _METRIC_FIELDS: list[str] = [
37
- mm_schemas.WriterEvent.APPLICATION_NAME,
38
- mm_schemas.MetricData.METRIC_NAME,
39
- mm_schemas.MetricData.METRIC_VALUE,
40
- mm_schemas.WriterEvent.START_INFER_TIME,
41
- mm_schemas.WriterEvent.END_INFER_TIME,
38
+ mm_schemas.WriterEvent.APPLICATION_NAME.value,
39
+ mm_schemas.MetricData.METRIC_NAME.value,
40
+ mm_schemas.MetricData.METRIC_VALUE.value,
41
+ mm_schemas.WriterEvent.START_INFER_TIME.value,
42
+ mm_schemas.WriterEvent.END_INFER_TIME.value,
42
43
  ]
43
44
 
44
45
 
@@ -100,13 +101,18 @@ class KVStoreBase(StoreBase):
100
101
  project: str,
101
102
  ) -> None:
102
103
  super().__init__(project=project)
103
- # Initialize a V3IO client instance
104
- self.client = mlrun.utils.v3io_clients.get_v3io_client(
105
- endpoint=mlrun.mlconf.v3io_api,
106
- )
104
+ self._client = None
107
105
  # Get the KV table path and container
108
106
  self.path, self.container = self._get_path_and_container()
109
107
 
108
+ @property
109
+ def client(self) -> V3IOClient:
110
+ if not self._client:
111
+ self._client = mlrun.utils.v3io_clients.get_v3io_client(
112
+ endpoint=mlrun.mlconf.v3io_api,
113
+ )
114
+ return self._client
115
+
110
116
  def write_model_endpoint(self, endpoint: dict[str, typing.Any]):
111
117
  """
112
118
  Create a new endpoint record in the KV table.
@@ -226,24 +232,8 @@ class KVStoreBase(StoreBase):
226
232
  labels: list[str] = None,
227
233
  top_level: bool = None,
228
234
  uids: list = None,
235
+ include_stats: bool = None,
229
236
  ) -> list[dict[str, typing.Any]]:
230
- """
231
- Returns a list of model endpoint dictionaries, supports filtering by model, function, labels or top level.
232
- By default, when no filters are applied, all available model endpoints for the given project will
233
- be listed.
234
-
235
- :param model: The name of the model to filter by.
236
- :param function: The name of the function to filter by.
237
- :param labels: A list of labels to filter by. Label filters work by either filtering a specific value
238
- of a label (i.e. list("key=value")) or by looking for the existence of a given
239
- key (i.e. "key").
240
- :param top_level: If True will return only routers and endpoint that are NOT children of any router.
241
- :param uids: List of model endpoint unique ids to include in the result.
242
-
243
-
244
- :return: A list of model endpoint dictionaries.
245
- """
246
-
247
237
  # # Initialize an empty model endpoints list
248
238
  endpoint_list = []
249
239
 
@@ -283,6 +273,10 @@ class KVStoreBase(StoreBase):
283
273
  endpoint_dict = self.get_model_endpoint(
284
274
  endpoint_id=endpoint_id,
285
275
  )
276
+ if not include_stats:
277
+ # Exclude these fields when listing model endpoints to avoid returning too much data (ML-6594)
278
+ endpoint_dict.pop(mm_schemas.EventFieldType.FEATURE_STATS)
279
+ endpoint_dict.pop(mm_schemas.EventFieldType.CURRENT_STATS)
286
280
 
287
281
  if labels and not self._validate_labels(
288
282
  endpoint_dict=endpoint_dict, labels=labels
@@ -297,6 +291,10 @@ class KVStoreBase(StoreBase):
297
291
  """
298
292
  Delete all model endpoints resources in V3IO KV.
299
293
  """
294
+ logger.debug(
295
+ "Deleting model monitoring endpoints resources in V3IO KV",
296
+ project=self.project,
297
+ )
300
298
 
301
299
  endpoints = self.list_model_endpoints()
302
300
 
@@ -307,10 +305,22 @@ class KVStoreBase(StoreBase):
307
305
  endpoint_id = endpoint_dict[mm_schemas.EventFieldType.ENDPOINT_ID]
308
306
  else:
309
307
  endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
308
+
309
+ logger.debug(
310
+ "Deleting model endpoint resources from the V3IO KV table",
311
+ endpoint_id=endpoint_id,
312
+ project=self.project,
313
+ )
314
+
310
315
  self.delete_model_endpoint(
311
316
  endpoint_id,
312
317
  )
313
318
 
319
+ logger.debug(
320
+ "Successfully deleted model monitoring endpoints from the V3IO KV table",
321
+ project=self.project,
322
+ )
323
+
314
324
  # Delete remain records in the KV
315
325
  all_records = self.client.kv.new_cursor(
316
326
  container=self.container,
@@ -362,7 +372,7 @@ class KVStoreBase(StoreBase):
362
372
  table_path = self._get_results_table_path(endpoint_id)
363
373
  key = event.pop(mm_schemas.WriterEvent.APPLICATION_NAME)
364
374
  metric_name = event.pop(mm_schemas.ResultData.RESULT_NAME)
365
- attributes = {metric_name: json.dumps(event)}
375
+ attributes = {metric_name: self._encode_field(json.dumps(event))}
366
376
  else:
367
377
  raise ValueError(f"Invalid {kind = }")
368
378
 
@@ -420,20 +430,23 @@ class KVStoreBase(StoreBase):
420
430
 
421
431
  """
422
432
  try:
423
- data = self.client.kv.get(
433
+ response = self.client.kv.get(
424
434
  container=self._get_monitoring_schedules_container(
425
435
  project_name=self.project
426
436
  ),
427
437
  table_path=endpoint_id,
428
438
  key=application_name,
429
439
  )
430
- return data.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
440
+ return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
431
441
  except v3io.dataplane.response.HttpResponseError as err:
432
- logger.debug("Error while getting last analyzed time", err=err)
433
- raise mlrun.errors.MLRunNotFoundError(
434
- f"No last analyzed value has been found for {application_name} "
435
- f"that processes model endpoint {endpoint_id}",
436
- )
442
+ if err.status_code == http.HTTPStatus.NOT_FOUND:
443
+ logger.debug("Last analyzed time not found", err=err)
444
+ raise mlrun.errors.MLRunNotFoundError(
445
+ f"No last analyzed value has been found for {application_name} "
446
+ f"that processes model endpoint {endpoint_id}",
447
+ )
448
+ logger.error("Error while getting last analyzed time", err=err)
449
+ raise err
437
450
 
438
451
  def update_last_analyzed(
439
452
  self, endpoint_id: str, application_name: str, last_analyzed: int
@@ -57,7 +57,7 @@ class ObjectTSDBFactory(enum.Enum):
57
57
  :param value: Provided enum (invalid) value.
58
58
  """
59
59
  valid_values = list(cls.__members__.keys())
60
- raise mlrun.errors.MLRunInvalidMMStoreType(
60
+ raise mlrun.errors.MLRunInvalidMMStoreTypeError(
61
61
  f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
62
62
  )
63
63
 
@@ -76,6 +76,8 @@ def get_tsdb_connector(
76
76
 
77
77
  :return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
78
78
  TSDB connector such as updating drift metrics or write application record result.
79
+ :raise: `MLRunInvalidMMStoreTypeError` if the user didn't provide TSDB connection
80
+ or the provided TSDB connection is invalid.
79
81
  """
80
82
 
81
83
  tsdb_connection_string = (
@@ -91,7 +93,10 @@ def get_tsdb_connector(
91
93
  elif tsdb_connection_string and tsdb_connection_string == "v3io":
92
94
  tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
93
95
  else:
94
- tsdb_connector_type = None
96
+ raise mlrun.errors.MLRunInvalidMMStoreTypeError(
97
+ "You must provide a valid tsdb store connection by using "
98
+ "set_model_monitoring_credentials API."
99
+ )
95
100
 
96
101
  # Get connector type value from ObjectTSDBFactory enum class
97
102
  tsdb_connector_factory = ObjectTSDBFactory(tsdb_connector_type)
@@ -15,8 +15,10 @@
15
15
  import typing
16
16
  from abc import ABC, abstractmethod
17
17
  from datetime import datetime
18
+ from typing import Union
18
19
 
19
20
  import pandas as pd
21
+ import pydantic
20
22
 
21
23
  import mlrun.common.schemas.model_monitoring as mm_schemas
22
24
  import mlrun.model_monitoring.db.tsdb.helpers
@@ -27,7 +29,7 @@ from mlrun.utils import logger
27
29
  class TSDBConnector(ABC):
28
30
  type: typing.ClassVar[str]
29
31
 
30
- def __init__(self, project: str):
32
+ def __init__(self, project: str) -> None:
31
33
  """
32
34
  Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
33
35
  At the moment we have 3 different types of monitoring data:
@@ -42,11 +44,11 @@ class TSDBConnector(ABC):
42
44
  writer.
43
45
 
44
46
  :param project: the name of the project.
45
-
46
47
  """
47
48
  self.project = project
48
49
 
49
- def apply_monitoring_stream_steps(self, graph):
50
+ @abstractmethod
51
+ def apply_monitoring_stream_steps(self, graph) -> None:
50
52
  """
51
53
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
52
54
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -58,6 +60,15 @@ class TSDBConnector(ABC):
58
60
  """
59
61
  pass
60
62
 
63
+ @abstractmethod
64
+ def handle_model_error(self, graph, **kwargs) -> None:
65
+ """
66
+ Adds a branch to the stream pod graph to handle events that
67
+ arrive with errors from the model server and saves them to the error TSDB table.
68
+ The first step that generates by this method should come after `ForwardError` step.
69
+ """
70
+
71
+ @abstractmethod
61
72
  def write_application_event(
62
73
  self,
63
74
  event: dict,
@@ -69,13 +80,14 @@ class TSDBConnector(ABC):
69
80
  :raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
70
81
  """
71
82
 
83
+ @abstractmethod
72
84
  def delete_tsdb_resources(self):
73
85
  """
74
86
  Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
75
87
  """
76
-
77
88
  pass
78
89
 
90
+ @abstractmethod
79
91
  def get_model_endpoint_real_time_metrics(
80
92
  self,
81
93
  endpoint_id: str,
@@ -102,6 +114,7 @@ class TSDBConnector(ABC):
102
114
  """
103
115
  pass
104
116
 
117
+ @abstractmethod
105
118
  def create_tables(self) -> None:
106
119
  """
107
120
  Create the TSDB tables using the TSDB connector. At the moment we support 3 types of tables:
@@ -177,6 +190,117 @@ class TSDBConnector(ABC):
177
190
  :return: Metric values object or no data object.
178
191
  """
179
192
 
193
+ @abstractmethod
194
+ def get_last_request(
195
+ self,
196
+ endpoint_ids: Union[str, list[str]],
197
+ start: Union[datetime, str] = "0",
198
+ end: Union[datetime, str] = "now",
199
+ ) -> pd.DataFrame:
200
+ """
201
+ Fetches data from the predictions TSDB table and returns the most recent request
202
+ timestamp for each specified endpoint.
203
+
204
+ :param endpoint_ids: A list of model endpoint identifiers.
205
+ :param start: The start time for the query.
206
+ :param end: The end time for the query.
207
+
208
+ :return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
209
+ If an endpoint has not been invoked within the specified time range, it will not appear in the result.
210
+ """
211
+
212
+ @abstractmethod
213
+ def get_drift_status(
214
+ self,
215
+ endpoint_ids: Union[str, list[str]],
216
+ start: Union[datetime, str] = "now-24h",
217
+ end: Union[datetime, str] = "now",
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Fetches data from the app-results TSDB table and returns the highest status among all
221
+ the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
222
+
223
+ :param endpoint_ids: A list of model endpoint identifiers.
224
+ :param start: The start time for the query.
225
+ :param end: The end time for the query.
226
+
227
+ :return: A pd.DataFrame containing the columns [result_status, endpoint_id].
228
+ If an endpoint has not been monitored within the specified time range (last 24 hours),
229
+ it will not appear in the result.
230
+ """
231
+
232
+ @abstractmethod
233
+ def get_metrics_metadata(
234
+ self,
235
+ endpoint_id: str,
236
+ start: Union[datetime, str] = "0",
237
+ end: Union[datetime, str] = "now",
238
+ ) -> pd.DataFrame:
239
+ """
240
+ Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
241
+
242
+ :param endpoint_id: The model endpoint identifier.
243
+ :param start: The start time of the query.
244
+ :param end: The end time of the query.
245
+
246
+ :return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
247
+ Containing the columns [application_name, metric_name, endpoint_id]
248
+ """
249
+
250
+ @abstractmethod
251
+ def get_results_metadata(
252
+ self,
253
+ endpoint_id: str,
254
+ start: Union[datetime, str] = "0",
255
+ end: Union[datetime, str] = "now",
256
+ ) -> pd.DataFrame:
257
+ """
258
+ Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
259
+
260
+ :param endpoint_id: The model endpoint identifier.
261
+ :param start: The start time of the query.
262
+ :param end: The end time of the query.
263
+
264
+ :return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
265
+ Containing the columns [application_name, result_name, result_kind, endpoint_id]
266
+ """
267
+
268
+ @abstractmethod
269
+ def get_error_count(
270
+ self,
271
+ endpoint_ids: Union[str, list[str]],
272
+ start: Union[datetime, str] = "0",
273
+ end: Union[datetime, str] = "now",
274
+ ) -> pd.DataFrame:
275
+ """
276
+ Fetches data from the error TSDB table and returns the error count for each specified endpoint.
277
+
278
+ :param endpoint_ids: A list of model endpoint identifiers.
279
+ :param start: The start time for the query.
280
+ :param end: The end time for the query.
281
+
282
+ :return: A pd.DataFrame containing the columns [error_count, endpoint_id].
283
+ If an endpoint have not raised error within the specified time range, it will not appear in the result.
284
+ """
285
+
286
+ @abstractmethod
287
+ def get_avg_latency(
288
+ self,
289
+ endpoint_ids: Union[str, list[str]],
290
+ start: Union[datetime, str] = "0",
291
+ end: Union[datetime, str] = "now",
292
+ ) -> pd.DataFrame:
293
+ """
294
+ Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
295
+
296
+ :param endpoint_ids: A list of model endpoint identifiers.
297
+ :param start: The start time for the query.
298
+ :param end: The end time for the query.
299
+
300
+ :return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
301
+ If an endpoint has not been invoked within the specified time range, it will not appear in the result.
302
+ """
303
+
180
304
  @staticmethod
181
305
  def df_to_metrics_values(
182
306
  *,
@@ -286,19 +410,27 @@ class TSDBConnector(ABC):
286
410
  full_name = mlrun.model_monitoring.helpers._compose_full_name(
287
411
  project=project, app=app_name, name=name
288
412
  )
289
- metrics_values.append(
290
- mm_schemas.ModelEndpointMonitoringResultValues(
413
+ try:
414
+ metrics_values.append(
415
+ mm_schemas.ModelEndpointMonitoringResultValues(
416
+ full_name=full_name,
417
+ result_kind=result_kind,
418
+ values=list(
419
+ zip(
420
+ sub_df.index,
421
+ sub_df[mm_schemas.ResultData.RESULT_VALUE],
422
+ sub_df[mm_schemas.ResultData.RESULT_STATUS],
423
+ )
424
+ ), # pyright: ignore[reportArgumentType]
425
+ )
426
+ )
427
+ except pydantic.ValidationError:
428
+ logger.exception(
429
+ "Failed to convert data-frame into `ModelEndpointMonitoringResultValues`",
291
430
  full_name=full_name,
292
- result_kind=result_kind,
293
- values=list(
294
- zip(
295
- sub_df.index,
296
- sub_df[mm_schemas.ResultData.RESULT_VALUE],
297
- sub_df[mm_schemas.ResultData.RESULT_STATUS],
298
- )
299
- ), # pyright: ignore[reportArgumentType]
431
+ sub_df_json=sub_df.to_json(),
300
432
  )
301
- )
433
+ raise
302
434
  del metrics_without_data[full_name]
303
435
 
304
436
  for metric in metrics_without_data.values():