mlrun 1.7.0rc37__py3-none-any.whl → 1.7.0rc39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (52) hide show
  1. mlrun/alerts/alert.py +34 -30
  2. mlrun/common/schemas/alert.py +3 -0
  3. mlrun/common/schemas/model_monitoring/constants.py +4 -0
  4. mlrun/common/schemas/notification.py +4 -3
  5. mlrun/datastore/alibaba_oss.py +2 -2
  6. mlrun/datastore/azure_blob.py +124 -31
  7. mlrun/datastore/base.py +1 -1
  8. mlrun/datastore/dbfs_store.py +2 -2
  9. mlrun/datastore/google_cloud_storage.py +83 -20
  10. mlrun/datastore/s3.py +2 -2
  11. mlrun/datastore/sources.py +54 -0
  12. mlrun/datastore/targets.py +9 -53
  13. mlrun/db/httpdb.py +6 -1
  14. mlrun/errors.py +8 -0
  15. mlrun/execution.py +7 -0
  16. mlrun/feature_store/api.py +5 -0
  17. mlrun/feature_store/common.py +6 -11
  18. mlrun/feature_store/retrieval/job.py +1 -0
  19. mlrun/model.py +29 -3
  20. mlrun/model_monitoring/api.py +9 -0
  21. mlrun/model_monitoring/applications/_application_steps.py +36 -0
  22. mlrun/model_monitoring/applications/histogram_data_drift.py +15 -13
  23. mlrun/model_monitoring/controller.py +15 -11
  24. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +14 -11
  25. mlrun/model_monitoring/db/tsdb/base.py +121 -1
  26. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +85 -47
  27. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +100 -12
  28. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
  29. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +214 -36
  30. mlrun/model_monitoring/helpers.py +16 -17
  31. mlrun/model_monitoring/stream_processing.py +68 -27
  32. mlrun/projects/operations.py +1 -1
  33. mlrun/projects/pipelines.py +19 -30
  34. mlrun/projects/project.py +76 -52
  35. mlrun/run.py +8 -6
  36. mlrun/runtimes/__init__.py +19 -8
  37. mlrun/runtimes/nuclio/api_gateway.py +9 -0
  38. mlrun/runtimes/nuclio/application/application.py +64 -9
  39. mlrun/runtimes/nuclio/function.py +1 -1
  40. mlrun/runtimes/pod.py +2 -2
  41. mlrun/runtimes/remotesparkjob.py +2 -5
  42. mlrun/runtimes/sparkjob/spark3job.py +7 -9
  43. mlrun/serving/v2_serving.py +1 -0
  44. mlrun/track/trackers/mlflow_tracker.py +5 -0
  45. mlrun/utils/helpers.py +21 -0
  46. mlrun/utils/version/version.json +2 -2
  47. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/METADATA +14 -11
  48. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/RECORD +52 -52
  49. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/WHEEL +1 -1
  50. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/LICENSE +0 -0
  51. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/entry_points.txt +0 -0
  52. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/top_level.txt +0 -0
@@ -390,6 +390,7 @@ class BaseStoreTarget(DataTargetBase):
390
390
  is_offline = False
391
391
  support_spark = False
392
392
  support_storey = False
393
+ support_pandas = False
393
394
  support_append = False
394
395
 
395
396
  def __init__(
@@ -758,6 +759,8 @@ class BaseStoreTarget(DataTargetBase):
758
759
  **kwargs,
759
760
  ):
760
761
  """return the target data as dataframe"""
762
+ if not self.support_pandas:
763
+ raise NotImplementedError()
761
764
  mlrun.utils.helpers.additional_filters_warning(
762
765
  additional_filters, self.__class__
763
766
  )
@@ -819,6 +822,7 @@ class ParquetTarget(BaseStoreTarget):
819
822
  support_spark = True
820
823
  support_storey = True
821
824
  support_dask = True
825
+ support_pandas = True
822
826
  support_append = True
823
827
 
824
828
  def __init__(
@@ -1084,6 +1088,7 @@ class CSVTarget(BaseStoreTarget):
1084
1088
  is_offline = True
1085
1089
  support_spark = True
1086
1090
  support_storey = True
1091
+ support_pandas = True
1087
1092
 
1088
1093
  @staticmethod
1089
1094
  def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
@@ -1292,7 +1297,7 @@ class SnowflakeTarget(BaseStoreTarget):
1292
1297
  **kwargs,
1293
1298
  ):
1294
1299
  raise mlrun.errors.MLRunRuntimeError(
1295
- f"{type(self).__name__} does not support storey engine"
1300
+ f"{type(self).__name__} does not support pandas engine"
1296
1301
  )
1297
1302
 
1298
1303
  @property
@@ -1366,19 +1371,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
1366
1371
  def get_dask_options(self):
1367
1372
  return {"format": "csv"}
1368
1373
 
1369
- def as_df(
1370
- self,
1371
- columns=None,
1372
- df_module=None,
1373
- entities=None,
1374
- start_time=None,
1375
- end_time=None,
1376
- time_column=None,
1377
- additional_filters=None,
1378
- **kwargs,
1379
- ):
1380
- raise NotImplementedError()
1381
-
1382
1374
  def write_dataframe(
1383
1375
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1384
1376
  ):
@@ -1612,19 +1604,6 @@ class StreamTarget(BaseStoreTarget):
1612
1604
  **self.attributes,
1613
1605
  )
1614
1606
 
1615
- def as_df(
1616
- self,
1617
- columns=None,
1618
- df_module=None,
1619
- entities=None,
1620
- start_time=None,
1621
- end_time=None,
1622
- time_column=None,
1623
- additional_filters=None,
1624
- **kwargs,
1625
- ):
1626
- raise NotImplementedError()
1627
-
1628
1607
 
1629
1608
  class KafkaTarget(BaseStoreTarget):
1630
1609
  """
@@ -1727,19 +1706,6 @@ class KafkaTarget(BaseStoreTarget):
1727
1706
  **attributes,
1728
1707
  )
1729
1708
 
1730
- def as_df(
1731
- self,
1732
- columns=None,
1733
- df_module=None,
1734
- entities=None,
1735
- start_time=None,
1736
- end_time=None,
1737
- time_column=None,
1738
- additional_filters=None,
1739
- **kwargs,
1740
- ):
1741
- raise NotImplementedError()
1742
-
1743
1709
  def purge(self):
1744
1710
  pass
1745
1711
 
@@ -1784,19 +1750,6 @@ class TSDBTarget(BaseStoreTarget):
1784
1750
  **self.attributes,
1785
1751
  )
1786
1752
 
1787
- def as_df(
1788
- self,
1789
- columns=None,
1790
- df_module=None,
1791
- entities=None,
1792
- start_time=None,
1793
- end_time=None,
1794
- time_column=None,
1795
- additional_filters=None,
1796
- **kwargs,
1797
- ):
1798
- raise NotImplementedError()
1799
-
1800
1753
  def write_dataframe(
1801
1754
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1802
1755
  ):
@@ -1834,6 +1787,7 @@ class CustomTarget(BaseStoreTarget):
1834
1787
  is_online = False
1835
1788
  support_spark = False
1836
1789
  support_storey = True
1790
+ support_pandas = True
1837
1791
 
1838
1792
  def __init__(
1839
1793
  self,
@@ -1869,6 +1823,7 @@ class CustomTarget(BaseStoreTarget):
1869
1823
  class DFTarget(BaseStoreTarget):
1870
1824
  kind = TargetTypes.dataframe
1871
1825
  support_storey = True
1826
+ support_pandas = True
1872
1827
 
1873
1828
  def __init__(self, *args, name="dataframe", **kwargs):
1874
1829
  self._df = None
@@ -1931,6 +1886,7 @@ class SQLTarget(BaseStoreTarget):
1931
1886
  is_online = True
1932
1887
  support_spark = False
1933
1888
  support_storey = True
1889
+ support_pandas = True
1934
1890
 
1935
1891
  def __init__(
1936
1892
  self,
mlrun/db/httpdb.py CHANGED
@@ -3475,7 +3475,7 @@ class HTTPRunDB(RunDBInterface):
3475
3475
  if response.status_code == http.HTTPStatus.ACCEPTED:
3476
3476
  if delete_resources:
3477
3477
  logger.info(
3478
- "Model Monitoring is being disable",
3478
+ "Model Monitoring is being disabled",
3479
3479
  project_name=project,
3480
3480
  )
3481
3481
  if delete_user_applications:
@@ -4216,6 +4216,9 @@ class HTTPRunDB(RunDBInterface):
4216
4216
  :param project: The project that the alert belongs to.
4217
4217
  :returns: The created/modified alert.
4218
4218
  """
4219
+ if not alert_data:
4220
+ raise mlrun.errors.MLRunInvalidArgumentError("Alert data must be provided")
4221
+
4219
4222
  project = project or config.default_project
4220
4223
  endpoint_path = f"projects/{project}/alerts/{alert_name}"
4221
4224
  error_message = f"put alert {project}/alerts/{alert_name}"
@@ -4224,6 +4227,8 @@ class HTTPRunDB(RunDBInterface):
4224
4227
  if isinstance(alert_data, AlertConfig)
4225
4228
  else AlertConfig.from_dict(alert_data)
4226
4229
  )
4230
+ # Validation is necessary here because users can directly invoke this function
4231
+ # through `mlrun.get_run_db().store_alert_config()`.
4227
4232
  alert_instance.validate_required_fields()
4228
4233
 
4229
4234
  alert_data = alert_instance.to_dict()
mlrun/errors.py CHANGED
@@ -209,6 +209,14 @@ class MLRunInvalidMMStoreType(MLRunHTTPStatusError, ValueError):
209
209
  error_status_code = HTTPStatus.BAD_REQUEST.value
210
210
 
211
211
 
212
+ class MLRunStreamConnectionFailure(MLRunHTTPStatusError, ValueError):
213
+ error_status_code = HTTPStatus.BAD_REQUEST.value
214
+
215
+
216
+ class MLRunTSDBConnectionFailure(MLRunHTTPStatusError, ValueError):
217
+ error_status_code = HTTPStatus.BAD_REQUEST.value
218
+
219
+
212
220
  class MLRunRetryExhaustedError(Exception):
213
221
  pass
214
222
 
mlrun/execution.py CHANGED
@@ -921,6 +921,13 @@ class MLClientCtx:
921
921
  updates, self._uid, self.project, iter=self._iteration
922
922
  )
923
923
 
924
+ def get_notifications(self):
925
+ """Get the list of notifications"""
926
+ return [
927
+ mlrun.model.Notification.from_dict(notification)
928
+ for notification in self._notifications
929
+ ]
930
+
924
931
  def to_dict(self):
925
932
  """Convert the run context to a dictionary"""
926
933
 
@@ -230,6 +230,11 @@ def _get_offline_features(
230
230
  "entity_timestamp_column param "
231
231
  "can not be specified without entity_rows param"
232
232
  )
233
+ if isinstance(target, BaseStoreTarget) and not target.support_pandas:
234
+ raise mlrun.errors.MLRunInvalidArgumentError(
235
+ f"get_offline_features does not support targets that do not support pandas engine."
236
+ f" Target kind: {target.kind}"
237
+ )
233
238
 
234
239
  if isinstance(feature_vector, FeatureVector):
235
240
  update_stats = True
@@ -37,17 +37,12 @@ def parse_feature_string(feature):
37
37
  raise mlrun.errors.MLRunInvalidArgumentError(
38
38
  f"feature {feature} must be {expected_message}"
39
39
  )
40
- splitted = feature.split(feature_separator)
41
- if len(splitted) > 2:
42
- raise mlrun.errors.MLRunInvalidArgumentError(
43
- f"feature {feature} must be {expected_message}, cannot have more than one '.'"
44
- )
45
- feature_set = splitted[0]
46
- feature_name = splitted[1]
47
- splitted = feature_name.split(" as ")
48
- if len(splitted) > 1:
49
- return feature_set.strip(), splitted[0].strip(), splitted[1].strip()
50
- return feature_set.strip(), feature_name.strip(), None
40
+ feature_set, feature_name = feature.rsplit(feature_separator, 1)
41
+ feature_set = feature_set.strip()
42
+ split_result = feature_name.split(" as ", 1)
43
+ feature_name = split_result[0].strip()
44
+ alias = split_result[1].strip() if len(split_result) > 1 else None
45
+ return feature_set, feature_name, alias
51
46
 
52
47
 
53
48
  def parse_project_name_from_feature_string(feature):
@@ -181,6 +181,7 @@ class RemoteVectorResponse:
181
181
  file_format = kwargs.get("format")
182
182
  if not file_format:
183
183
  file_format = self.run.status.results["target"]["kind"]
184
+
184
185
  df = mlrun.get_dataitem(self.target_uri).as_df(
185
186
  columns=columns, df_module=df_module, format=file_format, **kwargs
186
187
  )
mlrun/model.py CHANGED
@@ -679,7 +679,24 @@ class ImageBuilder(ModelObj):
679
679
 
680
680
 
681
681
  class Notification(ModelObj):
682
- """Notification specification"""
682
+ """Notification object
683
+
684
+ :param kind: notification implementation kind - slack, webhook, etc.
685
+ :param name: for logging and identification
686
+ :param message: message content in the notification
687
+ :param severity: severity to display in the notification
688
+ :param when: list of statuses to trigger the notification: 'running', 'completed', 'error'
689
+ :param condition: optional condition to trigger the notification, a jinja2 expression that can use run data
690
+ to evaluate if the notification should be sent in addition to the 'when' statuses.
691
+ e.g.: '{{ run["status"]["results"]["accuracy"] < 0.9}}'
692
+ :param params: Implementation specific parameters for the notification implementation (e.g. slack webhook url,
693
+ git repository details, etc.)
694
+ :param secret_params: secret parameters for the notification implementation, same as params but will be stored
695
+ in a k8s secret and passed as a secret reference to the implementation.
696
+ :param status: notification status - pending, sent, error
697
+ :param sent_time: time the notification was sent
698
+ :param reason: failure reason if the notification failed to send
699
+ """
683
700
 
684
701
  def __init__(
685
702
  self,
@@ -1468,7 +1485,11 @@ class RunObject(RunTemplate):
1468
1485
  @property
1469
1486
  def error(self) -> str:
1470
1487
  """error string if failed"""
1471
- if self.status:
1488
+ if (
1489
+ self.status
1490
+ and self.status.state
1491
+ in mlrun.common.runtimes.constants.RunStates.error_and_abortion_states()
1492
+ ):
1472
1493
  unknown_error = ""
1473
1494
  if (
1474
1495
  self.status.state
@@ -1484,8 +1505,8 @@ class RunObject(RunTemplate):
1484
1505
 
1485
1506
  return (
1486
1507
  self.status.error
1487
- or self.status.reason
1488
1508
  or self.status.status_text
1509
+ or self.status.reason
1489
1510
  or unknown_error
1490
1511
  )
1491
1512
  return ""
@@ -1789,6 +1810,11 @@ class RunObject(RunTemplate):
1789
1810
 
1790
1811
  return state
1791
1812
 
1813
+ def abort(self):
1814
+ """abort the run"""
1815
+ db = mlrun.get_run_db()
1816
+ db.abort_run(self.metadata.uid, self.metadata.project)
1817
+
1792
1818
  @staticmethod
1793
1819
  def create_uri(project: str, uid: str, iteration: Union[int, str], tag: str = ""):
1794
1820
  if tag:
@@ -616,7 +616,16 @@ def _create_model_monitoring_function_base(
616
616
  app_step = prepare_step.to(class_name=application_class, **application_kwargs)
617
617
  else:
618
618
  app_step = prepare_step.to(class_name=application_class)
619
+
619
620
  app_step.__class__ = mlrun.serving.MonitoringApplicationStep
621
+
622
+ app_step.error_handler(
623
+ name="ApplicationErrorHandler",
624
+ class_name="mlrun.model_monitoring.applications._application_steps._ApplicationErrorHandler",
625
+ full_event=True,
626
+ project=project,
627
+ )
628
+
620
629
  app_step.to(
621
630
  class_name="mlrun.model_monitoring.applications._application_steps._PushToMonitoringWriter",
622
631
  name="PushToMonitoringWriter",
@@ -17,6 +17,7 @@ from typing import Optional
17
17
 
18
18
  import mlrun.common.helpers
19
19
  import mlrun.common.model_monitoring.helpers
20
+ import mlrun.common.schemas.alert as alert_objects
20
21
  import mlrun.common.schemas.model_monitoring.constants as mm_constant
21
22
  import mlrun.datastore
22
23
  import mlrun.serving
@@ -164,3 +165,38 @@ class _PrepareMonitoringEvent(StepToDict):
164
165
  )
165
166
  context.__class__ = MonitoringApplicationContext
166
167
  return context
168
+
169
+
170
+ class _ApplicationErrorHandler(StepToDict):
171
+ def __init__(self, project: str, name: Optional[str] = None):
172
+ self.project = project
173
+ self.name = name or "ApplicationErrorHandler"
174
+
175
+ def do(self, event):
176
+ """
177
+ Handle model monitoring application error. This step will generate an event, describing the error.
178
+
179
+ :param event: Application event.
180
+ """
181
+
182
+ logger.error(f"Error in application step: {event}")
183
+
184
+ event_data = mlrun.common.schemas.Event(
185
+ kind=alert_objects.EventKind.MM_APP_FAILED,
186
+ entity={
187
+ "kind": alert_objects.EventEntityKind.MODEL_MONITORING_APPLICATION,
188
+ "project": self.project,
189
+ "ids": [f"{self.project}_{event.body.application_name}"],
190
+ },
191
+ value_dict={
192
+ "Error": event.error,
193
+ "Timestamp": event.timestamp,
194
+ "Application Class": event.body.application_name,
195
+ "Endpoint ID": event.body.endpoint_id,
196
+ },
197
+ )
198
+
199
+ mlrun.get_run_db().generate_event(
200
+ name=alert_objects.EventKind.MM_APP_FAILED, event_data=event_data
201
+ )
202
+ logger.info("Event generated successfully")
@@ -91,7 +91,9 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
91
91
  """
92
92
  MLRun's default data drift application for model monitoring.
93
93
 
94
- The application expects tabular numerical data, and calculates three metrics over the features' histograms.
94
+ The application expects tabular numerical data, and calculates three metrics over the shared features' histograms.
95
+ The metrics are calculated on features that have reference data from the training dataset. When there is no
96
+ reference data (`feature_stats`), this application send a warning log and does nothing.
95
97
  The three metrics are:
96
98
 
97
99
  * Hellinger distance.
@@ -112,6 +114,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
112
114
 
113
115
  project.enable_model_monitoring()
114
116
 
117
+ To avoid it, pass `deploy_histogram_data_drift_app=False`.
115
118
  """
116
119
 
117
120
  NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME
@@ -223,19 +226,18 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
223
226
  return metrics
224
227
 
225
228
  @staticmethod
226
- def _remove_timestamp_feature(
227
- sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
229
+ def _get_shared_features_sample_stats(
230
+ monitoring_context: mm_context.MonitoringApplicationContext,
228
231
  ) -> mlrun.common.model_monitoring.helpers.FeatureStats:
229
232
  """
230
- Drop the 'timestamp' feature if it exists, as it is irrelevant
231
- in the plotly artifact
233
+ Filter out features without reference data in `feature_stats`, e.g. `timestamp`.
232
234
  """
233
- sample_set_statistics = mlrun.common.model_monitoring.helpers.FeatureStats(
234
- sample_set_statistics.copy()
235
+ return mlrun.common.model_monitoring.helpers.FeatureStats(
236
+ {
237
+ key: monitoring_context.sample_df_stats[key]
238
+ for key in monitoring_context.feature_stats
239
+ }
235
240
  )
236
- if EventFieldType.TIMESTAMP in sample_set_statistics:
237
- del sample_set_statistics[EventFieldType.TIMESTAMP]
238
- return sample_set_statistics
239
241
 
240
242
  @staticmethod
241
243
  def _log_json_artifact(
@@ -299,8 +301,8 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
299
301
  self._log_json_artifact(drift_per_feature_values, monitoring_context)
300
302
 
301
303
  self._log_plotly_table_artifact(
302
- sample_set_statistics=self._remove_timestamp_feature(
303
- monitoring_context.sample_df_stats
304
+ sample_set_statistics=self._get_shared_features_sample_stats(
305
+ monitoring_context
304
306
  ),
305
307
  inputs_statistics=monitoring_context.feature_stats,
306
308
  metrics_per_feature=metrics_per_feature,
@@ -325,7 +327,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
325
327
  """
326
328
  monitoring_context.logger.debug("Starting to run the application")
327
329
  if not monitoring_context.feature_stats:
328
- monitoring_context.logger.info(
330
+ monitoring_context.logger.warning(
329
331
  "No feature statistics found, skipping the application. \n"
330
332
  "In order to run the application, training set must be provided when logging the model."
331
333
  )
@@ -335,19 +335,23 @@ class MonitoringApplicationController:
335
335
  return
336
336
  monitoring_functions = self.project_obj.list_model_monitoring_functions()
337
337
  if monitoring_functions:
338
- # Gets only application in ready state
339
338
  applications_names = list(
340
- {
341
- app.metadata.name
342
- for app in monitoring_functions
343
- if (
344
- app.status.state == "ready"
345
- # workaround for the default app, as its `status.state` is `None`
346
- or app.metadata.name
347
- == mm_constants.HistogramDataDriftApplicationConstants.NAME
348
- )
349
- }
339
+ {app.metadata.name for app in monitoring_functions}
350
340
  )
341
+ # if monitoring_functions: - TODO : ML-7700
342
+ # Gets only application in ready state
343
+ # applications_names = list(
344
+ # {
345
+ # app.metadata.name
346
+ # for app in monitoring_functions
347
+ # if (
348
+ # app.status.state == "ready"
349
+ # # workaround for the default app, as its `status.state` is `None`
350
+ # or app.metadata.name
351
+ # == mm_constants.HistogramDataDriftApplicationConstants.NAME
352
+ # )
353
+ # }
354
+ # )
351
355
  if not applications_names:
352
356
  logger.info("No monitoring functions found", project=self.project)
353
357
  return
@@ -11,7 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import http
15
15
  import json
16
16
  import typing
17
17
  from dataclasses import dataclass
@@ -34,11 +34,11 @@ fields_to_encode_decode = [
34
34
  ]
35
35
 
36
36
  _METRIC_FIELDS: list[str] = [
37
- mm_schemas.WriterEvent.APPLICATION_NAME,
38
- mm_schemas.MetricData.METRIC_NAME,
39
- mm_schemas.MetricData.METRIC_VALUE,
40
- mm_schemas.WriterEvent.START_INFER_TIME,
41
- mm_schemas.WriterEvent.END_INFER_TIME,
37
+ mm_schemas.WriterEvent.APPLICATION_NAME.value,
38
+ mm_schemas.MetricData.METRIC_NAME.value,
39
+ mm_schemas.MetricData.METRIC_VALUE.value,
40
+ mm_schemas.WriterEvent.START_INFER_TIME.value,
41
+ mm_schemas.WriterEvent.END_INFER_TIME.value,
42
42
  ]
43
43
 
44
44
 
@@ -417,11 +417,14 @@ class KVStoreBase(StoreBase):
417
417
  )
418
418
  return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
419
419
  except v3io.dataplane.response.HttpResponseError as err:
420
- logger.debug("Error while getting last analyzed time", err=err)
421
- raise mlrun.errors.MLRunNotFoundError(
422
- f"No last analyzed value has been found for {application_name} "
423
- f"that processes model endpoint {endpoint_id}",
424
- )
420
+ if err.status_code == http.HTTPStatus.NOT_FOUND:
421
+ logger.debug("Last analyzed time not found", err=err)
422
+ raise mlrun.errors.MLRunNotFoundError(
423
+ f"No last analyzed value has been found for {application_name} "
424
+ f"that processes model endpoint {endpoint_id}",
425
+ )
426
+ logger.error("Error while getting last analyzed time", err=err)
427
+ raise err
425
428
 
426
429
  def update_last_analyzed(
427
430
  self, endpoint_id: str, application_name: str, last_analyzed: int
@@ -15,6 +15,7 @@
15
15
  import typing
16
16
  from abc import ABC, abstractmethod
17
17
  from datetime import datetime
18
+ from typing import Union
18
19
 
19
20
  import pandas as pd
20
21
  import pydantic
@@ -47,7 +48,7 @@ class TSDBConnector(ABC):
47
48
  self.project = project
48
49
 
49
50
  @abstractmethod
50
- def apply_monitoring_stream_steps(self, graph):
51
+ def apply_monitoring_stream_steps(self, graph) -> None:
51
52
  """
52
53
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
53
54
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -59,6 +60,14 @@ class TSDBConnector(ABC):
59
60
  """
60
61
  pass
61
62
 
63
+ @abstractmethod
64
+ def handle_model_error(self, graph, **kwargs) -> None:
65
+ """
66
+ Adds a branch to the stream pod graph to handle events that
67
+ arrive with errors from the model server and saves them to the error TSDB table.
68
+ The first step that generates by this method should come after `ForwardError` step.
69
+ """
70
+
62
71
  @abstractmethod
63
72
  def write_application_event(
64
73
  self,
@@ -181,6 +190,117 @@ class TSDBConnector(ABC):
181
190
  :return: Metric values object or no data object.
182
191
  """
183
192
 
193
+ @abstractmethod
194
+ def get_last_request(
195
+ self,
196
+ endpoint_ids: Union[str, list[str]],
197
+ start: Union[datetime, str] = "0",
198
+ end: Union[datetime, str] = "now",
199
+ ) -> pd.DataFrame:
200
+ """
201
+ Fetches data from the predictions TSDB table and returns the most recent request
202
+ timestamp for each specified endpoint.
203
+
204
+ :param endpoint_ids: A list of model endpoint identifiers.
205
+ :param start: The start time for the query.
206
+ :param end: The end time for the query.
207
+
208
+ :return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
209
+ If an endpoint has not been invoked within the specified time range, it will not appear in the result.
210
+ """
211
+
212
+ @abstractmethod
213
+ def get_drift_status(
214
+ self,
215
+ endpoint_ids: Union[str, list[str]],
216
+ start: Union[datetime, str] = "now-24h",
217
+ end: Union[datetime, str] = "now",
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Fetches data from the app-results TSDB table and returns the highest status among all
221
+ the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
222
+
223
+ :param endpoint_ids: A list of model endpoint identifiers.
224
+ :param start: The start time for the query.
225
+ :param end: The end time for the query.
226
+
227
+ :return: A pd.DataFrame containing the columns [result_status, endpoint_id].
228
+ If an endpoint has not been monitored within the specified time range (last 24 hours),
229
+ it will not appear in the result.
230
+ """
231
+
232
+ @abstractmethod
233
+ def get_metrics_metadata(
234
+ self,
235
+ endpoint_id: str,
236
+ start: Union[datetime, str] = "0",
237
+ end: Union[datetime, str] = "now",
238
+ ) -> pd.DataFrame:
239
+ """
240
+ Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
241
+
242
+ :param endpoint_id: The model endpoint identifier.
243
+ :param start: The start time of the query.
244
+ :param end: The end time of the query.
245
+
246
+ :return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
247
+ Containing the columns [application_name, metric_name, endpoint_id]
248
+ """
249
+
250
+ @abstractmethod
251
+ def get_results_metadata(
252
+ self,
253
+ endpoint_id: str,
254
+ start: Union[datetime, str] = "0",
255
+ end: Union[datetime, str] = "now",
256
+ ) -> pd.DataFrame:
257
+ """
258
+ Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
259
+
260
+ :param endpoint_id: The model endpoint identifier.
261
+ :param start: The start time of the query.
262
+ :param end: The end time of the query.
263
+
264
+ :return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
265
+ Containing the columns [application_name, result_name, result_kind, endpoint_id]
266
+ """
267
+
268
+ @abstractmethod
269
+ def get_error_count(
270
+ self,
271
+ endpoint_ids: Union[str, list[str]],
272
+ start: Union[datetime, str] = "0",
273
+ end: Union[datetime, str] = "now",
274
+ ) -> pd.DataFrame:
275
+ """
276
+ Fetches data from the error TSDB table and returns the error count for each specified endpoint.
277
+
278
+ :param endpoint_ids: A list of model endpoint identifiers.
279
+ :param start: The start time for the query.
280
+ :param end: The end time for the query.
281
+
282
+ :return: A pd.DataFrame containing the columns [error_count, endpoint_id].
283
+ If an endpoint have not raised error within the specified time range, it will not appear in the result.
284
+ """
285
+
286
+ @abstractmethod
287
+ def get_avg_latency(
288
+ self,
289
+ endpoint_ids: Union[str, list[str]],
290
+ start: Union[datetime, str] = "0",
291
+ end: Union[datetime, str] = "now",
292
+ ) -> pd.DataFrame:
293
+ """
294
+ Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
295
+
296
+ :param endpoint_ids: A list of model endpoint identifiers.
297
+ :param start: The start time for the query.
298
+ :param end: The end time for the query.
299
+
300
+ :return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
301
+ If an endpoint has not been invoked within the specified time range, it will not appear in the result.
302
+ """
303
+
184
304
  @staticmethod
185
305
  def df_to_metrics_values(
186
306
  *,