mlrun 1.7.0rc13__py3-none-any.whl → 1.7.0rc21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (156) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +23 -111
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +36 -253
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +46 -42
  10. mlrun/artifacts/model.py +9 -141
  11. mlrun/artifacts/plots.py +14 -375
  12. mlrun/common/constants.py +65 -3
  13. mlrun/common/formatters/__init__.py +19 -0
  14. mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
  15. mlrun/common/formatters/base.py +113 -0
  16. mlrun/common/formatters/function.py +46 -0
  17. mlrun/common/formatters/pipeline.py +53 -0
  18. mlrun/common/formatters/project.py +51 -0
  19. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  20. mlrun/common/schemas/__init__.py +10 -5
  21. mlrun/common/schemas/alert.py +92 -11
  22. mlrun/common/schemas/api_gateway.py +56 -0
  23. mlrun/common/schemas/artifact.py +15 -5
  24. mlrun/common/schemas/auth.py +2 -0
  25. mlrun/common/schemas/client_spec.py +1 -0
  26. mlrun/common/schemas/frontend_spec.py +1 -0
  27. mlrun/common/schemas/function.py +4 -0
  28. mlrun/common/schemas/model_monitoring/__init__.py +15 -3
  29. mlrun/common/schemas/model_monitoring/constants.py +58 -7
  30. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  31. mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
  32. mlrun/common/schemas/pipeline.py +0 -9
  33. mlrun/common/schemas/project.py +6 -11
  34. mlrun/common/types.py +1 -0
  35. mlrun/config.py +36 -8
  36. mlrun/data_types/to_pandas.py +9 -9
  37. mlrun/datastore/base.py +41 -9
  38. mlrun/datastore/datastore.py +6 -2
  39. mlrun/datastore/datastore_profile.py +56 -4
  40. mlrun/datastore/hdfs.py +5 -0
  41. mlrun/datastore/inmem.py +2 -2
  42. mlrun/datastore/redis.py +2 -2
  43. mlrun/datastore/s3.py +5 -0
  44. mlrun/datastore/sources.py +147 -7
  45. mlrun/datastore/store_resources.py +7 -7
  46. mlrun/datastore/targets.py +129 -9
  47. mlrun/datastore/utils.py +42 -0
  48. mlrun/datastore/v3io.py +1 -1
  49. mlrun/db/auth_utils.py +152 -0
  50. mlrun/db/base.py +55 -11
  51. mlrun/db/httpdb.py +346 -107
  52. mlrun/db/nopdb.py +52 -10
  53. mlrun/errors.py +11 -0
  54. mlrun/execution.py +24 -9
  55. mlrun/feature_store/__init__.py +0 -2
  56. mlrun/feature_store/api.py +12 -47
  57. mlrun/feature_store/feature_set.py +9 -0
  58. mlrun/feature_store/feature_vector.py +8 -0
  59. mlrun/feature_store/ingestion.py +7 -6
  60. mlrun/feature_store/retrieval/base.py +9 -4
  61. mlrun/feature_store/retrieval/conversion.py +9 -9
  62. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  63. mlrun/feature_store/retrieval/job.py +9 -3
  64. mlrun/feature_store/retrieval/local_merger.py +2 -0
  65. mlrun/feature_store/retrieval/spark_merger.py +16 -0
  66. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  67. mlrun/frameworks/parallel_coordinates.py +2 -1
  68. mlrun/frameworks/tf_keras/__init__.py +4 -1
  69. mlrun/k8s_utils.py +10 -11
  70. mlrun/launcher/base.py +4 -3
  71. mlrun/launcher/client.py +5 -3
  72. mlrun/launcher/local.py +8 -2
  73. mlrun/launcher/remote.py +8 -2
  74. mlrun/lists.py +6 -2
  75. mlrun/model.py +62 -20
  76. mlrun/model_monitoring/__init__.py +1 -1
  77. mlrun/model_monitoring/api.py +41 -18
  78. mlrun/model_monitoring/application.py +5 -305
  79. mlrun/model_monitoring/applications/__init__.py +11 -0
  80. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  81. mlrun/model_monitoring/applications/base.py +280 -0
  82. mlrun/model_monitoring/applications/context.py +214 -0
  83. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  84. mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
  85. mlrun/model_monitoring/applications/results.py +99 -0
  86. mlrun/model_monitoring/controller.py +3 -1
  87. mlrun/model_monitoring/db/__init__.py +2 -0
  88. mlrun/model_monitoring/db/stores/__init__.py +0 -2
  89. mlrun/model_monitoring/db/stores/base/store.py +22 -37
  90. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
  91. mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
  92. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
  93. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
  94. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
  95. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
  96. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  97. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  98. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  99. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  100. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  101. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  102. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  103. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  104. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  105. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +636 -0
  106. mlrun/model_monitoring/evidently_application.py +6 -118
  107. mlrun/model_monitoring/helpers.py +46 -1
  108. mlrun/model_monitoring/model_endpoint.py +3 -2
  109. mlrun/model_monitoring/stream_processing.py +57 -216
  110. mlrun/model_monitoring/writer.py +134 -124
  111. mlrun/package/utils/_formatter.py +2 -2
  112. mlrun/platforms/__init__.py +10 -9
  113. mlrun/platforms/iguazio.py +21 -202
  114. mlrun/projects/operations.py +19 -12
  115. mlrun/projects/pipelines.py +103 -109
  116. mlrun/projects/project.py +377 -137
  117. mlrun/render.py +15 -14
  118. mlrun/run.py +16 -47
  119. mlrun/runtimes/__init__.py +6 -3
  120. mlrun/runtimes/base.py +8 -7
  121. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  122. mlrun/runtimes/funcdoc.py +0 -28
  123. mlrun/runtimes/kubejob.py +2 -1
  124. mlrun/runtimes/local.py +5 -2
  125. mlrun/runtimes/mpijob/__init__.py +0 -20
  126. mlrun/runtimes/mpijob/v1.py +1 -1
  127. mlrun/runtimes/nuclio/api_gateway.py +440 -208
  128. mlrun/runtimes/nuclio/application/application.py +170 -8
  129. mlrun/runtimes/nuclio/function.py +39 -49
  130. mlrun/runtimes/pod.py +21 -41
  131. mlrun/runtimes/remotesparkjob.py +9 -3
  132. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  133. mlrun/runtimes/utils.py +6 -45
  134. mlrun/serving/server.py +2 -1
  135. mlrun/serving/states.py +53 -2
  136. mlrun/serving/v2_serving.py +5 -1
  137. mlrun/track/tracker.py +2 -1
  138. mlrun/utils/async_http.py +25 -5
  139. mlrun/utils/helpers.py +107 -75
  140. mlrun/utils/logger.py +39 -7
  141. mlrun/utils/notifications/notification/__init__.py +14 -9
  142. mlrun/utils/notifications/notification/base.py +1 -1
  143. mlrun/utils/notifications/notification/slack.py +61 -13
  144. mlrun/utils/notifications/notification/webhook.py +1 -1
  145. mlrun/utils/notifications/notification_pusher.py +147 -16
  146. mlrun/utils/regex.py +9 -0
  147. mlrun/utils/v3io_clients.py +0 -1
  148. mlrun/utils/version/version.json +2 -2
  149. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/METADATA +14 -6
  150. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/RECORD +154 -133
  151. mlrun/kfpops.py +0 -865
  152. mlrun/platforms/other.py +0 -305
  153. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/LICENSE +0 -0
  154. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/WHEEL +0 -0
  155. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/entry_points.txt +0 -0
  156. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/top_level.txt +0 -0
@@ -11,37 +11,91 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
16
15
  import json
17
16
  import os
18
17
  import typing
18
+ from dataclasses import dataclass
19
19
  from http import HTTPStatus
20
20
 
21
21
  import v3io.dataplane
22
+ import v3io.dataplane.output
22
23
  import v3io.dataplane.response
23
- import v3io_frames
24
24
 
25
25
  import mlrun.common.model_monitoring.helpers
26
- import mlrun.common.schemas.model_monitoring
26
+ import mlrun.common.schemas.model_monitoring as mm_schemas
27
27
  import mlrun.model_monitoring.db
28
28
  import mlrun.utils.v3io_clients
29
29
  from mlrun.utils import logger
30
30
 
31
31
  # Fields to encode before storing in the KV table or to decode after retrieving
32
32
  fields_to_encode_decode = [
33
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS,
34
- mlrun.common.schemas.model_monitoring.EventFieldType.CURRENT_STATS,
33
+ mm_schemas.EventFieldType.FEATURE_STATS,
34
+ mm_schemas.EventFieldType.CURRENT_STATS,
35
+ ]
36
+
37
+ _METRIC_FIELDS: list[str] = [
38
+ mm_schemas.WriterEvent.APPLICATION_NAME,
39
+ mm_schemas.MetricData.METRIC_NAME,
40
+ mm_schemas.MetricData.METRIC_VALUE,
41
+ mm_schemas.WriterEvent.START_INFER_TIME,
42
+ mm_schemas.WriterEvent.END_INFER_TIME,
43
+ ]
44
+
45
+
46
+ class SchemaField(typing.TypedDict):
47
+ name: str
48
+ type: str
49
+ nullable: bool
50
+
51
+
52
+ @dataclass
53
+ class SchemaParams:
54
+ key: str
55
+ fields: list[SchemaField]
56
+
57
+
58
+ _RESULT_SCHEMA: list[SchemaField] = [
59
+ SchemaField(
60
+ name=mm_schemas.ResultData.RESULT_NAME,
61
+ type=mm_schemas.GrafanaColumnType.STRING,
62
+ nullable=False,
63
+ )
64
+ ]
65
+
66
+ _METRIC_SCHEMA: list[SchemaField] = [
67
+ SchemaField(
68
+ name=mm_schemas.WriterEvent.APPLICATION_NAME,
69
+ type=mm_schemas.GrafanaColumnType.STRING,
70
+ nullable=False,
71
+ ),
72
+ SchemaField(
73
+ name=mm_schemas.MetricData.METRIC_NAME,
74
+ type=mm_schemas.GrafanaColumnType.STRING,
75
+ nullable=False,
76
+ ),
35
77
  ]
36
78
 
37
79
 
80
+ _KIND_TO_SCHEMA_PARAMS: dict[mm_schemas.WriterEventKind, SchemaParams] = {
81
+ mm_schemas.WriterEventKind.RESULT: SchemaParams(
82
+ key=mm_schemas.WriterEvent.APPLICATION_NAME, fields=_RESULT_SCHEMA
83
+ ),
84
+ mm_schemas.WriterEventKind.METRIC: SchemaParams(
85
+ key="metric_id", fields=_METRIC_SCHEMA
86
+ ),
87
+ }
88
+
89
+ _EXCLUDE_SCHEMA_FILTER_EXPRESSION = '__name!=".#schema"'
90
+
91
+
38
92
  class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
39
93
  """
40
94
  Handles the DB operations when the DB target is from type KV. For the KV operations, we use an instance of V3IO
41
95
  client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
42
96
  """
43
97
 
44
- def __init__(self, project: str, access_key: str):
98
+ def __init__(self, project: str, access_key: typing.Optional[str] = None) -> None:
45
99
  super().__init__(project=project)
46
100
  # Initialize a V3IO client instance
47
101
  self.access_key = access_key or os.environ.get("V3IO_ACCESS_KEY")
@@ -66,7 +120,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
66
120
  self.client.kv.put(
67
121
  container=self.container,
68
122
  table_path=self.path,
69
- key=endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID],
123
+ key=endpoint[mm_schemas.EventFieldType.UID],
70
124
  attributes=endpoint,
71
125
  )
72
126
 
@@ -153,7 +207,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
153
207
  """Getting path and container based on the model monitoring configurations"""
154
208
  path = mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
155
209
  project=self.project,
156
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.ENDPOINTS,
210
+ kind=mm_schemas.ModelMonitoringStoreKinds.ENDPOINTS,
157
211
  )
158
212
  (
159
213
  _,
@@ -219,17 +273,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
219
273
  if uids is None:
220
274
  uids = []
221
275
  for item in items:
222
- if mlrun.common.schemas.model_monitoring.EventFieldType.UID not in item:
276
+ if mm_schemas.EventFieldType.UID not in item:
223
277
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
224
- uids.append(
225
- item[
226
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
227
- ]
228
- )
278
+ uids.append(item[mm_schemas.EventFieldType.ENDPOINT_ID])
229
279
  else:
230
- uids.append(
231
- item[mlrun.common.schemas.model_monitoring.EventFieldType.UID]
232
- )
280
+ uids.append(item[mm_schemas.EventFieldType.UID])
233
281
 
234
282
  # Add each relevant model endpoint to the model endpoints list
235
283
  for endpoint_id in uids:
@@ -240,27 +288,20 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
240
288
 
241
289
  return endpoint_list
242
290
 
243
- def delete_model_endpoints_resources(self, endpoints: list[dict[str, typing.Any]]):
291
+ def delete_model_endpoints_resources(self):
244
292
  """
245
- Delete all model endpoints resources in both KV and the time series DB.
246
-
247
- :param endpoints: A list of model endpoints flattened dictionaries.
293
+ Delete all model endpoints resources in V3IO KV.
248
294
  """
249
295
 
296
+ endpoints = self.list_model_endpoints()
297
+
250
298
  # Delete model endpoint record from KV table
251
299
  for endpoint_dict in endpoints:
252
- if (
253
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
254
- not in endpoint_dict
255
- ):
300
+ if mm_schemas.EventFieldType.UID not in endpoint_dict:
256
301
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
257
- endpoint_id = endpoint_dict[
258
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
259
- ]
302
+ endpoint_id = endpoint_dict[mm_schemas.EventFieldType.ENDPOINT_ID]
260
303
  else:
261
- endpoint_id = endpoint_dict[
262
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
263
- ]
304
+ endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
264
305
  self.delete_model_endpoint(
265
306
  endpoint_id,
266
307
  )
@@ -283,188 +324,84 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
283
324
  raise_for_status=v3io.dataplane.RaiseForStatus.never,
284
325
  )
285
326
 
286
- # Cleanup TSDB
287
- frames = self._get_frames_client()
288
-
289
- # Generate the required tsdb paths
290
- tsdb_path, filtered_path = self._generate_tsdb_paths()
327
+ @staticmethod
328
+ def _get_results_table_path(endpoint_id: str) -> str:
329
+ return endpoint_id
291
330
 
292
- # Delete time series DB resources
293
- try:
294
- frames.delete(
295
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
296
- table=filtered_path,
297
- )
298
- except v3io_frames.errors.DeleteError as e:
299
- if "No TSDB schema file found" not in str(e):
300
- logger.warning(
301
- f"Failed to delete TSDB table '{filtered_path}'",
302
- err=mlrun.errors.err_to_str(e),
303
- )
304
- # Final cleanup of tsdb path
305
- tsdb_path.replace("://u", ":///u")
306
- store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
307
- store.rm(tsdb_path, recursive=True)
331
+ @staticmethod
332
+ def _get_metrics_table_path(endpoint_id: str) -> str:
333
+ return f"{endpoint_id}_metrics"
308
334
 
309
- def get_endpoint_real_time_metrics(
335
+ def write_application_event(
310
336
  self,
311
- endpoint_id: str,
312
- metrics: list[str],
313
- start: str = "now-1h",
314
- end: str = "now",
315
- access_key: str = None,
316
- ) -> dict[str, list[tuple[str, float]]]:
317
- """
318
- Getting metrics from the time series DB. There are pre-defined metrics for model endpoints such as
319
- `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user.
320
-
321
- :param endpoint_id: The unique id of the model endpoint.
322
- :param metrics: A list of real-time metrics to return for the model endpoint.
323
- :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
324
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
325
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
326
- earliest time.
327
- :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
328
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
329
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
330
- earliest time.
331
- :param access_key: V3IO access key that will be used for generating Frames client object. If not
332
- provided, the access key will be retrieved from the environment variables.
333
-
334
- :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
335
- includes timestamps and the values.
336
- """
337
-
338
- # Initialize access key
339
- access_key = access_key or mlrun.mlconf.get_v3io_access_key()
340
-
341
- if not metrics:
342
- raise mlrun.errors.MLRunInvalidArgumentError(
343
- "Metric names must be provided"
344
- )
345
-
346
- # Initialize metrics mapping dictionary
347
- metrics_mapping = {}
348
-
349
- # Getting the path for the time series DB
350
- events_path = (
351
- mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
352
- project=self.project,
353
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
354
- )
355
- )
356
- (
357
- _,
358
- container,
359
- events_path,
360
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
361
- events_path
362
- )
363
-
364
- # Retrieve the raw data from the time series DB based on the provided metrics and time ranges
365
- frames_client = mlrun.utils.v3io_clients.get_frames_client(
366
- token=access_key,
367
- address=mlrun.mlconf.v3io_framesd,
368
- container=container,
369
- )
370
-
371
- try:
372
- data = frames_client.read(
373
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
374
- table=events_path,
375
- columns=["endpoint_id", *metrics],
376
- filter=f"endpoint_id=='{endpoint_id}'",
377
- start=start,
378
- end=end,
379
- )
380
-
381
- # Fill the metrics mapping dictionary with the metric name and values
382
- data_dict = data.to_dict()
383
- for metric in metrics:
384
- metric_data = data_dict.get(metric)
385
- if metric_data is None:
386
- continue
387
-
388
- values = [
389
- (str(timestamp), value) for timestamp, value in metric_data.items()
390
- ]
391
- metrics_mapping[metric] = values
392
-
393
- except v3io_frames.errors.ReadError:
394
- logger.warn("Failed to read tsdb", endpoint=endpoint_id)
395
-
396
- return metrics_mapping
397
-
398
- def write_application_result(self, event: dict[str, typing.Any]):
337
+ event: dict[str, typing.Any],
338
+ kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
339
+ ) -> None:
399
340
  """
400
- Write a new application result event in the target table.
341
+ Write a new application event in the target table.
401
342
 
402
343
  :param event: An event dictionary that represents the application result, should be corresponded to the
403
344
  schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
404
345
  object.
346
+ :param kind: The type of the event, can be either "result" or "metric".
405
347
  """
406
- endpoint_id = event.pop(
407
- mlrun.common.schemas.model_monitoring.WriterEvent.ENDPOINT_ID
408
- )
409
- app_name = event.pop(
410
- mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME
411
- )
412
- metric_name = event.pop(
413
- mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME
414
- )
415
- attributes = {metric_name: json.dumps(event)}
416
348
 
417
- v3io_monitoring_apps_container = self.get_v3io_monitoring_apps_container(
418
- project_name=self.project
419
- )
349
+ container = self.get_v3io_monitoring_apps_container(project_name=self.project)
350
+ endpoint_id = event.pop(mm_schemas.WriterEvent.ENDPOINT_ID)
351
+
352
+ if kind == mm_schemas.WriterEventKind.METRIC:
353
+ table_path = self._get_metrics_table_path(endpoint_id)
354
+ key = f"{event[mm_schemas.WriterEvent.APPLICATION_NAME]}.{event[mm_schemas.MetricData.METRIC_NAME]}"
355
+ attributes = {event_key: event[event_key] for event_key in _METRIC_FIELDS}
356
+ elif kind == mm_schemas.WriterEventKind.RESULT:
357
+ table_path = self._get_results_table_path(endpoint_id)
358
+ key = event.pop(mm_schemas.WriterEvent.APPLICATION_NAME)
359
+ metric_name = event.pop(mm_schemas.ResultData.RESULT_NAME)
360
+ attributes = {metric_name: json.dumps(event)}
361
+ else:
362
+ raise ValueError(f"Invalid {kind = }")
420
363
 
421
364
  self.client.kv.update(
422
- container=v3io_monitoring_apps_container,
423
- table_path=endpoint_id,
424
- key=app_name,
365
+ container=container,
366
+ table_path=table_path,
367
+ key=key,
425
368
  attributes=attributes,
426
369
  )
427
370
 
428
371
  schema_file = self.client.kv.new_cursor(
429
- container=v3io_monitoring_apps_container,
430
- table_path=endpoint_id,
372
+ container=container,
373
+ table_path=table_path,
431
374
  filter_expression='__name==".#schema"',
432
375
  )
433
376
 
434
377
  if not schema_file.all():
435
378
  logger.info(
436
- "Generate a new V3IO KV schema file",
437
- container=v3io_monitoring_apps_container,
438
- endpoint_id=endpoint_id,
379
+ "Generating a new V3IO KV schema file",
380
+ container=container,
381
+ table_path=table_path,
439
382
  )
440
- self._generate_kv_schema(endpoint_id, v3io_monitoring_apps_container)
441
- logger.info("Updated V3IO KV successfully", key=app_name)
383
+ self._generate_kv_schema(
384
+ container=container, table_path=table_path, kind=kind
385
+ )
386
+ logger.info("Updated V3IO KV successfully", key=key)
442
387
 
443
388
  def _generate_kv_schema(
444
- self, endpoint_id: str, v3io_monitoring_apps_container: str
445
- ):
389
+ self, *, container: str, table_path: str, kind: mm_schemas.WriterEventKind
390
+ ) -> None:
446
391
  """Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
447
- fields = [
448
- {
449
- "name": mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME,
450
- "type": "string",
451
- "nullable": False,
452
- }
453
- ]
392
+ schema_params = _KIND_TO_SCHEMA_PARAMS[kind]
454
393
  res = self.client.kv.create_schema(
455
- container=v3io_monitoring_apps_container,
456
- table_path=endpoint_id,
457
- key=mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME,
458
- fields=fields,
394
+ container=container,
395
+ table_path=table_path,
396
+ key=schema_params.key,
397
+ fields=schema_params.fields,
459
398
  )
460
399
  if res.status_code != HTTPStatus.OK:
461
400
  raise mlrun.errors.MLRunBadRequestError(
462
- f"Couldn't infer schema for endpoint {endpoint_id} which is required for Grafana dashboards"
401
+ f"Couldn't infer schema for endpoint {table_path} which is required for Grafana dashboards"
463
402
  )
464
403
  else:
465
- logger.info(
466
- "Generated V3IO KV schema successfully", endpoint_id=endpoint_id
467
- )
404
+ logger.info("Generated V3IO KV schema successfully", table_path=table_path)
468
405
 
469
406
  def get_last_analyzed(self, endpoint_id: str, application_name: str) -> int:
470
407
  """
@@ -485,9 +422,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
485
422
  table_path=endpoint_id,
486
423
  key=application_name,
487
424
  )
488
- return data.output.item[
489
- mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED
490
- ]
425
+ return data.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
491
426
  except v3io.dataplane.response.HttpResponseError as err:
492
427
  logger.debug("Error while getting last analyzed time", err=err)
493
428
  raise mlrun.errors.MLRunNotFoundError(
@@ -512,9 +447,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
512
447
  ),
513
448
  table_path=endpoint_id,
514
449
  key=application_name,
515
- attributes={
516
- mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED: last_analyzed
517
- },
450
+ attributes={mm_schemas.SchedulingKeys.LAST_ANALYZED: last_analyzed},
518
451
  )
519
452
 
520
453
  def _generate_tsdb_paths(self) -> tuple[str, str]:
@@ -527,7 +460,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
527
460
  full_path = (
528
461
  mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
529
462
  project=self.project,
530
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
463
+ kind=mm_schemas.ModelMonitoringStoreKinds.EVENTS,
531
464
  )
532
465
  )
533
466
 
@@ -623,8 +556,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
623
556
  # Apply top_level filter (remove endpoints that considered a child of a router)
624
557
  if top_level:
625
558
  filter_expression.append(
626
- f"(endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.NODE_EP.value)}' "
627
- f"OR endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.ROUTER.value)}')"
559
+ f"(endpoint_type=='{str(mm_schemas.EndpointType.NODE_EP.value)}' "
560
+ f"OR endpoint_type=='{str(mm_schemas.EndpointType.ROUTER.value)}')"
628
561
  )
629
562
 
630
563
  return " AND ".join(filter_expression)
@@ -644,41 +577,31 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
644
577
  # Validate default value for `error_count`
645
578
  # For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
646
579
  if (
647
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT in endpoint
648
- and endpoint[
649
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
650
- ]
651
- == "null"
580
+ mm_schemas.EventFieldType.ERROR_COUNT in endpoint
581
+ and endpoint[mm_schemas.EventFieldType.ERROR_COUNT] == "null"
652
582
  ):
653
- endpoint[
654
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
655
- ] = "0"
583
+ endpoint[mm_schemas.EventFieldType.ERROR_COUNT] = "0"
656
584
 
657
585
  # Validate default value for `metrics`
658
586
  # For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
659
587
  if (
660
- mlrun.common.schemas.model_monitoring.EventFieldType.METRICS in endpoint
661
- and endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS]
662
- == "null"
588
+ mm_schemas.EventFieldType.METRICS in endpoint
589
+ and endpoint[mm_schemas.EventFieldType.METRICS] == "null"
663
590
  ):
664
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS] = (
665
- json.dumps(
666
- {
667
- mlrun.common.schemas.model_monitoring.EventKeyMetrics.GENERIC: {
668
- mlrun.common.schemas.model_monitoring.EventLiveStats.LATENCY_AVG_1H: 0,
669
- mlrun.common.schemas.model_monitoring.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
670
- }
591
+ endpoint[mm_schemas.EventFieldType.METRICS] = json.dumps(
592
+ {
593
+ mm_schemas.EventKeyMetrics.GENERIC: {
594
+ mm_schemas.EventLiveStats.LATENCY_AVG_1H: 0,
595
+ mm_schemas.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
671
596
  }
672
- )
597
+ }
673
598
  )
674
599
  # Validate key `uid` instead of `endpoint_id`
675
600
  # For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
676
- if mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID in endpoint:
677
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID] = (
678
- endpoint[
679
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
680
- ]
681
- )
601
+ if mm_schemas.EventFieldType.ENDPOINT_ID in endpoint:
602
+ endpoint[mm_schemas.EventFieldType.UID] = endpoint[
603
+ mm_schemas.EventFieldType.ENDPOINT_ID
604
+ ]
682
605
 
683
606
  @staticmethod
684
607
  def _encode_field(field: typing.Union[str, bytes]) -> bytes:
@@ -703,3 +626,96 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
703
626
  @staticmethod
704
627
  def _get_monitoring_schedules_container(project_name: str) -> str:
705
628
  return f"users/pipelines/{project_name}/monitoring-schedules/functions"
629
+
630
+ def _extract_results_from_items(
631
+ self, app_items: list[dict[str, str]]
632
+ ) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
633
+ """Assuming .#schema items are filtered out"""
634
+ metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
635
+ for app_item in app_items:
636
+ app_name = app_item.pop("__name")
637
+ for result_name in app_item:
638
+ metrics.append(
639
+ mm_schemas.ModelEndpointMonitoringMetric(
640
+ project=self.project,
641
+ app=app_name,
642
+ type=mm_schemas.ModelEndpointMonitoringMetricType.RESULT,
643
+ name=result_name,
644
+ full_name=mm_schemas.model_endpoints._compose_full_name(
645
+ project=self.project, app=app_name, name=result_name
646
+ ),
647
+ )
648
+ )
649
+ return metrics
650
+
651
+ def _extract_metrics_from_items(
652
+ self, result_items: list[dict[str, str]]
653
+ ) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
654
+ metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
655
+ logger.debug("Result items", result_items=result_items)
656
+ for result_item in result_items:
657
+ app = result_item[mm_schemas.WriterEvent.APPLICATION_NAME]
658
+ name = result_item[mm_schemas.MetricData.METRIC_NAME]
659
+ metrics.append(
660
+ mm_schemas.ModelEndpointMonitoringMetric(
661
+ project=self.project,
662
+ app=app,
663
+ type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
664
+ name=name,
665
+ full_name=mm_schemas.model_endpoints._compose_full_name(
666
+ project=self.project,
667
+ app=app,
668
+ name=name,
669
+ type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
670
+ ),
671
+ )
672
+ )
673
+ return metrics
674
+
675
+ def get_model_endpoint_metrics(
676
+ self, endpoint_id: str, type: mm_schemas.ModelEndpointMonitoringMetricType
677
+ ) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
678
+ """Get model monitoring results and metrics on the endpoint"""
679
+ metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
680
+ container = self.get_v3io_monitoring_apps_container(self.project)
681
+ if type == mm_schemas.ModelEndpointMonitoringMetricType.METRIC:
682
+ table_path = self._get_metrics_table_path(endpoint_id)
683
+ items_extractor = self._extract_metrics_from_items
684
+ elif type == mm_schemas.ModelEndpointMonitoringMetricType.RESULT:
685
+ table_path = self._get_results_table_path(endpoint_id)
686
+ items_extractor = self._extract_results_from_items
687
+ else:
688
+ raise ValueError(f"Invalid metric {type = }")
689
+
690
+ def scan(
691
+ marker: typing.Optional[str] = None,
692
+ ) -> v3io.dataplane.response.Response:
693
+ # TODO: Use AIO client: `v3io.aio.dataplane.client.Client`
694
+ return self.client.kv.scan(
695
+ container=container,
696
+ table_path=table_path,
697
+ marker=marker,
698
+ filter_expression=_EXCLUDE_SCHEMA_FILTER_EXPRESSION,
699
+ )
700
+
701
+ try:
702
+ response = scan()
703
+ except v3io.dataplane.response.HttpResponseError as err:
704
+ if err.status_code == HTTPStatus.NOT_FOUND:
705
+ logger.warning(
706
+ f"Attempt getting {type}s - no data. Check the "
707
+ "project name, endpoint, or wait for the applications to start.",
708
+ container=container,
709
+ table_path=table_path,
710
+ )
711
+ return []
712
+ raise
713
+
714
+ while True:
715
+ output = typing.cast(v3io.dataplane.output.GetItemsOutput, response.output)
716
+ metrics.extend(items_extractor(output.items))
717
+ if output.last:
718
+ break
719
+ response = scan(marker=output.next_marker)
720
+
721
+ return metrics