mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (200) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +25 -111
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +38 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +41 -47
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +68 -0
  13. mlrun/common/formatters/__init__.py +19 -0
  14. mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
  15. mlrun/common/formatters/base.py +78 -0
  16. mlrun/common/formatters/function.py +41 -0
  17. mlrun/common/formatters/pipeline.py +53 -0
  18. mlrun/common/formatters/project.py +51 -0
  19. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  20. mlrun/common/schemas/__init__.py +25 -4
  21. mlrun/common/schemas/alert.py +203 -0
  22. mlrun/common/schemas/api_gateway.py +148 -0
  23. mlrun/common/schemas/artifact.py +15 -5
  24. mlrun/common/schemas/auth.py +8 -2
  25. mlrun/common/schemas/client_spec.py +2 -0
  26. mlrun/common/schemas/frontend_spec.py +1 -0
  27. mlrun/common/schemas/function.py +4 -0
  28. mlrun/common/schemas/hub.py +7 -9
  29. mlrun/common/schemas/model_monitoring/__init__.py +19 -3
  30. mlrun/common/schemas/model_monitoring/constants.py +96 -26
  31. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  32. mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
  33. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  34. mlrun/common/schemas/pipeline.py +0 -9
  35. mlrun/common/schemas/project.py +22 -21
  36. mlrun/common/types.py +7 -1
  37. mlrun/config.py +87 -19
  38. mlrun/data_types/data_types.py +4 -0
  39. mlrun/data_types/to_pandas.py +9 -9
  40. mlrun/datastore/__init__.py +5 -8
  41. mlrun/datastore/alibaba_oss.py +130 -0
  42. mlrun/datastore/azure_blob.py +4 -5
  43. mlrun/datastore/base.py +69 -30
  44. mlrun/datastore/datastore.py +10 -2
  45. mlrun/datastore/datastore_profile.py +90 -6
  46. mlrun/datastore/google_cloud_storage.py +1 -1
  47. mlrun/datastore/hdfs.py +5 -0
  48. mlrun/datastore/inmem.py +2 -2
  49. mlrun/datastore/redis.py +2 -2
  50. mlrun/datastore/s3.py +5 -0
  51. mlrun/datastore/snowflake_utils.py +43 -0
  52. mlrun/datastore/sources.py +172 -44
  53. mlrun/datastore/store_resources.py +7 -7
  54. mlrun/datastore/targets.py +285 -41
  55. mlrun/datastore/utils.py +68 -5
  56. mlrun/datastore/v3io.py +27 -50
  57. mlrun/db/auth_utils.py +152 -0
  58. mlrun/db/base.py +149 -14
  59. mlrun/db/factory.py +1 -1
  60. mlrun/db/httpdb.py +608 -178
  61. mlrun/db/nopdb.py +191 -7
  62. mlrun/errors.py +11 -0
  63. mlrun/execution.py +37 -20
  64. mlrun/feature_store/__init__.py +0 -2
  65. mlrun/feature_store/api.py +21 -52
  66. mlrun/feature_store/feature_set.py +48 -23
  67. mlrun/feature_store/feature_vector.py +2 -1
  68. mlrun/feature_store/ingestion.py +7 -6
  69. mlrun/feature_store/retrieval/base.py +9 -4
  70. mlrun/feature_store/retrieval/conversion.py +9 -9
  71. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  72. mlrun/feature_store/retrieval/job.py +9 -3
  73. mlrun/feature_store/retrieval/local_merger.py +2 -0
  74. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  75. mlrun/feature_store/steps.py +30 -19
  76. mlrun/features.py +4 -13
  77. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  78. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  79. mlrun/frameworks/lgbm/__init__.py +1 -1
  80. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  81. mlrun/frameworks/lgbm/model_handler.py +1 -1
  82. mlrun/frameworks/parallel_coordinates.py +2 -1
  83. mlrun/frameworks/pytorch/__init__.py +2 -2
  84. mlrun/frameworks/sklearn/__init__.py +1 -1
  85. mlrun/frameworks/tf_keras/__init__.py +5 -2
  86. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  87. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  88. mlrun/frameworks/xgboost/__init__.py +1 -1
  89. mlrun/k8s_utils.py +10 -11
  90. mlrun/launcher/__init__.py +1 -1
  91. mlrun/launcher/base.py +6 -5
  92. mlrun/launcher/client.py +8 -6
  93. mlrun/launcher/factory.py +1 -1
  94. mlrun/launcher/local.py +9 -3
  95. mlrun/launcher/remote.py +9 -3
  96. mlrun/lists.py +6 -2
  97. mlrun/model.py +58 -19
  98. mlrun/model_monitoring/__init__.py +1 -1
  99. mlrun/model_monitoring/api.py +127 -301
  100. mlrun/model_monitoring/application.py +5 -296
  101. mlrun/model_monitoring/applications/__init__.py +11 -0
  102. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  103. mlrun/model_monitoring/applications/base.py +282 -0
  104. mlrun/model_monitoring/applications/context.py +214 -0
  105. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  106. mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
  107. mlrun/model_monitoring/applications/results.py +99 -0
  108. mlrun/model_monitoring/controller.py +30 -36
  109. mlrun/model_monitoring/db/__init__.py +18 -0
  110. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  111. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  112. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
  113. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  114. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  115. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  116. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  117. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  118. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  119. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  120. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
  121. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  122. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  123. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  124. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  125. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  126. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  127. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  128. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  129. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  130. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  131. mlrun/model_monitoring/evidently_application.py +6 -118
  132. mlrun/model_monitoring/features_drift_table.py +34 -22
  133. mlrun/model_monitoring/helpers.py +100 -7
  134. mlrun/model_monitoring/model_endpoint.py +3 -2
  135. mlrun/model_monitoring/stream_processing.py +93 -228
  136. mlrun/model_monitoring/tracking_policy.py +7 -1
  137. mlrun/model_monitoring/writer.py +152 -124
  138. mlrun/package/packagers_manager.py +1 -0
  139. mlrun/package/utils/_formatter.py +2 -2
  140. mlrun/platforms/__init__.py +11 -10
  141. mlrun/platforms/iguazio.py +21 -202
  142. mlrun/projects/operations.py +30 -16
  143. mlrun/projects/pipelines.py +92 -99
  144. mlrun/projects/project.py +757 -268
  145. mlrun/render.py +15 -14
  146. mlrun/run.py +160 -162
  147. mlrun/runtimes/__init__.py +55 -3
  148. mlrun/runtimes/base.py +33 -19
  149. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  150. mlrun/runtimes/funcdoc.py +0 -28
  151. mlrun/runtimes/kubejob.py +28 -122
  152. mlrun/runtimes/local.py +5 -2
  153. mlrun/runtimes/mpijob/__init__.py +0 -20
  154. mlrun/runtimes/mpijob/abstract.py +8 -8
  155. mlrun/runtimes/mpijob/v1.py +1 -1
  156. mlrun/runtimes/nuclio/__init__.py +1 -0
  157. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  158. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  159. mlrun/runtimes/nuclio/application/application.py +523 -0
  160. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  161. mlrun/runtimes/nuclio/function.py +98 -58
  162. mlrun/runtimes/nuclio/serving.py +36 -42
  163. mlrun/runtimes/pod.py +196 -45
  164. mlrun/runtimes/remotesparkjob.py +1 -1
  165. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  166. mlrun/runtimes/utils.py +6 -73
  167. mlrun/secrets.py +6 -2
  168. mlrun/serving/remote.py +2 -3
  169. mlrun/serving/routers.py +7 -4
  170. mlrun/serving/server.py +7 -8
  171. mlrun/serving/states.py +73 -43
  172. mlrun/serving/v2_serving.py +8 -7
  173. mlrun/track/tracker.py +2 -1
  174. mlrun/utils/async_http.py +25 -5
  175. mlrun/utils/helpers.py +141 -75
  176. mlrun/utils/http.py +1 -1
  177. mlrun/utils/logger.py +39 -7
  178. mlrun/utils/notifications/notification/__init__.py +14 -9
  179. mlrun/utils/notifications/notification/base.py +12 -0
  180. mlrun/utils/notifications/notification/console.py +2 -0
  181. mlrun/utils/notifications/notification/git.py +3 -1
  182. mlrun/utils/notifications/notification/ipython.py +2 -0
  183. mlrun/utils/notifications/notification/slack.py +101 -21
  184. mlrun/utils/notifications/notification/webhook.py +11 -1
  185. mlrun/utils/notifications/notification_pusher.py +147 -16
  186. mlrun/utils/retryer.py +3 -2
  187. mlrun/utils/v3io_clients.py +0 -1
  188. mlrun/utils/version/version.json +2 -2
  189. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
  190. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  191. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
  192. mlrun/kfpops.py +0 -868
  193. mlrun/model_monitoring/batch.py +0 -974
  194. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  195. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  196. mlrun/platforms/other.py +0 -305
  197. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  198. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  199. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  200. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
@@ -11,36 +11,91 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
16
15
  import json
17
16
  import os
18
17
  import typing
18
+ from dataclasses import dataclass
19
+ from http import HTTPStatus
19
20
 
20
21
  import v3io.dataplane
21
- import v3io_frames
22
+ import v3io.dataplane.output
23
+ import v3io.dataplane.response
22
24
 
23
25
  import mlrun.common.model_monitoring.helpers
24
- import mlrun.common.schemas.model_monitoring
26
+ import mlrun.common.schemas.model_monitoring as mm_schemas
27
+ import mlrun.model_monitoring.db
25
28
  import mlrun.utils.v3io_clients
26
29
  from mlrun.utils import logger
27
30
 
28
- from .model_endpoint_store import ModelEndpointStore
29
-
30
31
  # Fields to encode before storing in the KV table or to decode after retrieving
31
32
  fields_to_encode_decode = [
32
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS,
33
- mlrun.common.schemas.model_monitoring.EventFieldType.CURRENT_STATS,
33
+ mm_schemas.EventFieldType.FEATURE_STATS,
34
+ mm_schemas.EventFieldType.CURRENT_STATS,
35
+ ]
36
+
37
+ _METRIC_FIELDS: list[str] = [
38
+ mm_schemas.WriterEvent.APPLICATION_NAME,
39
+ mm_schemas.MetricData.METRIC_NAME,
40
+ mm_schemas.MetricData.METRIC_VALUE,
41
+ mm_schemas.WriterEvent.START_INFER_TIME,
42
+ mm_schemas.WriterEvent.END_INFER_TIME,
43
+ ]
44
+
45
+
46
+ class SchemaField(typing.TypedDict):
47
+ name: str
48
+ type: str
49
+ nullable: bool
50
+
51
+
52
+ @dataclass
53
+ class SchemaParams:
54
+ key: str
55
+ fields: list[SchemaField]
56
+
57
+
58
+ _RESULT_SCHEMA: list[SchemaField] = [
59
+ SchemaField(
60
+ name=mm_schemas.ResultData.RESULT_NAME,
61
+ type=mm_schemas.GrafanaColumnType.STRING,
62
+ nullable=False,
63
+ )
34
64
  ]
35
65
 
66
+ _METRIC_SCHEMA: list[SchemaField] = [
67
+ SchemaField(
68
+ name=mm_schemas.WriterEvent.APPLICATION_NAME,
69
+ type=mm_schemas.GrafanaColumnType.STRING,
70
+ nullable=False,
71
+ ),
72
+ SchemaField(
73
+ name=mm_schemas.MetricData.METRIC_NAME,
74
+ type=mm_schemas.GrafanaColumnType.STRING,
75
+ nullable=False,
76
+ ),
77
+ ]
78
+
79
+
80
+ _KIND_TO_SCHEMA_PARAMS: dict[mm_schemas.WriterEventKind, SchemaParams] = {
81
+ mm_schemas.WriterEventKind.RESULT: SchemaParams(
82
+ key=mm_schemas.WriterEvent.APPLICATION_NAME, fields=_RESULT_SCHEMA
83
+ ),
84
+ mm_schemas.WriterEventKind.METRIC: SchemaParams(
85
+ key="metric_id", fields=_METRIC_SCHEMA
86
+ ),
87
+ }
36
88
 
37
- class KVModelEndpointStore(ModelEndpointStore):
89
+ _EXCLUDE_SCHEMA_FILTER_EXPRESSION = '__name!=".#schema"'
90
+
91
+
92
+ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
38
93
  """
39
94
  Handles the DB operations when the DB target is from type KV. For the KV operations, we use an instance of V3IO
40
95
  client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
41
96
  """
42
97
 
43
- def __init__(self, project: str, access_key: str):
98
+ def __init__(self, project: str, access_key: typing.Optional[str] = None) -> None:
44
99
  super().__init__(project=project)
45
100
  # Initialize a V3IO client instance
46
101
  self.access_key = access_key or os.environ.get("V3IO_ACCESS_KEY")
@@ -65,7 +120,7 @@ class KVModelEndpointStore(ModelEndpointStore):
65
120
  self.client.kv.put(
66
121
  container=self.container,
67
122
  table_path=self.path,
68
- key=endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID],
123
+ key=endpoint[mm_schemas.EventFieldType.UID],
69
124
  attributes=endpoint,
70
125
  )
71
126
 
@@ -152,7 +207,7 @@ class KVModelEndpointStore(ModelEndpointStore):
152
207
  """Getting path and container based on the model monitoring configurations"""
153
208
  path = mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
154
209
  project=self.project,
155
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.ENDPOINTS,
210
+ kind=mm_schemas.ModelMonitoringStoreKinds.ENDPOINTS,
156
211
  )
157
212
  (
158
213
  _,
@@ -218,17 +273,11 @@ class KVModelEndpointStore(ModelEndpointStore):
218
273
  if uids is None:
219
274
  uids = []
220
275
  for item in items:
221
- if mlrun.common.schemas.model_monitoring.EventFieldType.UID not in item:
276
+ if mm_schemas.EventFieldType.UID not in item:
222
277
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
223
- uids.append(
224
- item[
225
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
226
- ]
227
- )
278
+ uids.append(item[mm_schemas.EventFieldType.ENDPOINT_ID])
228
279
  else:
229
- uids.append(
230
- item[mlrun.common.schemas.model_monitoring.EventFieldType.UID]
231
- )
280
+ uids.append(item[mm_schemas.EventFieldType.UID])
232
281
 
233
282
  # Add each relevant model endpoint to the model endpoints list
234
283
  for endpoint_id in uids:
@@ -239,27 +288,20 @@ class KVModelEndpointStore(ModelEndpointStore):
239
288
 
240
289
  return endpoint_list
241
290
 
242
- def delete_model_endpoints_resources(self, endpoints: list[dict[str, typing.Any]]):
291
+ def delete_model_endpoints_resources(self):
243
292
  """
244
- Delete all model endpoints resources in both KV and the time series DB.
245
-
246
- :param endpoints: A list of model endpoints flattened dictionaries.
293
+ Delete all model endpoints resources in V3IO KV.
247
294
  """
248
295
 
296
+ endpoints = self.list_model_endpoints()
297
+
249
298
  # Delete model endpoint record from KV table
250
299
  for endpoint_dict in endpoints:
251
- if (
252
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
253
- not in endpoint_dict
254
- ):
300
+ if mm_schemas.EventFieldType.UID not in endpoint_dict:
255
301
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
256
- endpoint_id = endpoint_dict[
257
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
258
- ]
302
+ endpoint_id = endpoint_dict[mm_schemas.EventFieldType.ENDPOINT_ID]
259
303
  else:
260
- endpoint_id = endpoint_dict[
261
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
262
- ]
304
+ endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
263
305
  self.delete_model_endpoint(
264
306
  endpoint_id,
265
307
  )
@@ -282,117 +324,131 @@ class KVModelEndpointStore(ModelEndpointStore):
282
324
  raise_for_status=v3io.dataplane.RaiseForStatus.never,
283
325
  )
284
326
 
285
- # Cleanup TSDB
286
- frames = self._get_frames_client()
287
-
288
- # Generate the required tsdb paths
289
- tsdb_path, filtered_path = self._generate_tsdb_paths()
327
+ @staticmethod
328
+ def _get_results_table_path(endpoint_id: str) -> str:
329
+ return endpoint_id
290
330
 
291
- # Delete time series DB resources
292
- try:
293
- frames.delete(
294
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
295
- table=filtered_path,
296
- )
297
- except v3io_frames.errors.DeleteError as e:
298
- if "No TSDB schema file found" not in str(e):
299
- logger.warning(
300
- f"Failed to delete TSDB table '{filtered_path}'",
301
- err=mlrun.errors.err_to_str(e),
302
- )
303
- # Final cleanup of tsdb path
304
- tsdb_path.replace("://u", ":///u")
305
- store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
306
- store.rm(tsdb_path, recursive=True)
331
+ @staticmethod
332
+ def _get_metrics_table_path(endpoint_id: str) -> str:
333
+ return f"{endpoint_id}_metrics"
307
334
 
308
- def get_endpoint_real_time_metrics(
335
+ def write_application_event(
309
336
  self,
310
- endpoint_id: str,
311
- metrics: list[str],
312
- start: str = "now-1h",
313
- end: str = "now",
314
- access_key: str = None,
315
- ) -> dict[str, list[tuple[str, float]]]:
337
+ event: dict[str, typing.Any],
338
+ kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
339
+ ) -> None:
316
340
  """
317
- Getting metrics from the time series DB. There are pre-defined metrics for model endpoints such as
318
- `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user.
341
+ Write a new application event in the target table.
319
342
 
320
- :param endpoint_id: The unique id of the model endpoint.
321
- :param metrics: A list of real-time metrics to return for the model endpoint.
322
- :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
323
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
324
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
325
- earliest time.
326
- :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
327
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
328
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
329
- earliest time.
330
- :param access_key: V3IO access key that will be used for generating Frames client object. If not
331
- provided, the access key will be retrieved from the environment variables.
332
-
333
- :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
334
- includes timestamps and the values.
343
+ :param event: An event dictionary that represents the application result, should be corresponded to the
344
+ schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
345
+ object.
346
+ :param kind: The type of the event, can be either "result" or "metric".
335
347
  """
336
348
 
337
- # Initialize access key
338
- access_key = access_key or mlrun.mlconf.get_v3io_access_key()
339
-
340
- if not metrics:
341
- raise mlrun.errors.MLRunInvalidArgumentError(
342
- "Metric names must be provided"
343
- )
349
+ container = self.get_v3io_monitoring_apps_container(project_name=self.project)
350
+ endpoint_id = event.pop(mm_schemas.WriterEvent.ENDPOINT_ID)
351
+
352
+ if kind == mm_schemas.WriterEventKind.METRIC:
353
+ table_path = self._get_metrics_table_path(endpoint_id)
354
+ key = f"{event[mm_schemas.WriterEvent.APPLICATION_NAME]}.{event[mm_schemas.MetricData.METRIC_NAME]}"
355
+ attributes = {event_key: event[event_key] for event_key in _METRIC_FIELDS}
356
+ elif kind == mm_schemas.WriterEventKind.RESULT:
357
+ table_path = self._get_results_table_path(endpoint_id)
358
+ key = event.pop(mm_schemas.WriterEvent.APPLICATION_NAME)
359
+ metric_name = event.pop(mm_schemas.ResultData.RESULT_NAME)
360
+ attributes = {metric_name: json.dumps(event)}
361
+ else:
362
+ raise ValueError(f"Invalid {kind = }")
344
363
 
345
- # Initialize metrics mapping dictionary
346
- metrics_mapping = {}
347
-
348
- # Getting the path for the time series DB
349
- events_path = (
350
- mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
351
- project=self.project,
352
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
353
- )
354
- )
355
- (
356
- _,
357
- container,
358
- events_path,
359
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
360
- events_path
364
+ self.client.kv.update(
365
+ container=container,
366
+ table_path=table_path,
367
+ key=key,
368
+ attributes=attributes,
361
369
  )
362
370
 
363
- # Retrieve the raw data from the time series DB based on the provided metrics and time ranges
364
- frames_client = mlrun.utils.v3io_clients.get_frames_client(
365
- token=access_key,
366
- address=mlrun.mlconf.v3io_framesd,
371
+ schema_file = self.client.kv.new_cursor(
367
372
  container=container,
373
+ table_path=table_path,
374
+ filter_expression='__name==".#schema"',
368
375
  )
369
376
 
370
- try:
371
- data = frames_client.read(
372
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
373
- table=events_path,
374
- columns=["endpoint_id", *metrics],
375
- filter=f"endpoint_id=='{endpoint_id}'",
376
- start=start,
377
- end=end,
377
+ if not schema_file.all():
378
+ logger.info(
379
+ "Generating a new V3IO KV schema file",
380
+ container=container,
381
+ table_path=table_path,
382
+ )
383
+ self._generate_kv_schema(
384
+ container=container, table_path=table_path, kind=kind
378
385
  )
386
+ logger.info("Updated V3IO KV successfully", key=key)
387
+
388
+ def _generate_kv_schema(
389
+ self, *, container: str, table_path: str, kind: mm_schemas.WriterEventKind
390
+ ) -> None:
391
+ """Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
392
+ schema_params = _KIND_TO_SCHEMA_PARAMS[kind]
393
+ res = self.client.kv.create_schema(
394
+ container=container,
395
+ table_path=table_path,
396
+ key=schema_params.key,
397
+ fields=schema_params.fields,
398
+ )
399
+ if res.status_code != HTTPStatus.OK:
400
+ raise mlrun.errors.MLRunBadRequestError(
401
+ f"Couldn't infer schema for endpoint {table_path} which is required for Grafana dashboards"
402
+ )
403
+ else:
404
+ logger.info("Generated V3IO KV schema successfully", table_path=table_path)
405
+
406
+ def get_last_analyzed(self, endpoint_id: str, application_name: str) -> int:
407
+ """
408
+ Get the last analyzed time for the provided model endpoint and application.
379
409
 
380
- # Fill the metrics mapping dictionary with the metric name and values
381
- data_dict = data.to_dict()
382
- for metric in metrics:
383
- metric_data = data_dict.get(metric)
384
- if metric_data is None:
385
- continue
410
+ :param endpoint_id: The unique id of the model endpoint.
411
+ :param application_name: Registered application name.
386
412
 
387
- values = [
388
- (str(timestamp), value) for timestamp, value in metric_data.items()
389
- ]
390
- metrics_mapping[metric] = values
413
+ :return: Timestamp as a Unix time.
414
+ :raise: MLRunNotFoundError if last analyzed value is not found.
391
415
 
392
- except v3io_frames.errors.ReadError:
393
- logger.warn("Failed to read tsdb", endpoint=endpoint_id)
416
+ """
417
+ try:
418
+ data = self.client.kv.get(
419
+ container=self._get_monitoring_schedules_container(
420
+ project_name=self.project
421
+ ),
422
+ table_path=endpoint_id,
423
+ key=application_name,
424
+ )
425
+ return data.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
426
+ except v3io.dataplane.response.HttpResponseError as err:
427
+ logger.debug("Error while getting last analyzed time", err=err)
428
+ raise mlrun.errors.MLRunNotFoundError(
429
+ f"No last analyzed value has been found for {application_name} "
430
+ f"that processes model endpoint {endpoint_id}",
431
+ )
394
432
 
395
- return metrics_mapping
433
+ def update_last_analyzed(
434
+ self, endpoint_id: str, application_name: str, last_analyzed: int
435
+ ):
436
+ """
437
+ Update the last analyzed time for the provided model endpoint and application.
438
+
439
+ :param endpoint_id: The unique id of the model endpoint.
440
+ :param application_name: Registered application name.
441
+ :param last_analyzed: Timestamp as a Unix time that represents the last analyzed time of a certain
442
+ application and model endpoint.
443
+ """
444
+ self.client.kv.put(
445
+ container=self._get_monitoring_schedules_container(
446
+ project_name=self.project
447
+ ),
448
+ table_path=endpoint_id,
449
+ key=application_name,
450
+ attributes={mm_schemas.SchedulingKeys.LAST_ANALYZED: last_analyzed},
451
+ )
396
452
 
397
453
  def _generate_tsdb_paths(self) -> tuple[str, str]:
398
454
  """Generate a short path to the TSDB resources and a filtered path for the frames object
@@ -404,7 +460,7 @@ class KVModelEndpointStore(ModelEndpointStore):
404
460
  full_path = (
405
461
  mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
406
462
  project=self.project,
407
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
463
+ kind=mm_schemas.ModelMonitoringStoreKinds.EVENTS,
408
464
  )
409
465
  )
410
466
 
@@ -500,8 +556,8 @@ class KVModelEndpointStore(ModelEndpointStore):
500
556
  # Apply top_level filter (remove endpoints that considered a child of a router)
501
557
  if top_level:
502
558
  filter_expression.append(
503
- f"(endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.NODE_EP.value)}' "
504
- f"OR endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.ROUTER.value)}')"
559
+ f"(endpoint_type=='{str(mm_schemas.EndpointType.NODE_EP.value)}' "
560
+ f"OR endpoint_type=='{str(mm_schemas.EndpointType.ROUTER.value)}')"
505
561
  )
506
562
 
507
563
  return " AND ".join(filter_expression)
@@ -521,41 +577,31 @@ class KVModelEndpointStore(ModelEndpointStore):
521
577
  # Validate default value for `error_count`
522
578
  # For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
523
579
  if (
524
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT in endpoint
525
- and endpoint[
526
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
527
- ]
528
- == "null"
580
+ mm_schemas.EventFieldType.ERROR_COUNT in endpoint
581
+ and endpoint[mm_schemas.EventFieldType.ERROR_COUNT] == "null"
529
582
  ):
530
- endpoint[
531
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
532
- ] = "0"
583
+ endpoint[mm_schemas.EventFieldType.ERROR_COUNT] = "0"
533
584
 
534
585
  # Validate default value for `metrics`
535
586
  # For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
536
587
  if (
537
- mlrun.common.schemas.model_monitoring.EventFieldType.METRICS in endpoint
538
- and endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS]
539
- == "null"
588
+ mm_schemas.EventFieldType.METRICS in endpoint
589
+ and endpoint[mm_schemas.EventFieldType.METRICS] == "null"
540
590
  ):
541
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS] = (
542
- json.dumps(
543
- {
544
- mlrun.common.schemas.model_monitoring.EventKeyMetrics.GENERIC: {
545
- mlrun.common.schemas.model_monitoring.EventLiveStats.LATENCY_AVG_1H: 0,
546
- mlrun.common.schemas.model_monitoring.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
547
- }
591
+ endpoint[mm_schemas.EventFieldType.METRICS] = json.dumps(
592
+ {
593
+ mm_schemas.EventKeyMetrics.GENERIC: {
594
+ mm_schemas.EventLiveStats.LATENCY_AVG_1H: 0,
595
+ mm_schemas.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
548
596
  }
549
- )
597
+ }
550
598
  )
551
599
  # Validate key `uid` instead of `endpoint_id`
552
600
  # For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
553
- if mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID in endpoint:
554
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID] = (
555
- endpoint[
556
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
557
- ]
558
- )
601
+ if mm_schemas.EventFieldType.ENDPOINT_ID in endpoint:
602
+ endpoint[mm_schemas.EventFieldType.UID] = endpoint[
603
+ mm_schemas.EventFieldType.ENDPOINT_ID
604
+ ]
559
605
 
560
606
  @staticmethod
561
607
  def _encode_field(field: typing.Union[str, bytes]) -> bytes:
@@ -572,3 +618,104 @@ class KVModelEndpointStore(ModelEndpointStore):
572
618
  if isinstance(field, bytes):
573
619
  return field.decode()
574
620
  return field
621
+
622
+ @staticmethod
623
+ def get_v3io_monitoring_apps_container(project_name: str) -> str:
624
+ return f"users/pipelines/{project_name}/monitoring-apps"
625
+
626
+ @staticmethod
627
+ def _get_monitoring_schedules_container(project_name: str) -> str:
628
+ return f"users/pipelines/{project_name}/monitoring-schedules/functions"
629
+
630
+ def _extract_results_from_items(
631
+ self, app_items: list[dict[str, str]]
632
+ ) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
633
+ """Assuming .#schema items are filtered out"""
634
+ metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
635
+ for app_item in app_items:
636
+ app_name = app_item.pop("__name")
637
+ for result_name in app_item:
638
+ metrics.append(
639
+ mm_schemas.ModelEndpointMonitoringMetric(
640
+ project=self.project,
641
+ app=app_name,
642
+ type=mm_schemas.ModelEndpointMonitoringMetricType.RESULT,
643
+ name=result_name,
644
+ full_name=mm_schemas.model_endpoints._compose_full_name(
645
+ project=self.project, app=app_name, name=result_name
646
+ ),
647
+ )
648
+ )
649
+ return metrics
650
+
651
+ def _extract_metrics_from_items(
652
+ self, result_items: list[dict[str, str]]
653
+ ) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
654
+ metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
655
+ logger.debug("Result items", result_items=result_items)
656
+ for result_item in result_items:
657
+ app = result_item[mm_schemas.WriterEvent.APPLICATION_NAME]
658
+ name = result_item[mm_schemas.MetricData.METRIC_NAME]
659
+ metrics.append(
660
+ mm_schemas.ModelEndpointMonitoringMetric(
661
+ project=self.project,
662
+ app=app,
663
+ type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
664
+ name=name,
665
+ full_name=mm_schemas.model_endpoints._compose_full_name(
666
+ project=self.project,
667
+ app=app,
668
+ name=name,
669
+ type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
670
+ ),
671
+ )
672
+ )
673
+ return metrics
674
+
675
+ def get_model_endpoint_metrics(
676
+ self, endpoint_id: str, type: mm_schemas.ModelEndpointMonitoringMetricType
677
+ ) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
678
+ """Get model monitoring results and metrics on the endpoint"""
679
+ metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
680
+ container = self.get_v3io_monitoring_apps_container(self.project)
681
+ if type == mm_schemas.ModelEndpointMonitoringMetricType.METRIC:
682
+ table_path = self._get_metrics_table_path(endpoint_id)
683
+ items_extractor = self._extract_metrics_from_items
684
+ elif type == mm_schemas.ModelEndpointMonitoringMetricType.RESULT:
685
+ table_path = self._get_results_table_path(endpoint_id)
686
+ items_extractor = self._extract_results_from_items
687
+ else:
688
+ raise ValueError(f"Invalid metric {type = }")
689
+
690
+ def scan(
691
+ marker: typing.Optional[str] = None,
692
+ ) -> v3io.dataplane.response.Response:
693
+ # TODO: Use AIO client: `v3io.aio.dataplane.client.Client`
694
+ return self.client.kv.scan(
695
+ container=container,
696
+ table_path=table_path,
697
+ marker=marker,
698
+ filter_expression=_EXCLUDE_SCHEMA_FILTER_EXPRESSION,
699
+ )
700
+
701
+ try:
702
+ response = scan()
703
+ except v3io.dataplane.response.HttpResponseError as err:
704
+ if err.status_code == HTTPStatus.NOT_FOUND:
705
+ logger.warning(
706
+ f"Attempt getting {type}s - no data. Check the "
707
+ "project name, endpoint, or wait for the applications to start.",
708
+ container=container,
709
+ table_path=table_path,
710
+ )
711
+ return []
712
+ raise
713
+
714
+ while True:
715
+ output = typing.cast(v3io.dataplane.output.GetItemsOutput, response.output)
716
+ metrics.extend(items_extractor(output.items))
717
+ if output.last:
718
+ break
719
+ response = scan(marker=output.next_marker)
720
+
721
+ return metrics
@@ -0,0 +1,100 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import enum
16
+ import typing
17
+
18
+ import mlrun.common.schemas.secret
19
+ import mlrun.errors
20
+
21
+ from .base import TSDBConnector
22
+
23
+
24
+ class ObjectTSDBFactory(enum.Enum):
25
+ """Enum class to handle the different TSDB connector type values for storing real time metrics"""
26
+
27
+ v3io_tsdb = "v3io-tsdb"
28
+ tdengine = "tdengine"
29
+
30
+ def to_tsdb_connector(self, project: str, **kwargs) -> TSDBConnector:
31
+ """
32
+ Return a TSDBConnector object based on the provided enum value.
33
+ :param project: The name of the project.
34
+ :return: `TSDBConnector` object.
35
+ """
36
+
37
+ if self == self.v3io_tsdb:
38
+ if mlrun.mlconf.is_ce_mode():
39
+ raise mlrun.errors.MLRunInvalidArgumentError(
40
+ f"{self.v3io_tsdb} is not supported in CE mode."
41
+ )
42
+
43
+ from .v3io.v3io_connector import V3IOTSDBConnector
44
+
45
+ return V3IOTSDBConnector(project=project, **kwargs)
46
+
47
+ # Assuming TDEngine connector if connector type is not V3IO TSDB.
48
+ # Update these lines once there are more than two connector types.
49
+
50
+ from .tdengine.tdengine_connector import TDEngineConnector
51
+
52
+ return TDEngineConnector(project=project, **kwargs)
53
+
54
+ @classmethod
55
+ def _missing_(cls, value: typing.Any):
56
+ """A lookup function to handle an invalid value.
57
+ :param value: Provided enum (invalid) value.
58
+ """
59
+ valid_values = list(cls.__members__.keys())
60
+ raise mlrun.errors.MLRunInvalidArgumentError(
61
+ f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
62
+ )
63
+
64
+
65
+ def get_tsdb_connector(
66
+ project: str,
67
+ tsdb_connector_type: str = "",
68
+ secret_provider: typing.Optional[typing.Callable] = None,
69
+ **kwargs,
70
+ ) -> TSDBConnector:
71
+ """
72
+ Get TSDB connector object.
73
+ :param project: The name of the project.
74
+ :param tsdb_connector_type: The type of the TSDB connector. See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory
75
+ for available options.
76
+ :param secret_provider: An optional secret provider to get the connection string secret.
77
+
78
+ :return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
79
+ TSDB connector such as updating drift metrics or write application record result.
80
+ """
81
+
82
+ tsdb_connection_string = mlrun.model_monitoring.helpers.get_tsdb_connection_string(
83
+ secret_provider=secret_provider
84
+ )
85
+
86
+ if tsdb_connection_string and tsdb_connection_string.startswith("taosws"):
87
+ tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
88
+ kwargs["connection_string"] = tsdb_connection_string
89
+
90
+ # Set the default TSDB connector type if no connection has been set
91
+ tsdb_connector_type = (
92
+ tsdb_connector_type
93
+ or mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
94
+ )
95
+
96
+ # Get connector type value from ObjectTSDBFactory enum class
97
+ tsdb_connector_factory = ObjectTSDBFactory(tsdb_connector_type)
98
+
99
+ # Convert into TSDB connector object
100
+ return tsdb_connector_factory.to_tsdb_connector(project=project, **kwargs)