mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +18 -109
  3. mlrun/{runtimes/mpijob/v1alpha1.py → alerts/__init__.py} +2 -16
  4. mlrun/alerts/alert.py +141 -0
  5. mlrun/artifacts/__init__.py +8 -3
  6. mlrun/artifacts/base.py +36 -253
  7. mlrun/artifacts/dataset.py +9 -190
  8. mlrun/artifacts/manager.py +20 -41
  9. mlrun/artifacts/model.py +8 -140
  10. mlrun/artifacts/plots.py +14 -375
  11. mlrun/common/schemas/__init__.py +4 -2
  12. mlrun/common/schemas/alert.py +46 -4
  13. mlrun/common/schemas/api_gateway.py +4 -0
  14. mlrun/common/schemas/artifact.py +15 -0
  15. mlrun/common/schemas/auth.py +2 -0
  16. mlrun/common/schemas/model_monitoring/__init__.py +8 -1
  17. mlrun/common/schemas/model_monitoring/constants.py +40 -4
  18. mlrun/common/schemas/model_monitoring/model_endpoints.py +73 -2
  19. mlrun/common/schemas/project.py +2 -0
  20. mlrun/config.py +7 -4
  21. mlrun/data_types/to_pandas.py +4 -4
  22. mlrun/datastore/base.py +41 -9
  23. mlrun/datastore/datastore_profile.py +54 -4
  24. mlrun/datastore/inmem.py +2 -2
  25. mlrun/datastore/sources.py +43 -2
  26. mlrun/datastore/store_resources.py +2 -6
  27. mlrun/datastore/targets.py +106 -39
  28. mlrun/db/base.py +23 -3
  29. mlrun/db/httpdb.py +101 -47
  30. mlrun/db/nopdb.py +20 -2
  31. mlrun/errors.py +5 -0
  32. mlrun/feature_store/__init__.py +0 -2
  33. mlrun/feature_store/api.py +12 -47
  34. mlrun/feature_store/feature_set.py +9 -0
  35. mlrun/feature_store/retrieval/base.py +9 -4
  36. mlrun/feature_store/retrieval/conversion.py +4 -4
  37. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  38. mlrun/feature_store/retrieval/job.py +2 -0
  39. mlrun/feature_store/retrieval/local_merger.py +2 -0
  40. mlrun/feature_store/retrieval/spark_merger.py +5 -0
  41. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
  42. mlrun/launcher/base.py +4 -3
  43. mlrun/launcher/client.py +1 -1
  44. mlrun/lists.py +4 -2
  45. mlrun/model.py +25 -11
  46. mlrun/model_monitoring/__init__.py +1 -1
  47. mlrun/model_monitoring/api.py +41 -18
  48. mlrun/model_monitoring/application.py +5 -305
  49. mlrun/model_monitoring/applications/__init__.py +11 -0
  50. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  51. mlrun/model_monitoring/applications/base.py +282 -0
  52. mlrun/model_monitoring/applications/context.py +214 -0
  53. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  54. mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
  55. mlrun/model_monitoring/applications/results.py +99 -0
  56. mlrun/model_monitoring/controller.py +3 -1
  57. mlrun/model_monitoring/db/__init__.py +2 -0
  58. mlrun/model_monitoring/db/stores/base/store.py +9 -36
  59. mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
  60. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
  61. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +104 -187
  62. mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
  63. mlrun/model_monitoring/db/tsdb/base.py +135 -0
  64. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  65. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  66. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +404 -0
  67. mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
  68. mlrun/model_monitoring/evidently_application.py +6 -118
  69. mlrun/model_monitoring/helpers.py +1 -1
  70. mlrun/model_monitoring/model_endpoint.py +3 -2
  71. mlrun/model_monitoring/stream_processing.py +48 -213
  72. mlrun/model_monitoring/writer.py +101 -121
  73. mlrun/platforms/__init__.py +10 -9
  74. mlrun/platforms/iguazio.py +21 -202
  75. mlrun/projects/operations.py +11 -7
  76. mlrun/projects/pipelines.py +13 -76
  77. mlrun/projects/project.py +73 -45
  78. mlrun/render.py +11 -13
  79. mlrun/run.py +6 -41
  80. mlrun/runtimes/__init__.py +3 -3
  81. mlrun/runtimes/base.py +6 -6
  82. mlrun/runtimes/funcdoc.py +0 -28
  83. mlrun/runtimes/kubejob.py +2 -1
  84. mlrun/runtimes/local.py +1 -1
  85. mlrun/runtimes/mpijob/__init__.py +0 -20
  86. mlrun/runtimes/mpijob/v1.py +1 -1
  87. mlrun/runtimes/nuclio/api_gateway.py +75 -9
  88. mlrun/runtimes/nuclio/function.py +9 -35
  89. mlrun/runtimes/pod.py +16 -36
  90. mlrun/runtimes/remotesparkjob.py +1 -1
  91. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  92. mlrun/runtimes/utils.py +1 -39
  93. mlrun/utils/helpers.py +72 -71
  94. mlrun/utils/notifications/notification/base.py +1 -1
  95. mlrun/utils/notifications/notification/slack.py +12 -5
  96. mlrun/utils/notifications/notification/webhook.py +1 -1
  97. mlrun/utils/notifications/notification_pusher.py +134 -14
  98. mlrun/utils/version/version.json +2 -2
  99. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/METADATA +4 -3
  100. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/RECORD +105 -95
  101. mlrun/kfpops.py +0 -865
  102. mlrun/platforms/other.py +0 -305
  103. /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
  104. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/LICENSE +0 -0
  105. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/WHEEL +0 -0
  106. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/entry_points.txt +0 -0
  107. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/top_level.txt +0 -0
@@ -12,121 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import uuid
16
- import warnings
17
- from typing import Union
18
-
19
- import pandas as pd
20
- import semver
21
-
22
- from mlrun.errors import MLRunIncompatibleVersionError
23
- from mlrun.model_monitoring.application import ModelMonitoringApplicationBase
24
-
25
- SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.11")
26
-
27
-
28
- def _check_evidently_version(*, cur: semver.Version, ref: semver.Version) -> None:
29
- if ref.is_compatible(cur) or (
30
- cur.major == ref.major == 0 and cur.minor == ref.minor and cur.patch > ref.patch
31
- ):
32
- return
33
- if cur.major == ref.major == 0 and cur.minor > ref.minor:
34
- warnings.warn(
35
- f"Evidently version {cur} is not compatible with the tested "
36
- f"version {ref}, use at your own risk."
37
- )
38
- else:
39
- raise MLRunIncompatibleVersionError(
40
- f"Evidently version {cur} is not supported, please change to "
41
- f"{ref} (or another compatible version)."
42
- )
43
-
44
-
45
- _HAS_EVIDENTLY = False
46
- try:
47
- import evidently # noqa: F401
48
-
49
- _check_evidently_version(
50
- cur=semver.Version.parse(evidently.__version__),
51
- ref=SUPPORTED_EVIDENTLY_VERSION,
52
- )
53
- _HAS_EVIDENTLY = True
54
- except ModuleNotFoundError:
55
- pass
56
-
57
-
58
- if _HAS_EVIDENTLY:
59
- from evidently.renderers.notebook_utils import determine_template
60
- from evidently.report.report import Report
61
- from evidently.suite.base_suite import Suite
62
- from evidently.ui.type_aliases import STR_UUID
63
- from evidently.ui.workspace import Workspace
64
- from evidently.utils.dashboard import TemplateParams
65
-
66
-
67
- class EvidentlyModelMonitoringApplicationBase(ModelMonitoringApplicationBase):
68
- def __init__(
69
- self, evidently_workspace_path: str, evidently_project_id: "STR_UUID"
70
- ) -> None:
71
- """
72
- A class for integrating Evidently for mlrun model monitoring within a monitoring application.
73
- Note: evidently is not installed by default in the mlrun/mlrun image.
74
- It must be installed separately to use this class.
75
-
76
- :param evidently_workspace_path: (str) The path to the Evidently workspace.
77
- :param evidently_project_id: (str) The ID of the Evidently project.
78
-
79
- """
80
- if not _HAS_EVIDENTLY:
81
- raise ModuleNotFoundError("Evidently is not installed - the app cannot run")
82
- self.evidently_workspace = Workspace.create(evidently_workspace_path)
83
- self.evidently_project_id = evidently_project_id
84
- self.evidently_project = self.evidently_workspace.get_project(
85
- evidently_project_id
86
- )
87
-
88
- def log_evidently_object(
89
- self, evidently_object: Union["Report", "Suite"], artifact_name: str
90
- ):
91
- """
92
- Logs an Evidently report or suite as an artifact.
93
-
94
- :param evidently_object: (Union[Report, Suite]) The Evidently report or suite object.
95
- :param artifact_name: (str) The name for the logged artifact.
96
- """
97
- evidently_object_html = evidently_object.get_html()
98
- self.context.log_artifact(
99
- artifact_name, body=evidently_object_html.encode("utf-8"), format="html"
100
- )
101
-
102
- def log_project_dashboard(
103
- self,
104
- timestamp_start: pd.Timestamp,
105
- timestamp_end: pd.Timestamp,
106
- artifact_name: str = "dashboard",
107
- ):
108
- """
109
- Logs an Evidently project dashboard.
110
-
111
- :param timestamp_start: (pd.Timestamp) The start timestamp for the dashboard data.
112
- :param timestamp_end: (pd.Timestamp) The end timestamp for the dashboard data.
113
- :param artifact_name: (str) The name for the logged artifact.
114
- """
115
-
116
- dashboard_info = self.evidently_project.build_dashboard_info(
117
- timestamp_start, timestamp_end
118
- )
119
- template_params = TemplateParams(
120
- dashboard_id="pd_" + str(uuid.uuid4()).replace("-", ""),
121
- dashboard_info=dashboard_info,
122
- additional_graphs={},
123
- )
124
-
125
- dashboard_html = self._render(determine_template("inline"), template_params)
126
- self.context.log_artifact(
127
- artifact_name, body=dashboard_html.encode("utf-8"), format="html"
128
- )
129
-
130
- @staticmethod
131
- def _render(temple_func, template_params: "TemplateParams"):
132
- return temple_func(params=template_params)
15
+ # TODO : delete this file in 1.9.0
16
+ from mlrun.model_monitoring.applications import ( # noqa: F401
17
+ _HAS_EVIDENTLY,
18
+ SUPPORTED_EVIDENTLY_VERSION,
19
+ EvidentlyModelMonitoringApplicationBase,
20
+ )
@@ -215,7 +215,7 @@ def update_model_endpoint_last_request(
215
215
 
216
216
  def calculate_inputs_statistics(
217
217
  sample_set_statistics: dict, inputs: pd.DataFrame
218
- ) -> dict:
218
+ ) -> mlrun.common.model_monitoring.helpers.FeatureStats:
219
219
  """
220
220
  Calculate the inputs data statistics for drift monitoring purpose.
221
221
 
@@ -17,6 +17,7 @@ from dataclasses import dataclass, field
17
17
  from typing import Any
18
18
 
19
19
  import mlrun.model
20
+ from mlrun.common.model_monitoring.helpers import FeatureStats
20
21
  from mlrun.common.schemas.model_monitoring.constants import (
21
22
  EndpointType,
22
23
  EventKeyMetrics,
@@ -42,8 +43,8 @@ class ModelEndpointSpec(mlrun.model.ModelObj):
42
43
 
43
44
  @dataclass
44
45
  class ModelEndpointStatus(mlrun.model.ModelObj):
45
- feature_stats: dict = field(default_factory=dict)
46
- current_stats: dict = field(default_factory=dict)
46
+ feature_stats: FeatureStats = field(default_factory=dict)
47
+ current_stats: FeatureStats = field(default_factory=dict)
47
48
  first_request: str = ""
48
49
  last_request: str = ""
49
50
  error_count: int = 0
@@ -30,7 +30,6 @@ import mlrun.model_monitoring.db
30
30
  import mlrun.model_monitoring.prometheus
31
31
  import mlrun.serving.states
32
32
  import mlrun.utils
33
- import mlrun.utils.v3io_clients
34
33
  from mlrun.common.schemas.model_monitoring.constants import (
35
34
  EventFieldType,
36
35
  EventKeyMetrics,
@@ -40,7 +39,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
40
39
  ProjectSecretKeys,
41
40
  PrometheusEndpoints,
42
41
  )
43
- from mlrun.model_monitoring.helpers import get_endpoint_record
44
42
  from mlrun.utils import logger
45
43
 
46
44
 
@@ -79,6 +77,7 @@ class EventStreamProcessor:
79
77
  )
80
78
 
81
79
  self.storage_options = None
80
+ self.tsdb_configurations = {}
82
81
  if not mlrun.mlconf.is_ce_mode():
83
82
  self._initialize_v3io_configurations(
84
83
  model_monitoring_access_key=model_monitoring_access_key
@@ -139,29 +138,29 @@ class EventStreamProcessor:
139
138
 
140
139
  def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
141
140
  """
142
- Apply monitoring serving graph to a given serving function. The following serving graph includes about 20 steps
143
- of different operations that are executed on the events from the model server. Each event has
144
- metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
145
- Throughout the serving graph, the results are written to 3 different databases:
146
- 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
147
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
148
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
149
- from other processes, such as current_stats that is being calculated by the monitoring batch job
150
- process. If the target is from type KV, then the model endpoints table can be found under
151
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
152
- is stored within the database that was defined in the provided connection string and can be found
153
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
154
- 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
155
- This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
141
+ Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
142
+ parts that each one them includes several steps of different operations that are executed on the events from
143
+ the model server.
144
+ Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
145
+ metrics from the model server.
146
+ In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
147
+ validation of the event data and adding important details to the event such as endpoint_id.
148
+ In the next parts, the serving graph stores data to 3 different targets:
149
+ 1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
150
+ time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
151
+ endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
152
+ represents sample statistics from the training data. If the target is from type KV, then the model endpoints
153
+ table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
154
+ SQL, then the table is stored within the database that was defined in the provided connection string.
155
+ 2. TSDB: live data of different key metric dictionaries in tsdb target.
156
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
156
157
  can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
157
158
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
158
159
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
159
- If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
160
- monitoring stream local memory.
161
- 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
162
- that run every hour by default. If defined, the parquet target path can be found under
163
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
164
- mlrun.mlconf.model_endpoint_monitoring.user_space.
160
+ 3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
161
+ the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
162
+ the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
163
+ using CE, the parquet target path is based on the defined MLRun artifact path.
165
164
 
166
165
  :param fn: A serving function.
167
166
  """
@@ -171,7 +170,7 @@ class EventStreamProcessor:
171
170
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
172
171
  )
173
172
 
174
- # Step 1 - Event routing based on the provided path
173
+ # Event routing based on the provided path
175
174
  def apply_event_routing():
176
175
  typing.cast(
177
176
  mlrun.serving.TaskStep,
@@ -184,7 +183,7 @@ class EventStreamProcessor:
184
183
 
185
184
  apply_event_routing()
186
185
 
187
- # Step 2 - Filter out events with '-' in the path basename from going forward
186
+ # Filter out events with '-' in the path basename from going forward
188
187
  # through the next steps of the stream graph
189
188
  def apply_storey_filter_stream_events():
190
189
  # Filter events with Prometheus endpoints path
@@ -197,7 +196,7 @@ class EventStreamProcessor:
197
196
 
198
197
  apply_storey_filter_stream_events()
199
198
 
200
- # Step 3 - Process endpoint event: splitting into sub-events and validate event data
199
+ # Process endpoint event: splitting into sub-events and validate event data
201
200
  def apply_process_endpoint_event():
202
201
  graph.add_step(
203
202
  "ProcessEndpointEvent",
@@ -208,7 +207,7 @@ class EventStreamProcessor:
208
207
 
209
208
  apply_process_endpoint_event()
210
209
 
211
- # Steps 4,5 - Applying Storey operations of filtering and flatten
210
+ # Applying Storey operations of filtering and flatten
212
211
  def apply_storey_filter_and_flatmap():
213
212
  # Remove none values from each event
214
213
  graph.add_step(
@@ -225,7 +224,7 @@ class EventStreamProcessor:
225
224
 
226
225
  apply_storey_filter_and_flatmap()
227
226
 
228
- # Step 6 - Validating feature names and map each feature to its value
227
+ # Validating feature names and map each feature to its value
229
228
  def apply_map_feature_names():
230
229
  graph.add_step(
231
230
  "MapFeatureNames",
@@ -237,9 +236,9 @@ class EventStreamProcessor:
237
236
 
238
237
  apply_map_feature_names()
239
238
 
240
- # Step 7 - Calculate number of predictions and average latency
239
+ # Calculate number of predictions and average latency
241
240
  def apply_storey_aggregations():
242
- # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
241
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
243
242
  graph.add_step(
244
243
  class_name="storey.AggregateByKey",
245
244
  aggregates=[
@@ -257,7 +256,7 @@ class EventStreamProcessor:
257
256
  table=".",
258
257
  key_field=EventFieldType.ENDPOINT_ID,
259
258
  )
260
- # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
259
+ # Calculate average latency time for each window (5 min and 1 hour by default)
261
260
  graph.add_step(
262
261
  class_name="storey.Rename",
263
262
  mapping={
@@ -270,8 +269,8 @@ class EventStreamProcessor:
270
269
 
271
270
  apply_storey_aggregations()
272
271
 
273
- # Steps 8-10 - KV/SQL branch
274
- # Step 8 - Filter relevant keys from the event before writing the data into the database table
272
+ # KV/SQL branch
273
+ # Filter relevant keys from the event before writing the data into the database table
275
274
  def apply_process_before_endpoint_update():
276
275
  graph.add_step(
277
276
  "ProcessBeforeEndpointUpdate",
@@ -281,7 +280,7 @@ class EventStreamProcessor:
281
280
 
282
281
  apply_process_before_endpoint_update()
283
282
 
284
- # Step 9 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
283
+ # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
285
284
  # about average latency and the amount of predictions over time
286
285
  def apply_update_endpoint():
287
286
  graph.add_step(
@@ -294,7 +293,7 @@ class EventStreamProcessor:
294
293
 
295
294
  apply_update_endpoint()
296
295
 
297
- # Step 10 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
296
+ # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
298
297
  # which will be used by Grafana monitoring dashboards
299
298
  def apply_infer_schema():
300
299
  graph.add_step(
@@ -309,7 +308,7 @@ class EventStreamProcessor:
309
308
  if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
310
309
  apply_infer_schema()
311
310
 
312
- # Step 11 - Emits the event in window size of events based on sample_window size (10 by default)
311
+ # Emits the event in window size of events based on sample_window size (10 by default)
313
312
  def apply_storey_sample_window():
314
313
  graph.add_step(
315
314
  "storey.steps.SampleWindow",
@@ -321,84 +320,18 @@ class EventStreamProcessor:
321
320
 
322
321
  apply_storey_sample_window()
323
322
 
324
- # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
325
- # Steps 20-21 - Prometheus branch
323
+ # TSDB branch (skip to Prometheus if in CE env)
326
324
  if not mlrun.mlconf.is_ce_mode():
327
325
  # TSDB branch
328
-
329
- # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
330
- # stats and details about the events
331
- def apply_process_before_tsdb():
332
- graph.add_step(
333
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
334
- )
335
-
336
- apply_process_before_tsdb()
337
-
338
- # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
339
- def apply_filter_and_unpacked_keys(name, keys):
340
- graph.add_step(
341
- "FilterAndUnpackKeys",
342
- name=name,
343
- after="ProcessBeforeTSDB",
344
- keys=[keys],
345
- )
346
-
347
- def apply_tsdb_target(name, after):
348
- graph.add_step(
349
- "storey.TSDBTarget",
350
- name=name,
351
- after=after,
352
- path=self.tsdb_path,
353
- rate="10/m",
354
- time_col=EventFieldType.TIMESTAMP,
355
- container=self.tsdb_container,
356
- v3io_frames=self.v3io_framesd,
357
- infer_columns_from_data=True,
358
- index_cols=[
359
- EventFieldType.ENDPOINT_ID,
360
- EventFieldType.RECORD_TYPE,
361
- EventFieldType.ENDPOINT_TYPE,
362
- ],
363
- max_events=self.tsdb_batching_max_events,
364
- flush_after_seconds=self.tsdb_batching_timeout_secs,
365
- key=EventFieldType.ENDPOINT_ID,
366
- )
367
-
368
- # Steps 13-14 - unpacked base_metrics dictionary
369
- apply_filter_and_unpacked_keys(
370
- name="FilterAndUnpackKeys1",
371
- keys=EventKeyMetrics.BASE_METRICS,
372
- )
373
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
374
-
375
- # Steps 15-16 - unpacked endpoint_features dictionary
376
- apply_filter_and_unpacked_keys(
377
- name="FilterAndUnpackKeys2",
378
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
379
- )
380
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
381
-
382
- # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
383
- apply_filter_and_unpacked_keys(
384
- name="FilterAndUnpackKeys3",
385
- keys=EventKeyMetrics.CUSTOM_METRICS,
326
+ tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
327
+ project=self.project,
386
328
  )
329
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
387
330
 
388
- def apply_storey_filter():
389
- graph.add_step(
390
- "storey.Filter",
391
- "FilterNotNone",
392
- after="FilterAndUnpackKeys3",
393
- _fn="(event is not None)",
394
- )
395
-
396
- apply_storey_filter()
397
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
398
331
  else:
399
- # Prometheus branch
332
+ # Prometheus
400
333
 
401
- # Step 20 - Increase the prediction counter by 1 and update the latency value
334
+ # Increase the prediction counter by 1 and update the latency value
402
335
  graph.add_step(
403
336
  "IncCounter",
404
337
  name="IncCounter",
@@ -406,7 +339,7 @@ class EventStreamProcessor:
406
339
  project=self.project,
407
340
  )
408
341
 
409
- # Step 21 - Record a sample of features and labels
342
+ # Record a sample of features and labels
410
343
  def apply_record_features_to_prometheus():
411
344
  graph.add_step(
412
345
  "RecordFeatures",
@@ -417,8 +350,8 @@ class EventStreamProcessor:
417
350
 
418
351
  apply_record_features_to_prometheus()
419
352
 
420
- # Steps 22-23 - Parquet branch
421
- # Step 22 - Filter and validate different keys before writing the data to Parquet target
353
+ # Parquet branch
354
+ # Filter and validate different keys before writing the data to Parquet target
422
355
  def apply_process_before_parquet():
423
356
  graph.add_step(
424
357
  "ProcessBeforeParquet",
@@ -429,7 +362,7 @@ class EventStreamProcessor:
429
362
 
430
363
  apply_process_before_parquet()
431
364
 
432
- # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
365
+ # Write the Parquet target file, partitioned by key (endpoint_id) and time.
433
366
  def apply_parquet_target():
434
367
  graph.add_step(
435
368
  "storey.ParquetTarget",
@@ -503,76 +436,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
503
436
  return e
504
437
 
505
438
 
506
- class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
507
- def __init__(self, **kwargs):
508
- """
509
- Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
510
- that each one of them contains important details and stats about the events:
511
- 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
512
- storey.AggregateByKey which was executed in step 5.
513
- 2. endpoint_features: feature names and values along with the prediction names and value.
514
- 3. custom_metric (opt): optional metrics provided by the user.
515
-
516
- :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
517
-
518
- """
519
- super().__init__(**kwargs)
520
-
521
- def do(self, event):
522
- # Compute prediction per second
523
- event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
524
- float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
525
- )
526
- base_fields = [
527
- EventFieldType.TIMESTAMP,
528
- EventFieldType.ENDPOINT_ID,
529
- EventFieldType.ENDPOINT_TYPE,
530
- ]
531
-
532
- # Getting event timestamp and endpoint_id
533
- base_event = {k: event[k] for k in base_fields}
534
-
535
- # base_metrics includes the stats about the average latency and the amount of predictions over time
536
- base_metrics = {
537
- EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
538
- EventLiveStats.PREDICTIONS_PER_SECOND: event[
539
- EventLiveStats.PREDICTIONS_PER_SECOND
540
- ],
541
- EventLiveStats.PREDICTIONS_COUNT_5M: event[
542
- EventLiveStats.PREDICTIONS_COUNT_5M
543
- ],
544
- EventLiveStats.PREDICTIONS_COUNT_1H: event[
545
- EventLiveStats.PREDICTIONS_COUNT_1H
546
- ],
547
- EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
548
- EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
549
- **base_event,
550
- }
551
-
552
- # endpoint_features includes the event values of each feature and prediction
553
- endpoint_features = {
554
- EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
555
- **event[EventFieldType.NAMED_PREDICTIONS],
556
- **event[EventFieldType.NAMED_FEATURES],
557
- **base_event,
558
- }
559
- # Create a dictionary that includes both base_metrics and endpoint_features
560
- processed = {
561
- EventKeyMetrics.BASE_METRICS: base_metrics,
562
- EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
563
- }
564
-
565
- # If metrics provided, add another dictionary if custom_metrics values
566
- if event[EventFieldType.METRICS]:
567
- processed[EventKeyMetrics.CUSTOM_METRICS] = {
568
- EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
569
- **event[EventFieldType.METRICS],
570
- **base_event,
571
- }
572
-
573
- return processed
574
-
575
-
576
439
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
577
440
  def __init__(self, **kwargs):
578
441
  """
@@ -807,7 +670,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
807
670
  # left them
808
671
  if endpoint_id not in self.endpoints:
809
672
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
810
- endpoint_record = get_endpoint_record(
673
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
811
674
  project=self.project,
812
675
  endpoint_id=endpoint_id,
813
676
  )
@@ -853,36 +716,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
853
716
  return False
854
717
 
855
718
 
856
- class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
857
- def __init__(self, keys, **kwargs):
858
- """
859
- Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
860
- or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
861
-
862
- :param keys: list of key metrics.
863
-
864
- :returns: An unpacked dictionary of event filtered by the provided key metrics.
865
- """
866
- super().__init__(**kwargs)
867
- self.keys = keys
868
-
869
- def do(self, event):
870
- # Keep only the relevant dictionary based on the provided keys
871
- new_event = {}
872
- for key in self.keys:
873
- if key in event:
874
- new_event[key] = event[key]
875
-
876
- # Create unpacked dictionary
877
- unpacked = {}
878
- for key in new_event.keys():
879
- if key in self.keys:
880
- unpacked = {**unpacked, **new_event[key]}
881
- else:
882
- unpacked[key] = new_event[key]
883
- return unpacked if unpacked else None
884
-
885
-
886
719
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
887
720
  def __init__(
888
721
  self,
@@ -940,7 +773,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
940
773
  label_values = event[EventFieldType.PREDICTION]
941
774
  # Get feature names and label columns
942
775
  if endpoint_id not in self.feature_names:
943
- endpoint_record = get_endpoint_record(
776
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
944
777
  project=self.project,
945
778
  endpoint_id=endpoint_id,
946
779
  )
@@ -1118,6 +951,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1118
951
  def do(self, event: dict):
1119
952
  key_set = set(event.keys())
1120
953
  if not key_set.issubset(self.keys):
954
+ import mlrun.utils.v3io_clients
955
+
1121
956
  self.keys.update(key_set)
1122
957
  # Apply infer_schema on the kv table for generating the schema file
1123
958
  mlrun.utils.v3io_clients.get_frames_client(