mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (234) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -2
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +21 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +113 -2
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +11 -0
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +224 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +374 -102
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +231 -22
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +864 -228
  77. mlrun/db/nopdb.py +268 -16
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1125 -414
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +207 -180
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +40 -14
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/api_gateway.py +646 -177
  178. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  179. mlrun/runtimes/nuclio/application/application.py +758 -0
  180. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  181. mlrun/runtimes/nuclio/function.py +188 -68
  182. mlrun/runtimes/nuclio/serving.py +57 -60
  183. mlrun/runtimes/pod.py +191 -58
  184. mlrun/runtimes/remotesparkjob.py +11 -8
  185. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  186. mlrun/runtimes/utils.py +40 -73
  187. mlrun/secrets.py +6 -2
  188. mlrun/serving/__init__.py +8 -1
  189. mlrun/serving/remote.py +2 -3
  190. mlrun/serving/routers.py +89 -64
  191. mlrun/serving/server.py +54 -26
  192. mlrun/serving/states.py +187 -56
  193. mlrun/serving/utils.py +19 -11
  194. mlrun/serving/v2_serving.py +136 -63
  195. mlrun/track/tracker.py +2 -1
  196. mlrun/track/trackers/mlflow_tracker.py +5 -0
  197. mlrun/utils/async_http.py +26 -6
  198. mlrun/utils/db.py +18 -0
  199. mlrun/utils/helpers.py +375 -105
  200. mlrun/utils/http.py +2 -2
  201. mlrun/utils/logger.py +75 -9
  202. mlrun/utils/notifications/notification/__init__.py +14 -10
  203. mlrun/utils/notifications/notification/base.py +48 -0
  204. mlrun/utils/notifications/notification/console.py +2 -0
  205. mlrun/utils/notifications/notification/git.py +24 -1
  206. mlrun/utils/notifications/notification/ipython.py +2 -0
  207. mlrun/utils/notifications/notification/slack.py +96 -21
  208. mlrun/utils/notifications/notification/webhook.py +63 -2
  209. mlrun/utils/notifications/notification_pusher.py +146 -16
  210. mlrun/utils/regex.py +9 -0
  211. mlrun/utils/retryer.py +3 -2
  212. mlrun/utils/v3io_clients.py +2 -3
  213. mlrun/utils/version/version.json +2 -2
  214. mlrun-1.7.2.dist-info/METADATA +390 -0
  215. mlrun-1.7.2.dist-info/RECORD +351 -0
  216. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  217. mlrun/feature_store/retrieval/conversion.py +0 -271
  218. mlrun/kfpops.py +0 -868
  219. mlrun/model_monitoring/application.py +0 -310
  220. mlrun/model_monitoring/batch.py +0 -974
  221. mlrun/model_monitoring/controller_handler.py +0 -37
  222. mlrun/model_monitoring/prometheus.py +0 -216
  223. mlrun/model_monitoring/stores/__init__.py +0 -111
  224. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  225. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  226. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  227. mlrun/model_monitoring/stores/models/base.py +0 -84
  228. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  229. mlrun/platforms/other.py +0 -305
  230. mlrun-1.7.0rc5.dist-info/METADATA +0 -269
  231. mlrun-1.7.0rc5.dist-info/RECORD +0 -323
  232. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  233. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  234. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -24,11 +24,11 @@ import mlrun
24
24
  import mlrun.common.model_monitoring.helpers
25
25
  import mlrun.config
26
26
  import mlrun.datastore.targets
27
+ import mlrun.feature_store as fstore
27
28
  import mlrun.feature_store.steps
28
- import mlrun.model_monitoring.prometheus
29
+ import mlrun.model_monitoring.db
29
30
  import mlrun.serving.states
30
31
  import mlrun.utils
31
- import mlrun.utils.v3io_clients
32
32
  from mlrun.common.schemas.model_monitoring.constants import (
33
33
  EventFieldType,
34
34
  EventKeyMetrics,
@@ -37,6 +37,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
37
37
  ModelEndpointTarget,
38
38
  ProjectSecretKeys,
39
39
  )
40
+ from mlrun.model_monitoring.db import StoreBase, TSDBConnector
40
41
  from mlrun.utils import logger
41
42
 
42
43
 
@@ -48,14 +49,12 @@ class EventStreamProcessor:
48
49
  parquet_batching_max_events: int,
49
50
  parquet_batching_timeout_secs: int,
50
51
  parquet_target: str,
51
- sample_window: int = 10,
52
52
  aggregate_windows: typing.Optional[list[str]] = None,
53
- aggregate_period: str = "30s",
53
+ aggregate_period: str = "5m",
54
54
  model_monitoring_access_key: str = None,
55
55
  ):
56
56
  # General configurations, mainly used for the storey steps in the future serving graph
57
57
  self.project = project
58
- self.sample_window = sample_window
59
58
  self.aggregate_windows = aggregate_windows or ["5m", "1h"]
60
59
  self.aggregate_period = aggregate_period
61
60
 
@@ -64,10 +63,6 @@ class EventStreamProcessor:
64
63
  self.parquet_batching_max_events = parquet_batching_max_events
65
64
  self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
66
65
 
67
- self.model_endpoint_store_target = (
68
- mlrun.mlconf.model_endpoint_monitoring.store_type
69
- )
70
-
71
66
  logger.info(
72
67
  "Initializing model monitoring event stream processor",
73
68
  parquet_path=self.parquet_path,
@@ -75,6 +70,7 @@ class EventStreamProcessor:
75
70
  )
76
71
 
77
72
  self.storage_options = None
73
+ self.tsdb_configurations = {}
78
74
  if not mlrun.mlconf.is_ce_mode():
79
75
  self._initialize_v3io_configurations(
80
76
  model_monitoring_access_key=model_monitoring_access_key
@@ -133,78 +129,83 @@ class EventStreamProcessor:
133
129
  self.tsdb_batching_max_events = tsdb_batching_max_events
134
130
  self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
135
131
 
136
- def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
132
+ def apply_monitoring_serving_graph(
133
+ self,
134
+ fn: mlrun.runtimes.ServingRuntime,
135
+ tsdb_connector: TSDBConnector,
136
+ endpoint_store: StoreBase,
137
+ ) -> None:
137
138
  """
138
- Apply monitoring serving graph to a given serving function. The following serving graph includes about 20 steps
139
- of different operations that are executed on the events from the model server. Each event has
140
- metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
141
- Throughout the serving graph, the results are written to 3 different databases:
142
- 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
143
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
144
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
145
- from other processes, such as current_stats that is being calculated by the monitoring batch job
146
- process. If the target is from type KV, then the model endpoints table can be found under
147
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
148
- is stored within the database that was defined in the provided connection string and can be found
149
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
150
- 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
151
- This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
139
+ Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
140
+ parts that each one them includes several steps of different operations that are executed on the events from
141
+ the model server.
142
+ Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
143
+ metrics from the model server.
144
+ In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
145
+ validation of the event data and adding important details to the event such as endpoint_id.
146
+ In the next parts, the serving graph stores data to 3 different targets:
147
+ 1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
148
+ time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
149
+ endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
150
+ represents sample statistics from the training data. If the target is from type KV, then the model endpoints
151
+ table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
152
+ SQL, then the table is stored within the database that was defined in the provided connection string.
153
+ 2. TSDB: live data of different key metric dictionaries in tsdb target.
154
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
152
155
  can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
153
156
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
154
157
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
155
- If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
156
- monitoring stream local memory.
157
- 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
158
- that run every hour by default. If defined, the parquet target path can be found under
159
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
160
- mlrun.mlconf.model_endpoint_monitoring.user_space.
158
+ 3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
159
+ the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
160
+ the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
161
+ using CE, the parquet target path is based on the defined MLRun artifact path.
161
162
 
162
163
  :param fn: A serving function.
164
+ :param tsdb_connector: Time series database connector.
165
+ :param endpoint_store: KV/SQL store used for endpoint data.
163
166
  """
164
167
 
165
168
  graph = typing.cast(
166
169
  mlrun.serving.states.RootFlowStep,
167
170
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
168
171
  )
172
+ graph.add_step(
173
+ "ExtractEndpointID",
174
+ "extract_endpoint",
175
+ full_event=True,
176
+ )
169
177
 
170
- # Step 1 - Event routing based on the provided path
171
- def apply_event_routing():
172
- typing.cast(
173
- mlrun.serving.TaskStep,
174
- graph.add_step(
175
- "EventRouting",
176
- full_event=True,
177
- project=self.project,
178
- ),
179
- ).respond()
180
-
181
- apply_event_routing()
178
+ # split the graph between event with error vs valid event
179
+ graph.add_step(
180
+ "storey.Filter",
181
+ "FilterError",
182
+ after="extract_endpoint",
183
+ _fn="(event.get('error') is None)",
184
+ )
182
185
 
183
- # Step 2 - Filter out events with '-' in the path basename from going forward
184
- # through the next steps of the stream graph
185
- def apply_storey_filter_stream_events():
186
- # Remove none values from each event
187
- graph.add_step(
188
- "storey.Filter",
189
- "filter_stream_event",
190
- _fn="('-' not in event.path.split('/')[-1])",
191
- full_event=True,
192
- )
186
+ graph.add_step(
187
+ "storey.Filter",
188
+ "ForwardError",
189
+ after="extract_endpoint",
190
+ _fn="(event.get('error') is not None)",
191
+ )
193
192
 
194
- apply_storey_filter_stream_events()
193
+ tsdb_connector.handle_model_error(
194
+ graph,
195
+ )
195
196
 
196
- # Step 3 - Process endpoint event: splitting into sub-events and validate event data
197
+ # Process endpoint event: splitting into sub-events and validate event data
197
198
  def apply_process_endpoint_event():
198
199
  graph.add_step(
199
200
  "ProcessEndpointEvent",
201
+ after="extract_endpoint", # TODO: change this to FilterError in ML-7456
200
202
  full_event=True,
201
203
  project=self.project,
202
- after="filter_stream_event",
203
204
  )
204
205
 
205
206
  apply_process_endpoint_event()
206
207
 
207
- # Steps 4,5 - Applying Storey operations of filtering and flatten
208
+ # Applying Storey operations of filtering and flatten
208
209
  def apply_storey_filter_and_flatmap():
209
210
  # Remove none values from each event
210
211
  graph.add_step(
@@ -221,7 +222,7 @@ class EventStreamProcessor:
221
222
 
222
223
  apply_storey_filter_and_flatmap()
223
224
 
224
- # Step 6 - Validating feature names and map each feature to its value
225
+ # Validating feature names and map each feature to its value
225
226
  def apply_map_feature_names():
226
227
  graph.add_step(
227
228
  "MapFeatureNames",
@@ -233,9 +234,9 @@ class EventStreamProcessor:
233
234
 
234
235
  apply_map_feature_names()
235
236
 
236
- # Step 7 - Calculate number of predictions and average latency
237
+ # Calculate number of predictions and average latency
237
238
  def apply_storey_aggregations():
238
- # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
239
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
239
240
  graph.add_step(
240
241
  class_name="storey.AggregateByKey",
241
242
  aggregates=[
@@ -253,7 +254,7 @@ class EventStreamProcessor:
253
254
  table=".",
254
255
  key_field=EventFieldType.ENDPOINT_ID,
255
256
  )
256
- # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
257
+ # Calculate average latency time for each window (5 min and 1 hour by default)
257
258
  graph.add_step(
258
259
  class_name="storey.Rename",
259
260
  mapping={
@@ -266,8 +267,8 @@ class EventStreamProcessor:
266
267
 
267
268
  apply_storey_aggregations()
268
269
 
269
- # Steps 8-10 - KV/SQL branch
270
- # Step 8 - Filter relevant keys from the event before writing the data into the database table
270
+ # KV/SQL branch
271
+ # Filter relevant keys from the event before writing the data into the database table
271
272
  def apply_process_before_endpoint_update():
272
273
  graph.add_step(
273
274
  "ProcessBeforeEndpointUpdate",
@@ -277,7 +278,7 @@ class EventStreamProcessor:
277
278
 
278
279
  apply_process_before_endpoint_update()
279
280
 
280
- # Step 9 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
281
+ # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
281
282
  # about average latency and the amount of predictions over time
282
283
  def apply_update_endpoint():
283
284
  graph.add_step(
@@ -285,12 +286,11 @@ class EventStreamProcessor:
285
286
  name="UpdateEndpoint",
286
287
  after="ProcessBeforeEndpointUpdate",
287
288
  project=self.project,
288
- model_endpoint_store_target=self.model_endpoint_store_target,
289
289
  )
290
290
 
291
291
  apply_update_endpoint()
292
292
 
293
- # Step 10 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
293
+ # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
294
294
  # which will be used by Grafana monitoring dashboards
295
295
  def apply_infer_schema():
296
296
  graph.add_step(
@@ -302,120 +302,13 @@ class EventStreamProcessor:
302
302
  table=self.kv_path,
303
303
  )
304
304
 
305
- if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
305
+ if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
306
306
  apply_infer_schema()
307
307
 
308
- # Step 11 - Emits the event in window size of events based on sample_window size (10 by default)
309
- def apply_storey_sample_window():
310
- graph.add_step(
311
- "storey.steps.SampleWindow",
312
- name="sample",
313
- after="Rename",
314
- window_size=self.sample_window,
315
- key=EventFieldType.ENDPOINT_ID,
316
- )
317
-
318
- apply_storey_sample_window()
319
-
320
- # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
321
- # Steps 20-21 - Prometheus branch
322
- if not mlrun.mlconf.is_ce_mode():
323
- # TSDB branch
324
-
325
- # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
326
- # stats and details about the events
327
- def apply_process_before_tsdb():
328
- graph.add_step(
329
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
330
- )
331
-
332
- apply_process_before_tsdb()
333
-
334
- # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
335
- def apply_filter_and_unpacked_keys(name, keys):
336
- graph.add_step(
337
- "FilterAndUnpackKeys",
338
- name=name,
339
- after="ProcessBeforeTSDB",
340
- keys=[keys],
341
- )
342
-
343
- def apply_tsdb_target(name, after):
344
- graph.add_step(
345
- "storey.TSDBTarget",
346
- name=name,
347
- after=after,
348
- path=self.tsdb_path,
349
- rate="10/m",
350
- time_col=EventFieldType.TIMESTAMP,
351
- container=self.tsdb_container,
352
- access_key=self.v3io_access_key,
353
- v3io_frames=self.v3io_framesd,
354
- infer_columns_from_data=True,
355
- index_cols=[
356
- EventFieldType.ENDPOINT_ID,
357
- EventFieldType.RECORD_TYPE,
358
- EventFieldType.ENDPOINT_TYPE,
359
- ],
360
- max_events=self.tsdb_batching_max_events,
361
- flush_after_seconds=self.tsdb_batching_timeout_secs,
362
- key=EventFieldType.ENDPOINT_ID,
363
- )
364
-
365
- # Steps 13-14 - unpacked base_metrics dictionary
366
- apply_filter_and_unpacked_keys(
367
- name="FilterAndUnpackKeys1",
368
- keys=EventKeyMetrics.BASE_METRICS,
369
- )
370
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
371
-
372
- # Steps 15-16 - unpacked endpoint_features dictionary
373
- apply_filter_and_unpacked_keys(
374
- name="FilterAndUnpackKeys2",
375
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
376
- )
377
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
378
-
379
- # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
380
- apply_filter_and_unpacked_keys(
381
- name="FilterAndUnpackKeys3",
382
- keys=EventKeyMetrics.CUSTOM_METRICS,
383
- )
384
-
385
- def apply_storey_filter():
386
- graph.add_step(
387
- "storey.Filter",
388
- "FilterNotNone",
389
- after="FilterAndUnpackKeys3",
390
- _fn="(event is not None)",
391
- )
392
-
393
- apply_storey_filter()
394
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
395
- else:
396
- # Prometheus branch
397
-
398
- # Step 20 - Increase the prediction counter by 1 and update the latency value
399
- graph.add_step(
400
- "IncCounter",
401
- name="IncCounter",
402
- after="MapFeatureNames",
403
- project=self.project,
404
- )
405
-
406
- # Step 21 - Record a sample of features and labels
407
- def apply_record_features_to_prometheus():
408
- graph.add_step(
409
- "RecordFeatures",
410
- name="RecordFeaturesToPrometheus",
411
- after="sample",
412
- project=self.project,
413
- )
414
-
415
- apply_record_features_to_prometheus()
308
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
416
309
 
417
- # Steps 22-23 - Parquet branch
418
- # Step 22 - Filter and validate different keys before writing the data to Parquet target
310
+ # Parquet branch
311
+ # Filter and validate different keys before writing the data to Parquet target
419
312
  def apply_process_before_parquet():
420
313
  graph.add_step(
421
314
  "ProcessBeforeParquet",
@@ -426,7 +319,7 @@ class EventStreamProcessor:
426
319
 
427
320
  apply_process_before_parquet()
428
321
 
429
- # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
322
+ # Write the Parquet target file, partitioned by key (endpoint_id) and time.
430
323
  def apply_parquet_target():
431
324
  graph.add_step(
432
325
  "storey.ParquetTarget",
@@ -441,6 +334,7 @@ class EventStreamProcessor:
441
334
  index_cols=[EventFieldType.ENDPOINT_ID],
442
335
  key_bucketing_number=0,
443
336
  time_partitioning_granularity="hour",
337
+ time_field=EventFieldType.TIMESTAMP,
444
338
  partition_cols=["$key", "$year", "$month", "$day", "$hour"],
445
339
  )
446
340
 
@@ -500,74 +394,36 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
500
394
  return e
501
395
 
502
396
 
503
- class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
504
- def __init__(self, **kwargs):
397
+ class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
398
+ def __init__(self, **kwargs) -> None:
505
399
  """
506
- Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
507
- that each one of them contains important details and stats about the events:
508
- 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
509
- storey.AggregateByKey which was executed in step 5.
510
- 2. endpoint_features: feature names and values along with the prediction names and value.
511
- 3. custom_metric (opt): optional metrics provided by the user.
512
-
513
- :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
514
-
400
+ Generate the model endpoint ID based on the event parameters and attach it to the event.
515
401
  """
516
402
  super().__init__(**kwargs)
517
403
 
518
- def do(self, event):
519
- # Compute prediction per second
520
- event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
521
- float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
522
- )
523
- base_fields = [
524
- EventFieldType.TIMESTAMP,
525
- EventFieldType.ENDPOINT_ID,
526
- EventFieldType.ENDPOINT_TYPE,
527
- ]
404
+ def do(self, full_event) -> typing.Union[storey.Event, None]:
405
+ # Getting model version and function uri from event
406
+ # and use them for retrieving the endpoint_id
407
+ function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
408
+ if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
409
+ return None
528
410
 
529
- # Getting event timestamp and endpoint_id
530
- base_event = {k: event[k] for k in base_fields}
531
-
532
- # base_metrics includes the stats about the average latency and the amount of predictions over time
533
- base_metrics = {
534
- EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
535
- EventLiveStats.PREDICTIONS_PER_SECOND: event[
536
- EventLiveStats.PREDICTIONS_PER_SECOND
537
- ],
538
- EventLiveStats.PREDICTIONS_COUNT_5M: event[
539
- EventLiveStats.PREDICTIONS_COUNT_5M
540
- ],
541
- EventLiveStats.PREDICTIONS_COUNT_1H: event[
542
- EventLiveStats.PREDICTIONS_COUNT_1H
543
- ],
544
- EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
545
- EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
546
- **base_event,
547
- }
411
+ model = full_event.body.get(EventFieldType.MODEL)
412
+ if not is_not_none(model, [EventFieldType.MODEL]):
413
+ return None
548
414
 
549
- # endpoint_features includes the event values of each feature and prediction
550
- endpoint_features = {
551
- EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
552
- **event[EventFieldType.NAMED_PREDICTIONS],
553
- **event[EventFieldType.NAMED_FEATURES],
554
- **base_event,
555
- }
556
- # Create a dictionary that includes both base_metrics and endpoint_features
557
- processed = {
558
- EventKeyMetrics.BASE_METRICS: base_metrics,
559
- EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
560
- }
415
+ version = full_event.body.get(EventFieldType.VERSION)
416
+ versioned_model = f"{model}:{version}" if version else f"{model}:latest"
561
417
 
562
- # If metrics provided, add another dictionary if custom_metrics values
563
- if event[EventFieldType.METRICS]:
564
- processed[EventKeyMetrics.CUSTOM_METRICS] = {
565
- EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
566
- **event[EventFieldType.METRICS],
567
- **base_event,
568
- }
418
+ endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
419
+ function_uri=function_uri,
420
+ versioned_model=versioned_model,
421
+ )
569
422
 
570
- return processed
423
+ endpoint_id = str(endpoint_id)
424
+ full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
425
+ full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
426
+ return full_event
571
427
 
572
428
 
573
429
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
@@ -587,6 +443,8 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
587
443
  for key in [
588
444
  EventFieldType.FEATURES,
589
445
  EventFieldType.NAMED_FEATURES,
446
+ EventFieldType.PREDICTION,
447
+ EventFieldType.NAMED_PREDICTIONS,
590
448
  ]:
591
449
  event.pop(key, None)
592
450
 
@@ -641,28 +499,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
641
499
  def do(self, full_event):
642
500
  event = full_event.body
643
501
 
644
- # Getting model version and function uri from event
645
- # and use them for retrieving the endpoint_id
646
- function_uri = event.get(EventFieldType.FUNCTION_URI)
647
- if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
648
- return None
649
-
650
- model = event.get(EventFieldType.MODEL)
651
- if not is_not_none(model, [EventFieldType.MODEL]):
652
- return None
653
-
654
- version = event.get(EventFieldType.VERSION)
655
- versioned_model = f"{model}:{version}" if version else f"{model}:latest"
656
-
657
- endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
658
- function_uri=function_uri,
659
- versioned_model=versioned_model,
660
- )
661
-
662
- endpoint_id = str(endpoint_id)
663
-
664
- event[EventFieldType.VERSIONED_MODEL] = versioned_model
665
- event[EventFieldType.ENDPOINT_ID] = endpoint_id
502
+ versioned_model = event[EventFieldType.VERSIONED_MODEL]
503
+ endpoint_id = event[EventFieldType.ENDPOINT_ID]
504
+ function_uri = event[EventFieldType.FUNCTION_URI]
666
505
 
667
506
  # In case this process fails, resume state from existing record
668
507
  self.resume_state(endpoint_id)
@@ -670,13 +509,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
670
509
  # If error key has been found in the current event,
671
510
  # increase the error counter by 1 and raise the error description
672
511
  error = event.get("error")
673
- if error:
512
+ if error: # TODO: delete this in ML-7456
674
513
  self.error_count[endpoint_id] += 1
675
- mlrun.model_monitoring.prometheus.write_errors(
676
- project=self.project,
677
- endpoint_id=event["endpoint_id"],
678
- model_name=event["model"],
679
- )
680
514
  raise mlrun.errors.MLRunInvalidArgumentError(str(error))
681
515
 
682
516
  # Validate event fields
@@ -743,6 +577,26 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
743
577
 
744
578
  # Separate each model invocation into sub events that will be stored as dictionary
745
579
  # in list of events. This list will be used as the body for the storey event.
580
+ if not isinstance(features, list):
581
+ raise mlrun.errors.MLRunInvalidArgumentError(
582
+ "Model's inputs must be a list"
583
+ )
584
+ features = (
585
+ features
586
+ if not any(not isinstance(feat, list) for feat in features)
587
+ else [features]
588
+ )
589
+ if not isinstance(predictions, list):
590
+ predictions = [[predictions]]
591
+ elif isinstance(predictions, list) and len(predictions) == len(features):
592
+ pass # predictions are already in the right format
593
+ else:
594
+ predictions = (
595
+ predictions
596
+ if not any(not isinstance(pred, list) for pred in predictions)
597
+ else [predictions]
598
+ )
599
+
746
600
  events = []
747
601
  for i, (feature, prediction) in enumerate(zip(features, predictions)):
748
602
  if not isinstance(prediction, list):
@@ -764,6 +618,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
764
618
  EventFieldType.PREDICTION: prediction,
765
619
  EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
766
620
  EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
621
+ EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
622
+ self.last_request[endpoint_id]
623
+ ).timestamp(),
767
624
  EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
768
625
  EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
769
626
  EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
@@ -802,7 +659,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
802
659
  # left them
803
660
  if endpoint_id not in self.endpoints:
804
661
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
805
- endpoint_record = get_endpoint_record(
662
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
806
663
  project=self.project,
807
664
  endpoint_id=endpoint_id,
808
665
  )
@@ -848,36 +705,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
848
705
  return False
849
706
 
850
707
 
851
- class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
852
- def __init__(self, keys, **kwargs):
853
- """
854
- Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
855
- or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
856
-
857
- :param keys: list of key metrics.
858
-
859
- :returns: An unpacked dictionary of event filtered by the provided key metrics.
860
- """
861
- super().__init__(**kwargs)
862
- self.keys = keys
863
-
864
- def do(self, event):
865
- # Keep only the relevant dictionary based on the provided keys
866
- new_event = {}
867
- for key in self.keys:
868
- if key in event:
869
- new_event[key] = event[key]
870
-
871
- # Create unpacked dictionary
872
- unpacked = {}
873
- for key in new_event.keys():
874
- if key in self.keys:
875
- unpacked = {**unpacked, **new_event[key]}
876
- else:
877
- unpacked[key] = new_event[key]
878
- return unpacked if unpacked else None
879
-
880
-
881
708
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
882
709
  def __init__(
883
710
  self,
@@ -931,9 +758,17 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
931
758
  def do(self, event: dict):
932
759
  endpoint_id = event[EventFieldType.ENDPOINT_ID]
933
760
 
761
+ feature_values = event[EventFieldType.FEATURES]
762
+ label_values = event[EventFieldType.PREDICTION]
763
+
764
+ for index in range(len(feature_values)):
765
+ feature_value = feature_values[index]
766
+ if isinstance(feature_value, int):
767
+ feature_values[index] = float(feature_value)
768
+
934
769
  # Get feature names and label columns
935
770
  if endpoint_id not in self.feature_names:
936
- endpoint_record = get_endpoint_record(
771
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
937
772
  project=self.project,
938
773
  endpoint_id=endpoint_id,
939
774
  )
@@ -966,6 +801,12 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
966
801
  },
967
802
  )
968
803
 
804
+ update_monitoring_feature_set(
805
+ endpoint_record=endpoint_record,
806
+ feature_names=feature_names,
807
+ feature_values=feature_values,
808
+ )
809
+
969
810
  # Similar process with label columns
970
811
  if not label_columns and self._infer_columns_from_data:
971
812
  label_columns = self._infer_label_columns_from_data(event)
@@ -984,6 +825,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
984
825
  endpoint_id=endpoint_id,
985
826
  attributes={EventFieldType.LABEL_NAMES: json.dumps(label_columns)},
986
827
  )
828
+ update_monitoring_feature_set(
829
+ endpoint_record=endpoint_record,
830
+ feature_names=label_columns,
831
+ feature_values=label_values,
832
+ )
987
833
 
988
834
  self.label_columns[endpoint_id] = label_columns
989
835
  self.feature_names[endpoint_id] = feature_names
@@ -1001,7 +847,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1001
847
 
1002
848
  # Add feature_name:value pairs along with a mapping dictionary of all of these pairs
1003
849
  feature_names = self.feature_names[endpoint_id]
1004
- feature_values = event[EventFieldType.FEATURES]
1005
850
  self._map_dictionary_values(
1006
851
  event=event,
1007
852
  named_iters=feature_names,
@@ -1011,7 +856,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1011
856
 
1012
857
  # Add label_name:value pairs along with a mapping dictionary of all of these pairs
1013
858
  label_names = self.label_columns[endpoint_id]
1014
- label_values = event[EventFieldType.PREDICTION]
1015
859
  self._map_dictionary_values(
1016
860
  event=event,
1017
861
  named_iters=label_names,
@@ -1052,7 +896,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1052
896
 
1053
897
 
1054
898
  class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1055
- def __init__(self, project: str, model_endpoint_store_target: str, **kwargs):
899
+ def __init__(self, project: str, **kwargs):
1056
900
  """
1057
901
  Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
1058
902
  the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
@@ -1062,9 +906,11 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1062
906
  """
1063
907
  super().__init__(**kwargs)
1064
908
  self.project = project
1065
- self.model_endpoint_store_target = model_endpoint_store_target
1066
909
 
1067
910
  def do(self, event: dict):
911
+ # Remove labels from the event
912
+ event.pop(EventFieldType.LABELS)
913
+
1068
914
  update_endpoint_record(
1069
915
  project=self.project,
1070
916
  endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
@@ -1102,6 +948,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1102
948
  def do(self, event: dict):
1103
949
  key_set = set(event.keys())
1104
950
  if not key_set.issubset(self.keys):
951
+ import mlrun.utils.v3io_clients
952
+
1105
953
  self.keys.update(key_set)
1106
954
  # Apply infer_schema on the kv table for generating the schema file
1107
955
  mlrun.utils.v3io_clients.get_frames_client(
@@ -1112,104 +960,12 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1112
960
  return event
1113
961
 
1114
962
 
1115
- class EventRouting(mlrun.feature_store.steps.MapClass):
1116
- """
1117
- Router the event according to the configured path under event.path. Please note that this step returns the result
1118
- to the caller. At the moment there are several paths:
1119
-
1120
- - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
1121
- to scrape the results from the monitoring stream memory.
1122
-
1123
- - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
1124
- statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
1125
- metrics.
1126
-
1127
- - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
1128
-
1129
- """
1130
-
1131
- def __init__(
1132
- self,
1133
- project: str,
1134
- **kwargs,
1135
- ):
1136
- super().__init__(**kwargs)
1137
- self.project: str = project
1138
-
1139
- def do(self, event):
1140
- if event.path == "/model-monitoring-metrics":
1141
- # Return a parsed Prometheus registry file
1142
- event.body = mlrun.model_monitoring.prometheus.get_registry()
1143
- elif event.path == "/monitoring-batch-metrics":
1144
- # Update statistical metrics
1145
- for event_metric in event.body:
1146
- mlrun.model_monitoring.prometheus.write_drift_metrics(
1147
- project=self.project,
1148
- endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
1149
- metric=event_metric[EventFieldType.METRIC],
1150
- value=event_metric[EventFieldType.VALUE],
1151
- )
1152
- elif event.path == "/monitoring-drift-status":
1153
- # Update drift status
1154
- mlrun.model_monitoring.prometheus.write_drift_status(
1155
- project=self.project,
1156
- endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
1157
- drift_status=event.body[EventFieldType.DRIFT_STATUS],
1158
- )
1159
-
1160
- return event
1161
-
1162
-
1163
- class IncCounter(mlrun.feature_store.steps.MapClass):
1164
- """Increase prediction counter by 1 and update the total latency value"""
1165
-
1166
- def __init__(self, project: str, **kwargs):
1167
- super().__init__(**kwargs)
1168
- self.project: str = project
1169
-
1170
- def do(self, event):
1171
- # Compute prediction per second
1172
-
1173
- mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
1174
- project=self.project,
1175
- endpoint_id=event[EventFieldType.ENDPOINT_ID],
1176
- latency=event[EventFieldType.LATENCY],
1177
- model_name=event[EventFieldType.MODEL],
1178
- endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
1179
- )
1180
-
1181
- return event
1182
-
1183
-
1184
- class RecordFeatures(mlrun.feature_store.steps.MapClass):
1185
- """Record a sample of features and labels in Prometheus registry"""
1186
-
1187
- def __init__(self, project: str, **kwargs):
1188
- super().__init__(**kwargs)
1189
- self.project: str = project
1190
-
1191
- def do(self, event):
1192
- # Generate a dictionary of features and predictions
1193
- features = {
1194
- **event[EventFieldType.NAMED_PREDICTIONS],
1195
- **event[EventFieldType.NAMED_FEATURES],
1196
- }
1197
-
1198
- mlrun.model_monitoring.prometheus.write_income_features(
1199
- project=self.project,
1200
- endpoint_id=event[EventFieldType.ENDPOINT_ID],
1201
- features=features,
1202
- )
1203
-
1204
- return event
1205
-
1206
-
1207
963
  def update_endpoint_record(
1208
964
  project: str,
1209
965
  endpoint_id: str,
1210
966
  attributes: dict,
1211
967
  ):
1212
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
968
+ model_endpoint_store = mlrun.model_monitoring.get_store_object(
1213
969
  project=project,
1214
970
  )
1215
971
 
@@ -1218,8 +974,19 @@ def update_endpoint_record(
1218
974
  )
1219
975
 
1220
976
 
1221
- def get_endpoint_record(project: str, endpoint_id: str):
1222
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1223
- project=project,
977
+ def update_monitoring_feature_set(
978
+ endpoint_record: dict[str, typing.Any],
979
+ feature_names: list[str],
980
+ feature_values: list[typing.Any],
981
+ ):
982
+ monitoring_feature_set = fstore.get_feature_set(
983
+ endpoint_record[
984
+ mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
985
+ ]
1224
986
  )
1225
- return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
987
+ for name, val in zip(feature_names, feature_values):
988
+ monitoring_feature_set.add_feature(
989
+ fstore.Feature(name=name, value_type=type(val))
990
+ )
991
+
992
+ monitoring_feature_set.save()