mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (200) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +25 -111
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +38 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +41 -47
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +68 -0
  13. mlrun/common/formatters/__init__.py +19 -0
  14. mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
  15. mlrun/common/formatters/base.py +78 -0
  16. mlrun/common/formatters/function.py +41 -0
  17. mlrun/common/formatters/pipeline.py +53 -0
  18. mlrun/common/formatters/project.py +51 -0
  19. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  20. mlrun/common/schemas/__init__.py +25 -4
  21. mlrun/common/schemas/alert.py +203 -0
  22. mlrun/common/schemas/api_gateway.py +148 -0
  23. mlrun/common/schemas/artifact.py +15 -5
  24. mlrun/common/schemas/auth.py +8 -2
  25. mlrun/common/schemas/client_spec.py +2 -0
  26. mlrun/common/schemas/frontend_spec.py +1 -0
  27. mlrun/common/schemas/function.py +4 -0
  28. mlrun/common/schemas/hub.py +7 -9
  29. mlrun/common/schemas/model_monitoring/__init__.py +19 -3
  30. mlrun/common/schemas/model_monitoring/constants.py +96 -26
  31. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  32. mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
  33. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  34. mlrun/common/schemas/pipeline.py +0 -9
  35. mlrun/common/schemas/project.py +22 -21
  36. mlrun/common/types.py +7 -1
  37. mlrun/config.py +87 -19
  38. mlrun/data_types/data_types.py +4 -0
  39. mlrun/data_types/to_pandas.py +9 -9
  40. mlrun/datastore/__init__.py +5 -8
  41. mlrun/datastore/alibaba_oss.py +130 -0
  42. mlrun/datastore/azure_blob.py +4 -5
  43. mlrun/datastore/base.py +69 -30
  44. mlrun/datastore/datastore.py +10 -2
  45. mlrun/datastore/datastore_profile.py +90 -6
  46. mlrun/datastore/google_cloud_storage.py +1 -1
  47. mlrun/datastore/hdfs.py +5 -0
  48. mlrun/datastore/inmem.py +2 -2
  49. mlrun/datastore/redis.py +2 -2
  50. mlrun/datastore/s3.py +5 -0
  51. mlrun/datastore/snowflake_utils.py +43 -0
  52. mlrun/datastore/sources.py +172 -44
  53. mlrun/datastore/store_resources.py +7 -7
  54. mlrun/datastore/targets.py +285 -41
  55. mlrun/datastore/utils.py +68 -5
  56. mlrun/datastore/v3io.py +27 -50
  57. mlrun/db/auth_utils.py +152 -0
  58. mlrun/db/base.py +149 -14
  59. mlrun/db/factory.py +1 -1
  60. mlrun/db/httpdb.py +608 -178
  61. mlrun/db/nopdb.py +191 -7
  62. mlrun/errors.py +11 -0
  63. mlrun/execution.py +37 -20
  64. mlrun/feature_store/__init__.py +0 -2
  65. mlrun/feature_store/api.py +21 -52
  66. mlrun/feature_store/feature_set.py +48 -23
  67. mlrun/feature_store/feature_vector.py +2 -1
  68. mlrun/feature_store/ingestion.py +7 -6
  69. mlrun/feature_store/retrieval/base.py +9 -4
  70. mlrun/feature_store/retrieval/conversion.py +9 -9
  71. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  72. mlrun/feature_store/retrieval/job.py +9 -3
  73. mlrun/feature_store/retrieval/local_merger.py +2 -0
  74. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  75. mlrun/feature_store/steps.py +30 -19
  76. mlrun/features.py +4 -13
  77. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  78. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  79. mlrun/frameworks/lgbm/__init__.py +1 -1
  80. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  81. mlrun/frameworks/lgbm/model_handler.py +1 -1
  82. mlrun/frameworks/parallel_coordinates.py +2 -1
  83. mlrun/frameworks/pytorch/__init__.py +2 -2
  84. mlrun/frameworks/sklearn/__init__.py +1 -1
  85. mlrun/frameworks/tf_keras/__init__.py +5 -2
  86. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  87. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  88. mlrun/frameworks/xgboost/__init__.py +1 -1
  89. mlrun/k8s_utils.py +10 -11
  90. mlrun/launcher/__init__.py +1 -1
  91. mlrun/launcher/base.py +6 -5
  92. mlrun/launcher/client.py +8 -6
  93. mlrun/launcher/factory.py +1 -1
  94. mlrun/launcher/local.py +9 -3
  95. mlrun/launcher/remote.py +9 -3
  96. mlrun/lists.py +6 -2
  97. mlrun/model.py +58 -19
  98. mlrun/model_monitoring/__init__.py +1 -1
  99. mlrun/model_monitoring/api.py +127 -301
  100. mlrun/model_monitoring/application.py +5 -296
  101. mlrun/model_monitoring/applications/__init__.py +11 -0
  102. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  103. mlrun/model_monitoring/applications/base.py +282 -0
  104. mlrun/model_monitoring/applications/context.py +214 -0
  105. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  106. mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
  107. mlrun/model_monitoring/applications/results.py +99 -0
  108. mlrun/model_monitoring/controller.py +30 -36
  109. mlrun/model_monitoring/db/__init__.py +18 -0
  110. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  111. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  112. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
  113. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  114. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  115. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  116. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  117. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  118. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  119. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  120. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
  121. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  122. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  123. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  124. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  125. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  126. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  127. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  128. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  129. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  130. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  131. mlrun/model_monitoring/evidently_application.py +6 -118
  132. mlrun/model_monitoring/features_drift_table.py +34 -22
  133. mlrun/model_monitoring/helpers.py +100 -7
  134. mlrun/model_monitoring/model_endpoint.py +3 -2
  135. mlrun/model_monitoring/stream_processing.py +93 -228
  136. mlrun/model_monitoring/tracking_policy.py +7 -1
  137. mlrun/model_monitoring/writer.py +152 -124
  138. mlrun/package/packagers_manager.py +1 -0
  139. mlrun/package/utils/_formatter.py +2 -2
  140. mlrun/platforms/__init__.py +11 -10
  141. mlrun/platforms/iguazio.py +21 -202
  142. mlrun/projects/operations.py +30 -16
  143. mlrun/projects/pipelines.py +92 -99
  144. mlrun/projects/project.py +757 -268
  145. mlrun/render.py +15 -14
  146. mlrun/run.py +160 -162
  147. mlrun/runtimes/__init__.py +55 -3
  148. mlrun/runtimes/base.py +33 -19
  149. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  150. mlrun/runtimes/funcdoc.py +0 -28
  151. mlrun/runtimes/kubejob.py +28 -122
  152. mlrun/runtimes/local.py +5 -2
  153. mlrun/runtimes/mpijob/__init__.py +0 -20
  154. mlrun/runtimes/mpijob/abstract.py +8 -8
  155. mlrun/runtimes/mpijob/v1.py +1 -1
  156. mlrun/runtimes/nuclio/__init__.py +1 -0
  157. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  158. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  159. mlrun/runtimes/nuclio/application/application.py +523 -0
  160. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  161. mlrun/runtimes/nuclio/function.py +98 -58
  162. mlrun/runtimes/nuclio/serving.py +36 -42
  163. mlrun/runtimes/pod.py +196 -45
  164. mlrun/runtimes/remotesparkjob.py +1 -1
  165. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  166. mlrun/runtimes/utils.py +6 -73
  167. mlrun/secrets.py +6 -2
  168. mlrun/serving/remote.py +2 -3
  169. mlrun/serving/routers.py +7 -4
  170. mlrun/serving/server.py +7 -8
  171. mlrun/serving/states.py +73 -43
  172. mlrun/serving/v2_serving.py +8 -7
  173. mlrun/track/tracker.py +2 -1
  174. mlrun/utils/async_http.py +25 -5
  175. mlrun/utils/helpers.py +141 -75
  176. mlrun/utils/http.py +1 -1
  177. mlrun/utils/logger.py +39 -7
  178. mlrun/utils/notifications/notification/__init__.py +14 -9
  179. mlrun/utils/notifications/notification/base.py +12 -0
  180. mlrun/utils/notifications/notification/console.py +2 -0
  181. mlrun/utils/notifications/notification/git.py +3 -1
  182. mlrun/utils/notifications/notification/ipython.py +2 -0
  183. mlrun/utils/notifications/notification/slack.py +101 -21
  184. mlrun/utils/notifications/notification/webhook.py +11 -1
  185. mlrun/utils/notifications/notification_pusher.py +147 -16
  186. mlrun/utils/retryer.py +3 -2
  187. mlrun/utils/v3io_clients.py +0 -1
  188. mlrun/utils/version/version.json +2 -2
  189. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
  190. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  191. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
  192. mlrun/kfpops.py +0 -868
  193. mlrun/model_monitoring/batch.py +0 -974
  194. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  195. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  196. mlrun/platforms/other.py +0 -305
  197. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  198. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  199. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  200. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
@@ -24,11 +24,12 @@ import mlrun
24
24
  import mlrun.common.model_monitoring.helpers
25
25
  import mlrun.config
26
26
  import mlrun.datastore.targets
27
+ import mlrun.feature_store as fstore
27
28
  import mlrun.feature_store.steps
29
+ import mlrun.model_monitoring.db
28
30
  import mlrun.model_monitoring.prometheus
29
31
  import mlrun.serving.states
30
32
  import mlrun.utils
31
- import mlrun.utils.v3io_clients
32
33
  from mlrun.common.schemas.model_monitoring.constants import (
33
34
  EventFieldType,
34
35
  EventKeyMetrics,
@@ -36,6 +37,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
36
37
  FileTargetKind,
37
38
  ModelEndpointTarget,
38
39
  ProjectSecretKeys,
40
+ PrometheusEndpoints,
39
41
  )
40
42
  from mlrun.utils import logger
41
43
 
@@ -75,6 +77,7 @@ class EventStreamProcessor:
75
77
  )
76
78
 
77
79
  self.storage_options = None
80
+ self.tsdb_configurations = {}
78
81
  if not mlrun.mlconf.is_ce_mode():
79
82
  self._initialize_v3io_configurations(
80
83
  model_monitoring_access_key=model_monitoring_access_key
@@ -133,33 +136,38 @@ class EventStreamProcessor:
133
136
  self.tsdb_batching_max_events = tsdb_batching_max_events
134
137
  self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
135
138
 
136
- def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
139
+ def apply_monitoring_serving_graph(
140
+ self,
141
+ fn: mlrun.runtimes.ServingRuntime,
142
+ tsdb_service_provider: typing.Optional[typing.Callable] = None,
143
+ ) -> None:
137
144
  """
138
- Apply monitoring serving graph to a given serving function. The following serving graph includes about 20 steps
139
- of different operations that are executed on the events from the model server. Each event has
140
- metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
141
- Throughout the serving graph, the results are written to 3 different databases:
142
- 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
143
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
144
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
145
- from other processes, such as current_stats that is being calculated by the monitoring batch job
146
- process. If the target is from type KV, then the model endpoints table can be found under
147
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
148
- is stored within the database that was defined in the provided connection string and can be found
149
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
150
- 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
151
- This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
145
+ Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
146
+ parts that each one them includes several steps of different operations that are executed on the events from
147
+ the model server.
148
+ Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
149
+ metrics from the model server.
150
+ In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
151
+ validation of the event data and adding important details to the event such as endpoint_id.
152
+ In the next parts, the serving graph stores data to 3 different targets:
153
+ 1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
154
+ time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
155
+ endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
156
+ represents sample statistics from the training data. If the target is from type KV, then the model endpoints
157
+ table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
158
+ SQL, then the table is stored within the database that was defined in the provided connection string.
159
+ 2. TSDB: live data of different key metric dictionaries in tsdb target.
160
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
152
161
  can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
153
162
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
154
163
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
155
- If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
156
- monitoring stream local memory.
157
- 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
158
- that run every hour by default. If defined, the parquet target path can be found under
159
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
160
- mlrun.mlconf.model_endpoint_monitoring.user_space.
164
+ 3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
165
+ the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
166
+ the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
167
+ using CE, the parquet target path is based on the defined MLRun artifact path.
161
168
 
162
169
  :param fn: A serving function.
170
+ :param tsdb_service_provider: An optional callable function that provides the TSDB connection string.
163
171
  """
164
172
 
165
173
  graph = typing.cast(
@@ -167,7 +175,7 @@ class EventStreamProcessor:
167
175
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
168
176
  )
169
177
 
170
- # Step 1 - Event routing based on the provided path
178
+ # Event routing based on the provided path
171
179
  def apply_event_routing():
172
180
  typing.cast(
173
181
  mlrun.serving.TaskStep,
@@ -180,20 +188,20 @@ class EventStreamProcessor:
180
188
 
181
189
  apply_event_routing()
182
190
 
183
- # Step 2 - Filter out events with '-' in the path basename from going forward
191
+ # Filter out events with '-' in the path basename from going forward
184
192
  # through the next steps of the stream graph
185
193
  def apply_storey_filter_stream_events():
186
- # Remove none values from each event
194
+ # Filter events with Prometheus endpoints path
187
195
  graph.add_step(
188
196
  "storey.Filter",
189
197
  "filter_stream_event",
190
- _fn="('-' not in event.path.split('/')[-1])",
198
+ _fn=f"(event.path not in {PrometheusEndpoints.list()})",
191
199
  full_event=True,
192
200
  )
193
201
 
194
202
  apply_storey_filter_stream_events()
195
203
 
196
- # Step 3 - Process endpoint event: splitting into sub-events and validate event data
204
+ # Process endpoint event: splitting into sub-events and validate event data
197
205
  def apply_process_endpoint_event():
198
206
  graph.add_step(
199
207
  "ProcessEndpointEvent",
@@ -204,7 +212,7 @@ class EventStreamProcessor:
204
212
 
205
213
  apply_process_endpoint_event()
206
214
 
207
- # Steps 4,5 - Applying Storey operations of filtering and flatten
215
+ # Applying Storey operations of filtering and flatten
208
216
  def apply_storey_filter_and_flatmap():
209
217
  # Remove none values from each event
210
218
  graph.add_step(
@@ -221,7 +229,7 @@ class EventStreamProcessor:
221
229
 
222
230
  apply_storey_filter_and_flatmap()
223
231
 
224
- # Step 6 - Validating feature names and map each feature to its value
232
+ # Validating feature names and map each feature to its value
225
233
  def apply_map_feature_names():
226
234
  graph.add_step(
227
235
  "MapFeatureNames",
@@ -233,9 +241,9 @@ class EventStreamProcessor:
233
241
 
234
242
  apply_map_feature_names()
235
243
 
236
- # Step 7 - Calculate number of predictions and average latency
244
+ # Calculate number of predictions and average latency
237
245
  def apply_storey_aggregations():
238
- # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
246
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
239
247
  graph.add_step(
240
248
  class_name="storey.AggregateByKey",
241
249
  aggregates=[
@@ -253,7 +261,7 @@ class EventStreamProcessor:
253
261
  table=".",
254
262
  key_field=EventFieldType.ENDPOINT_ID,
255
263
  )
256
- # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
264
+ # Calculate average latency time for each window (5 min and 1 hour by default)
257
265
  graph.add_step(
258
266
  class_name="storey.Rename",
259
267
  mapping={
@@ -266,8 +274,8 @@ class EventStreamProcessor:
266
274
 
267
275
  apply_storey_aggregations()
268
276
 
269
- # Steps 8-10 - KV/SQL branch
270
- # Step 8 - Filter relevant keys from the event before writing the data into the database table
277
+ # KV/SQL branch
278
+ # Filter relevant keys from the event before writing the data into the database table
271
279
  def apply_process_before_endpoint_update():
272
280
  graph.add_step(
273
281
  "ProcessBeforeEndpointUpdate",
@@ -277,7 +285,7 @@ class EventStreamProcessor:
277
285
 
278
286
  apply_process_before_endpoint_update()
279
287
 
280
- # Step 9 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
288
+ # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
281
289
  # about average latency and the amount of predictions over time
282
290
  def apply_update_endpoint():
283
291
  graph.add_step(
@@ -290,7 +298,7 @@ class EventStreamProcessor:
290
298
 
291
299
  apply_update_endpoint()
292
300
 
293
- # Step 10 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
301
+ # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
294
302
  # which will be used by Grafana monitoring dashboards
295
303
  def apply_infer_schema():
296
304
  graph.add_step(
@@ -305,7 +313,7 @@ class EventStreamProcessor:
305
313
  if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
306
314
  apply_infer_schema()
307
315
 
308
- # Step 11 - Emits the event in window size of events based on sample_window size (10 by default)
316
+ # Emits the event in window size of events based on sample_window size (10 by default)
309
317
  def apply_storey_sample_window():
310
318
  graph.add_step(
311
319
  "storey.steps.SampleWindow",
@@ -317,85 +325,16 @@ class EventStreamProcessor:
317
325
 
318
326
  apply_storey_sample_window()
319
327
 
320
- # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
321
- # Steps 20-21 - Prometheus branch
328
+ # TSDB branch (skip to Prometheus if in CE env)
322
329
  if not mlrun.mlconf.is_ce_mode():
323
- # TSDB branch
324
-
325
- # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
326
- # stats and details about the events
327
- def apply_process_before_tsdb():
328
- graph.add_step(
329
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
330
- )
331
-
332
- apply_process_before_tsdb()
333
-
334
- # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
335
- def apply_filter_and_unpacked_keys(name, keys):
336
- graph.add_step(
337
- "FilterAndUnpackKeys",
338
- name=name,
339
- after="ProcessBeforeTSDB",
340
- keys=[keys],
341
- )
342
-
343
- def apply_tsdb_target(name, after):
344
- graph.add_step(
345
- "storey.TSDBTarget",
346
- name=name,
347
- after=after,
348
- path=self.tsdb_path,
349
- rate="10/m",
350
- time_col=EventFieldType.TIMESTAMP,
351
- container=self.tsdb_container,
352
- access_key=self.v3io_access_key,
353
- v3io_frames=self.v3io_framesd,
354
- infer_columns_from_data=True,
355
- index_cols=[
356
- EventFieldType.ENDPOINT_ID,
357
- EventFieldType.RECORD_TYPE,
358
- EventFieldType.ENDPOINT_TYPE,
359
- ],
360
- max_events=self.tsdb_batching_max_events,
361
- flush_after_seconds=self.tsdb_batching_timeout_secs,
362
- key=EventFieldType.ENDPOINT_ID,
363
- )
364
-
365
- # Steps 13-14 - unpacked base_metrics dictionary
366
- apply_filter_and_unpacked_keys(
367
- name="FilterAndUnpackKeys1",
368
- keys=EventKeyMetrics.BASE_METRICS,
369
- )
370
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
371
-
372
- # Steps 15-16 - unpacked endpoint_features dictionary
373
- apply_filter_and_unpacked_keys(
374
- name="FilterAndUnpackKeys2",
375
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
376
- )
377
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
378
-
379
- # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
380
- apply_filter_and_unpacked_keys(
381
- name="FilterAndUnpackKeys3",
382
- keys=EventKeyMetrics.CUSTOM_METRICS,
330
+ tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
331
+ project=self.project, secret_provider=tsdb_service_provider
383
332
  )
333
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
384
334
 
385
- def apply_storey_filter():
386
- graph.add_step(
387
- "storey.Filter",
388
- "FilterNotNone",
389
- after="FilterAndUnpackKeys3",
390
- _fn="(event is not None)",
391
- )
392
-
393
- apply_storey_filter()
394
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
395
335
  else:
396
- # Prometheus branch
397
-
398
- # Step 20 - Increase the prediction counter by 1 and update the latency value
336
+ # Prometheus
337
+ # Increase the prediction counter by 1 and update the latency value
399
338
  graph.add_step(
400
339
  "IncCounter",
401
340
  name="IncCounter",
@@ -403,7 +342,7 @@ class EventStreamProcessor:
403
342
  project=self.project,
404
343
  )
405
344
 
406
- # Step 21 - Record a sample of features and labels
345
+ # Record a sample of features and labels
407
346
  def apply_record_features_to_prometheus():
408
347
  graph.add_step(
409
348
  "RecordFeatures",
@@ -414,8 +353,8 @@ class EventStreamProcessor:
414
353
 
415
354
  apply_record_features_to_prometheus()
416
355
 
417
- # Steps 22-23 - Parquet branch
418
- # Step 22 - Filter and validate different keys before writing the data to Parquet target
356
+ # Parquet branch
357
+ # Filter and validate different keys before writing the data to Parquet target
419
358
  def apply_process_before_parquet():
420
359
  graph.add_step(
421
360
  "ProcessBeforeParquet",
@@ -426,7 +365,7 @@ class EventStreamProcessor:
426
365
 
427
366
  apply_process_before_parquet()
428
367
 
429
- # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
368
+ # Write the Parquet target file, partitioned by key (endpoint_id) and time.
430
369
  def apply_parquet_target():
431
370
  graph.add_step(
432
371
  "storey.ParquetTarget",
@@ -500,76 +439,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
500
439
  return e
501
440
 
502
441
 
503
- class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
504
- def __init__(self, **kwargs):
505
- """
506
- Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
507
- that each one of them contains important details and stats about the events:
508
- 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
509
- storey.AggregateByKey which was executed in step 5.
510
- 2. endpoint_features: feature names and values along with the prediction names and value.
511
- 3. custom_metric (opt): optional metrics provided by the user.
512
-
513
- :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
514
-
515
- """
516
- super().__init__(**kwargs)
517
-
518
- def do(self, event):
519
- # Compute prediction per second
520
- event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
521
- float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
522
- )
523
- base_fields = [
524
- EventFieldType.TIMESTAMP,
525
- EventFieldType.ENDPOINT_ID,
526
- EventFieldType.ENDPOINT_TYPE,
527
- ]
528
-
529
- # Getting event timestamp and endpoint_id
530
- base_event = {k: event[k] for k in base_fields}
531
-
532
- # base_metrics includes the stats about the average latency and the amount of predictions over time
533
- base_metrics = {
534
- EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
535
- EventLiveStats.PREDICTIONS_PER_SECOND: event[
536
- EventLiveStats.PREDICTIONS_PER_SECOND
537
- ],
538
- EventLiveStats.PREDICTIONS_COUNT_5M: event[
539
- EventLiveStats.PREDICTIONS_COUNT_5M
540
- ],
541
- EventLiveStats.PREDICTIONS_COUNT_1H: event[
542
- EventLiveStats.PREDICTIONS_COUNT_1H
543
- ],
544
- EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
545
- EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
546
- **base_event,
547
- }
548
-
549
- # endpoint_features includes the event values of each feature and prediction
550
- endpoint_features = {
551
- EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
552
- **event[EventFieldType.NAMED_PREDICTIONS],
553
- **event[EventFieldType.NAMED_FEATURES],
554
- **base_event,
555
- }
556
- # Create a dictionary that includes both base_metrics and endpoint_features
557
- processed = {
558
- EventKeyMetrics.BASE_METRICS: base_metrics,
559
- EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
560
- }
561
-
562
- # If metrics provided, add another dictionary if custom_metrics values
563
- if event[EventFieldType.METRICS]:
564
- processed[EventKeyMetrics.CUSTOM_METRICS] = {
565
- EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
566
- **event[EventFieldType.METRICS],
567
- **base_event,
568
- }
569
-
570
- return processed
571
-
572
-
573
442
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
574
443
  def __init__(self, **kwargs):
575
444
  """
@@ -587,6 +456,8 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
587
456
  for key in [
588
457
  EventFieldType.FEATURES,
589
458
  EventFieldType.NAMED_FEATURES,
459
+ EventFieldType.PREDICTION,
460
+ EventFieldType.NAMED_PREDICTIONS,
590
461
  ]:
591
462
  event.pop(key, None)
592
463
 
@@ -802,7 +673,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
802
673
  # left them
803
674
  if endpoint_id not in self.endpoints:
804
675
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
805
- endpoint_record = get_endpoint_record(
676
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
806
677
  project=self.project,
807
678
  endpoint_id=endpoint_id,
808
679
  )
@@ -848,36 +719,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
848
719
  return False
849
720
 
850
721
 
851
- class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
852
- def __init__(self, keys, **kwargs):
853
- """
854
- Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
855
- or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
856
-
857
- :param keys: list of key metrics.
858
-
859
- :returns: An unpacked dictionary of event filtered by the provided key metrics.
860
- """
861
- super().__init__(**kwargs)
862
- self.keys = keys
863
-
864
- def do(self, event):
865
- # Keep only the relevant dictionary based on the provided keys
866
- new_event = {}
867
- for key in self.keys:
868
- if key in event:
869
- new_event[key] = event[key]
870
-
871
- # Create unpacked dictionary
872
- unpacked = {}
873
- for key in new_event.keys():
874
- if key in self.keys:
875
- unpacked = {**unpacked, **new_event[key]}
876
- else:
877
- unpacked[key] = new_event[key]
878
- return unpacked if unpacked else None
879
-
880
-
881
722
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
882
723
  def __init__(
883
724
  self,
@@ -931,9 +772,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
931
772
  def do(self, event: dict):
932
773
  endpoint_id = event[EventFieldType.ENDPOINT_ID]
933
774
 
775
+ feature_values = event[EventFieldType.FEATURES]
776
+ label_values = event[EventFieldType.PREDICTION]
934
777
  # Get feature names and label columns
935
778
  if endpoint_id not in self.feature_names:
936
- endpoint_record = get_endpoint_record(
779
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
937
780
  project=self.project,
938
781
  endpoint_id=endpoint_id,
939
782
  )
@@ -966,6 +809,12 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
966
809
  },
967
810
  )
968
811
 
812
+ update_monitoring_feature_set(
813
+ endpoint_record=endpoint_record,
814
+ feature_names=feature_names,
815
+ feature_values=feature_values,
816
+ )
817
+
969
818
  # Similar process with label columns
970
819
  if not label_columns and self._infer_columns_from_data:
971
820
  label_columns = self._infer_label_columns_from_data(event)
@@ -984,6 +833,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
984
833
  endpoint_id=endpoint_id,
985
834
  attributes={EventFieldType.LABEL_NAMES: json.dumps(label_columns)},
986
835
  )
836
+ update_monitoring_feature_set(
837
+ endpoint_record=endpoint_record,
838
+ feature_names=label_columns,
839
+ feature_values=label_values,
840
+ )
987
841
 
988
842
  self.label_columns[endpoint_id] = label_columns
989
843
  self.feature_names[endpoint_id] = feature_names
@@ -1001,7 +855,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1001
855
 
1002
856
  # Add feature_name:value pairs along with a mapping dictionary of all of these pairs
1003
857
  feature_names = self.feature_names[endpoint_id]
1004
- feature_values = event[EventFieldType.FEATURES]
1005
858
  self._map_dictionary_values(
1006
859
  event=event,
1007
860
  named_iters=feature_names,
@@ -1011,7 +864,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1011
864
 
1012
865
  # Add label_name:value pairs along with a mapping dictionary of all of these pairs
1013
866
  label_names = self.label_columns[endpoint_id]
1014
- label_values = event[EventFieldType.PREDICTION]
1015
867
  self._map_dictionary_values(
1016
868
  event=event,
1017
869
  named_iters=label_names,
@@ -1102,6 +954,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1102
954
  def do(self, event: dict):
1103
955
  key_set = set(event.keys())
1104
956
  if not key_set.issubset(self.keys):
957
+ import mlrun.utils.v3io_clients
958
+
1105
959
  self.keys.update(key_set)
1106
960
  # Apply infer_schema on the kv table for generating the schema file
1107
961
  mlrun.utils.v3io_clients.get_frames_client(
@@ -1137,10 +991,10 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
1137
991
  self.project: str = project
1138
992
 
1139
993
  def do(self, event):
1140
- if event.path == "/model-monitoring-metrics":
994
+ if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
1141
995
  # Return a parsed Prometheus registry file
1142
996
  event.body = mlrun.model_monitoring.prometheus.get_registry()
1143
- elif event.path == "/monitoring-batch-metrics":
997
+ elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
1144
998
  # Update statistical metrics
1145
999
  for event_metric in event.body:
1146
1000
  mlrun.model_monitoring.prometheus.write_drift_metrics(
@@ -1149,7 +1003,7 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
1149
1003
  metric=event_metric[EventFieldType.METRIC],
1150
1004
  value=event_metric[EventFieldType.VALUE],
1151
1005
  )
1152
- elif event.path == "/monitoring-drift-status":
1006
+ elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
1153
1007
  # Update drift status
1154
1008
  mlrun.model_monitoring.prometheus.write_drift_status(
1155
1009
  project=self.project,
@@ -1209,7 +1063,7 @@ def update_endpoint_record(
1209
1063
  endpoint_id: str,
1210
1064
  attributes: dict,
1211
1065
  ):
1212
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1066
+ model_endpoint_store = mlrun.model_monitoring.get_store_object(
1213
1067
  project=project,
1214
1068
  )
1215
1069
 
@@ -1218,8 +1072,19 @@ def update_endpoint_record(
1218
1072
  )
1219
1073
 
1220
1074
 
1221
- def get_endpoint_record(project: str, endpoint_id: str):
1222
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1223
- project=project,
1075
+ def update_monitoring_feature_set(
1076
+ endpoint_record: dict[str, typing.Any],
1077
+ feature_names: list[str],
1078
+ feature_values: list[typing.Any],
1079
+ ):
1080
+ monitoring_feature_set = fstore.get_feature_set(
1081
+ endpoint_record[
1082
+ mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
1083
+ ]
1224
1084
  )
1225
- return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
1085
+ for name, val in zip(feature_names, feature_values):
1086
+ monitoring_feature_set.add_feature(
1087
+ fstore.Feature(name=name, value_type=type(val))
1088
+ )
1089
+
1090
+ monitoring_feature_set.save()
@@ -11,8 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
15
+ import warnings
16
16
  from typing import Union
17
17
 
18
18
  import mlrun.common.schemas.schedule
@@ -55,6 +55,12 @@ class TrackingPolicy(mlrun.model.ModelObj):
55
55
  writer function, which is a real time nuclio functino, will be deployed
56
56
  with the same image. By default, the image is mlrun/mlrun.
57
57
  """
58
+ warnings.warn(
59
+ "The `TrackingPolicy` class is deprecated from version 1.7.0 and is not "
60
+ "used anymore. It will be removed in 1.9.0.",
61
+ FutureWarning,
62
+ )
63
+
58
64
  if isinstance(default_batch_intervals, str):
59
65
  default_batch_intervals = (
60
66
  mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(