mlrun 1.3.2rc1__py3-none-any.whl → 1.3.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (93) hide show
  1. mlrun/api/api/deps.py +14 -1
  2. mlrun/api/api/endpoints/frontend_spec.py +0 -2
  3. mlrun/api/api/endpoints/functions.py +15 -27
  4. mlrun/api/api/endpoints/grafana_proxy.py +435 -74
  5. mlrun/api/api/endpoints/healthz.py +5 -18
  6. mlrun/api/api/endpoints/model_endpoints.py +33 -37
  7. mlrun/api/api/utils.py +6 -13
  8. mlrun/api/crud/__init__.py +14 -16
  9. mlrun/api/crud/logs.py +5 -7
  10. mlrun/api/crud/model_monitoring/__init__.py +2 -2
  11. mlrun/api/crud/model_monitoring/model_endpoint_store.py +847 -0
  12. mlrun/api/crud/model_monitoring/model_endpoints.py +105 -328
  13. mlrun/api/crud/pipelines.py +2 -3
  14. mlrun/api/db/sqldb/models/models_mysql.py +52 -19
  15. mlrun/api/db/sqldb/models/models_sqlite.py +52 -19
  16. mlrun/api/db/sqldb/session.py +19 -26
  17. mlrun/api/schemas/__init__.py +2 -0
  18. mlrun/api/schemas/constants.py +0 -13
  19. mlrun/api/schemas/frontend_spec.py +0 -1
  20. mlrun/api/schemas/model_endpoints.py +38 -195
  21. mlrun/api/schemas/schedule.py +2 -2
  22. mlrun/api/utils/clients/log_collector.py +5 -0
  23. mlrun/builder.py +9 -41
  24. mlrun/config.py +1 -76
  25. mlrun/data_types/__init__.py +1 -6
  26. mlrun/data_types/data_types.py +1 -3
  27. mlrun/datastore/__init__.py +2 -9
  28. mlrun/datastore/sources.py +20 -25
  29. mlrun/datastore/store_resources.py +1 -1
  30. mlrun/datastore/targets.py +34 -67
  31. mlrun/datastore/utils.py +4 -26
  32. mlrun/db/base.py +2 -4
  33. mlrun/db/filedb.py +5 -13
  34. mlrun/db/httpdb.py +32 -64
  35. mlrun/db/sqldb.py +2 -4
  36. mlrun/errors.py +0 -5
  37. mlrun/execution.py +0 -2
  38. mlrun/feature_store/api.py +8 -24
  39. mlrun/feature_store/feature_set.py +6 -28
  40. mlrun/feature_store/feature_vector.py +0 -2
  41. mlrun/feature_store/ingestion.py +11 -8
  42. mlrun/feature_store/retrieval/base.py +43 -271
  43. mlrun/feature_store/retrieval/dask_merger.py +153 -55
  44. mlrun/feature_store/retrieval/job.py +3 -12
  45. mlrun/feature_store/retrieval/local_merger.py +130 -48
  46. mlrun/feature_store/retrieval/spark_merger.py +125 -126
  47. mlrun/features.py +2 -7
  48. mlrun/model_monitoring/constants.py +6 -48
  49. mlrun/model_monitoring/helpers.py +35 -118
  50. mlrun/model_monitoring/model_monitoring_batch.py +260 -293
  51. mlrun/model_monitoring/stream_processing_fs.py +253 -220
  52. mlrun/platforms/iguazio.py +0 -33
  53. mlrun/projects/project.py +72 -34
  54. mlrun/runtimes/base.py +0 -5
  55. mlrun/runtimes/daskjob.py +0 -2
  56. mlrun/runtimes/function.py +3 -29
  57. mlrun/runtimes/kubejob.py +15 -39
  58. mlrun/runtimes/local.py +45 -7
  59. mlrun/runtimes/mpijob/abstract.py +0 -2
  60. mlrun/runtimes/mpijob/v1.py +0 -2
  61. mlrun/runtimes/pod.py +0 -2
  62. mlrun/runtimes/remotesparkjob.py +0 -2
  63. mlrun/runtimes/serving.py +0 -6
  64. mlrun/runtimes/sparkjob/abstract.py +2 -39
  65. mlrun/runtimes/sparkjob/spark3job.py +0 -2
  66. mlrun/serving/__init__.py +1 -2
  67. mlrun/serving/routers.py +35 -35
  68. mlrun/serving/server.py +12 -22
  69. mlrun/serving/states.py +30 -162
  70. mlrun/serving/v2_serving.py +10 -13
  71. mlrun/utils/clones.py +1 -1
  72. mlrun/utils/model_monitoring.py +96 -122
  73. mlrun/utils/version/version.json +2 -2
  74. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/METADATA +27 -23
  75. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/RECORD +79 -92
  76. mlrun/api/crud/model_monitoring/grafana.py +0 -427
  77. mlrun/datastore/spark_udf.py +0 -40
  78. mlrun/model_monitoring/__init__.py +0 -44
  79. mlrun/model_monitoring/common.py +0 -112
  80. mlrun/model_monitoring/model_endpoint.py +0 -141
  81. mlrun/model_monitoring/stores/__init__.py +0 -106
  82. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -448
  83. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  84. mlrun/model_monitoring/stores/models/__init__.py +0 -23
  85. mlrun/model_monitoring/stores/models/base.py +0 -18
  86. mlrun/model_monitoring/stores/models/mysql.py +0 -100
  87. mlrun/model_monitoring/stores/models/sqlite.py +0 -98
  88. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -375
  89. mlrun/utils/db.py +0 -52
  90. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/LICENSE +0 -0
  91. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/WHEEL +0 -0
  92. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/entry_points.txt +0 -0
  93. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/top_level.txt +0 -0
@@ -19,24 +19,23 @@ import os
19
19
  import typing
20
20
 
21
21
  import pandas as pd
22
+
23
+ # Constants
22
24
  import storey
25
+ import v3io
26
+ import v3io.dataplane
23
27
 
24
- import mlrun
25
28
  import mlrun.config
26
29
  import mlrun.datastore.targets
27
30
  import mlrun.feature_store.steps
28
31
  import mlrun.utils
29
32
  import mlrun.utils.model_monitoring
30
33
  import mlrun.utils.v3io_clients
31
- from mlrun.model_monitoring import (
34
+ from mlrun.model_monitoring.constants import (
32
35
  EventFieldType,
33
36
  EventKeyMetrics,
34
37
  EventLiveStats,
35
- FileTargetKind,
36
- ModelEndpointTarget,
37
- ProjectSecretKeys,
38
38
  )
39
- from mlrun.model_monitoring.stores import get_model_endpoint_store
40
39
  from mlrun.utils import logger
41
40
 
42
41
 
@@ -46,90 +45,81 @@ class EventStreamProcessor:
46
45
  self,
47
46
  project: str,
48
47
  parquet_batching_max_events: int,
49
- parquet_target: str,
50
48
  sample_window: int = 10,
49
+ tsdb_batching_max_events: int = 10,
50
+ tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes
51
51
  parquet_batching_timeout_secs: int = 30 * 60, # Default 30 minutes
52
52
  aggregate_count_windows: typing.Optional[typing.List[str]] = None,
53
53
  aggregate_count_period: str = "30s",
54
54
  aggregate_avg_windows: typing.Optional[typing.List[str]] = None,
55
55
  aggregate_avg_period: str = "30s",
56
+ v3io_access_key: typing.Optional[str] = None,
57
+ v3io_framesd: typing.Optional[str] = None,
58
+ v3io_api: typing.Optional[str] = None,
56
59
  model_monitoring_access_key: str = None,
57
60
  ):
58
- # General configurations, mainly used for the storey steps in the future serving graph
59
61
  self.project = project
60
62
  self.sample_window = sample_window
63
+ self.tsdb_batching_max_events = tsdb_batching_max_events
64
+ self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
65
+ self.parquet_batching_max_events = parquet_batching_max_events
66
+ self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
61
67
  self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
62
68
  self.aggregate_count_period = aggregate_count_period
63
69
  self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
64
70
  self.aggregate_avg_period = aggregate_avg_period
65
71
 
66
- # Parquet path and configurations
67
- self.parquet_path = parquet_target
68
- self.parquet_batching_max_events = parquet_batching_max_events
69
- self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
70
-
71
- self.model_endpoint_store_target = (
72
- mlrun.mlconf.model_endpoint_monitoring.store_type
73
- )
74
-
75
- logger.info(
76
- "Initializing model monitoring event stream processor",
77
- parquet_path=self.parquet_path,
78
- parquet_batching_max_events=self.parquet_batching_max_events,
79
- )
80
-
81
- self.storage_options = None
82
- if not mlrun.mlconf.is_ce_mode():
83
- self._initialize_v3io_configurations(
84
- model_monitoring_access_key=model_monitoring_access_key
85
- )
86
-
87
- def _initialize_v3io_configurations(
88
- self,
89
- tsdb_batching_max_events: int = 10,
90
- tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes
91
- v3io_access_key: typing.Optional[str] = None,
92
- v3io_framesd: typing.Optional[str] = None,
93
- v3io_api: typing.Optional[str] = None,
94
- model_monitoring_access_key: str = None,
95
- ):
96
- # Get the V3IO configurations
97
72
  self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
98
73
  self.v3io_api = v3io_api or mlrun.mlconf.v3io_api
99
74
 
100
75
  self.v3io_access_key = v3io_access_key or os.environ.get("V3IO_ACCESS_KEY")
101
76
  self.model_monitoring_access_key = (
102
77
  model_monitoring_access_key
103
- or os.environ.get(ProjectSecretKeys.ACCESS_KEY)
78
+ or os.environ.get("MODEL_MONITORING_ACCESS_KEY")
104
79
  or self.v3io_access_key
105
80
  )
106
81
  self.storage_options = dict(
107
82
  v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
108
83
  )
109
84
 
110
- # KV path
111
- kv_path = mlrun.mlconf.get_model_monitoring_file_target_path(
112
- project=self.project, kind=FileTargetKind.ENDPOINTS
113
- )
85
+ template = mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default
86
+
87
+ kv_path = template.format(project=project, kind="endpoints")
114
88
  (
115
89
  _,
116
90
  self.kv_container,
117
91
  self.kv_path,
118
92
  ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(kv_path)
119
93
 
120
- # TSDB path and configurations
121
- tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
122
- project=self.project, kind=FileTargetKind.EVENTS
123
- )
94
+ tsdb_path = template.format(project=project, kind="events")
124
95
  (
125
96
  _,
126
97
  self.tsdb_container,
127
98
  self.tsdb_path,
128
99
  ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(tsdb_path)
129
-
130
100
  self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
131
- self.tsdb_batching_max_events = tsdb_batching_max_events
132
- self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
101
+
102
+ self.parquet_path = (
103
+ mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
104
+ project=project, kind="parquet"
105
+ )
106
+ )
107
+
108
+ logger.info(
109
+ "Initializing model monitoring event stream processor",
110
+ parquet_batching_max_events=self.parquet_batching_max_events,
111
+ v3io_access_key=self.v3io_access_key,
112
+ model_monitoring_access_key=self.model_monitoring_access_key,
113
+ default_store_prefix=mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default,
114
+ user_space_store_prefix=mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space,
115
+ v3io_api=self.v3io_api,
116
+ v3io_framesd=self.v3io_framesd,
117
+ kv_container=self.kv_container,
118
+ kv_path=self.kv_path,
119
+ tsdb_container=self.tsdb_container,
120
+ tsdb_path=self.tsdb_path,
121
+ parquet_path=self.parquet_path,
122
+ )
133
123
 
134
124
  def apply_monitoring_serving_graph(self, fn):
135
125
  """
@@ -137,23 +127,20 @@ class EventStreamProcessor:
137
127
  of different operations that are executed on the events from the model server. Each event has
138
128
  metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
139
129
  Throughout the serving graph, the results are written to 3 different databases:
140
- 1. KV/SQL (steps 7-9): Stores metadata and stats about the average latency and the amount of predictions over
141
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
142
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
143
- from other processes, such as current_stats that is being calculated by the monitoring batch job
144
- process. If the target is from type KV, then the model endpoints table can be found under
145
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
146
- is stored within the database that was defined in the provided connection string and can be found
147
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
130
+ 1. KV (steps 7-9): Stores metadata and stats about the average latency and the amount of predictions over time
131
+ per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used by
132
+ the monitoring dashboards in grafana. Please note that the KV table, which can be found under
133
+ v3io:///users/pipelines/project-name/model-endpoints/endpoints/ also contains data on the model endpoint
134
+ from other processes, such as current_stats that is being calculated by the monitoring batch job
135
+ process.
148
136
  2. TSDB (steps 12-18): Stores live data of different key metric dictionaries in tsdb target. Results can be
149
137
  found under v3io:///users/pipelines/project-name/model-endpoints/events/. At the moment, this part supports
150
138
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
151
139
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
152
140
  This data is also being used by the monitoring dashboards in grafana.
153
141
  3. Parquet (steps 19-20): This Parquet file includes the required data for the model monitoring batch job
154
- that run every hour by default. If defined, the parquet target path can be found under
155
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
156
- mlrun.mlconf.model_endpoint_monitoring.user_space.
142
+ that run every hour by default. The parquet target can be found under
143
+ v3io:///projects/{project}/model-endpoints/.
157
144
 
158
145
  :param fn: A serving function.
159
146
  """
@@ -164,6 +151,9 @@ class EventStreamProcessor:
164
151
  def apply_process_endpoint_event():
165
152
  graph.add_step(
166
153
  "ProcessEndpointEvent",
154
+ kv_container=self.kv_container,
155
+ kv_path=self.kv_path,
156
+ v3io_access_key=self.v3io_access_key,
167
157
  full_event=True,
168
158
  project=self.project,
169
159
  )
@@ -192,8 +182,10 @@ class EventStreamProcessor:
192
182
  graph.add_step(
193
183
  "MapFeatureNames",
194
184
  name="MapFeatureNames",
185
+ kv_container=self.kv_container,
186
+ kv_path=self.kv_path,
187
+ access_key=self.v3io_access_key,
195
188
  infer_columns_from_data=True,
196
- project=self.project,
197
189
  after="flatten_events",
198
190
  )
199
191
 
@@ -217,6 +209,7 @@ class EventStreamProcessor:
217
209
  after="MapFeatureNames",
218
210
  step_name="Aggregates",
219
211
  table=".",
212
+ v3io_access_key=self.v3io_access_key,
220
213
  )
221
214
  # Step 5.2 - Calculate average latency time for each window (5 min and 1 hour by default)
222
215
  graph.add_step(
@@ -233,6 +226,7 @@ class EventStreamProcessor:
233
226
  name=EventFieldType.LATENCY,
234
227
  after=EventFieldType.PREDICTIONS,
235
228
  table=".",
229
+ v3io_access_key=self.v3io_access_key,
236
230
  )
237
231
 
238
232
  apply_storey_aggregations()
@@ -245,121 +239,117 @@ class EventStreamProcessor:
245
239
  after=EventFieldType.LATENCY,
246
240
  window_size=self.sample_window,
247
241
  key=EventFieldType.ENDPOINT_ID,
242
+ v3io_access_key=self.v3io_access_key,
248
243
  )
249
244
 
250
245
  apply_storey_sample_window()
251
246
 
252
- # Steps 7-9 - KV/SQL branch
253
- # Step 7 - Filter relevant keys from the event before writing the data into the database table
254
- def apply_process_before_endpoint_update():
255
- graph.add_step(
256
- "ProcessBeforeEndpointUpdate",
257
- name="ProcessBeforeEndpointUpdate",
258
- after="sample",
259
- )
247
+ # Steps 7-9 - KV branch
248
+ # Step 7 - Filter relevant keys from the event before writing the data into KV
249
+ def apply_process_before_kv():
250
+ graph.add_step("ProcessBeforeKV", name="ProcessBeforeKV", after="sample")
260
251
 
261
- apply_process_before_endpoint_update()
252
+ apply_process_before_kv()
262
253
 
263
- # Step 8 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
254
+ # Step 8 - Write the filtered event to KV table. At this point, the serving graph updates the stats
264
255
  # about average latency and the amount of predictions over time
265
- def apply_update_endpoint():
256
+ def apply_write_to_kv():
266
257
  graph.add_step(
267
- "UpdateEndpoint",
268
- name="UpdateEndpoint",
269
- after="ProcessBeforeEndpointUpdate",
270
- project=self.project,
271
- model_endpoint_store_target=self.model_endpoint_store_target,
258
+ "WriteToKV",
259
+ name="WriteToKV",
260
+ after="ProcessBeforeKV",
261
+ container=self.kv_container,
262
+ table=self.kv_path,
263
+ v3io_access_key=self.v3io_access_key,
272
264
  )
273
265
 
274
- apply_update_endpoint()
266
+ apply_write_to_kv()
275
267
 
276
- # Step 9 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
268
+ # Step 9 - Apply infer_schema on the KB table for generating schema file
277
269
  # which will be used by Grafana monitoring dashboards
278
270
  def apply_infer_schema():
279
271
  graph.add_step(
280
272
  "InferSchema",
281
273
  name="InferSchema",
282
- after="UpdateEndpoint",
274
+ after="WriteToKV",
275
+ v3io_access_key=self.v3io_access_key,
283
276
  v3io_framesd=self.v3io_framesd,
284
277
  container=self.kv_container,
285
278
  table=self.kv_path,
286
279
  )
287
280
 
288
- if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
289
- apply_infer_schema()
281
+ apply_infer_schema()
290
282
 
291
- # Steps 11-18 - TSDB branch (not supported in CE environment at the moment)
283
+ # Steps 11-18 - TSDB branch
284
+ # Step 11 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
285
+ # stats and details about the events
286
+ def apply_process_before_tsdb():
287
+ graph.add_step(
288
+ "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
289
+ )
292
290
 
293
- if not mlrun.mlconf.is_ce_mode():
294
- # Step 11 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
295
- # stats and details about the events
296
- def apply_process_before_tsdb():
297
- graph.add_step(
298
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
299
- )
291
+ apply_process_before_tsdb()
300
292
 
301
- apply_process_before_tsdb()
293
+ # Steps 12-18: - Unpacked keys from each dictionary and write to TSDB target
294
+ def apply_filter_and_unpacked_keys(name, keys):
295
+ graph.add_step(
296
+ "FilterAndUnpackKeys",
297
+ name=name,
298
+ after="ProcessBeforeTSDB",
299
+ keys=[keys],
300
+ )
302
301
 
303
- # Steps 12-18: - Unpacked keys from each dictionary and write to TSDB target
304
- def apply_filter_and_unpacked_keys(name, keys):
305
- graph.add_step(
306
- "FilterAndUnpackKeys",
307
- name=name,
308
- after="ProcessBeforeTSDB",
309
- keys=[keys],
310
- )
302
+ def apply_tsdb_target(name, after):
303
+ graph.add_step(
304
+ "storey.TSDBTarget",
305
+ name=name,
306
+ after=after,
307
+ path=self.tsdb_path,
308
+ rate="10/m",
309
+ time_col=EventFieldType.TIMESTAMP,
310
+ container=self.tsdb_container,
311
+ access_key=self.v3io_access_key,
312
+ v3io_frames=self.v3io_framesd,
313
+ infer_columns_from_data=True,
314
+ index_cols=[
315
+ EventFieldType.ENDPOINT_ID,
316
+ EventFieldType.RECORD_TYPE,
317
+ ],
318
+ max_events=self.tsdb_batching_max_events,
319
+ flush_after_seconds=self.tsdb_batching_timeout_secs,
320
+ key=EventFieldType.ENDPOINT_ID,
321
+ )
311
322
 
312
- def apply_tsdb_target(name, after):
313
- graph.add_step(
314
- "storey.TSDBTarget",
315
- name=name,
316
- after=after,
317
- path=self.tsdb_path,
318
- rate="10/m",
319
- time_col=EventFieldType.TIMESTAMP,
320
- container=self.tsdb_container,
321
- access_key=self.v3io_access_key,
322
- v3io_frames=self.v3io_framesd,
323
- infer_columns_from_data=True,
324
- index_cols=[
325
- EventFieldType.ENDPOINT_ID,
326
- EventFieldType.RECORD_TYPE,
327
- ],
328
- max_events=self.tsdb_batching_max_events,
329
- flush_after_seconds=self.tsdb_batching_timeout_secs,
330
- key=EventFieldType.ENDPOINT_ID,
331
- )
323
+ # Steps 12-13 - unpacked base_metrics dictionary
324
+ apply_filter_and_unpacked_keys(
325
+ name="FilterAndUnpackKeys1",
326
+ keys=EventKeyMetrics.BASE_METRICS,
327
+ )
328
+ apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
332
329
 
333
- # Steps 12-13 - unpacked base_metrics dictionary
334
- apply_filter_and_unpacked_keys(
335
- name="FilterAndUnpackKeys1",
336
- keys=EventKeyMetrics.BASE_METRICS,
337
- )
338
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
330
+ # Steps 14-15 - unpacked endpoint_features dictionary
331
+ apply_filter_and_unpacked_keys(
332
+ name="FilterAndUnpackKeys2",
333
+ keys=EventKeyMetrics.ENDPOINT_FEATURES,
334
+ )
335
+ apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
339
336
 
340
- # Steps 14-15 - unpacked endpoint_features dictionary
341
- apply_filter_and_unpacked_keys(
342
- name="FilterAndUnpackKeys2",
343
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
344
- )
345
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
337
+ # Steps 16-18 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
338
+ apply_filter_and_unpacked_keys(
339
+ name="FilterAndUnpackKeys3",
340
+ keys=EventKeyMetrics.CUSTOM_METRICS,
341
+ )
346
342
 
347
- # Steps 16-18 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
348
- apply_filter_and_unpacked_keys(
349
- name="FilterAndUnpackKeys3",
350
- keys=EventKeyMetrics.CUSTOM_METRICS,
343
+ def apply_storey_filter():
344
+ graph.add_step(
345
+ "storey.Filter",
346
+ "FilterNotNone",
347
+ after="FilterAndUnpackKeys3",
348
+ _fn="(event is not None)",
351
349
  )
352
350
 
353
- def apply_storey_filter():
354
- graph.add_step(
355
- "storey.Filter",
356
- "FilterNotNone",
357
- after="FilterAndUnpackKeys3",
358
- _fn="(event is not None)",
359
- )
360
-
361
- apply_storey_filter()
362
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
351
+ apply_storey_filter()
352
+ apply_tsdb_target(name="tsdb3", after="FilterNotNone")
363
353
 
364
354
  # Steps 19-20 - Parquet branch
365
355
  # Step 19 - Filter and validate different keys before writing the data to Parquet target
@@ -394,14 +384,14 @@ class EventStreamProcessor:
394
384
  apply_parquet_target()
395
385
 
396
386
 
397
- class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
387
+ class ProcessBeforeKV(mlrun.feature_store.steps.MapClass):
398
388
  def __init__(self, **kwargs):
399
389
  """
400
- Filter relevant keys from the event before writing the data to database table (in EndpointUpdate step).
401
- Note that in the endpoint table we only keep metadata (function_uri, model_class, etc.) and stats about the
402
- average latency and the number of predictions (per 5min and 1hour).
390
+ Filter relevant keys from the event before writing the data to KV table (in WriteToKV step). Note that in KV
391
+ we only keep metadata (function_uri, model_class, etc.) and stats about the average latency and the number
392
+ of predictions (per 5min and 1hour).
403
393
 
404
- :returns: A filtered event as a dictionary which will be written to the endpoint table in the next step.
394
+ :returns: A filtered event as a dictionary which will be written to KV table in the next step.
405
395
  """
406
396
  super().__init__(**kwargs)
407
397
 
@@ -418,31 +408,26 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
418
408
  EventFieldType.FUNCTION_URI,
419
409
  EventFieldType.MODEL,
420
410
  EventFieldType.MODEL_CLASS,
411
+ EventFieldType.TIMESTAMP,
421
412
  EventFieldType.ENDPOINT_ID,
422
413
  EventFieldType.LABELS,
423
- EventFieldType.FIRST_REQUEST,
424
- EventFieldType.LAST_REQUEST,
425
- EventFieldType.ERROR_COUNT,
426
- ]
427
- }
428
-
429
- # Add generic metrics statistics
430
- generic_metrics = {
431
- k: event[k]
432
- for k in [
414
+ EventFieldType.UNPACKED_LABELS,
433
415
  EventLiveStats.LATENCY_AVG_5M,
434
416
  EventLiveStats.LATENCY_AVG_1H,
435
417
  EventLiveStats.PREDICTIONS_PER_SECOND,
436
418
  EventLiveStats.PREDICTIONS_COUNT_5M,
437
419
  EventLiveStats.PREDICTIONS_COUNT_1H,
420
+ EventFieldType.FIRST_REQUEST,
421
+ EventFieldType.LAST_REQUEST,
422
+ EventFieldType.ERROR_COUNT,
438
423
  ]
439
424
  }
440
-
441
- e[EventFieldType.METRICS] = json.dumps(
442
- {EventKeyMetrics.GENERIC: generic_metrics}
443
- )
444
-
445
- # Write labels as json string as required by the DB format
425
+ # Unpack labels dictionary
426
+ e = {
427
+ **e.pop(EventFieldType.UNPACKED_LABELS, {}),
428
+ **e,
429
+ }
430
+ # Write labels to kv as json string to be presentable later
446
431
  e[EventFieldType.LABELS] = json.dumps(e[EventFieldType.LABELS])
447
432
 
448
433
  return e
@@ -538,6 +523,7 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
538
523
  logger.info("ProcessBeforeParquet1", event=event)
539
524
  # Remove the following keys from the event
540
525
  for key in [
526
+ EventFieldType.UNPACKED_LABELS,
541
527
  EventFieldType.FEATURES,
542
528
  EventFieldType.NAMED_FEATURES,
543
529
  ]:
@@ -563,23 +549,32 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
563
549
  class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
564
550
  def __init__(
565
551
  self,
566
- project: str,
552
+ kv_container: str,
553
+ kv_path: str,
554
+ v3io_access_key: str,
567
555
  **kwargs,
568
556
  ):
569
557
  """
570
558
  Process event or batch of events as part of the first step of the monitoring serving graph. It includes
571
- Adding important details to the event such as endpoint_id, handling errors coming from the stream, validation
559
+ Adding important details to the event such as endpoint_id, handling errors coming from the stream, Validation
572
560
  of event data such as inputs and outputs, and splitting model event into sub-events.
573
561
 
574
- :param project: Project name.
562
+ :param kv_container: Name of the container that will be used to retrieve the endpoint id. For model
563
+ endpoints it is usually 'users'.
564
+ :param kv_path: KV table path that will be used to retrieve the endpoint id. For model endpoints
565
+ it is usually pipelines/project-name/model-endpoints/endpoints/
566
+ :param v3io_access_key: Access key with permission to read from a KV table.
567
+ :param project: Project name.
568
+
575
569
 
576
570
  :returns: A Storey event object which is the basic unit of data in Storey. Note that the next steps of
577
571
  the monitoring serving graph are based on Storey operations.
578
572
 
579
573
  """
580
574
  super().__init__(**kwargs)
581
-
582
- self.project: str = project
575
+ self.kv_container: str = kv_container
576
+ self.kv_path: str = kv_path
577
+ self.v3io_access_key: str = v3io_access_key
583
578
 
584
579
  # First and last requests timestamps (value) of each endpoint (key)
585
580
  self.first_request: typing.Dict[str, str] = dict()
@@ -607,7 +602,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
607
602
  version = event.get(EventFieldType.VERSION)
608
603
  versioned_model = f"{model}:{version}" if version else f"{model}:latest"
609
604
 
610
- endpoint_id = mlrun.model_monitoring.create_model_endpoint_uid(
605
+ endpoint_id = mlrun.utils.model_monitoring.create_model_endpoint_id(
611
606
  function_uri=function_uri,
612
607
  versioned_model=versioned_model,
613
608
  )
@@ -684,6 +679,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
684
679
  ):
685
680
  return None
686
681
 
682
+ # Get labels from event (if exist)
683
+ unpacked_labels = {
684
+ f"_{k}": v for k, v in event.get(EventFieldType.LABELS, {}).items()
685
+ }
686
+
687
687
  # Adjust timestamp format
688
688
  timestamp = datetime.datetime.strptime(timestamp[:-6], "%Y-%m-%d %H:%M:%S.%f")
689
689
 
@@ -722,6 +722,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
722
722
  EventFieldType.ENTITIES: event.get("request", {}).get(
723
723
  EventFieldType.ENTITIES, {}
724
724
  ),
725
+ EventFieldType.UNPACKED_LABELS: unpacked_labels,
725
726
  }
726
727
  )
727
728
 
@@ -750,8 +751,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
750
751
  f"{self.last_request[endpoint_id]} - write to TSDB will be rejected"
751
752
  )
752
753
 
753
- @staticmethod
754
754
  def is_list_of_numerics(
755
+ self,
755
756
  field: typing.List[typing.Union[int, float, dict, list]],
756
757
  dict_path: typing.List[str],
757
758
  ):
@@ -766,11 +767,12 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
766
767
  # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
767
768
  # left them
768
769
  if endpoint_id not in self.endpoints:
769
-
770
770
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
771
771
  endpoint_record = get_endpoint_record(
772
- project=self.project,
772
+ kv_container=self.kv_container,
773
+ kv_path=self.kv_path,
773
774
  endpoint_id=endpoint_id,
775
+ access_key=self.v3io_access_key,
774
776
  )
775
777
 
776
778
  # If model endpoint found, get first_request, last_request and error_count values
@@ -788,7 +790,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
788
790
  error_count = endpoint_record.get(EventFieldType.ERROR_COUNT)
789
791
 
790
792
  if error_count:
791
- self.error_count[endpoint_id] = int(error_count)
793
+ self.error_count[endpoint_id] = error_count
792
794
 
793
795
  # add endpoint to endpoints set
794
796
  self.endpoints.add(endpoint_id)
@@ -855,7 +857,9 @@ class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
855
857
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
856
858
  def __init__(
857
859
  self,
858
- project: str,
860
+ kv_container: str,
861
+ kv_path: str,
862
+ access_key: str,
859
863
  infer_columns_from_data: bool = False,
860
864
  **kwargs,
861
865
  ):
@@ -863,7 +867,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
863
867
  Validating feature names and label columns and map each feature to its value. In the end of this step,
864
868
  the event should have key-value pairs of (feature name: feature value).
865
869
 
866
- :param project: Project name.
870
+ :param kv_container: Name of the container that will be used to retrieve the endpoint id. For model
871
+ endpoints it is usually 'users'.
872
+ :param kv_path: KV table path that will be used to retrieve the endpoint id. For model endpoints
873
+ it is usually pipelines/project-name/model-endpoints/endpoints/
874
+ :param v3io_access_key: Access key with permission to read from a KV table.
867
875
  :param infer_columns_from_data: If true and features or labels names were not found, then try to
868
876
  retrieve them from data that was stored in the previous events of
869
877
  the current process. This data can be found under self.feature_names and
@@ -874,9 +882,10 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
874
882
  feature names and values (as well as the prediction results).
875
883
  """
876
884
  super().__init__(**kwargs)
877
-
885
+ self.kv_container = kv_container
886
+ self.kv_path = kv_path
887
+ self.access_key = access_key
878
888
  self._infer_columns_from_data = infer_columns_from_data
879
- self.project = project
880
889
 
881
890
  # Dictionaries that will be used in case features names
882
891
  # and labels columns were not found in the current event
@@ -905,8 +914,10 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
905
914
  # Get feature names and label columns
906
915
  if endpoint_id not in self.feature_names:
907
916
  endpoint_record = get_endpoint_record(
908
- project=self.project,
917
+ kv_container=self.kv_container,
918
+ kv_path=self.kv_path,
909
919
  endpoint_id=endpoint_id,
920
+ access_key=self.access_key,
910
921
  )
911
922
  feature_names = endpoint_record.get(EventFieldType.FEATURE_NAMES)
912
923
  feature_names = json.loads(feature_names) if feature_names else None
@@ -929,12 +940,15 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
929
940
  ]
930
941
 
931
942
  # Update the endpoint record with the generated features
932
- update_endpoint_record(
933
- project=self.project,
934
- endpoint_id=endpoint_id,
943
+ mlrun.utils.v3io_clients.get_v3io_client().kv.update(
944
+ container=self.kv_container,
945
+ table_path=self.kv_path,
946
+ access_key=self.access_key,
947
+ key=event[EventFieldType.ENDPOINT_ID],
935
948
  attributes={
936
949
  EventFieldType.FEATURE_NAMES: json.dumps(feature_names)
937
950
  },
951
+ raise_for_status=v3io.dataplane.RaiseForStatus.always,
938
952
  )
939
953
 
940
954
  # Similar process with label columns
@@ -949,11 +963,15 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
949
963
  label_columns = [
950
964
  f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
951
965
  ]
952
-
953
- update_endpoint_record(
954
- project=self.project,
955
- endpoint_id=endpoint_id,
956
- attributes={EventFieldType.LABEL_NAMES: json.dumps(label_columns)},
966
+ mlrun.utils.v3io_clients.get_v3io_client().kv.update(
967
+ container=self.kv_container,
968
+ table_path=self.kv_path,
969
+ access_key=self.access_key,
970
+ key=event[EventFieldType.ENDPOINT_ID],
971
+ attributes={
972
+ EventFieldType.LABEL_COLUMNS: json.dumps(label_columns)
973
+ },
974
+ raise_for_status=v3io.dataplane.RaiseForStatus.always,
957
975
  )
958
976
 
959
977
  self.label_columns[endpoint_id] = label_columns
@@ -1015,24 +1033,33 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1015
1033
  event[mapping_dictionary][name] = value
1016
1034
 
1017
1035
 
1018
- class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1019
- def __init__(self, project: str, model_endpoint_store_target: str, **kwargs):
1036
+ class WriteToKV(mlrun.feature_store.steps.MapClass):
1037
+ def __init__(self, container: str, table: str, v3io_access_key: str, **kwargs):
1020
1038
  """
1021
- Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
1022
- the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
1039
+ Writes the event to KV table. Note that the event at this point includes metadata and stats about the
1040
+ average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
1023
1041
  such as "Model Monitoring - Performance" which can be found in Grafana.
1024
1042
 
1043
+ :param kv_container: Name of the container that will be used to retrieve the endpoint id. For model
1044
+ endpoints it is usually 'users'.
1045
+ :param table: KV table path that will be used to retrieve the endpoint id. For model endpoints
1046
+ it is usually pipelines/project-name/model-endpoints/endpoints/.
1047
+ :param v3io_access_key: Access key with permission to read from a KV table.
1048
+
1025
1049
  :returns: Event as a dictionary (without any changes) for the next step (InferSchema).
1026
1050
  """
1027
1051
  super().__init__(**kwargs)
1028
- self.project = project
1029
- self.model_endpoint_store_target = model_endpoint_store_target
1052
+ self.container = container
1053
+ self.table = table
1054
+ self.v3io_access_key = v3io_access_key
1030
1055
 
1031
1056
  def do(self, event: typing.Dict):
1032
- update_endpoint_record(
1033
- project=self.project,
1034
- endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
1057
+ mlrun.utils.v3io_clients.get_v3io_client().kv.update(
1058
+ container=self.container,
1059
+ table_path=self.table,
1060
+ key=event[EventFieldType.ENDPOINT_ID],
1035
1061
  attributes=event,
1062
+ access_key=self.v3io_access_key,
1036
1063
  )
1037
1064
  return event
1038
1065
 
@@ -1040,6 +1067,7 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1040
1067
  class InferSchema(mlrun.feature_store.steps.MapClass):
1041
1068
  def __init__(
1042
1069
  self,
1070
+ v3io_access_key: str,
1043
1071
  v3io_framesd: str,
1044
1072
  container: str,
1045
1073
  table: str,
@@ -1059,40 +1087,45 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1059
1087
  """
1060
1088
  super().__init__(**kwargs)
1061
1089
  self.container = container
1090
+ self.v3io_access_key = v3io_access_key
1062
1091
  self.v3io_framesd = v3io_framesd
1063
1092
  self.table = table
1064
1093
  self.keys = set()
1065
1094
 
1066
1095
  def do(self, event: typing.Dict):
1067
-
1068
1096
  key_set = set(event.keys())
1069
1097
  if not key_set.issubset(self.keys):
1070
1098
  self.keys.update(key_set)
1071
1099
  # Apply infer_schema on the kv table for generating the schema file
1072
1100
  mlrun.utils.v3io_clients.get_frames_client(
1101
+ token=self.v3io_access_key,
1073
1102
  container=self.container,
1074
1103
  address=self.v3io_framesd,
1075
1104
  ).execute(backend="kv", table=self.table, command="infer_schema")
1076
-
1077
1105
  return event
1078
1106
 
1079
1107
 
1080
- def update_endpoint_record(
1081
- project: str,
1082
- endpoint_id: str,
1083
- attributes: dict,
1084
- ):
1085
- model_endpoint_store = get_model_endpoint_store(
1086
- project=project,
1108
+ def get_endpoint_record(
1109
+ kv_container: str, kv_path: str, endpoint_id: str, access_key: str
1110
+ ) -> typing.Optional[dict]:
1111
+ logger.info(
1112
+ "Grabbing endpoint data",
1113
+ container=kv_container,
1114
+ table_path=kv_path,
1115
+ key=endpoint_id,
1087
1116
  )
1088
-
1089
- model_endpoint_store.update_model_endpoint(
1090
- endpoint_id=endpoint_id, attributes=attributes
1091
- )
1092
-
1093
-
1094
- def get_endpoint_record(project: str, endpoint_id: str):
1095
- model_endpoint_store = get_model_endpoint_store(
1096
- project=project,
1097
- )
1098
- return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
1117
+ try:
1118
+ endpoint_record = (
1119
+ mlrun.utils.v3io_clients.get_v3io_client()
1120
+ .kv.get(
1121
+ container=kv_container,
1122
+ table_path=kv_path,
1123
+ key=endpoint_id,
1124
+ access_key=access_key,
1125
+ raise_for_status=v3io.dataplane.RaiseForStatus.always,
1126
+ )
1127
+ .output.item
1128
+ )
1129
+ return endpoint_record
1130
+ except Exception:
1131
+ return None