mlrun 1.7.0rc15__py3-none-any.whl → 1.7.0rc17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (77) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +18 -4
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/artifacts/__init__.py +7 -1
  6. mlrun/artifacts/base.py +28 -3
  7. mlrun/artifacts/dataset.py +8 -0
  8. mlrun/artifacts/manager.py +18 -0
  9. mlrun/artifacts/model.py +8 -1
  10. mlrun/artifacts/plots.py +13 -0
  11. mlrun/common/schemas/__init__.py +10 -2
  12. mlrun/common/schemas/alert.py +64 -5
  13. mlrun/common/schemas/api_gateway.py +4 -0
  14. mlrun/common/schemas/artifact.py +15 -0
  15. mlrun/common/schemas/auth.py +2 -0
  16. mlrun/common/schemas/model_monitoring/__init__.py +4 -1
  17. mlrun/common/schemas/model_monitoring/constants.py +17 -1
  18. mlrun/common/schemas/model_monitoring/model_endpoints.py +60 -1
  19. mlrun/common/schemas/project.py +5 -1
  20. mlrun/config.py +11 -4
  21. mlrun/datastore/datastore_profile.py +10 -7
  22. mlrun/db/base.py +24 -4
  23. mlrun/db/httpdb.py +97 -43
  24. mlrun/db/nopdb.py +25 -4
  25. mlrun/errors.py +5 -0
  26. mlrun/launcher/base.py +3 -2
  27. mlrun/lists.py +4 -0
  28. mlrun/model.py +15 -8
  29. mlrun/model_monitoring/__init__.py +1 -1
  30. mlrun/model_monitoring/applications/_application_steps.py +1 -2
  31. mlrun/model_monitoring/applications/context.py +1 -1
  32. mlrun/model_monitoring/applications/histogram_data_drift.py +64 -38
  33. mlrun/model_monitoring/db/__init__.py +2 -0
  34. mlrun/model_monitoring/db/stores/base/store.py +9 -36
  35. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
  36. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +56 -202
  37. mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
  38. mlrun/model_monitoring/db/tsdb/base.py +135 -0
  39. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  40. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  41. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +442 -0
  42. mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
  43. mlrun/model_monitoring/stream_processing.py +46 -210
  44. mlrun/model_monitoring/writer.py +50 -100
  45. mlrun/platforms/__init__.py +10 -9
  46. mlrun/platforms/iguazio.py +19 -200
  47. mlrun/projects/operations.py +11 -7
  48. mlrun/projects/pipelines.py +13 -76
  49. mlrun/projects/project.py +62 -17
  50. mlrun/render.py +9 -3
  51. mlrun/run.py +5 -38
  52. mlrun/runtimes/__init__.py +1 -0
  53. mlrun/runtimes/base.py +3 -3
  54. mlrun/runtimes/kubejob.py +2 -1
  55. mlrun/runtimes/nuclio/api_gateway.py +163 -77
  56. mlrun/runtimes/nuclio/application/application.py +160 -7
  57. mlrun/runtimes/nuclio/function.py +25 -45
  58. mlrun/runtimes/pod.py +16 -36
  59. mlrun/runtimes/remotesparkjob.py +1 -1
  60. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  61. mlrun/runtimes/utils.py +0 -38
  62. mlrun/track/tracker.py +2 -1
  63. mlrun/utils/helpers.py +51 -31
  64. mlrun/utils/logger.py +11 -6
  65. mlrun/utils/notifications/notification/base.py +1 -1
  66. mlrun/utils/notifications/notification/slack.py +9 -4
  67. mlrun/utils/notifications/notification/webhook.py +1 -1
  68. mlrun/utils/notifications/notification_pusher.py +21 -14
  69. mlrun/utils/version/version.json +2 -2
  70. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/METADATA +4 -3
  71. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/RECORD +75 -69
  72. mlrun/kfpops.py +0 -860
  73. mlrun/platforms/other.py +0 -305
  74. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/LICENSE +0 -0
  75. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/WHEEL +0 -0
  76. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/entry_points.txt +0 -0
  77. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,6 @@ import mlrun.model_monitoring.db
30
30
  import mlrun.model_monitoring.prometheus
31
31
  import mlrun.serving.states
32
32
  import mlrun.utils
33
- import mlrun.utils.v3io_clients
34
33
  from mlrun.common.schemas.model_monitoring.constants import (
35
34
  EventFieldType,
36
35
  EventKeyMetrics,
@@ -78,6 +77,7 @@ class EventStreamProcessor:
78
77
  )
79
78
 
80
79
  self.storage_options = None
80
+ self.tsdb_configurations = {}
81
81
  if not mlrun.mlconf.is_ce_mode():
82
82
  self._initialize_v3io_configurations(
83
83
  model_monitoring_access_key=model_monitoring_access_key
@@ -138,29 +138,29 @@ class EventStreamProcessor:
138
138
 
139
139
  def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
140
140
  """
141
- Apply monitoring serving graph to a given serving function. The following serving graph includes about 20 steps
142
- of different operations that are executed on the events from the model server. Each event has
143
- metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
144
- Throughout the serving graph, the results are written to 3 different databases:
145
- 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
146
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
147
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
148
- from other processes, such as current_stats that is being calculated by the monitoring batch job
149
- process. If the target is from type KV, then the model endpoints table can be found under
150
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
151
- is stored within the database that was defined in the provided connection string and can be found
152
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
153
- 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
154
- This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
141
+ Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
142
+ parts that each one them includes several steps of different operations that are executed on the events from
143
+ the model server.
144
+ Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
145
+ metrics from the model server.
146
+ In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
147
+ validation of the event data and adding important details to the event such as endpoint_id.
148
+ In the next parts, the serving graph stores data to 3 different targets:
149
+ 1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
150
+ time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
151
+ endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
152
+ represents sample statistics from the training data. If the target is from type KV, then the model endpoints
153
+ table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
154
+ SQL, then the table is stored within the database that was defined in the provided connection string.
155
+ 2. TSDB: live data of different key metric dictionaries in tsdb target.
156
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
155
157
  can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
156
158
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
157
159
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
158
- If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
159
- monitoring stream local memory.
160
- 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
161
- that run every hour by default. If defined, the parquet target path can be found under
162
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
163
- mlrun.mlconf.model_endpoint_monitoring.user_space.
160
+ 3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
161
+ the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
162
+ the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
163
+ using CE, the parquet target path is based on the defined MLRun artifact path.
164
164
 
165
165
  :param fn: A serving function.
166
166
  """
@@ -170,7 +170,7 @@ class EventStreamProcessor:
170
170
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
171
171
  )
172
172
 
173
- # Step 1 - Event routing based on the provided path
173
+ # Event routing based on the provided path
174
174
  def apply_event_routing():
175
175
  typing.cast(
176
176
  mlrun.serving.TaskStep,
@@ -183,7 +183,7 @@ class EventStreamProcessor:
183
183
 
184
184
  apply_event_routing()
185
185
 
186
- # Step 2 - Filter out events with '-' in the path basename from going forward
186
+ # Filter out events with '-' in the path basename from going forward
187
187
  # through the next steps of the stream graph
188
188
  def apply_storey_filter_stream_events():
189
189
  # Filter events with Prometheus endpoints path
@@ -196,7 +196,7 @@ class EventStreamProcessor:
196
196
 
197
197
  apply_storey_filter_stream_events()
198
198
 
199
- # Step 3 - Process endpoint event: splitting into sub-events and validate event data
199
+ # Process endpoint event: splitting into sub-events and validate event data
200
200
  def apply_process_endpoint_event():
201
201
  graph.add_step(
202
202
  "ProcessEndpointEvent",
@@ -207,7 +207,7 @@ class EventStreamProcessor:
207
207
 
208
208
  apply_process_endpoint_event()
209
209
 
210
- # Steps 4,5 - Applying Storey operations of filtering and flatten
210
+ # Applying Storey operations of filtering and flatten
211
211
  def apply_storey_filter_and_flatmap():
212
212
  # Remove none values from each event
213
213
  graph.add_step(
@@ -224,7 +224,7 @@ class EventStreamProcessor:
224
224
 
225
225
  apply_storey_filter_and_flatmap()
226
226
 
227
- # Step 6 - Validating feature names and map each feature to its value
227
+ # Validating feature names and map each feature to its value
228
228
  def apply_map_feature_names():
229
229
  graph.add_step(
230
230
  "MapFeatureNames",
@@ -236,9 +236,9 @@ class EventStreamProcessor:
236
236
 
237
237
  apply_map_feature_names()
238
238
 
239
- # Step 7 - Calculate number of predictions and average latency
239
+ # Calculate number of predictions and average latency
240
240
  def apply_storey_aggregations():
241
- # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
241
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
242
242
  graph.add_step(
243
243
  class_name="storey.AggregateByKey",
244
244
  aggregates=[
@@ -256,7 +256,7 @@ class EventStreamProcessor:
256
256
  table=".",
257
257
  key_field=EventFieldType.ENDPOINT_ID,
258
258
  )
259
- # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
259
+ # Calculate average latency time for each window (5 min and 1 hour by default)
260
260
  graph.add_step(
261
261
  class_name="storey.Rename",
262
262
  mapping={
@@ -269,8 +269,8 @@ class EventStreamProcessor:
269
269
 
270
270
  apply_storey_aggregations()
271
271
 
272
- # Steps 8-10 - KV/SQL branch
273
- # Step 8 - Filter relevant keys from the event before writing the data into the database table
272
+ # KV/SQL branch
273
+ # Filter relevant keys from the event before writing the data into the database table
274
274
  def apply_process_before_endpoint_update():
275
275
  graph.add_step(
276
276
  "ProcessBeforeEndpointUpdate",
@@ -280,7 +280,7 @@ class EventStreamProcessor:
280
280
 
281
281
  apply_process_before_endpoint_update()
282
282
 
283
- # Step 9 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
283
+ # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
284
284
  # about average latency and the amount of predictions over time
285
285
  def apply_update_endpoint():
286
286
  graph.add_step(
@@ -293,7 +293,7 @@ class EventStreamProcessor:
293
293
 
294
294
  apply_update_endpoint()
295
295
 
296
- # Step 10 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
296
+ # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
297
297
  # which will be used by Grafana monitoring dashboards
298
298
  def apply_infer_schema():
299
299
  graph.add_step(
@@ -308,7 +308,7 @@ class EventStreamProcessor:
308
308
  if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
309
309
  apply_infer_schema()
310
310
 
311
- # Step 11 - Emits the event in window size of events based on sample_window size (10 by default)
311
+ # Emits the event in window size of events based on sample_window size (10 by default)
312
312
  def apply_storey_sample_window():
313
313
  graph.add_step(
314
314
  "storey.steps.SampleWindow",
@@ -320,84 +320,18 @@ class EventStreamProcessor:
320
320
 
321
321
  apply_storey_sample_window()
322
322
 
323
- # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
324
- # Steps 20-21 - Prometheus branch
323
+ # TSDB branch (skip to Prometheus if in CE env)
325
324
  if not mlrun.mlconf.is_ce_mode():
326
325
  # TSDB branch
327
-
328
- # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
329
- # stats and details about the events
330
- def apply_process_before_tsdb():
331
- graph.add_step(
332
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
333
- )
334
-
335
- apply_process_before_tsdb()
336
-
337
- # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
338
- def apply_filter_and_unpacked_keys(name, keys):
339
- graph.add_step(
340
- "FilterAndUnpackKeys",
341
- name=name,
342
- after="ProcessBeforeTSDB",
343
- keys=[keys],
344
- )
345
-
346
- def apply_tsdb_target(name, after):
347
- graph.add_step(
348
- "storey.TSDBTarget",
349
- name=name,
350
- after=after,
351
- path=self.tsdb_path,
352
- rate="10/m",
353
- time_col=EventFieldType.TIMESTAMP,
354
- container=self.tsdb_container,
355
- v3io_frames=self.v3io_framesd,
356
- infer_columns_from_data=True,
357
- index_cols=[
358
- EventFieldType.ENDPOINT_ID,
359
- EventFieldType.RECORD_TYPE,
360
- EventFieldType.ENDPOINT_TYPE,
361
- ],
362
- max_events=self.tsdb_batching_max_events,
363
- flush_after_seconds=self.tsdb_batching_timeout_secs,
364
- key=EventFieldType.ENDPOINT_ID,
365
- )
366
-
367
- # Steps 13-14 - unpacked base_metrics dictionary
368
- apply_filter_and_unpacked_keys(
369
- name="FilterAndUnpackKeys1",
370
- keys=EventKeyMetrics.BASE_METRICS,
371
- )
372
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
373
-
374
- # Steps 15-16 - unpacked endpoint_features dictionary
375
- apply_filter_and_unpacked_keys(
376
- name="FilterAndUnpackKeys2",
377
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
378
- )
379
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
380
-
381
- # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
382
- apply_filter_and_unpacked_keys(
383
- name="FilterAndUnpackKeys3",
384
- keys=EventKeyMetrics.CUSTOM_METRICS,
326
+ tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
327
+ project=self.project,
385
328
  )
329
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
386
330
 
387
- def apply_storey_filter():
388
- graph.add_step(
389
- "storey.Filter",
390
- "FilterNotNone",
391
- after="FilterAndUnpackKeys3",
392
- _fn="(event is not None)",
393
- )
394
-
395
- apply_storey_filter()
396
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
397
331
  else:
398
- # Prometheus branch
332
+ # Prometheus
399
333
 
400
- # Step 20 - Increase the prediction counter by 1 and update the latency value
334
+ # Increase the prediction counter by 1 and update the latency value
401
335
  graph.add_step(
402
336
  "IncCounter",
403
337
  name="IncCounter",
@@ -405,7 +339,7 @@ class EventStreamProcessor:
405
339
  project=self.project,
406
340
  )
407
341
 
408
- # Step 21 - Record a sample of features and labels
342
+ # Record a sample of features and labels
409
343
  def apply_record_features_to_prometheus():
410
344
  graph.add_step(
411
345
  "RecordFeatures",
@@ -416,8 +350,8 @@ class EventStreamProcessor:
416
350
 
417
351
  apply_record_features_to_prometheus()
418
352
 
419
- # Steps 22-23 - Parquet branch
420
- # Step 22 - Filter and validate different keys before writing the data to Parquet target
353
+ # Parquet branch
354
+ # Filter and validate different keys before writing the data to Parquet target
421
355
  def apply_process_before_parquet():
422
356
  graph.add_step(
423
357
  "ProcessBeforeParquet",
@@ -428,7 +362,7 @@ class EventStreamProcessor:
428
362
 
429
363
  apply_process_before_parquet()
430
364
 
431
- # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
365
+ # Write the Parquet target file, partitioned by key (endpoint_id) and time.
432
366
  def apply_parquet_target():
433
367
  graph.add_step(
434
368
  "storey.ParquetTarget",
@@ -502,76 +436,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
502
436
  return e
503
437
 
504
438
 
505
- class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
506
- def __init__(self, **kwargs):
507
- """
508
- Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
509
- that each one of them contains important details and stats about the events:
510
- 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
511
- storey.AggregateByKey which was executed in step 5.
512
- 2. endpoint_features: feature names and values along with the prediction names and value.
513
- 3. custom_metric (opt): optional metrics provided by the user.
514
-
515
- :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
516
-
517
- """
518
- super().__init__(**kwargs)
519
-
520
- def do(self, event):
521
- # Compute prediction per second
522
- event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
523
- float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
524
- )
525
- base_fields = [
526
- EventFieldType.TIMESTAMP,
527
- EventFieldType.ENDPOINT_ID,
528
- EventFieldType.ENDPOINT_TYPE,
529
- ]
530
-
531
- # Getting event timestamp and endpoint_id
532
- base_event = {k: event[k] for k in base_fields}
533
-
534
- # base_metrics includes the stats about the average latency and the amount of predictions over time
535
- base_metrics = {
536
- EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
537
- EventLiveStats.PREDICTIONS_PER_SECOND: event[
538
- EventLiveStats.PREDICTIONS_PER_SECOND
539
- ],
540
- EventLiveStats.PREDICTIONS_COUNT_5M: event[
541
- EventLiveStats.PREDICTIONS_COUNT_5M
542
- ],
543
- EventLiveStats.PREDICTIONS_COUNT_1H: event[
544
- EventLiveStats.PREDICTIONS_COUNT_1H
545
- ],
546
- EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
547
- EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
548
- **base_event,
549
- }
550
-
551
- # endpoint_features includes the event values of each feature and prediction
552
- endpoint_features = {
553
- EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
554
- **event[EventFieldType.NAMED_PREDICTIONS],
555
- **event[EventFieldType.NAMED_FEATURES],
556
- **base_event,
557
- }
558
- # Create a dictionary that includes both base_metrics and endpoint_features
559
- processed = {
560
- EventKeyMetrics.BASE_METRICS: base_metrics,
561
- EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
562
- }
563
-
564
- # If metrics provided, add another dictionary if custom_metrics values
565
- if event[EventFieldType.METRICS]:
566
- processed[EventKeyMetrics.CUSTOM_METRICS] = {
567
- EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
568
- **event[EventFieldType.METRICS],
569
- **base_event,
570
- }
571
-
572
- return processed
573
-
574
-
575
439
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
576
440
  def __init__(self, **kwargs):
577
441
  """
@@ -852,36 +716,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
852
716
  return False
853
717
 
854
718
 
855
- class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
856
- def __init__(self, keys, **kwargs):
857
- """
858
- Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
859
- or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
860
-
861
- :param keys: list of key metrics.
862
-
863
- :returns: An unpacked dictionary of event filtered by the provided key metrics.
864
- """
865
- super().__init__(**kwargs)
866
- self.keys = keys
867
-
868
- def do(self, event):
869
- # Keep only the relevant dictionary based on the provided keys
870
- new_event = {}
871
- for key in self.keys:
872
- if key in event:
873
- new_event[key] = event[key]
874
-
875
- # Create unpacked dictionary
876
- unpacked = {}
877
- for key in new_event.keys():
878
- if key in self.keys:
879
- unpacked = {**unpacked, **new_event[key]}
880
- else:
881
- unpacked[key] = new_event[key]
882
- return unpacked if unpacked else None
883
-
884
-
885
719
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
886
720
  def __init__(
887
721
  self,
@@ -1117,6 +951,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1117
951
  def do(self, event: dict):
1118
952
  key_set = set(event.keys())
1119
953
  if not key_set.issubset(self.keys):
954
+ import mlrun.utils.v3io_clients
955
+
1120
956
  self.keys.update(key_set)
1121
957
  # Apply infer_schema on the kv table for generating the schema file
1122
958
  mlrun.utils.v3io_clients.get_frames_client(
@@ -12,24 +12,16 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import datetime
16
15
  import json
17
16
  from typing import Any, NewType
18
17
 
19
- import pandas as pd
20
- from v3io.dataplane import Client as V3IOClient
21
- from v3io_frames.client import ClientBase as V3IOFramesClient
22
- from v3io_frames.errors import Error as V3IOFramesError
23
- from v3io_frames.frames_pb2 import IGNORE
24
-
25
18
  import mlrun.common.model_monitoring
26
19
  import mlrun.common.schemas
27
- import mlrun.common.schemas.alert as alert_constants
20
+ import mlrun.common.schemas.alert as alert_objects
28
21
  import mlrun.model_monitoring
29
- import mlrun.model_monitoring.db.stores
30
- import mlrun.utils.v3io_clients
31
22
  from mlrun.common.schemas.model_monitoring.constants import (
32
23
  EventFieldType,
24
+ HistogramDataDriftApplicationConstants,
33
25
  MetricData,
34
26
  ResultData,
35
27
  ResultStatusApp,
@@ -42,9 +34,6 @@ from mlrun.serving.utils import StepToDict
42
34
  from mlrun.utils import logger
43
35
  from mlrun.utils.notifications.notification_pusher import CustomNotificationPusher
44
36
 
45
- _TSDB_BE = "tsdb"
46
- _TSDB_RATE = "1/s"
47
- _TSDB_TABLE = "app-results"
48
37
  _RawEvent = dict[str, Any]
49
38
  _AppResultEvent = NewType("_AppResultEvent", _RawEvent)
50
39
 
@@ -107,7 +96,7 @@ Extra data: `{self._event[ResultData.RESULT_EXTRA_DATA]}`\
107
96
 
108
97
  class ModelMonitoringWriter(StepToDict):
109
98
  """
110
- Write monitoring app events to V3IO KV storage
99
+ Write monitoring application results to the target databases
111
100
  """
112
101
 
113
102
  kind = "monitoring_application_stream_pusher"
@@ -115,102 +104,38 @@ class ModelMonitoringWriter(StepToDict):
115
104
  def __init__(self, project: str) -> None:
116
105
  self.project = project
117
106
  self.name = project # required for the deployment process
118
- self._v3io_container = self.get_v3io_container(self.name)
119
- self._tsdb_client = self._get_v3io_frames_client(self._v3io_container)
107
+
120
108
  self._custom_notifier = CustomNotificationPusher(
121
109
  notification_types=[NotificationKind.slack]
122
110
  )
123
- self._create_tsdb_table()
124
- self._endpoints_records = {}
125
-
126
- @staticmethod
127
- def get_v3io_container(project_name: str) -> str:
128
- return f"users/pipelines/{project_name}/monitoring-apps"
129
-
130
- @staticmethod
131
- def _get_v3io_client() -> V3IOClient:
132
- return mlrun.utils.v3io_clients.get_v3io_client(
133
- endpoint=mlrun.mlconf.v3io_api,
134
- )
135
-
136
- @staticmethod
137
- def _get_v3io_frames_client(v3io_container: str) -> V3IOFramesClient:
138
- return mlrun.utils.v3io_clients.get_frames_client(
139
- address=mlrun.mlconf.v3io_framesd,
140
- container=v3io_container,
141
- )
142
111
 
143
- def _create_tsdb_table(self) -> None:
144
- self._tsdb_client.create(
145
- backend=_TSDB_BE,
146
- table=_TSDB_TABLE,
147
- if_exists=IGNORE,
148
- rate=_TSDB_RATE,
149
- )
150
-
151
- def _update_kv_db(self, event: _AppResultEvent, kind: str = "result") -> None:
152
- if kind == "metric":
153
- # TODO : Implement the logic for writing metrics to KV
154
- return
155
- event = _AppResultEvent(event.copy())
156
- application_result_store = mlrun.model_monitoring.get_store_object(
112
+ self._app_result_store = mlrun.model_monitoring.get_store_object(
157
113
  project=self.project
158
114
  )
159
- application_result_store.write_application_result(event=event)
160
-
161
- def _update_tsdb(self, event: _AppResultEvent, kind: str = "result") -> None:
162
- if kind == "metric":
163
- # TODO : Implement the logic for writing metrics to TSDB
164
- return
165
- event = _AppResultEvent(event.copy())
166
- event[WriterEvent.END_INFER_TIME] = datetime.datetime.fromisoformat(
167
- event[WriterEvent.END_INFER_TIME]
115
+ self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
116
+ project=self.project,
168
117
  )
169
- del event[ResultData.RESULT_EXTRA_DATA]
170
- try:
171
- self._tsdb_client.write(
172
- backend=_TSDB_BE,
173
- table=_TSDB_TABLE,
174
- dfs=pd.DataFrame.from_records([event]),
175
- index_cols=[
176
- WriterEvent.END_INFER_TIME,
177
- WriterEvent.ENDPOINT_ID,
178
- WriterEvent.APPLICATION_NAME,
179
- ResultData.RESULT_NAME,
180
- ],
181
- )
182
- logger.info("Updated V3IO TSDB successfully", table=_TSDB_TABLE)
183
- except V3IOFramesError as err:
184
- logger.warn(
185
- "Could not write drift measures to TSDB",
186
- err=err,
187
- table=_TSDB_TABLE,
188
- event=event,
189
- )
118
+ self._endpoints_records = {}
190
119
 
191
120
  @staticmethod
192
121
  def _generate_event_on_drift(
193
122
  model_endpoint: str, drift_status: str, event_value: dict, project_name: str
194
123
  ) -> None:
195
- if (
196
- drift_status == ResultStatusApp.detected.value
197
- or drift_status == ResultStatusApp.potential_detection.value
198
- ):
199
- logger.info("Sending an alert")
200
- entity = {
201
- "kind": alert_constants.EventEntityKind.MODEL,
202
- "project": project_name,
203
- "model_endpoint": model_endpoint,
204
- }
205
- event_kind = (
206
- alert_constants.EventKind.DRIFT_DETECTED
207
- if drift_status == ResultStatusApp.detected.value
208
- else alert_constants.EventKind.DRIFT_SUSPECTED
209
- )
210
- event_data = mlrun.common.schemas.Event(
211
- kind=event_kind, entity=entity, value_dict=event_value
212
- )
213
- mlrun.get_run_db().generate_event(event_kind, event_data)
124
+ logger.info("Sending an alert")
125
+ entity = mlrun.common.schemas.alert.EventEntities(
126
+ kind=alert_objects.EventEntityKind.MODEL,
127
+ project=project_name,
128
+ ids=[model_endpoint],
129
+ )
130
+ event_kind = (
131
+ alert_objects.EventKind.DRIFT_DETECTED
132
+ if drift_status == ResultStatusApp.detected.value
133
+ else alert_objects.EventKind.DRIFT_SUSPECTED
134
+ )
135
+ event_data = mlrun.common.schemas.Event(
136
+ kind=event_kind, entity=entity, value_dict=event_value
137
+ )
138
+ mlrun.get_run_db().generate_event(event_kind, event_data)
214
139
 
215
140
  @staticmethod
216
141
  def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent, str]:
@@ -255,14 +180,20 @@ class ModelMonitoringWriter(StepToDict):
255
180
  event, kind = self._reconstruct_event(event)
256
181
  logger.info("Starting to write event", event=event)
257
182
 
258
- self._update_tsdb(event, kind)
259
- self._update_kv_db(event, kind)
183
+ self._tsdb_connector.write_application_event(event=event.copy(), kind=kind)
184
+ self._app_result_store.write_application_event(event=event.copy(), kind=kind)
260
185
  logger.info("Completed event DB writes")
186
+
261
187
  _Notifier(event=event, notification_pusher=self._custom_notifier).notify()
262
188
 
263
189
  if (
264
190
  mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
265
191
  and kind == WriterEventKind.RESULT
192
+ and (
193
+ event[ResultData.RESULT_STATUS] == ResultStatusApp.detected.value
194
+ or event[ResultData.RESULT_STATUS]
195
+ == ResultStatusApp.potential_detection.value
196
+ )
266
197
  ):
267
198
  endpoint_id = event[WriterEvent.ENDPOINT_ID]
268
199
  endpoint_record = self._endpoints_records.setdefault(
@@ -282,3 +213,22 @@ class ModelMonitoringWriter(StepToDict):
282
213
  event_value,
283
214
  self.project,
284
215
  )
216
+
217
+ if (
218
+ kind == WriterEventKind.RESULT
219
+ and event[WriterEvent.APPLICATION_NAME]
220
+ == HistogramDataDriftApplicationConstants.NAME
221
+ and event[ResultData.RESULT_NAME]
222
+ == HistogramDataDriftApplicationConstants.GENERAL_RESULT_NAME
223
+ ):
224
+ endpoint_id = event[WriterEvent.ENDPOINT_ID]
225
+ logger.info(
226
+ "Updating the model endpoint with metadata specific to the histogram "
227
+ "data drift app",
228
+ endpoint_id=endpoint_id,
229
+ )
230
+ store = mlrun.model_monitoring.get_store_object(project=self.project)
231
+ store.update_model_endpoint(
232
+ endpoint_id=endpoint_id,
233
+ attributes=json.loads(event[ResultData.RESULT_EXTRA_DATA]),
234
+ )
@@ -17,22 +17,23 @@ import json
17
17
  from pprint import pprint
18
18
  from time import sleep
19
19
 
20
- from .iguazio import (
21
- V3ioStreamClient,
22
- VolumeMount,
23
- add_or_refresh_credentials,
24
- is_iguazio_session_cookie,
25
- mount_v3io,
26
- v3io_cred,
27
- )
28
- from .other import (
20
+ from mlrun_pipelines.common.mounts import VolumeMount
21
+ from mlrun_pipelines.mounts import (
29
22
  auto_mount,
30
23
  mount_configmap,
31
24
  mount_hostpath,
32
25
  mount_pvc,
33
26
  mount_s3,
34
27
  mount_secret,
28
+ mount_v3io,
35
29
  set_env_variables,
30
+ v3io_cred,
31
+ )
32
+
33
+ from .iguazio import (
34
+ V3ioStreamClient,
35
+ add_or_refresh_credentials,
36
+ is_iguazio_session_cookie,
36
37
  )
37
38
 
38
39