mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (119) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +1 -40
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/functions.py +6 -1
  7. mlrun/api/api/endpoints/logs.py +17 -3
  8. mlrun/api/api/endpoints/pipelines.py +1 -5
  9. mlrun/api/api/endpoints/projects.py +88 -0
  10. mlrun/api/api/endpoints/runs.py +48 -6
  11. mlrun/api/api/endpoints/workflows.py +355 -0
  12. mlrun/api/api/utils.py +1 -1
  13. mlrun/api/crud/__init__.py +1 -0
  14. mlrun/api/crud/client_spec.py +3 -0
  15. mlrun/api/crud/model_monitoring/deployment.py +36 -7
  16. mlrun/api/crud/model_monitoring/grafana.py +1 -1
  17. mlrun/api/crud/model_monitoring/helpers.py +32 -2
  18. mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
  19. mlrun/api/crud/notifications.py +9 -4
  20. mlrun/api/crud/pipelines.py +4 -9
  21. mlrun/api/crud/runtime_resources.py +4 -3
  22. mlrun/api/crud/secrets.py +21 -0
  23. mlrun/api/crud/workflows.py +352 -0
  24. mlrun/api/db/base.py +16 -1
  25. mlrun/api/db/sqldb/db.py +97 -16
  26. mlrun/api/launcher.py +26 -7
  27. mlrun/api/main.py +3 -4
  28. mlrun/{mlutils → api/rundb}/__init__.py +2 -6
  29. mlrun/{db → api/rundb}/sqldb.py +35 -83
  30. mlrun/api/runtime_handlers/__init__.py +56 -0
  31. mlrun/api/runtime_handlers/base.py +1247 -0
  32. mlrun/api/runtime_handlers/daskjob.py +209 -0
  33. mlrun/api/runtime_handlers/kubejob.py +37 -0
  34. mlrun/api/runtime_handlers/mpijob.py +147 -0
  35. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  36. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  37. mlrun/api/utils/builder.py +1 -4
  38. mlrun/api/utils/clients/chief.py +14 -0
  39. mlrun/api/utils/scheduler.py +98 -15
  40. mlrun/api/utils/singletons/db.py +4 -0
  41. mlrun/artifacts/manager.py +1 -2
  42. mlrun/common/schemas/__init__.py +6 -0
  43. mlrun/common/schemas/auth.py +4 -1
  44. mlrun/common/schemas/client_spec.py +1 -1
  45. mlrun/common/schemas/model_monitoring/__init__.py +1 -0
  46. mlrun/common/schemas/model_monitoring/constants.py +11 -0
  47. mlrun/common/schemas/project.py +1 -0
  48. mlrun/common/schemas/runs.py +1 -8
  49. mlrun/common/schemas/schedule.py +1 -8
  50. mlrun/common/schemas/workflow.py +54 -0
  51. mlrun/config.py +42 -40
  52. mlrun/datastore/sources.py +1 -1
  53. mlrun/db/__init__.py +4 -68
  54. mlrun/db/base.py +12 -0
  55. mlrun/db/factory.py +65 -0
  56. mlrun/db/httpdb.py +175 -19
  57. mlrun/db/nopdb.py +4 -2
  58. mlrun/execution.py +4 -2
  59. mlrun/feature_store/__init__.py +1 -0
  60. mlrun/feature_store/api.py +1 -2
  61. mlrun/feature_store/feature_set.py +0 -10
  62. mlrun/feature_store/feature_vector.py +340 -2
  63. mlrun/feature_store/ingestion.py +5 -10
  64. mlrun/feature_store/retrieval/base.py +118 -104
  65. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  66. mlrun/feature_store/retrieval/job.py +4 -1
  67. mlrun/feature_store/retrieval/local_merger.py +18 -18
  68. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  69. mlrun/feature_store/retrieval/storey_merger.py +21 -15
  70. mlrun/kfpops.py +3 -9
  71. mlrun/launcher/base.py +3 -3
  72. mlrun/launcher/client.py +3 -2
  73. mlrun/launcher/factory.py +16 -13
  74. mlrun/lists.py +0 -11
  75. mlrun/model.py +9 -15
  76. mlrun/model_monitoring/helpers.py +15 -25
  77. mlrun/model_monitoring/model_monitoring_batch.py +72 -4
  78. mlrun/model_monitoring/prometheus.py +219 -0
  79. mlrun/model_monitoring/stores/__init__.py +15 -9
  80. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
  81. mlrun/model_monitoring/stream_processing.py +181 -29
  82. mlrun/package/packager.py +6 -8
  83. mlrun/package/packagers/default_packager.py +121 -10
  84. mlrun/platforms/__init__.py +0 -2
  85. mlrun/platforms/iguazio.py +0 -56
  86. mlrun/projects/pipelines.py +57 -158
  87. mlrun/projects/project.py +6 -32
  88. mlrun/render.py +1 -1
  89. mlrun/run.py +2 -124
  90. mlrun/runtimes/__init__.py +6 -42
  91. mlrun/runtimes/base.py +26 -1241
  92. mlrun/runtimes/daskjob.py +2 -198
  93. mlrun/runtimes/function.py +16 -5
  94. mlrun/runtimes/kubejob.py +5 -29
  95. mlrun/runtimes/mpijob/__init__.py +2 -2
  96. mlrun/runtimes/mpijob/abstract.py +10 -1
  97. mlrun/runtimes/mpijob/v1.py +0 -76
  98. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  99. mlrun/runtimes/nuclio.py +3 -2
  100. mlrun/runtimes/pod.py +0 -10
  101. mlrun/runtimes/remotesparkjob.py +1 -15
  102. mlrun/runtimes/serving.py +1 -1
  103. mlrun/runtimes/sparkjob/__init__.py +0 -1
  104. mlrun/runtimes/sparkjob/abstract.py +4 -131
  105. mlrun/serving/states.py +1 -1
  106. mlrun/utils/db.py +0 -2
  107. mlrun/utils/helpers.py +19 -13
  108. mlrun/utils/notifications/notification_pusher.py +5 -25
  109. mlrun/utils/regex.py +7 -2
  110. mlrun/utils/version/version.json +2 -2
  111. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
  112. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
  113. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  114. mlrun/mlutils/data.py +0 -160
  115. mlrun/mlutils/models.py +0 -78
  116. mlrun/mlutils/plots.py +0 -902
  117. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  118. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  119. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import typing
16
+
17
+ import prometheus_client
18
+
19
+ from mlrun.common.schemas.model_monitoring import EventFieldType, PrometheusMetric
20
+
21
+ # Memory path for Prometheus registry file
22
+ _registry_path = "/tmp/prom-reg.txt"
23
+
24
+ # Initializing Promethues metric collector registry
25
+ _registry: prometheus_client.CollectorRegistry = prometheus_client.CollectorRegistry()
26
+
27
+ # The following real-time metrics are being updated through the monitoring stream graph steps
28
+ _prediction_counter: prometheus_client.Counter = prometheus_client.Counter(
29
+ name=PrometheusMetric.PREDICTIONS_TOTAL,
30
+ documentation="Counter for total predictions",
31
+ registry=_registry,
32
+ labelnames=[
33
+ EventFieldType.PROJECT,
34
+ EventFieldType.ENDPOINT_ID,
35
+ EventFieldType.MODEL,
36
+ EventFieldType.ENDPOINT_TYPE,
37
+ ],
38
+ )
39
+ _model_latency: prometheus_client.Summary = prometheus_client.Summary(
40
+ name=PrometheusMetric.MODEL_LATENCY_SECONDS,
41
+ documentation="Summary for for model latency",
42
+ registry=_registry,
43
+ labelnames=[
44
+ EventFieldType.PROJECT,
45
+ EventFieldType.ENDPOINT_ID,
46
+ EventFieldType.MODEL,
47
+ EventFieldType.ENDPOINT_TYPE,
48
+ ],
49
+ )
50
+ _income_features: prometheus_client.Gauge = prometheus_client.Gauge(
51
+ name=PrometheusMetric.INCOME_FEATURES,
52
+ documentation="Samples of features and predictions",
53
+ registry=_registry,
54
+ labelnames=[
55
+ EventFieldType.PROJECT,
56
+ EventFieldType.ENDPOINT_ID,
57
+ EventFieldType.METRIC,
58
+ ],
59
+ )
60
+ _error_counter: prometheus_client.Counter = prometheus_client.Counter(
61
+ name=PrometheusMetric.ERRORS_TOTAL,
62
+ documentation="Counter for total errors",
63
+ registry=_registry,
64
+ labelnames=[
65
+ EventFieldType.PROJECT,
66
+ EventFieldType.ENDPOINT_ID,
67
+ EventFieldType.MODEL,
68
+ ],
69
+ )
70
+
71
+ # The following metrics are being updated through the model monitoring batch job
72
+ _batch_metrics: prometheus_client.Gauge = prometheus_client.Gauge(
73
+ name=PrometheusMetric.DRIFT_METRICS,
74
+ documentation="Results from the batch drift analysis",
75
+ registry=_registry,
76
+ labelnames=[
77
+ EventFieldType.PROJECT,
78
+ EventFieldType.ENDPOINT_ID,
79
+ EventFieldType.METRIC,
80
+ ],
81
+ )
82
+ _drift_status: prometheus_client.Enum = prometheus_client.Enum(
83
+ name=PrometheusMetric.DRIFT_STATUS,
84
+ documentation="Drift status of the model endpoint",
85
+ registry=_registry,
86
+ states=["NO_DRIFT", "DRIFT_DETECTED", "POSSIBLE_DRIFT"],
87
+ labelnames=[EventFieldType.PROJECT, EventFieldType.ENDPOINT_ID],
88
+ )
89
+
90
+
91
+ def _write_registry(func):
92
+ def wrapper(*args, **kwargs):
93
+ global _registry
94
+ """A wrapper function to update the registry file each time a metric has been updated"""
95
+ func(*args, **kwargs)
96
+ prometheus_client.write_to_textfile(path=_registry_path, registry=_registry)
97
+
98
+ return wrapper
99
+
100
+
101
+ @_write_registry
102
+ def write_predictions_and_latency_metrics(
103
+ project: str, endpoint_id: str, latency: int, model_name: str, endpoint_type: int
104
+ ):
105
+ """
106
+ Update the prediction counter and the latency value of the provided model endpoint within Prometheus registry.
107
+ Please note that while the prediction counter is ALWAYS increasing by 1,the latency summary metric is being
108
+ increased by the event latency time. Grafana dashboard will query the average latency time by dividing the total
109
+ latency value by the total amount of predictions.
110
+
111
+ :param project: Project name.
112
+ :param endpoint_id: Model endpoint unique id.
113
+ :param latency: Latency time (microsecond) in which the event has been processed through the model server.
114
+ :param model_name: Model name which will be used by Grafana for displaying the results by model.
115
+ :param endpoint_type: Endpoint type that is represented by an int (possible values: 1,2,3) corresponding to the
116
+ Enum class :py:class:`~mlrun.common.schemas.model_monitoring.EndpointType`.
117
+ """
118
+
119
+ # Increase the prediction counter by 1
120
+ _prediction_counter.labels(
121
+ project=project,
122
+ endpoint_id=endpoint_id,
123
+ model=model_name,
124
+ endpoint_type=endpoint_type,
125
+ ).inc(1)
126
+
127
+ # Increase the latency value according to the provided latency of the current event
128
+ _model_latency.labels(
129
+ project=project,
130
+ endpoint_id=endpoint_id,
131
+ model=model_name,
132
+ endpoint_type=endpoint_type,
133
+ ).observe(latency)
134
+
135
+
136
+ @_write_registry
137
+ def write_income_features(
138
+ project: str, endpoint_id: str, features: typing.Dict[str, float]
139
+ ):
140
+ """Update a sample of features.
141
+
142
+ :param project: Project name.
143
+ :param endpoint_id: Model endpoint unique id.
144
+ :param features: Dictionary in which the key is a feature name and the value is a float number.
145
+
146
+
147
+ """
148
+
149
+ for metric in features:
150
+ _income_features.labels(
151
+ project=project, endpoint_id=endpoint_id, metric=metric
152
+ ).set(value=features[metric])
153
+
154
+
155
+ @_write_registry
156
+ def write_drift_metrics(project: str, endpoint_id: str, metric: str, value: float):
157
+ """Update drift metrics that have been calculated through the monitoring batch job
158
+
159
+ :param project: Project name.
160
+ :param endpoint_id: Model endpoint unique id.
161
+ :param metric: Metric name (e.g. TVD, Hellinger).
162
+ :param value: Metric value as a float.
163
+
164
+ """
165
+
166
+ _batch_metrics.labels(project=project, endpoint_id=endpoint_id, metric=metric).set(
167
+ value=value
168
+ )
169
+
170
+
171
+ @_write_registry
172
+ def write_drift_status(project: str, endpoint_id: str, drift_status: str):
173
+ """
174
+ Update the drift status enum for a specific model endpoint.
175
+
176
+ :param project: Project name.
177
+ :param endpoint_id: Model endpoint unique id.
178
+ :param drift_status: Drift status value, can be one of the following: 'NO_DRIFT', 'DRIFT_DETECTED', or
179
+ 'POSSIBLE_DRIFT'.
180
+ """
181
+
182
+ _drift_status.labels(project=project, endpoint_id=endpoint_id).state(drift_status)
183
+
184
+
185
+ @_write_registry
186
+ def write_errors(project: str, endpoint_id: str, model_name: str):
187
+ """
188
+ Update the error counter for a specific model endpoint.
189
+
190
+ :param project: Project name.
191
+ :param endpoint_id: Model endpoint unique id.
192
+ :param model_name: Model name. Will be used by Grafana to show the amount of errors per model by time.
193
+ """
194
+
195
+ _error_counter.labels(
196
+ project=project, endpoint_id=endpoint_id, model=model_name
197
+ ).inc(1)
198
+
199
+
200
+ def get_registry() -> str:
201
+ """Returns the parsed registry file according to the exposition format of Prometheus."""
202
+
203
+ # Read the registry file (note that the text is stored in UTF-8 format)
204
+ f = open(_registry_path)
205
+ lines = f.read()
206
+ f.close()
207
+
208
+ # Reset part of the metrics to avoid a repeating scraping of the same value
209
+ clean_metrics()
210
+
211
+ return lines
212
+
213
+
214
+ @_write_registry
215
+ def clean_metrics():
216
+ """Clean the income features values. As these results are relevant only for a certain timestamp, we will remove
217
+ them from the global registry after they have been scraped by Prometheus."""
218
+
219
+ _income_features.clear()
@@ -17,6 +17,7 @@
17
17
  import enum
18
18
  import typing
19
19
 
20
+ import mlrun.common.schemas.secret
20
21
  import mlrun.errors
21
22
 
22
23
  from .model_endpoint_store import ModelEndpointStore
@@ -33,6 +34,7 @@ class ModelEndpointStoreType(enum.Enum):
33
34
  project: str,
34
35
  access_key: str = None,
35
36
  endpoint_store_connection: str = None,
37
+ secret_provider: typing.Callable = None,
36
38
  ) -> ModelEndpointStore:
37
39
  """
38
40
  Return a ModelEndpointStore object based on the provided enum value.
@@ -46,6 +48,7 @@ class ModelEndpointStoreType(enum.Enum):
46
48
  e.g. A root user with password 1234, tries to connect a schema called
47
49
  mlrun within a local MySQL DB instance:
48
50
  'mysql+pymysql://root:1234@localhost:3306/mlrun'.
51
+ :param secret_provider: An optional secret provider to get the connection string secret.
49
52
 
50
53
  :return: `ModelEndpointStore` object.
51
54
 
@@ -61,15 +64,13 @@ class ModelEndpointStoreType(enum.Enum):
61
64
 
62
65
  # Assuming SQL store target if store type is not KV.
63
66
  # Update these lines once there are more than two store target types.
64
- from mlrun.model_monitoring.helpers import get_connection_string
65
67
 
66
- sql_connection_string = endpoint_store_connection or get_connection_string(
67
- project=project
68
- )
69
68
  from .sql_model_endpoint_store import SQLModelEndpointStore
70
69
 
71
70
  return SQLModelEndpointStore(
72
- project=project, sql_connection_string=sql_connection_string
71
+ project=project,
72
+ sql_connection_string=endpoint_store_connection,
73
+ secret_provider=secret_provider,
73
74
  )
74
75
 
75
76
  @classmethod
@@ -84,13 +85,16 @@ class ModelEndpointStoreType(enum.Enum):
84
85
 
85
86
 
86
87
  def get_model_endpoint_store(
87
- project: str, access_key: str = None
88
+ project: str,
89
+ access_key: str = None,
90
+ secret_provider: typing.Callable = None,
88
91
  ) -> ModelEndpointStore:
89
92
  """
90
93
  Getting the DB target type based on mlrun.config.model_endpoint_monitoring.store_type.
91
94
 
92
- :param project: The name of the project.
93
- :param access_key: Access key with permission to the DB table.
95
+ :param project: The name of the project.
96
+ :param access_key: Access key with permission to the DB table.
97
+ :param secret_provider: An optional secret provider to get the connection string secret.
94
98
 
95
99
  :return: `ModelEndpointStore` object. Using this object, the user can apply different operations on the
96
100
  model endpoint record such as write, update, get and delete.
@@ -102,4 +106,6 @@ def get_model_endpoint_store(
102
106
  )
103
107
 
104
108
  # Convert into model endpoint store target object
105
- return model_endpoint_store_type.to_endpoint_store(project, access_key)
109
+ return model_endpoint_store_type.to_endpoint_store(
110
+ project=project, access_key=access_key, secret_provider=secret_provider
111
+ )
@@ -45,12 +45,14 @@ class SQLModelEndpointStore(ModelEndpointStore):
45
45
  self,
46
46
  project: str,
47
47
  sql_connection_string: str = None,
48
+ secret_provider: typing.Callable = None,
48
49
  ):
49
50
  """
50
51
  Initialize SQL store target object.
51
52
 
52
53
  :param project: The name of the project.
53
54
  :param sql_connection_string: Valid connection string or a path to SQL database with model endpoints table.
55
+ :param secret_provider: An optional secret provider to get the connection string secret.
54
56
  """
55
57
 
56
58
  super().__init__(project=project)
@@ -58,7 +60,7 @@ class SQLModelEndpointStore(ModelEndpointStore):
58
60
  self.sql_connection_string = (
59
61
  sql_connection_string
60
62
  or mlrun.model_monitoring.helpers.get_connection_string(
61
- project=self.project
63
+ secret_provider=secret_provider
62
64
  )
63
65
  )
64
66
 
@@ -21,8 +21,13 @@ import typing
21
21
  import pandas as pd
22
22
  import storey
23
23
 
24
+ import mlrun
24
25
  import mlrun.common.model_monitoring.helpers
26
+ import mlrun.config
27
+ import mlrun.datastore.targets
25
28
  import mlrun.feature_store.steps
29
+ import mlrun.model_monitoring.prometheus
30
+ import mlrun.utils
26
31
  import mlrun.utils.v3io_clients
27
32
  from mlrun.common.schemas.model_monitoring.constants import (
28
33
  EventFieldType,
@@ -41,9 +46,9 @@ class EventStreamProcessor:
41
46
  self,
42
47
  project: str,
43
48
  parquet_batching_max_events: int,
49
+ parquet_batching_timeout_secs: int,
44
50
  parquet_target: str,
45
51
  sample_window: int = 10,
46
- parquet_batching_timeout_secs: int = 30 * 60, # Default 30 minutes
47
52
  aggregate_windows: typing.Optional[typing.List[str]] = None,
48
53
  aggregate_period: str = "30s",
49
54
  model_monitoring_access_key: str = None,
@@ -74,6 +79,8 @@ class EventStreamProcessor:
74
79
  self._initialize_v3io_configurations(
75
80
  model_monitoring_access_key=model_monitoring_access_key
76
81
  )
82
+ elif self.parquet_path.startswith("s3://"):
83
+ self.storage_options = mlrun.mlconf.get_s3_storage_options()
77
84
 
78
85
  def _initialize_v3io_configurations(
79
86
  self,
@@ -132,7 +139,7 @@ class EventStreamProcessor:
132
139
  of different operations that are executed on the events from the model server. Each event has
133
140
  metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
134
141
  Throughout the serving graph, the results are written to 3 different databases:
135
- 1. KV/SQL (steps 7-9): Stores metadata and stats about the average latency and the amount of predictions over
142
+ 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
136
143
  time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
137
144
  by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
138
145
  from other processes, such as current_stats that is being calculated by the monitoring batch job
@@ -140,12 +147,14 @@ class EventStreamProcessor:
140
147
  v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
141
148
  is stored within the database that was defined in the provided connection string and can be found
142
149
  under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
143
- 2. TSDB (steps 12-18): Stores live data of different key metric dictionaries in tsdb target. Results can be
144
- found under v3io:///users/pipelines/project-name/model-endpoints/events/. At the moment, this part supports
145
- 3 different key metric dictionaries: base_metrics (average latency and predictions over time),
150
+ 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
151
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
152
+ can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
153
+ 3 different key metric dictionaries: base_metrics (average latency and predictions over time),
146
154
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
147
- This data is also being used by the monitoring dashboards in grafana.
148
- 3. Parquet (steps 19-20): This Parquet file includes the required data for the model monitoring batch job
155
+ If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
156
+ monitoring stream local memory.
157
+ 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
149
158
  that run every hour by default. If defined, the parquet target path can be found under
150
159
  mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
151
160
  mlrun.mlconf.model_endpoint_monitoring.user_space.
@@ -155,17 +164,41 @@ class EventStreamProcessor:
155
164
 
156
165
  graph = fn.set_topology("flow")
157
166
 
158
- # Step 1 - Process endpoint event: splitting into sub-events and validate event data
167
+ # Step 1 - Event routing based on the provided path
168
+ def apply_event_routing():
169
+ graph.add_step(
170
+ "EventRouting",
171
+ full_event=True,
172
+ project=self.project,
173
+ ).respond()
174
+
175
+ apply_event_routing()
176
+
177
+ # Step 2 - Filter out events with no '-' in path which indicates that the event is supposed to be processed
178
+ # through the next steps of the stream graph
179
+ def apply_storey_filter_stream_events():
180
+ # Remove none values from each event
181
+ graph.add_step(
182
+ "storey.Filter",
183
+ "filter_stream_event",
184
+ _fn="('-' not in event.path)",
185
+ full_event=True,
186
+ )
187
+
188
+ apply_storey_filter_stream_events()
189
+
190
+ # Step 3 - Process endpoint event: splitting into sub-events and validate event data
159
191
  def apply_process_endpoint_event():
160
192
  graph.add_step(
161
193
  "ProcessEndpointEvent",
162
194
  full_event=True,
163
195
  project=self.project,
196
+ after="filter_stream_event",
164
197
  )
165
198
 
166
199
  apply_process_endpoint_event()
167
200
 
168
- # Steps 2,3 - Applying Storey operations of filtering and flatten
201
+ # Steps 4,5 - Applying Storey operations of filtering and flatten
169
202
  def apply_storey_filter_and_flatmap():
170
203
  # Remove none values from each event
171
204
  graph.add_step(
@@ -182,7 +215,7 @@ class EventStreamProcessor:
182
215
 
183
216
  apply_storey_filter_and_flatmap()
184
217
 
185
- # Step 4 - Validating feature names and map each feature to its value
218
+ # Step 6 - Validating feature names and map each feature to its value
186
219
  def apply_map_feature_names():
187
220
  graph.add_step(
188
221
  "MapFeatureNames",
@@ -194,9 +227,9 @@ class EventStreamProcessor:
194
227
 
195
228
  apply_map_feature_names()
196
229
 
197
- # Step 5 - Calculate number of predictions and average latency
230
+ # Step 7 - Calculate number of predictions and average latency
198
231
  def apply_storey_aggregations():
199
- # Step 5.1 - Calculate number of predictions and average latency for each window (5 min and 1 hour)
232
+ # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
200
233
  graph.add_step(
201
234
  class_name="storey.AggregateByKey",
202
235
  aggregates=[
@@ -214,8 +247,7 @@ class EventStreamProcessor:
214
247
  table=".",
215
248
  key_field=EventFieldType.ENDPOINT_ID,
216
249
  )
217
-
218
- # Step 5.2 - Rename the latency counter field to prediction counter
250
+ # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
219
251
  graph.add_step(
220
252
  class_name="storey.Rename",
221
253
  mapping={
@@ -228,7 +260,7 @@ class EventStreamProcessor:
228
260
 
229
261
  apply_storey_aggregations()
230
262
 
231
- # Step 6 - Emits the event in window size of events based on sample_window size (10 by default)
263
+ # Step 8 - Emits the event in window size of events based on sample_window size (10 by default)
232
264
  def apply_storey_sample_window():
233
265
  graph.add_step(
234
266
  "storey.steps.SampleWindow",
@@ -240,8 +272,8 @@ class EventStreamProcessor:
240
272
 
241
273
  apply_storey_sample_window()
242
274
 
243
- # Steps 7-9 - KV/SQL branch
244
- # Step 7 - Filter relevant keys from the event before writing the data into the database table
275
+ # Steps 9-11 - KV/SQL branch
276
+ # Step 9 - Filter relevant keys from the event before writing the data into the database table
245
277
  def apply_process_before_endpoint_update():
246
278
  graph.add_step(
247
279
  "ProcessBeforeEndpointUpdate",
@@ -251,7 +283,7 @@ class EventStreamProcessor:
251
283
 
252
284
  apply_process_before_endpoint_update()
253
285
 
254
- # Step 8 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
286
+ # Step 10 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
255
287
  # about average latency and the amount of predictions over time
256
288
  def apply_update_endpoint():
257
289
  graph.add_step(
@@ -264,7 +296,7 @@ class EventStreamProcessor:
264
296
 
265
297
  apply_update_endpoint()
266
298
 
267
- # Step 9 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
299
+ # Step 11 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
268
300
  # which will be used by Grafana monitoring dashboards
269
301
  def apply_infer_schema():
270
302
  graph.add_step(
@@ -279,10 +311,12 @@ class EventStreamProcessor:
279
311
  if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
280
312
  apply_infer_schema()
281
313
 
282
- # Steps 11-18 - TSDB branch (not supported in CE environment at the moment)
283
-
314
+ # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
315
+ # Steps 20-21 - Prometheus branch
284
316
  if not mlrun.mlconf.is_ce_mode():
285
- # Step 11 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
317
+ # TSDB branch
318
+
319
+ # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
286
320
  # stats and details about the events
287
321
  def apply_process_before_tsdb():
288
322
  graph.add_step(
@@ -291,7 +325,7 @@ class EventStreamProcessor:
291
325
 
292
326
  apply_process_before_tsdb()
293
327
 
294
- # Steps 12-18: - Unpacked keys from each dictionary and write to TSDB target
328
+ # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
295
329
  def apply_filter_and_unpacked_keys(name, keys):
296
330
  graph.add_step(
297
331
  "FilterAndUnpackKeys",
@@ -322,21 +356,21 @@ class EventStreamProcessor:
322
356
  key=EventFieldType.ENDPOINT_ID,
323
357
  )
324
358
 
325
- # Steps 12-13 - unpacked base_metrics dictionary
359
+ # Steps 13-14 - unpacked base_metrics dictionary
326
360
  apply_filter_and_unpacked_keys(
327
361
  name="FilterAndUnpackKeys1",
328
362
  keys=EventKeyMetrics.BASE_METRICS,
329
363
  )
330
364
  apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
331
365
 
332
- # Steps 14-15 - unpacked endpoint_features dictionary
366
+ # Steps 15-16 - unpacked endpoint_features dictionary
333
367
  apply_filter_and_unpacked_keys(
334
368
  name="FilterAndUnpackKeys2",
335
369
  keys=EventKeyMetrics.ENDPOINT_FEATURES,
336
370
  )
337
371
  apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
338
372
 
339
- # Steps 16-18 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
373
+ # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
340
374
  apply_filter_and_unpacked_keys(
341
375
  name="FilterAndUnpackKeys3",
342
376
  keys=EventKeyMetrics.CUSTOM_METRICS,
@@ -352,9 +386,30 @@ class EventStreamProcessor:
352
386
 
353
387
  apply_storey_filter()
354
388
  apply_tsdb_target(name="tsdb3", after="FilterNotNone")
389
+ else:
390
+ # Prometheus branch
391
+
392
+ # Step 20 - Increase the prediction counter by 1 and update the latency value
393
+ graph.add_step(
394
+ "IncCounter",
395
+ name="IncCounter",
396
+ after="MapFeatureNames",
397
+ project=self.project,
398
+ )
355
399
 
356
- # Steps 19-20 - Parquet branch
357
- # Step 19 - Filter and validate different keys before writing the data to Parquet target
400
+ # Step 21 - Record a sample of features and labels
401
+ def apply_record_features_to_prometheus():
402
+ graph.add_step(
403
+ "RecordFeatures",
404
+ name="RecordFeaturesToPrometheus",
405
+ after="sample",
406
+ project=self.project,
407
+ )
408
+
409
+ apply_record_features_to_prometheus()
410
+
411
+ # Steps 22-23 - Parquet branch
412
+ # Step 22 - Filter and validate different keys before writing the data to Parquet target
358
413
  def apply_process_before_parquet():
359
414
  graph.add_step(
360
415
  "ProcessBeforeParquet",
@@ -365,7 +420,7 @@ class EventStreamProcessor:
365
420
 
366
421
  apply_process_before_parquet()
367
422
 
368
- # Step 20 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
423
+ # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
369
424
  def apply_parquet_target():
370
425
  graph.add_step(
371
426
  "storey.ParquetTarget",
@@ -615,6 +670,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
615
670
  error = event.get("error")
616
671
  if error:
617
672
  self.error_count[endpoint_id] += 1
673
+ mlrun.model_monitoring.prometheus.write_errors(
674
+ project=self.project,
675
+ endpoint_id=event["endpoint_id"],
676
+ model_name=event["model"],
677
+ )
618
678
  raise mlrun.errors.MLRunInvalidArgumentError(str(error))
619
679
 
620
680
  # Validate event fields
@@ -1068,6 +1128,98 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1068
1128
  return event
1069
1129
 
1070
1130
 
1131
+ class EventRouting(mlrun.feature_store.steps.MapClass):
1132
+ """
1133
+ Router the event according to the configured path under event.path. Please note that this step returns the result
1134
+ to the caller. At the moment there are several paths:
1135
+
1136
+ - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
1137
+ to scrape the results from the monitoring stream memory.
1138
+
1139
+ - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
1140
+ statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
1141
+ metrics.
1142
+
1143
+ - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
1144
+
1145
+ """
1146
+
1147
+ def __init__(
1148
+ self,
1149
+ project: str,
1150
+ **kwargs,
1151
+ ):
1152
+ super().__init__(**kwargs)
1153
+ self.project: str = project
1154
+
1155
+ def do(self, event):
1156
+ if event.path == "/model-monitoring-metrics":
1157
+ # Return a parsed Prometheus registry file
1158
+ event.body = mlrun.model_monitoring.prometheus.get_registry()
1159
+ elif event.path == "/monitoring-batch-metrics":
1160
+ # Update statistical metrics
1161
+ for event_metric in event.body:
1162
+ mlrun.model_monitoring.prometheus.write_drift_metrics(
1163
+ project=self.project,
1164
+ endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
1165
+ metric=event_metric[EventFieldType.METRIC],
1166
+ value=event_metric[EventFieldType.VALUE],
1167
+ )
1168
+ elif event.path == "/monitoring-drift-status":
1169
+ # Update drift status
1170
+ mlrun.model_monitoring.prometheus.write_drift_status(
1171
+ project=self.project,
1172
+ endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
1173
+ drift_status=event.body[EventFieldType.DRIFT_STATUS],
1174
+ )
1175
+
1176
+ return event
1177
+
1178
+
1179
+ class IncCounter(mlrun.feature_store.steps.MapClass):
1180
+ """Increase prediction counter by 1 and update the total latency value"""
1181
+
1182
+ def __init__(self, project: str, **kwargs):
1183
+ super().__init__(**kwargs)
1184
+ self.project: str = project
1185
+
1186
+ def do(self, event):
1187
+ # Compute prediction per second
1188
+
1189
+ mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
1190
+ project=self.project,
1191
+ endpoint_id=event[EventFieldType.ENDPOINT_ID],
1192
+ latency=event[EventFieldType.LATENCY],
1193
+ model_name=event[EventFieldType.MODEL],
1194
+ endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
1195
+ )
1196
+
1197
+ return event
1198
+
1199
+
1200
+ class RecordFeatures(mlrun.feature_store.steps.MapClass):
1201
+ """Record a sample of features and labels in Prometheus registry"""
1202
+
1203
+ def __init__(self, project: str, **kwargs):
1204
+ super().__init__(**kwargs)
1205
+ self.project: str = project
1206
+
1207
+ def do(self, event):
1208
+ # Generate a dictionary of features and predictions
1209
+ features = {
1210
+ **event[EventFieldType.NAMED_PREDICTIONS],
1211
+ **event[EventFieldType.NAMED_FEATURES],
1212
+ }
1213
+
1214
+ mlrun.model_monitoring.prometheus.write_income_features(
1215
+ project=self.project,
1216
+ endpoint_id=event[EventFieldType.ENDPOINT_ID],
1217
+ features=features,
1218
+ )
1219
+
1220
+ return event
1221
+
1222
+
1071
1223
  def update_endpoint_record(
1072
1224
  project: str,
1073
1225
  endpoint_id: str,