mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +3 -41
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/frontend_spec.py +2 -1
  7. mlrun/api/api/endpoints/functions.py +95 -59
  8. mlrun/api/api/endpoints/grafana_proxy.py +9 -9
  9. mlrun/api/api/endpoints/logs.py +17 -3
  10. mlrun/api/api/endpoints/model_endpoints.py +3 -2
  11. mlrun/api/api/endpoints/pipelines.py +1 -5
  12. mlrun/api/api/endpoints/projects.py +88 -0
  13. mlrun/api/api/endpoints/runs.py +48 -6
  14. mlrun/api/api/endpoints/submit.py +2 -1
  15. mlrun/api/api/endpoints/workflows.py +355 -0
  16. mlrun/api/api/utils.py +3 -4
  17. mlrun/api/crud/__init__.py +1 -0
  18. mlrun/api/crud/client_spec.py +6 -2
  19. mlrun/api/crud/feature_store.py +5 -0
  20. mlrun/api/crud/model_monitoring/__init__.py +1 -0
  21. mlrun/api/crud/model_monitoring/deployment.py +497 -0
  22. mlrun/api/crud/model_monitoring/grafana.py +96 -42
  23. mlrun/api/crud/model_monitoring/helpers.py +159 -0
  24. mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
  25. mlrun/api/crud/notifications.py +9 -4
  26. mlrun/api/crud/pipelines.py +6 -11
  27. mlrun/api/crud/projects.py +2 -2
  28. mlrun/api/crud/runtime_resources.py +4 -3
  29. mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
  30. mlrun/api/crud/secrets.py +21 -0
  31. mlrun/api/crud/workflows.py +352 -0
  32. mlrun/api/db/base.py +16 -1
  33. mlrun/api/db/init_db.py +2 -4
  34. mlrun/api/db/session.py +1 -1
  35. mlrun/api/db/sqldb/db.py +129 -31
  36. mlrun/api/db/sqldb/models/models_mysql.py +15 -1
  37. mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
  38. mlrun/api/launcher.py +38 -6
  39. mlrun/api/main.py +3 -2
  40. mlrun/api/rundb/__init__.py +13 -0
  41. mlrun/{db → api/rundb}/sqldb.py +36 -84
  42. mlrun/api/runtime_handlers/__init__.py +56 -0
  43. mlrun/api/runtime_handlers/base.py +1247 -0
  44. mlrun/api/runtime_handlers/daskjob.py +209 -0
  45. mlrun/api/runtime_handlers/kubejob.py +37 -0
  46. mlrun/api/runtime_handlers/mpijob.py +147 -0
  47. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  48. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  49. mlrun/api/schemas/__init__.py +17 -6
  50. mlrun/api/utils/builder.py +1 -4
  51. mlrun/api/utils/clients/chief.py +14 -0
  52. mlrun/api/utils/clients/iguazio.py +33 -33
  53. mlrun/api/utils/clients/nuclio.py +2 -2
  54. mlrun/api/utils/periodic.py +9 -2
  55. mlrun/api/utils/projects/follower.py +14 -7
  56. mlrun/api/utils/projects/leader.py +2 -1
  57. mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
  58. mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
  59. mlrun/api/utils/runtimes/__init__.py +14 -0
  60. mlrun/api/utils/runtimes/nuclio.py +43 -0
  61. mlrun/api/utils/scheduler.py +98 -15
  62. mlrun/api/utils/singletons/db.py +5 -1
  63. mlrun/api/utils/singletons/project_member.py +4 -1
  64. mlrun/api/utils/singletons/scheduler.py +1 -1
  65. mlrun/artifacts/base.py +6 -6
  66. mlrun/artifacts/dataset.py +4 -4
  67. mlrun/artifacts/manager.py +2 -3
  68. mlrun/artifacts/model.py +2 -2
  69. mlrun/artifacts/plots.py +8 -8
  70. mlrun/common/db/__init__.py +14 -0
  71. mlrun/common/helpers.py +37 -0
  72. mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
  73. mlrun/common/model_monitoring/helpers.py +69 -0
  74. mlrun/common/schemas/__init__.py +13 -1
  75. mlrun/common/schemas/auth.py +4 -1
  76. mlrun/common/schemas/client_spec.py +1 -1
  77. mlrun/common/schemas/function.py +17 -0
  78. mlrun/common/schemas/model_monitoring/__init__.py +48 -0
  79. mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
  80. mlrun/common/schemas/model_monitoring/grafana.py +55 -0
  81. mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
  82. mlrun/common/schemas/notification.py +1 -0
  83. mlrun/common/schemas/object.py +4 -0
  84. mlrun/common/schemas/project.py +1 -0
  85. mlrun/common/schemas/regex.py +1 -1
  86. mlrun/common/schemas/runs.py +1 -8
  87. mlrun/common/schemas/schedule.py +1 -8
  88. mlrun/common/schemas/workflow.py +54 -0
  89. mlrun/config.py +45 -42
  90. mlrun/datastore/__init__.py +21 -0
  91. mlrun/datastore/base.py +1 -1
  92. mlrun/datastore/datastore.py +9 -0
  93. mlrun/datastore/dbfs_store.py +168 -0
  94. mlrun/datastore/helpers.py +18 -0
  95. mlrun/datastore/sources.py +1 -0
  96. mlrun/datastore/store_resources.py +2 -5
  97. mlrun/datastore/v3io.py +1 -2
  98. mlrun/db/__init__.py +4 -68
  99. mlrun/db/base.py +12 -0
  100. mlrun/db/factory.py +65 -0
  101. mlrun/db/httpdb.py +175 -20
  102. mlrun/db/nopdb.py +4 -2
  103. mlrun/execution.py +4 -2
  104. mlrun/feature_store/__init__.py +1 -0
  105. mlrun/feature_store/api.py +1 -2
  106. mlrun/feature_store/common.py +2 -1
  107. mlrun/feature_store/feature_set.py +1 -11
  108. mlrun/feature_store/feature_vector.py +340 -2
  109. mlrun/feature_store/ingestion.py +5 -10
  110. mlrun/feature_store/retrieval/base.py +118 -104
  111. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  112. mlrun/feature_store/retrieval/job.py +4 -1
  113. mlrun/feature_store/retrieval/local_merger.py +18 -18
  114. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  115. mlrun/feature_store/retrieval/storey_merger.py +22 -16
  116. mlrun/kfpops.py +3 -9
  117. mlrun/launcher/base.py +57 -53
  118. mlrun/launcher/client.py +5 -4
  119. mlrun/launcher/factory.py +24 -13
  120. mlrun/launcher/local.py +6 -6
  121. mlrun/launcher/remote.py +4 -4
  122. mlrun/lists.py +0 -11
  123. mlrun/model.py +11 -17
  124. mlrun/model_monitoring/__init__.py +2 -22
  125. mlrun/model_monitoring/features_drift_table.py +1 -1
  126. mlrun/model_monitoring/helpers.py +22 -210
  127. mlrun/model_monitoring/model_endpoint.py +1 -1
  128. mlrun/model_monitoring/model_monitoring_batch.py +127 -50
  129. mlrun/model_monitoring/prometheus.py +219 -0
  130. mlrun/model_monitoring/stores/__init__.py +16 -11
  131. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
  132. mlrun/model_monitoring/stores/models/mysql.py +47 -29
  133. mlrun/model_monitoring/stores/models/sqlite.py +47 -29
  134. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
  135. mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
  136. mlrun/model_monitoring/tracking_policy.py +104 -0
  137. mlrun/package/packager.py +6 -8
  138. mlrun/package/packagers/default_packager.py +121 -10
  139. mlrun/package/packagers/numpy_packagers.py +1 -1
  140. mlrun/platforms/__init__.py +0 -2
  141. mlrun/platforms/iguazio.py +0 -56
  142. mlrun/projects/pipelines.py +53 -159
  143. mlrun/projects/project.py +10 -37
  144. mlrun/render.py +1 -1
  145. mlrun/run.py +8 -124
  146. mlrun/runtimes/__init__.py +6 -42
  147. mlrun/runtimes/base.py +29 -1249
  148. mlrun/runtimes/daskjob.py +2 -198
  149. mlrun/runtimes/funcdoc.py +0 -9
  150. mlrun/runtimes/function.py +25 -29
  151. mlrun/runtimes/kubejob.py +5 -29
  152. mlrun/runtimes/local.py +1 -1
  153. mlrun/runtimes/mpijob/__init__.py +2 -2
  154. mlrun/runtimes/mpijob/abstract.py +10 -1
  155. mlrun/runtimes/mpijob/v1.py +0 -76
  156. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  157. mlrun/runtimes/nuclio.py +3 -2
  158. mlrun/runtimes/pod.py +28 -18
  159. mlrun/runtimes/remotesparkjob.py +1 -15
  160. mlrun/runtimes/serving.py +14 -6
  161. mlrun/runtimes/sparkjob/__init__.py +0 -1
  162. mlrun/runtimes/sparkjob/abstract.py +4 -131
  163. mlrun/runtimes/utils.py +0 -26
  164. mlrun/serving/routers.py +7 -7
  165. mlrun/serving/server.py +11 -8
  166. mlrun/serving/states.py +7 -1
  167. mlrun/serving/v2_serving.py +6 -6
  168. mlrun/utils/helpers.py +23 -42
  169. mlrun/utils/notifications/notification/__init__.py +4 -0
  170. mlrun/utils/notifications/notification/webhook.py +61 -0
  171. mlrun/utils/notifications/notification_pusher.py +5 -25
  172. mlrun/utils/regex.py +7 -2
  173. mlrun/utils/version/version.json +2 -2
  174. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
  175. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
  176. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  177. mlrun/mlutils/data.py +0 -160
  178. mlrun/mlutils/models.py +0 -78
  179. mlrun/mlutils/plots.py +0 -902
  180. mlrun/utils/model_monitoring.py +0 -249
  181. /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
  182. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  183. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  184. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -22,14 +22,14 @@ import pandas as pd
22
22
  import storey
23
23
 
24
24
  import mlrun
25
- import mlrun.common.model_monitoring
25
+ import mlrun.common.model_monitoring.helpers
26
26
  import mlrun.config
27
27
  import mlrun.datastore.targets
28
28
  import mlrun.feature_store.steps
29
+ import mlrun.model_monitoring.prometheus
29
30
  import mlrun.utils
30
- import mlrun.utils.model_monitoring
31
31
  import mlrun.utils.v3io_clients
32
- from mlrun.common.model_monitoring import (
32
+ from mlrun.common.schemas.model_monitoring.constants import (
33
33
  EventFieldType,
34
34
  EventKeyMetrics,
35
35
  EventLiveStats,
@@ -37,7 +37,6 @@ from mlrun.common.model_monitoring import (
37
37
  ModelEndpointTarget,
38
38
  ProjectSecretKeys,
39
39
  )
40
- from mlrun.model_monitoring.stores import get_model_endpoint_store
41
40
  from mlrun.utils import logger
42
41
 
43
42
 
@@ -47,22 +46,18 @@ class EventStreamProcessor:
47
46
  self,
48
47
  project: str,
49
48
  parquet_batching_max_events: int,
49
+ parquet_batching_timeout_secs: int,
50
50
  parquet_target: str,
51
51
  sample_window: int = 10,
52
- parquet_batching_timeout_secs: int = 30 * 60, # Default 30 minutes
53
- aggregate_count_windows: typing.Optional[typing.List[str]] = None,
54
- aggregate_count_period: str = "30s",
55
- aggregate_avg_windows: typing.Optional[typing.List[str]] = None,
56
- aggregate_avg_period: str = "30s",
52
+ aggregate_windows: typing.Optional[typing.List[str]] = None,
53
+ aggregate_period: str = "30s",
57
54
  model_monitoring_access_key: str = None,
58
55
  ):
59
56
  # General configurations, mainly used for the storey steps in the future serving graph
60
57
  self.project = project
61
58
  self.sample_window = sample_window
62
- self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
63
- self.aggregate_count_period = aggregate_count_period
64
- self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
65
- self.aggregate_avg_period = aggregate_avg_period
59
+ self.aggregate_windows = aggregate_windows or ["5m", "1h"]
60
+ self.aggregate_period = aggregate_period
66
61
 
67
62
  # Parquet path and configurations
68
63
  self.parquet_path = parquet_target
@@ -84,6 +79,8 @@ class EventStreamProcessor:
84
79
  self._initialize_v3io_configurations(
85
80
  model_monitoring_access_key=model_monitoring_access_key
86
81
  )
82
+ elif self.parquet_path.startswith("s3://"):
83
+ self.storage_options = mlrun.mlconf.get_s3_storage_options()
87
84
 
88
85
  def _initialize_v3io_configurations(
89
86
  self,
@@ -116,7 +113,9 @@ class EventStreamProcessor:
116
113
  _,
117
114
  self.kv_container,
118
115
  self.kv_path,
119
- ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(kv_path)
116
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
117
+ kv_path
118
+ )
120
119
 
121
120
  # TSDB path and configurations
122
121
  tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
@@ -126,7 +125,9 @@ class EventStreamProcessor:
126
125
  _,
127
126
  self.tsdb_container,
128
127
  self.tsdb_path,
129
- ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(tsdb_path)
128
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
129
+ tsdb_path
130
+ )
130
131
 
131
132
  self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
132
133
  self.tsdb_batching_max_events = tsdb_batching_max_events
@@ -138,7 +139,7 @@ class EventStreamProcessor:
138
139
  of different operations that are executed on the events from the model server. Each event has
139
140
  metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
140
141
  Throughout the serving graph, the results are written to 3 different databases:
141
- 1. KV/SQL (steps 7-9): Stores metadata and stats about the average latency and the amount of predictions over
142
+ 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
142
143
  time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
143
144
  by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
144
145
  from other processes, such as current_stats that is being calculated by the monitoring batch job
@@ -146,12 +147,14 @@ class EventStreamProcessor:
146
147
  v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
147
148
  is stored within the database that was defined in the provided connection string and can be found
148
149
  under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
149
- 2. TSDB (steps 12-18): Stores live data of different key metric dictionaries in tsdb target. Results can be
150
- found under v3io:///users/pipelines/project-name/model-endpoints/events/. At the moment, this part supports
151
- 3 different key metric dictionaries: base_metrics (average latency and predictions over time),
150
+ 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
151
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
152
+ can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
153
+ 3 different key metric dictionaries: base_metrics (average latency and predictions over time),
152
154
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
153
- This data is also being used by the monitoring dashboards in grafana.
154
- 3. Parquet (steps 19-20): This Parquet file includes the required data for the model monitoring batch job
155
+ If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
156
+ monitoring stream local memory.
157
+ 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
155
158
  that run every hour by default. If defined, the parquet target path can be found under
156
159
  mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
157
160
  mlrun.mlconf.model_endpoint_monitoring.user_space.
@@ -161,17 +164,41 @@ class EventStreamProcessor:
161
164
 
162
165
  graph = fn.set_topology("flow")
163
166
 
164
- # Step 1 - Process endpoint event: splitting into sub-events and validate event data
167
+ # Step 1 - Event routing based on the provided path
168
+ def apply_event_routing():
169
+ graph.add_step(
170
+ "EventRouting",
171
+ full_event=True,
172
+ project=self.project,
173
+ ).respond()
174
+
175
+ apply_event_routing()
176
+
177
+ # Step 2 - Filter out events with no '-' in path which indicates that the event is supposed to be processed
178
+ # through the next steps of the stream graph
179
+ def apply_storey_filter_stream_events():
180
+ # Remove none values from each event
181
+ graph.add_step(
182
+ "storey.Filter",
183
+ "filter_stream_event",
184
+ _fn="('-' not in event.path)",
185
+ full_event=True,
186
+ )
187
+
188
+ apply_storey_filter_stream_events()
189
+
190
+ # Step 3 - Process endpoint event: splitting into sub-events and validate event data
165
191
  def apply_process_endpoint_event():
166
192
  graph.add_step(
167
193
  "ProcessEndpointEvent",
168
194
  full_event=True,
169
195
  project=self.project,
196
+ after="filter_stream_event",
170
197
  )
171
198
 
172
199
  apply_process_endpoint_event()
173
200
 
174
- # Steps 2,3 - Applying Storey operations of filtering and flatten
201
+ # Steps 4,5 - Applying Storey operations of filtering and flatten
175
202
  def apply_storey_filter_and_flatmap():
176
203
  # Remove none values from each event
177
204
  graph.add_step(
@@ -188,7 +215,7 @@ class EventStreamProcessor:
188
215
 
189
216
  apply_storey_filter_and_flatmap()
190
217
 
191
- # Step 4 - Validating feature names and map each feature to its value
218
+ # Step 6 - Validating feature names and map each feature to its value
192
219
  def apply_map_feature_names():
193
220
  graph.add_step(
194
221
  "MapFeatureNames",
@@ -200,58 +227,53 @@ class EventStreamProcessor:
200
227
 
201
228
  apply_map_feature_names()
202
229
 
203
- # Step 5 - Calculate number of predictions and average latency
230
+ # Step 7 - Calculate number of predictions and average latency
204
231
  def apply_storey_aggregations():
205
- # Step 5.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
232
+ # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
206
233
  graph.add_step(
207
234
  class_name="storey.AggregateByKey",
208
235
  aggregates=[
209
236
  {
210
- "name": EventFieldType.PREDICTIONS,
211
- "column": EventFieldType.ENDPOINT_ID,
212
- "operations": ["count"],
213
- "windows": self.aggregate_count_windows,
214
- "period": self.aggregate_count_period,
237
+ "name": EventFieldType.LATENCY,
238
+ "column": EventFieldType.LATENCY,
239
+ "operations": ["count", "avg"],
240
+ "windows": self.aggregate_windows,
241
+ "period": self.aggregate_period,
215
242
  }
216
243
  ],
217
- name=EventFieldType.PREDICTIONS,
244
+ name=EventFieldType.LATENCY,
218
245
  after="MapFeatureNames",
219
246
  step_name="Aggregates",
220
247
  table=".",
248
+ key_field=EventFieldType.ENDPOINT_ID,
221
249
  )
222
- # Step 5.2 - Calculate average latency time for each window (5 min and 1 hour by default)
250
+ # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
223
251
  graph.add_step(
224
- class_name="storey.AggregateByKey",
225
- aggregates=[
226
- {
227
- "name": EventFieldType.LATENCY,
228
- "column": EventFieldType.LATENCY,
229
- "operations": ["avg"],
230
- "windows": self.aggregate_avg_windows,
231
- "period": self.aggregate_avg_period,
232
- }
233
- ],
234
- name=EventFieldType.LATENCY,
235
- after=EventFieldType.PREDICTIONS,
236
- table=".",
252
+ class_name="storey.Rename",
253
+ mapping={
254
+ "latency_count_5m": EventLiveStats.PREDICTIONS_COUNT_5M,
255
+ "latency_count_1h": EventLiveStats.PREDICTIONS_COUNT_1H,
256
+ },
257
+ name="Rename",
258
+ after=EventFieldType.LATENCY,
237
259
  )
238
260
 
239
261
  apply_storey_aggregations()
240
262
 
241
- # Step 6 - Emits the event in window size of events based on sample_window size (10 by default)
263
+ # Step 8 - Emits the event in window size of events based on sample_window size (10 by default)
242
264
  def apply_storey_sample_window():
243
265
  graph.add_step(
244
266
  "storey.steps.SampleWindow",
245
267
  name="sample",
246
- after=EventFieldType.LATENCY,
268
+ after="Rename",
247
269
  window_size=self.sample_window,
248
270
  key=EventFieldType.ENDPOINT_ID,
249
271
  )
250
272
 
251
273
  apply_storey_sample_window()
252
274
 
253
- # Steps 7-9 - KV/SQL branch
254
- # Step 7 - Filter relevant keys from the event before writing the data into the database table
275
+ # Steps 9-11 - KV/SQL branch
276
+ # Step 9 - Filter relevant keys from the event before writing the data into the database table
255
277
  def apply_process_before_endpoint_update():
256
278
  graph.add_step(
257
279
  "ProcessBeforeEndpointUpdate",
@@ -261,7 +283,7 @@ class EventStreamProcessor:
261
283
 
262
284
  apply_process_before_endpoint_update()
263
285
 
264
- # Step 8 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
286
+ # Step 10 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
265
287
  # about average latency and the amount of predictions over time
266
288
  def apply_update_endpoint():
267
289
  graph.add_step(
@@ -274,7 +296,7 @@ class EventStreamProcessor:
274
296
 
275
297
  apply_update_endpoint()
276
298
 
277
- # Step 9 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
299
+ # Step 11 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
278
300
  # which will be used by Grafana monitoring dashboards
279
301
  def apply_infer_schema():
280
302
  graph.add_step(
@@ -289,10 +311,12 @@ class EventStreamProcessor:
289
311
  if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
290
312
  apply_infer_schema()
291
313
 
292
- # Steps 11-18 - TSDB branch (not supported in CE environment at the moment)
293
-
314
+ # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
315
+ # Steps 20-21 - Prometheus branch
294
316
  if not mlrun.mlconf.is_ce_mode():
295
- # Step 11 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
317
+ # TSDB branch
318
+
319
+ # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
296
320
  # stats and details about the events
297
321
  def apply_process_before_tsdb():
298
322
  graph.add_step(
@@ -301,7 +325,7 @@ class EventStreamProcessor:
301
325
 
302
326
  apply_process_before_tsdb()
303
327
 
304
- # Steps 12-18: - Unpacked keys from each dictionary and write to TSDB target
328
+ # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
305
329
  def apply_filter_and_unpacked_keys(name, keys):
306
330
  graph.add_step(
307
331
  "FilterAndUnpackKeys",
@@ -332,21 +356,21 @@ class EventStreamProcessor:
332
356
  key=EventFieldType.ENDPOINT_ID,
333
357
  )
334
358
 
335
- # Steps 12-13 - unpacked base_metrics dictionary
359
+ # Steps 13-14 - unpacked base_metrics dictionary
336
360
  apply_filter_and_unpacked_keys(
337
361
  name="FilterAndUnpackKeys1",
338
362
  keys=EventKeyMetrics.BASE_METRICS,
339
363
  )
340
364
  apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
341
365
 
342
- # Steps 14-15 - unpacked endpoint_features dictionary
366
+ # Steps 15-16 - unpacked endpoint_features dictionary
343
367
  apply_filter_and_unpacked_keys(
344
368
  name="FilterAndUnpackKeys2",
345
369
  keys=EventKeyMetrics.ENDPOINT_FEATURES,
346
370
  )
347
371
  apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
348
372
 
349
- # Steps 16-18 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
373
+ # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
350
374
  apply_filter_and_unpacked_keys(
351
375
  name="FilterAndUnpackKeys3",
352
376
  keys=EventKeyMetrics.CUSTOM_METRICS,
@@ -362,9 +386,30 @@ class EventStreamProcessor:
362
386
 
363
387
  apply_storey_filter()
364
388
  apply_tsdb_target(name="tsdb3", after="FilterNotNone")
389
+ else:
390
+ # Prometheus branch
391
+
392
+ # Step 20 - Increase the prediction counter by 1 and update the latency value
393
+ graph.add_step(
394
+ "IncCounter",
395
+ name="IncCounter",
396
+ after="MapFeatureNames",
397
+ project=self.project,
398
+ )
365
399
 
366
- # Steps 19-20 - Parquet branch
367
- # Step 19 - Filter and validate different keys before writing the data to Parquet target
400
+ # Step 21 - Record a sample of features and labels
401
+ def apply_record_features_to_prometheus():
402
+ graph.add_step(
403
+ "RecordFeatures",
404
+ name="RecordFeaturesToPrometheus",
405
+ after="sample",
406
+ project=self.project,
407
+ )
408
+
409
+ apply_record_features_to_prometheus()
410
+
411
+ # Steps 22-23 - Parquet branch
412
+ # Step 22 - Filter and validate different keys before writing the data to Parquet target
368
413
  def apply_process_before_parquet():
369
414
  graph.add_step(
370
415
  "ProcessBeforeParquet",
@@ -375,7 +420,7 @@ class EventStreamProcessor:
375
420
 
376
421
  apply_process_before_parquet()
377
422
 
378
- # Step 20 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
423
+ # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
379
424
  def apply_parquet_target():
380
425
  graph.add_step(
381
426
  "storey.ParquetTarget",
@@ -625,6 +670,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
625
670
  error = event.get("error")
626
671
  if error:
627
672
  self.error_count[endpoint_id] += 1
673
+ mlrun.model_monitoring.prometheus.write_errors(
674
+ project=self.project,
675
+ endpoint_id=event["endpoint_id"],
676
+ model_name=event["model"],
677
+ )
628
678
  raise mlrun.errors.MLRunInvalidArgumentError(str(error))
629
679
 
630
680
  # Validate event fields
@@ -1078,12 +1128,104 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1078
1128
  return event
1079
1129
 
1080
1130
 
1131
+ class EventRouting(mlrun.feature_store.steps.MapClass):
1132
+ """
1133
+ Router the event according to the configured path under event.path. Please note that this step returns the result
1134
+ to the caller. At the moment there are several paths:
1135
+
1136
+ - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
1137
+ to scrape the results from the monitoring stream memory.
1138
+
1139
+ - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
1140
+ statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
1141
+ metrics.
1142
+
1143
+ - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
1144
+
1145
+ """
1146
+
1147
+ def __init__(
1148
+ self,
1149
+ project: str,
1150
+ **kwargs,
1151
+ ):
1152
+ super().__init__(**kwargs)
1153
+ self.project: str = project
1154
+
1155
+ def do(self, event):
1156
+ if event.path == "/model-monitoring-metrics":
1157
+ # Return a parsed Prometheus registry file
1158
+ event.body = mlrun.model_monitoring.prometheus.get_registry()
1159
+ elif event.path == "/monitoring-batch-metrics":
1160
+ # Update statistical metrics
1161
+ for event_metric in event.body:
1162
+ mlrun.model_monitoring.prometheus.write_drift_metrics(
1163
+ project=self.project,
1164
+ endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
1165
+ metric=event_metric[EventFieldType.METRIC],
1166
+ value=event_metric[EventFieldType.VALUE],
1167
+ )
1168
+ elif event.path == "/monitoring-drift-status":
1169
+ # Update drift status
1170
+ mlrun.model_monitoring.prometheus.write_drift_status(
1171
+ project=self.project,
1172
+ endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
1173
+ drift_status=event.body[EventFieldType.DRIFT_STATUS],
1174
+ )
1175
+
1176
+ return event
1177
+
1178
+
1179
+ class IncCounter(mlrun.feature_store.steps.MapClass):
1180
+ """Increase prediction counter by 1 and update the total latency value"""
1181
+
1182
+ def __init__(self, project: str, **kwargs):
1183
+ super().__init__(**kwargs)
1184
+ self.project: str = project
1185
+
1186
+ def do(self, event):
1187
+ # Compute prediction per second
1188
+
1189
+ mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
1190
+ project=self.project,
1191
+ endpoint_id=event[EventFieldType.ENDPOINT_ID],
1192
+ latency=event[EventFieldType.LATENCY],
1193
+ model_name=event[EventFieldType.MODEL],
1194
+ endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
1195
+ )
1196
+
1197
+ return event
1198
+
1199
+
1200
+ class RecordFeatures(mlrun.feature_store.steps.MapClass):
1201
+ """Record a sample of features and labels in Prometheus registry"""
1202
+
1203
+ def __init__(self, project: str, **kwargs):
1204
+ super().__init__(**kwargs)
1205
+ self.project: str = project
1206
+
1207
+ def do(self, event):
1208
+ # Generate a dictionary of features and predictions
1209
+ features = {
1210
+ **event[EventFieldType.NAMED_PREDICTIONS],
1211
+ **event[EventFieldType.NAMED_FEATURES],
1212
+ }
1213
+
1214
+ mlrun.model_monitoring.prometheus.write_income_features(
1215
+ project=self.project,
1216
+ endpoint_id=event[EventFieldType.ENDPOINT_ID],
1217
+ features=features,
1218
+ )
1219
+
1220
+ return event
1221
+
1222
+
1081
1223
  def update_endpoint_record(
1082
1224
  project: str,
1083
1225
  endpoint_id: str,
1084
1226
  attributes: dict,
1085
1227
  ):
1086
- model_endpoint_store = get_model_endpoint_store(
1228
+ model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1087
1229
  project=project,
1088
1230
  )
1089
1231
 
@@ -1093,7 +1235,7 @@ def update_endpoint_record(
1093
1235
 
1094
1236
 
1095
1237
  def get_endpoint_record(project: str, endpoint_id: str):
1096
- model_endpoint_store = get_model_endpoint_store(
1238
+ model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1097
1239
  project=project,
1098
1240
  )
1099
1241
  return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
@@ -0,0 +1,104 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ from typing import Union
17
+
18
+ import mlrun.common.schemas.schedule
19
+ import mlrun.model
20
+
21
+
22
+ class TrackingPolicy(mlrun.model.ModelObj):
23
+ """
24
+ Modified model monitoring configurations. By using TrackingPolicy, the user can apply his model monitoring
25
+ requirements, such as setting the scheduling policy of the model monitoring batch job or changing the image of the
26
+ model monitoring stream.
27
+ """
28
+
29
+ _dict_fields = [
30
+ "default_batch_image",
31
+ "stream_image",
32
+ ]
33
+
34
+ def __init__(
35
+ self,
36
+ default_batch_intervals: Union[
37
+ mlrun.common.schemas.schedule.ScheduleCronTrigger, str
38
+ ] = mlrun.common.schemas.schedule.ScheduleCronTrigger(minute="0", hour="*/1"),
39
+ default_batch_image: str = "mlrun/mlrun",
40
+ stream_image: str = "mlrun/mlrun",
41
+ ):
42
+ """
43
+ Initialize TrackingPolicy object.
44
+ :param default_batch_intervals: Model monitoring batch scheduling policy. By default, executed on the hour
45
+ every hour. Can be either a string or a ScheduleCronTrigger object. The
46
+ string time format is based on ScheduleCronTrigger expression:
47
+ minute, hour, day of month, month, day of week. It will be converted into
48
+ a ScheduleCronTrigger object.
49
+ :param default_batch_image: The default image of the model monitoring batch job. By default, the image
50
+ is mlrun/mlrun.
51
+ :param stream_image: The image of the model monitoring stream real-time function. By default,
52
+ the image is mlrun/mlrun.
53
+ """
54
+ if isinstance(default_batch_intervals, str):
55
+ default_batch_intervals = (
56
+ mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
57
+ default_batch_intervals
58
+ )
59
+ )
60
+ self.default_batch_intervals = default_batch_intervals
61
+ self.default_batch_image = default_batch_image
62
+ self.stream_image = stream_image
63
+
64
+ @classmethod
65
+ def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
66
+ new_obj = super().from_dict(
67
+ struct, fields=cls._dict_fields, deprecated_fields=deprecated_fields
68
+ )
69
+ # Convert default batch interval into ScheduleCronTrigger object
70
+ if (
71
+ mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
72
+ in struct
73
+ ):
74
+ if isinstance(
75
+ struct[
76
+ mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
77
+ ],
78
+ str,
79
+ ):
80
+ new_obj.default_batch_intervals = mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
81
+ struct[
82
+ mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
83
+ ]
84
+ )
85
+ else:
86
+ new_obj.default_batch_intervals = mlrun.common.schemas.schedule.ScheduleCronTrigger.parse_obj(
87
+ struct[
88
+ mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
89
+ ]
90
+ )
91
+ return new_obj
92
+
93
+ def to_dict(self, fields=None, exclude=None):
94
+ struct = super().to_dict(
95
+ fields,
96
+ exclude=[
97
+ mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
98
+ ],
99
+ )
100
+ if self.default_batch_intervals:
101
+ struct[
102
+ mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
103
+ ] = self.default_batch_intervals.dict()
104
+ return struct
mlrun/package/packager.py CHANGED
@@ -107,8 +107,7 @@ class Packager(ABC, metaclass=_PackagerMeta):
107
107
 
108
108
  Preferably, each packager should handle a single type of object.
109
109
 
110
- Linking Artifacts (extra data)
111
- ------------------------------
110
+ **Linking Artifacts (extra data)**
112
111
 
113
112
  In order to link between packages (using the extra data or metrics spec attributes of an artifact), you should use
114
113
  the key as if it exists and as value ellipses (...). The manager will link all packages once it is done packing.
@@ -118,8 +117,7 @@ class Packager(ABC, metaclass=_PackagerMeta):
118
117
  artifact = Artifact(key="my_artifact")
119
118
  artifact.spec.extra_data = {key: ... for key in extra_data}
120
119
 
121
- Clearing Outputs
122
- ----------------
120
+ **Clearing Outputs**
123
121
 
124
122
  Some of the packagers may produce files and temporary directories that should be deleted once done with logging the
125
123
  artifact. The packager can mark paths of files and directories to delete after logging using the class method
@@ -131,15 +129,15 @@ class Packager(ABC, metaclass=_PackagerMeta):
131
129
  with open("./some_file.txt", "w") as file:
132
130
  file.write("Pack me")
133
131
  artifact = Artifact(key="my_artifact")
134
- cls.future_clear(path="./some_file.txt")
132
+ cls.add_future_clearing_path(path="./some_file.txt")
135
133
  return artifact, None
136
134
  """
137
135
 
138
- # The type of object this packager can pack and unpack:
136
+ #: The type of object this packager can pack and unpack.
139
137
  PACKABLE_OBJECT_TYPE: Type = ...
140
138
 
141
- # The priority of this packager in the packagers collection of the manager (lower is better)
142
- PRIORITY = ...
139
+ #: The priority of this packager in the packagers collection of the manager (lower is better).
140
+ PRIORITY: int = ...
143
141
 
144
142
  # List of all paths to be deleted by the manager of this packager post logging the packages:
145
143
  _CLEARING_PATH_LIST: List[str] = []