mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +26 -112
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +46 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +47 -48
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +69 -0
  13. mlrun/common/db/sql_session.py +2 -3
  14. mlrun/common/formatters/__init__.py +19 -0
  15. mlrun/common/formatters/artifact.py +21 -0
  16. mlrun/common/formatters/base.py +78 -0
  17. mlrun/common/formatters/function.py +41 -0
  18. mlrun/common/formatters/pipeline.py +53 -0
  19. mlrun/common/formatters/project.py +51 -0
  20. mlrun/common/helpers.py +1 -2
  21. mlrun/common/model_monitoring/helpers.py +9 -5
  22. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  23. mlrun/common/schemas/__init__.py +24 -4
  24. mlrun/common/schemas/alert.py +203 -0
  25. mlrun/common/schemas/api_gateway.py +148 -0
  26. mlrun/common/schemas/artifact.py +18 -8
  27. mlrun/common/schemas/auth.py +11 -5
  28. mlrun/common/schemas/background_task.py +1 -1
  29. mlrun/common/schemas/client_spec.py +4 -1
  30. mlrun/common/schemas/feature_store.py +16 -16
  31. mlrun/common/schemas/frontend_spec.py +8 -7
  32. mlrun/common/schemas/function.py +5 -1
  33. mlrun/common/schemas/hub.py +11 -18
  34. mlrun/common/schemas/memory_reports.py +2 -2
  35. mlrun/common/schemas/model_monitoring/__init__.py +18 -3
  36. mlrun/common/schemas/model_monitoring/constants.py +83 -26
  37. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  38. mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
  39. mlrun/common/schemas/notification.py +4 -4
  40. mlrun/common/schemas/object.py +2 -2
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +1 -10
  43. mlrun/common/schemas/project.py +24 -23
  44. mlrun/common/schemas/runtime_resource.py +8 -12
  45. mlrun/common/schemas/schedule.py +3 -3
  46. mlrun/common/schemas/tag.py +1 -2
  47. mlrun/common/schemas/workflow.py +2 -2
  48. mlrun/common/types.py +7 -1
  49. mlrun/config.py +54 -17
  50. mlrun/data_types/to_pandas.py +10 -12
  51. mlrun/datastore/__init__.py +5 -8
  52. mlrun/datastore/alibaba_oss.py +130 -0
  53. mlrun/datastore/azure_blob.py +17 -5
  54. mlrun/datastore/base.py +62 -39
  55. mlrun/datastore/datastore.py +28 -9
  56. mlrun/datastore/datastore_profile.py +146 -20
  57. mlrun/datastore/filestore.py +0 -1
  58. mlrun/datastore/google_cloud_storage.py +6 -2
  59. mlrun/datastore/hdfs.py +56 -0
  60. mlrun/datastore/inmem.py +2 -2
  61. mlrun/datastore/redis.py +6 -2
  62. mlrun/datastore/s3.py +9 -0
  63. mlrun/datastore/snowflake_utils.py +43 -0
  64. mlrun/datastore/sources.py +201 -96
  65. mlrun/datastore/spark_utils.py +1 -2
  66. mlrun/datastore/store_resources.py +7 -7
  67. mlrun/datastore/targets.py +358 -104
  68. mlrun/datastore/utils.py +72 -58
  69. mlrun/datastore/v3io.py +5 -1
  70. mlrun/db/base.py +185 -35
  71. mlrun/db/factory.py +1 -1
  72. mlrun/db/httpdb.py +614 -179
  73. mlrun/db/nopdb.py +210 -26
  74. mlrun/errors.py +12 -1
  75. mlrun/execution.py +41 -24
  76. mlrun/feature_store/__init__.py +0 -2
  77. mlrun/feature_store/api.py +40 -72
  78. mlrun/feature_store/common.py +1 -1
  79. mlrun/feature_store/feature_set.py +76 -55
  80. mlrun/feature_store/feature_vector.py +28 -30
  81. mlrun/feature_store/ingestion.py +7 -6
  82. mlrun/feature_store/retrieval/base.py +16 -11
  83. mlrun/feature_store/retrieval/conversion.py +11 -13
  84. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  85. mlrun/feature_store/retrieval/job.py +9 -3
  86. mlrun/feature_store/retrieval/local_merger.py +2 -0
  87. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  88. mlrun/feature_store/steps.py +37 -34
  89. mlrun/features.py +9 -20
  90. mlrun/frameworks/_common/artifacts_library.py +9 -9
  91. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  92. mlrun/frameworks/_common/model_handler.py +48 -48
  93. mlrun/frameworks/_common/plan.py +2 -3
  94. mlrun/frameworks/_common/producer.py +3 -4
  95. mlrun/frameworks/_common/utils.py +5 -5
  96. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  97. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  98. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  99. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  100. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  101. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  102. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  103. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  104. mlrun/frameworks/_ml_common/plan.py +1 -1
  105. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  106. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  107. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  108. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  109. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  110. mlrun/frameworks/_ml_common/utils.py +4 -4
  111. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  112. mlrun/frameworks/huggingface/model_server.py +4 -4
  113. mlrun/frameworks/lgbm/__init__.py +33 -33
  114. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  115. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  116. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  117. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  118. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  119. mlrun/frameworks/lgbm/model_handler.py +10 -10
  120. mlrun/frameworks/lgbm/model_server.py +6 -6
  121. mlrun/frameworks/lgbm/utils.py +5 -5
  122. mlrun/frameworks/onnx/dataset.py +8 -8
  123. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  124. mlrun/frameworks/onnx/model_handler.py +6 -6
  125. mlrun/frameworks/onnx/model_server.py +7 -7
  126. mlrun/frameworks/parallel_coordinates.py +4 -3
  127. mlrun/frameworks/pytorch/__init__.py +18 -18
  128. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  129. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  130. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  131. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  132. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  133. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  134. mlrun/frameworks/pytorch/model_handler.py +17 -17
  135. mlrun/frameworks/pytorch/model_server.py +7 -7
  136. mlrun/frameworks/sklearn/__init__.py +13 -13
  137. mlrun/frameworks/sklearn/estimator.py +4 -4
  138. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  139. mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
  140. mlrun/frameworks/sklearn/model_handler.py +2 -2
  141. mlrun/frameworks/tf_keras/__init__.py +10 -7
  142. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  143. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  144. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  145. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  146. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  147. mlrun/frameworks/tf_keras/model_server.py +6 -6
  148. mlrun/frameworks/xgboost/__init__.py +13 -13
  149. mlrun/frameworks/xgboost/model_handler.py +6 -6
  150. mlrun/k8s_utils.py +14 -16
  151. mlrun/launcher/__init__.py +1 -1
  152. mlrun/launcher/base.py +16 -15
  153. mlrun/launcher/client.py +8 -6
  154. mlrun/launcher/factory.py +1 -1
  155. mlrun/launcher/local.py +17 -11
  156. mlrun/launcher/remote.py +16 -10
  157. mlrun/lists.py +7 -6
  158. mlrun/model.py +238 -73
  159. mlrun/model_monitoring/__init__.py +1 -1
  160. mlrun/model_monitoring/api.py +138 -315
  161. mlrun/model_monitoring/application.py +5 -296
  162. mlrun/model_monitoring/applications/__init__.py +24 -0
  163. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  164. mlrun/model_monitoring/applications/base.py +282 -0
  165. mlrun/model_monitoring/applications/context.py +214 -0
  166. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  167. mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
  168. mlrun/model_monitoring/applications/results.py +99 -0
  169. mlrun/model_monitoring/controller.py +104 -84
  170. mlrun/model_monitoring/controller_handler.py +13 -5
  171. mlrun/model_monitoring/db/__init__.py +18 -0
  172. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  173. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  174. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
  175. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  176. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  177. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  178. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  179. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  180. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  181. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  182. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
  183. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  184. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  185. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  187. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  188. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  189. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  190. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  191. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  192. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  193. mlrun/model_monitoring/evidently_application.py +6 -118
  194. mlrun/model_monitoring/features_drift_table.py +134 -106
  195. mlrun/model_monitoring/helpers.py +127 -28
  196. mlrun/model_monitoring/metrics/__init__.py +13 -0
  197. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  198. mlrun/model_monitoring/model_endpoint.py +3 -2
  199. mlrun/model_monitoring/prometheus.py +1 -4
  200. mlrun/model_monitoring/stream_processing.py +62 -231
  201. mlrun/model_monitoring/tracking_policy.py +9 -2
  202. mlrun/model_monitoring/writer.py +152 -124
  203. mlrun/package/__init__.py +6 -6
  204. mlrun/package/context_handler.py +5 -5
  205. mlrun/package/packager.py +7 -7
  206. mlrun/package/packagers/default_packager.py +6 -6
  207. mlrun/package/packagers/numpy_packagers.py +15 -15
  208. mlrun/package/packagers/pandas_packagers.py +5 -5
  209. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  210. mlrun/package/packagers_manager.py +19 -23
  211. mlrun/package/utils/_formatter.py +6 -6
  212. mlrun/package/utils/_pickler.py +2 -2
  213. mlrun/package/utils/_supported_format.py +4 -4
  214. mlrun/package/utils/log_hint_utils.py +2 -2
  215. mlrun/package/utils/type_hint_utils.py +4 -9
  216. mlrun/platforms/__init__.py +11 -10
  217. mlrun/platforms/iguazio.py +24 -203
  218. mlrun/projects/operations.py +35 -21
  219. mlrun/projects/pipelines.py +68 -99
  220. mlrun/projects/project.py +830 -266
  221. mlrun/render.py +3 -11
  222. mlrun/run.py +162 -166
  223. mlrun/runtimes/__init__.py +62 -7
  224. mlrun/runtimes/base.py +39 -32
  225. mlrun/runtimes/daskjob.py +8 -8
  226. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  227. mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
  228. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  229. mlrun/runtimes/funcdoc.py +0 -28
  230. mlrun/runtimes/function_reference.py +1 -1
  231. mlrun/runtimes/kubejob.py +28 -122
  232. mlrun/runtimes/local.py +6 -3
  233. mlrun/runtimes/mpijob/__init__.py +0 -20
  234. mlrun/runtimes/mpijob/abstract.py +9 -10
  235. mlrun/runtimes/mpijob/v1.py +1 -1
  236. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  237. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  238. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  239. mlrun/runtimes/nuclio/application/application.py +523 -0
  240. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  241. mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
  242. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  243. mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
  244. mlrun/runtimes/pod.py +286 -88
  245. mlrun/runtimes/remotesparkjob.py +2 -2
  246. mlrun/runtimes/sparkjob/spark3job.py +51 -34
  247. mlrun/runtimes/utils.py +7 -75
  248. mlrun/secrets.py +9 -5
  249. mlrun/serving/remote.py +2 -7
  250. mlrun/serving/routers.py +13 -10
  251. mlrun/serving/server.py +22 -26
  252. mlrun/serving/states.py +99 -25
  253. mlrun/serving/utils.py +3 -3
  254. mlrun/serving/v1_serving.py +6 -7
  255. mlrun/serving/v2_serving.py +59 -20
  256. mlrun/track/tracker.py +2 -1
  257. mlrun/track/tracker_manager.py +3 -3
  258. mlrun/track/trackers/mlflow_tracker.py +1 -2
  259. mlrun/utils/async_http.py +5 -7
  260. mlrun/utils/azure_vault.py +1 -1
  261. mlrun/utils/clones.py +1 -2
  262. mlrun/utils/condition_evaluator.py +3 -3
  263. mlrun/utils/db.py +3 -3
  264. mlrun/utils/helpers.py +183 -197
  265. mlrun/utils/http.py +2 -5
  266. mlrun/utils/logger.py +76 -14
  267. mlrun/utils/notifications/notification/__init__.py +17 -12
  268. mlrun/utils/notifications/notification/base.py +14 -2
  269. mlrun/utils/notifications/notification/console.py +2 -0
  270. mlrun/utils/notifications/notification/git.py +3 -1
  271. mlrun/utils/notifications/notification/ipython.py +3 -1
  272. mlrun/utils/notifications/notification/slack.py +101 -21
  273. mlrun/utils/notifications/notification/webhook.py +11 -1
  274. mlrun/utils/notifications/notification_pusher.py +155 -30
  275. mlrun/utils/retryer.py +208 -0
  276. mlrun/utils/singleton.py +1 -1
  277. mlrun/utils/v3io_clients.py +2 -4
  278. mlrun/utils/version/version.json +2 -2
  279. mlrun/utils/version/version.py +2 -6
  280. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
  281. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  282. mlrun/kfpops.py +0 -868
  283. mlrun/model_monitoring/batch.py +0 -1095
  284. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  285. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  286. mlrun/platforms/other.py +0 -306
  287. mlrun-1.6.4rc2.dist-info/RECORD +0 -314
  288. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  289. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
  290. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  291. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
@@ -26,10 +26,10 @@ import mlrun.config
26
26
  import mlrun.datastore.targets
27
27
  import mlrun.feature_store as fstore
28
28
  import mlrun.feature_store.steps
29
+ import mlrun.model_monitoring.db
29
30
  import mlrun.model_monitoring.prometheus
30
31
  import mlrun.serving.states
31
32
  import mlrun.utils
32
- import mlrun.utils.v3io_clients
33
33
  from mlrun.common.schemas.model_monitoring.constants import (
34
34
  EventFieldType,
35
35
  EventKeyMetrics,
@@ -37,6 +37,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
37
37
  FileTargetKind,
38
38
  ModelEndpointTarget,
39
39
  ProjectSecretKeys,
40
+ PrometheusEndpoints,
40
41
  )
41
42
  from mlrun.utils import logger
42
43
 
@@ -76,6 +77,7 @@ class EventStreamProcessor:
76
77
  )
77
78
 
78
79
  self.storage_options = None
80
+ self.tsdb_configurations = {}
79
81
  if not mlrun.mlconf.is_ce_mode():
80
82
  self._initialize_v3io_configurations(
81
83
  model_monitoring_access_key=model_monitoring_access_key
@@ -134,33 +136,38 @@ class EventStreamProcessor:
134
136
  self.tsdb_batching_max_events = tsdb_batching_max_events
135
137
  self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
136
138
 
137
- def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
139
+ def apply_monitoring_serving_graph(
140
+ self,
141
+ fn: mlrun.runtimes.ServingRuntime,
142
+ tsdb_service_provider: typing.Optional[typing.Callable] = None,
143
+ ) -> None:
138
144
  """
139
- Apply monitoring serving graph to a given serving function. The following serving graph includes about 20 steps
140
- of different operations that are executed on the events from the model server. Each event has
141
- metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
142
- Throughout the serving graph, the results are written to 3 different databases:
143
- 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
144
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
145
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
146
- from other processes, such as current_stats that is being calculated by the monitoring batch job
147
- process. If the target is from type KV, then the model endpoints table can be found under
148
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
149
- is stored within the database that was defined in the provided connection string and can be found
150
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
151
- 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
152
- This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
145
+ Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
146
+ parts that each one them includes several steps of different operations that are executed on the events from
147
+ the model server.
148
+ Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
149
+ metrics from the model server.
150
+ In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
151
+ validation of the event data and adding important details to the event such as endpoint_id.
152
+ In the next parts, the serving graph stores data to 3 different targets:
153
+ 1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
154
+ time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
155
+ endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
156
+ represents sample statistics from the training data. If the target is from type KV, then the model endpoints
157
+ table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
158
+ SQL, then the table is stored within the database that was defined in the provided connection string.
159
+ 2. TSDB: live data of different key metric dictionaries in tsdb target.
160
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
153
161
  can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
154
162
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
155
163
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
156
- If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
157
- monitoring stream local memory.
158
- 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
159
- that run every hour by default. If defined, the parquet target path can be found under
160
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
161
- mlrun.mlconf.model_endpoint_monitoring.user_space.
164
+ 3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
165
+ the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
166
+ the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
167
+ using CE, the parquet target path is based on the defined MLRun artifact path.
162
168
 
163
169
  :param fn: A serving function.
170
+ :param tsdb_service_provider: An optional callable function that provides the TSDB connection string.
164
171
  """
165
172
 
166
173
  graph = typing.cast(
@@ -168,7 +175,7 @@ class EventStreamProcessor:
168
175
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
169
176
  )
170
177
 
171
- # Step 1 - Event routing based on the provided path
178
+ # Event routing based on the provided path
172
179
  def apply_event_routing():
173
180
  typing.cast(
174
181
  mlrun.serving.TaskStep,
@@ -181,20 +188,20 @@ class EventStreamProcessor:
181
188
 
182
189
  apply_event_routing()
183
190
 
184
- # Step 2 - Filter out events with '-' in the path basename from going forward
191
+ # Filter out events with '-' in the path basename from going forward
185
192
  # through the next steps of the stream graph
186
193
  def apply_storey_filter_stream_events():
187
- # Remove none values from each event
194
+ # Filter events with Prometheus endpoints path
188
195
  graph.add_step(
189
196
  "storey.Filter",
190
197
  "filter_stream_event",
191
- _fn="('-' not in event.path.split('/')[-1])",
198
+ _fn=f"(event.path not in {PrometheusEndpoints.list()})",
192
199
  full_event=True,
193
200
  )
194
201
 
195
202
  apply_storey_filter_stream_events()
196
203
 
197
- # Step 3 - Process endpoint event: splitting into sub-events and validate event data
204
+ # Process endpoint event: splitting into sub-events and validate event data
198
205
  def apply_process_endpoint_event():
199
206
  graph.add_step(
200
207
  "ProcessEndpointEvent",
@@ -205,7 +212,7 @@ class EventStreamProcessor:
205
212
 
206
213
  apply_process_endpoint_event()
207
214
 
208
- # Steps 4,5 - Applying Storey operations of filtering and flatten
215
+ # Applying Storey operations of filtering and flatten
209
216
  def apply_storey_filter_and_flatmap():
210
217
  # Remove none values from each event
211
218
  graph.add_step(
@@ -222,7 +229,7 @@ class EventStreamProcessor:
222
229
 
223
230
  apply_storey_filter_and_flatmap()
224
231
 
225
- # Step 6 - Validating feature names and map each feature to its value
232
+ # Validating feature names and map each feature to its value
226
233
  def apply_map_feature_names():
227
234
  graph.add_step(
228
235
  "MapFeatureNames",
@@ -234,9 +241,9 @@ class EventStreamProcessor:
234
241
 
235
242
  apply_map_feature_names()
236
243
 
237
- # Step 7 - Calculate number of predictions and average latency
244
+ # Calculate number of predictions and average latency
238
245
  def apply_storey_aggregations():
239
- # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
246
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
240
247
  graph.add_step(
241
248
  class_name="storey.AggregateByKey",
242
249
  aggregates=[
@@ -254,7 +261,7 @@ class EventStreamProcessor:
254
261
  table=".",
255
262
  key_field=EventFieldType.ENDPOINT_ID,
256
263
  )
257
- # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
264
+ # Calculate average latency time for each window (5 min and 1 hour by default)
258
265
  graph.add_step(
259
266
  class_name="storey.Rename",
260
267
  mapping={
@@ -267,8 +274,8 @@ class EventStreamProcessor:
267
274
 
268
275
  apply_storey_aggregations()
269
276
 
270
- # Steps 8-10 - KV/SQL branch
271
- # Step 8 - Filter relevant keys from the event before writing the data into the database table
277
+ # KV/SQL branch
278
+ # Filter relevant keys from the event before writing the data into the database table
272
279
  def apply_process_before_endpoint_update():
273
280
  graph.add_step(
274
281
  "ProcessBeforeEndpointUpdate",
@@ -278,7 +285,7 @@ class EventStreamProcessor:
278
285
 
279
286
  apply_process_before_endpoint_update()
280
287
 
281
- # Step 9 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
288
+ # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
282
289
  # about average latency and the amount of predictions over time
283
290
  def apply_update_endpoint():
284
291
  graph.add_step(
@@ -291,7 +298,7 @@ class EventStreamProcessor:
291
298
 
292
299
  apply_update_endpoint()
293
300
 
294
- # Step 10 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
301
+ # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
295
302
  # which will be used by Grafana monitoring dashboards
296
303
  def apply_infer_schema():
297
304
  graph.add_step(
@@ -306,7 +313,7 @@ class EventStreamProcessor:
306
313
  if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
307
314
  apply_infer_schema()
308
315
 
309
- # Step 11 - Emits the event in window size of events based on sample_window size (10 by default)
316
+ # Emits the event in window size of events based on sample_window size (10 by default)
310
317
  def apply_storey_sample_window():
311
318
  graph.add_step(
312
319
  "storey.steps.SampleWindow",
@@ -318,84 +325,16 @@ class EventStreamProcessor:
318
325
 
319
326
  apply_storey_sample_window()
320
327
 
321
- # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
322
- # Steps 20-21 - Prometheus branch
328
+ # TSDB branch (skip to Prometheus if in CE env)
323
329
  if not mlrun.mlconf.is_ce_mode():
324
- # TSDB branch
325
-
326
- # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
327
- # stats and details about the events
328
- def apply_process_before_tsdb():
329
- graph.add_step(
330
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
331
- )
332
-
333
- apply_process_before_tsdb()
334
-
335
- # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
336
- def apply_filter_and_unpacked_keys(name, keys):
337
- graph.add_step(
338
- "FilterAndUnpackKeys",
339
- name=name,
340
- after="ProcessBeforeTSDB",
341
- keys=[keys],
342
- )
343
-
344
- def apply_tsdb_target(name, after):
345
- graph.add_step(
346
- "storey.TSDBTarget",
347
- name=name,
348
- after=after,
349
- path=self.tsdb_path,
350
- rate="10/m",
351
- time_col=EventFieldType.TIMESTAMP,
352
- container=self.tsdb_container,
353
- v3io_frames=self.v3io_framesd,
354
- infer_columns_from_data=True,
355
- index_cols=[
356
- EventFieldType.ENDPOINT_ID,
357
- EventFieldType.RECORD_TYPE,
358
- EventFieldType.ENDPOINT_TYPE,
359
- ],
360
- max_events=self.tsdb_batching_max_events,
361
- flush_after_seconds=self.tsdb_batching_timeout_secs,
362
- key=EventFieldType.ENDPOINT_ID,
363
- )
364
-
365
- # Steps 13-14 - unpacked base_metrics dictionary
366
- apply_filter_and_unpacked_keys(
367
- name="FilterAndUnpackKeys1",
368
- keys=EventKeyMetrics.BASE_METRICS,
369
- )
370
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
371
-
372
- # Steps 15-16 - unpacked endpoint_features dictionary
373
- apply_filter_and_unpacked_keys(
374
- name="FilterAndUnpackKeys2",
375
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
330
+ tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
331
+ project=self.project, secret_provider=tsdb_service_provider
376
332
  )
377
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
333
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
378
334
 
379
- # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
380
- apply_filter_and_unpacked_keys(
381
- name="FilterAndUnpackKeys3",
382
- keys=EventKeyMetrics.CUSTOM_METRICS,
383
- )
384
-
385
- def apply_storey_filter():
386
- graph.add_step(
387
- "storey.Filter",
388
- "FilterNotNone",
389
- after="FilterAndUnpackKeys3",
390
- _fn="(event is not None)",
391
- )
392
-
393
- apply_storey_filter()
394
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
395
335
  else:
396
- # Prometheus branch
397
-
398
- # Step 20 - Increase the prediction counter by 1 and update the latency value
336
+ # Prometheus
337
+ # Increase the prediction counter by 1 and update the latency value
399
338
  graph.add_step(
400
339
  "IncCounter",
401
340
  name="IncCounter",
@@ -403,7 +342,7 @@ class EventStreamProcessor:
403
342
  project=self.project,
404
343
  )
405
344
 
406
- # Step 21 - Record a sample of features and labels
345
+ # Record a sample of features and labels
407
346
  def apply_record_features_to_prometheus():
408
347
  graph.add_step(
409
348
  "RecordFeatures",
@@ -414,8 +353,8 @@ class EventStreamProcessor:
414
353
 
415
354
  apply_record_features_to_prometheus()
416
355
 
417
- # Steps 22-23 - Parquet branch
418
- # Step 22 - Filter and validate different keys before writing the data to Parquet target
356
+ # Parquet branch
357
+ # Filter and validate different keys before writing the data to Parquet target
419
358
  def apply_process_before_parquet():
420
359
  graph.add_step(
421
360
  "ProcessBeforeParquet",
@@ -426,7 +365,7 @@ class EventStreamProcessor:
426
365
 
427
366
  apply_process_before_parquet()
428
367
 
429
- # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
368
+ # Write the Parquet target file, partitioned by key (endpoint_id) and time.
430
369
  def apply_parquet_target():
431
370
  graph.add_step(
432
371
  "storey.ParquetTarget",
@@ -500,76 +439,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
500
439
  return e
501
440
 
502
441
 
503
- class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
504
- def __init__(self, **kwargs):
505
- """
506
- Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
507
- that each one of them contains important details and stats about the events:
508
- 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
509
- storey.AggregateByKey which was executed in step 5.
510
- 2. endpoint_features: feature names and values along with the prediction names and value.
511
- 3. custom_metric (opt): optional metrics provided by the user.
512
-
513
- :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
514
-
515
- """
516
- super().__init__(**kwargs)
517
-
518
- def do(self, event):
519
- # Compute prediction per second
520
- event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
521
- float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
522
- )
523
- base_fields = [
524
- EventFieldType.TIMESTAMP,
525
- EventFieldType.ENDPOINT_ID,
526
- EventFieldType.ENDPOINT_TYPE,
527
- ]
528
-
529
- # Getting event timestamp and endpoint_id
530
- base_event = {k: event[k] for k in base_fields}
531
-
532
- # base_metrics includes the stats about the average latency and the amount of predictions over time
533
- base_metrics = {
534
- EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
535
- EventLiveStats.PREDICTIONS_PER_SECOND: event[
536
- EventLiveStats.PREDICTIONS_PER_SECOND
537
- ],
538
- EventLiveStats.PREDICTIONS_COUNT_5M: event[
539
- EventLiveStats.PREDICTIONS_COUNT_5M
540
- ],
541
- EventLiveStats.PREDICTIONS_COUNT_1H: event[
542
- EventLiveStats.PREDICTIONS_COUNT_1H
543
- ],
544
- EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
545
- EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
546
- **base_event,
547
- }
548
-
549
- # endpoint_features includes the event values of each feature and prediction
550
- endpoint_features = {
551
- EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
552
- **event[EventFieldType.NAMED_PREDICTIONS],
553
- **event[EventFieldType.NAMED_FEATURES],
554
- **base_event,
555
- }
556
- # Create a dictionary that includes both base_metrics and endpoint_features
557
- processed = {
558
- EventKeyMetrics.BASE_METRICS: base_metrics,
559
- EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
560
- }
561
-
562
- # If metrics provided, add another dictionary if custom_metrics values
563
- if event[EventFieldType.METRICS]:
564
- processed[EventKeyMetrics.CUSTOM_METRICS] = {
565
- EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
566
- **event[EventFieldType.METRICS],
567
- **base_event,
568
- }
569
-
570
- return processed
571
-
572
-
573
442
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
574
443
  def __init__(self, **kwargs):
575
444
  """
@@ -804,7 +673,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
804
673
  # left them
805
674
  if endpoint_id not in self.endpoints:
806
675
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
807
- endpoint_record = get_endpoint_record(
676
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
808
677
  project=self.project,
809
678
  endpoint_id=endpoint_id,
810
679
  )
@@ -850,36 +719,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
850
719
  return False
851
720
 
852
721
 
853
- class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
854
- def __init__(self, keys, **kwargs):
855
- """
856
- Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
857
- or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
858
-
859
- :param keys: list of key metrics.
860
-
861
- :returns: An unpacked dictionary of event filtered by the provided key metrics.
862
- """
863
- super().__init__(**kwargs)
864
- self.keys = keys
865
-
866
- def do(self, event):
867
- # Keep only the relevant dictionary based on the provided keys
868
- new_event = {}
869
- for key in self.keys:
870
- if key in event:
871
- new_event[key] = event[key]
872
-
873
- # Create unpacked dictionary
874
- unpacked = {}
875
- for key in new_event.keys():
876
- if key in self.keys:
877
- unpacked = {**unpacked, **new_event[key]}
878
- else:
879
- unpacked[key] = new_event[key]
880
- return unpacked if unpacked else None
881
-
882
-
883
722
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
884
723
  def __init__(
885
724
  self,
@@ -937,7 +776,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
937
776
  label_values = event[EventFieldType.PREDICTION]
938
777
  # Get feature names and label columns
939
778
  if endpoint_id not in self.feature_names:
940
- endpoint_record = get_endpoint_record(
779
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
941
780
  project=self.project,
942
781
  endpoint_id=endpoint_id,
943
782
  )
@@ -1078,9 +917,6 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1078
917
  self.model_endpoint_store_target = model_endpoint_store_target
1079
918
 
1080
919
  def do(self, event: dict):
1081
- # Remove labels from the event
1082
- event.pop(EventFieldType.LABELS)
1083
-
1084
920
  update_endpoint_record(
1085
921
  project=self.project,
1086
922
  endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
@@ -1118,6 +954,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1118
954
  def do(self, event: dict):
1119
955
  key_set = set(event.keys())
1120
956
  if not key_set.issubset(self.keys):
957
+ import mlrun.utils.v3io_clients
958
+
1121
959
  self.keys.update(key_set)
1122
960
  # Apply infer_schema on the kv table for generating the schema file
1123
961
  mlrun.utils.v3io_clients.get_frames_client(
@@ -1153,10 +991,10 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
1153
991
  self.project: str = project
1154
992
 
1155
993
  def do(self, event):
1156
- if event.path == "/model-monitoring-metrics":
994
+ if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
1157
995
  # Return a parsed Prometheus registry file
1158
996
  event.body = mlrun.model_monitoring.prometheus.get_registry()
1159
- elif event.path == "/monitoring-batch-metrics":
997
+ elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
1160
998
  # Update statistical metrics
1161
999
  for event_metric in event.body:
1162
1000
  mlrun.model_monitoring.prometheus.write_drift_metrics(
@@ -1165,7 +1003,7 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
1165
1003
  metric=event_metric[EventFieldType.METRIC],
1166
1004
  value=event_metric[EventFieldType.VALUE],
1167
1005
  )
1168
- elif event.path == "/monitoring-drift-status":
1006
+ elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
1169
1007
  # Update drift status
1170
1008
  mlrun.model_monitoring.prometheus.write_drift_status(
1171
1009
  project=self.project,
@@ -1225,7 +1063,7 @@ def update_endpoint_record(
1225
1063
  endpoint_id: str,
1226
1064
  attributes: dict,
1227
1065
  ):
1228
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1066
+ model_endpoint_store = mlrun.model_monitoring.get_store_object(
1229
1067
  project=project,
1230
1068
  )
1231
1069
 
@@ -1234,13 +1072,6 @@ def update_endpoint_record(
1234
1072
  )
1235
1073
 
1236
1074
 
1237
- def get_endpoint_record(project: str, endpoint_id: str):
1238
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1239
- project=project,
1240
- )
1241
- return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
1242
-
1243
-
1244
1075
  def update_monitoring_feature_set(
1245
1076
  endpoint_record: dict[str, typing.Any],
1246
1077
  feature_names: list[str],
@@ -11,8 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
15
+ import warnings
16
16
  from typing import Union
17
17
 
18
18
  import mlrun.common.schemas.schedule
@@ -55,6 +55,12 @@ class TrackingPolicy(mlrun.model.ModelObj):
55
55
  writer function, which is a real time nuclio functino, will be deployed
56
56
  with the same image. By default, the image is mlrun/mlrun.
57
57
  """
58
+ warnings.warn(
59
+ "The `TrackingPolicy` class is deprecated from version 1.7.0 and is not "
60
+ "used anymore. It will be removed in 1.9.0.",
61
+ FutureWarning,
62
+ )
63
+
58
64
  if isinstance(default_batch_intervals, str):
59
65
  default_batch_intervals = (
60
66
  mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
@@ -96,12 +102,13 @@ class TrackingPolicy(mlrun.model.ModelObj):
96
102
  )
97
103
  return new_obj
98
104
 
99
- def to_dict(self, fields=None, exclude=None):
105
+ def to_dict(self, fields: list = None, exclude: list = None, strip: bool = False):
100
106
  struct = super().to_dict(
101
107
  fields,
102
108
  exclude=[
103
109
  mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
104
110
  ],
111
+ strip=strip,
105
112
  )
106
113
  if self.default_batch_intervals:
107
114
  struct[