mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +40 -122
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +47 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +79 -47
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +74 -1
  13. mlrun/common/db/sql_session.py +5 -5
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +45 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +33 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +12 -3
  23. mlrun/common/model_monitoring/helpers.py +9 -5
  24. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  25. mlrun/common/schemas/__init__.py +31 -5
  26. mlrun/common/schemas/alert.py +202 -0
  27. mlrun/common/schemas/api_gateway.py +196 -0
  28. mlrun/common/schemas/artifact.py +25 -4
  29. mlrun/common/schemas/auth.py +16 -5
  30. mlrun/common/schemas/background_task.py +1 -1
  31. mlrun/common/schemas/client_spec.py +4 -2
  32. mlrun/common/schemas/common.py +7 -4
  33. mlrun/common/schemas/constants.py +3 -0
  34. mlrun/common/schemas/feature_store.py +74 -44
  35. mlrun/common/schemas/frontend_spec.py +15 -7
  36. mlrun/common/schemas/function.py +12 -1
  37. mlrun/common/schemas/hub.py +11 -18
  38. mlrun/common/schemas/memory_reports.py +2 -2
  39. mlrun/common/schemas/model_monitoring/__init__.py +20 -4
  40. mlrun/common/schemas/model_monitoring/constants.py +123 -42
  41. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  42. mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
  43. mlrun/common/schemas/notification.py +71 -14
  44. mlrun/common/schemas/object.py +2 -2
  45. mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
  46. mlrun/common/schemas/pipeline.py +8 -1
  47. mlrun/common/schemas/project.py +69 -18
  48. mlrun/common/schemas/runs.py +7 -1
  49. mlrun/common/schemas/runtime_resource.py +8 -12
  50. mlrun/common/schemas/schedule.py +4 -4
  51. mlrun/common/schemas/tag.py +1 -2
  52. mlrun/common/schemas/workflow.py +12 -4
  53. mlrun/common/types.py +14 -1
  54. mlrun/config.py +154 -69
  55. mlrun/data_types/data_types.py +6 -1
  56. mlrun/data_types/spark.py +2 -2
  57. mlrun/data_types/to_pandas.py +67 -37
  58. mlrun/datastore/__init__.py +6 -8
  59. mlrun/datastore/alibaba_oss.py +131 -0
  60. mlrun/datastore/azure_blob.py +143 -42
  61. mlrun/datastore/base.py +102 -58
  62. mlrun/datastore/datastore.py +34 -13
  63. mlrun/datastore/datastore_profile.py +146 -20
  64. mlrun/datastore/dbfs_store.py +3 -7
  65. mlrun/datastore/filestore.py +1 -4
  66. mlrun/datastore/google_cloud_storage.py +97 -33
  67. mlrun/datastore/hdfs.py +56 -0
  68. mlrun/datastore/inmem.py +6 -3
  69. mlrun/datastore/redis.py +7 -2
  70. mlrun/datastore/s3.py +34 -12
  71. mlrun/datastore/snowflake_utils.py +45 -0
  72. mlrun/datastore/sources.py +303 -111
  73. mlrun/datastore/spark_utils.py +31 -2
  74. mlrun/datastore/store_resources.py +9 -7
  75. mlrun/datastore/storeytargets.py +151 -0
  76. mlrun/datastore/targets.py +453 -176
  77. mlrun/datastore/utils.py +72 -58
  78. mlrun/datastore/v3io.py +6 -1
  79. mlrun/db/base.py +274 -41
  80. mlrun/db/factory.py +1 -1
  81. mlrun/db/httpdb.py +893 -225
  82. mlrun/db/nopdb.py +291 -33
  83. mlrun/errors.py +36 -6
  84. mlrun/execution.py +115 -42
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +65 -73
  87. mlrun/feature_store/common.py +7 -12
  88. mlrun/feature_store/feature_set.py +76 -55
  89. mlrun/feature_store/feature_vector.py +39 -31
  90. mlrun/feature_store/ingestion.py +7 -6
  91. mlrun/feature_store/retrieval/base.py +16 -11
  92. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  93. mlrun/feature_store/retrieval/job.py +13 -4
  94. mlrun/feature_store/retrieval/local_merger.py +2 -0
  95. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  96. mlrun/feature_store/steps.py +45 -34
  97. mlrun/features.py +11 -21
  98. mlrun/frameworks/_common/artifacts_library.py +9 -9
  99. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  100. mlrun/frameworks/_common/model_handler.py +48 -48
  101. mlrun/frameworks/_common/plan.py +5 -6
  102. mlrun/frameworks/_common/producer.py +3 -4
  103. mlrun/frameworks/_common/utils.py +5 -5
  104. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  105. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  106. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  107. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  108. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  109. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  110. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  111. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  112. mlrun/frameworks/_ml_common/plan.py +2 -2
  113. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  114. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  115. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  116. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  117. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  118. mlrun/frameworks/_ml_common/utils.py +4 -4
  119. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  120. mlrun/frameworks/huggingface/model_server.py +4 -4
  121. mlrun/frameworks/lgbm/__init__.py +33 -33
  122. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  123. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  124. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  125. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  126. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  127. mlrun/frameworks/lgbm/model_handler.py +10 -10
  128. mlrun/frameworks/lgbm/model_server.py +6 -6
  129. mlrun/frameworks/lgbm/utils.py +5 -5
  130. mlrun/frameworks/onnx/dataset.py +8 -8
  131. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  132. mlrun/frameworks/onnx/model_handler.py +6 -6
  133. mlrun/frameworks/onnx/model_server.py +7 -7
  134. mlrun/frameworks/parallel_coordinates.py +6 -6
  135. mlrun/frameworks/pytorch/__init__.py +18 -18
  136. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  137. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  138. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  139. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  140. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  141. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  142. mlrun/frameworks/pytorch/model_handler.py +17 -17
  143. mlrun/frameworks/pytorch/model_server.py +7 -7
  144. mlrun/frameworks/sklearn/__init__.py +13 -13
  145. mlrun/frameworks/sklearn/estimator.py +4 -4
  146. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  147. mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
  148. mlrun/frameworks/sklearn/model_handler.py +2 -2
  149. mlrun/frameworks/tf_keras/__init__.py +10 -7
  150. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  151. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  152. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  153. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  154. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  155. mlrun/frameworks/tf_keras/model_server.py +6 -6
  156. mlrun/frameworks/xgboost/__init__.py +13 -13
  157. mlrun/frameworks/xgboost/model_handler.py +6 -6
  158. mlrun/k8s_utils.py +61 -17
  159. mlrun/launcher/__init__.py +1 -1
  160. mlrun/launcher/base.py +16 -15
  161. mlrun/launcher/client.py +13 -11
  162. mlrun/launcher/factory.py +1 -1
  163. mlrun/launcher/local.py +23 -13
  164. mlrun/launcher/remote.py +17 -10
  165. mlrun/lists.py +7 -6
  166. mlrun/model.py +478 -103
  167. mlrun/model_monitoring/__init__.py +1 -1
  168. mlrun/model_monitoring/api.py +163 -371
  169. mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
  170. mlrun/model_monitoring/applications/_application_steps.py +188 -0
  171. mlrun/model_monitoring/applications/base.py +108 -0
  172. mlrun/model_monitoring/applications/context.py +341 -0
  173. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  174. mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
  175. mlrun/model_monitoring/applications/results.py +99 -0
  176. mlrun/model_monitoring/controller.py +131 -278
  177. mlrun/model_monitoring/db/__init__.py +18 -0
  178. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  179. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  180. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  181. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  182. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  183. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  184. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  185. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  186. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  187. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  188. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  189. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  190. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  191. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  192. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  193. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
  194. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  195. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
  196. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  197. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  198. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  199. mlrun/model_monitoring/features_drift_table.py +134 -106
  200. mlrun/model_monitoring/helpers.py +199 -55
  201. mlrun/model_monitoring/metrics/__init__.py +13 -0
  202. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  203. mlrun/model_monitoring/model_endpoint.py +3 -2
  204. mlrun/model_monitoring/stream_processing.py +131 -398
  205. mlrun/model_monitoring/tracking_policy.py +9 -2
  206. mlrun/model_monitoring/writer.py +161 -125
  207. mlrun/package/__init__.py +6 -6
  208. mlrun/package/context_handler.py +5 -5
  209. mlrun/package/packager.py +7 -7
  210. mlrun/package/packagers/default_packager.py +8 -8
  211. mlrun/package/packagers/numpy_packagers.py +15 -15
  212. mlrun/package/packagers/pandas_packagers.py +5 -5
  213. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  214. mlrun/package/packagers_manager.py +19 -23
  215. mlrun/package/utils/_formatter.py +6 -6
  216. mlrun/package/utils/_pickler.py +2 -2
  217. mlrun/package/utils/_supported_format.py +4 -4
  218. mlrun/package/utils/log_hint_utils.py +2 -2
  219. mlrun/package/utils/type_hint_utils.py +4 -9
  220. mlrun/platforms/__init__.py +11 -10
  221. mlrun/platforms/iguazio.py +24 -203
  222. mlrun/projects/operations.py +52 -25
  223. mlrun/projects/pipelines.py +191 -197
  224. mlrun/projects/project.py +1227 -400
  225. mlrun/render.py +16 -19
  226. mlrun/run.py +209 -184
  227. mlrun/runtimes/__init__.py +83 -15
  228. mlrun/runtimes/base.py +51 -35
  229. mlrun/runtimes/daskjob.py +17 -10
  230. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  231. mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
  232. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  233. mlrun/runtimes/funcdoc.py +1 -29
  234. mlrun/runtimes/function_reference.py +1 -1
  235. mlrun/runtimes/kubejob.py +34 -128
  236. mlrun/runtimes/local.py +40 -11
  237. mlrun/runtimes/mpijob/__init__.py +0 -20
  238. mlrun/runtimes/mpijob/abstract.py +9 -10
  239. mlrun/runtimes/mpijob/v1.py +1 -1
  240. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  241. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  242. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  243. mlrun/runtimes/nuclio/application/application.py +758 -0
  244. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  245. mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
  246. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  247. mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
  248. mlrun/runtimes/pod.py +281 -101
  249. mlrun/runtimes/remotesparkjob.py +12 -9
  250. mlrun/runtimes/sparkjob/spark3job.py +67 -51
  251. mlrun/runtimes/utils.py +41 -75
  252. mlrun/secrets.py +9 -5
  253. mlrun/serving/__init__.py +8 -1
  254. mlrun/serving/remote.py +2 -7
  255. mlrun/serving/routers.py +85 -69
  256. mlrun/serving/server.py +69 -44
  257. mlrun/serving/states.py +209 -36
  258. mlrun/serving/utils.py +22 -14
  259. mlrun/serving/v1_serving.py +6 -7
  260. mlrun/serving/v2_serving.py +129 -54
  261. mlrun/track/tracker.py +2 -1
  262. mlrun/track/tracker_manager.py +3 -3
  263. mlrun/track/trackers/mlflow_tracker.py +6 -2
  264. mlrun/utils/async_http.py +6 -8
  265. mlrun/utils/azure_vault.py +1 -1
  266. mlrun/utils/clones.py +1 -2
  267. mlrun/utils/condition_evaluator.py +3 -3
  268. mlrun/utils/db.py +21 -3
  269. mlrun/utils/helpers.py +405 -225
  270. mlrun/utils/http.py +3 -6
  271. mlrun/utils/logger.py +112 -16
  272. mlrun/utils/notifications/notification/__init__.py +17 -13
  273. mlrun/utils/notifications/notification/base.py +50 -2
  274. mlrun/utils/notifications/notification/console.py +2 -0
  275. mlrun/utils/notifications/notification/git.py +24 -1
  276. mlrun/utils/notifications/notification/ipython.py +3 -1
  277. mlrun/utils/notifications/notification/slack.py +96 -21
  278. mlrun/utils/notifications/notification/webhook.py +59 -2
  279. mlrun/utils/notifications/notification_pusher.py +149 -30
  280. mlrun/utils/regex.py +9 -0
  281. mlrun/utils/retryer.py +208 -0
  282. mlrun/utils/singleton.py +1 -1
  283. mlrun/utils/v3io_clients.py +4 -6
  284. mlrun/utils/version/version.json +2 -2
  285. mlrun/utils/version/version.py +2 -6
  286. mlrun-1.7.0.dist-info/METADATA +378 -0
  287. mlrun-1.7.0.dist-info/RECORD +351 -0
  288. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
  289. mlrun/feature_store/retrieval/conversion.py +0 -273
  290. mlrun/kfpops.py +0 -868
  291. mlrun/model_monitoring/application.py +0 -310
  292. mlrun/model_monitoring/batch.py +0 -1095
  293. mlrun/model_monitoring/prometheus.py +0 -219
  294. mlrun/model_monitoring/stores/__init__.py +0 -111
  295. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
  296. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  297. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  298. mlrun/model_monitoring/stores/models/base.py +0 -84
  299. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  300. mlrun/platforms/other.py +0 -306
  301. mlrun-1.6.4rc7.dist-info/METADATA +0 -272
  302. mlrun-1.6.4rc7.dist-info/RECORD +0 -314
  303. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
  304. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
  305. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
@@ -26,10 +26,9 @@ import mlrun.config
26
26
  import mlrun.datastore.targets
27
27
  import mlrun.feature_store as fstore
28
28
  import mlrun.feature_store.steps
29
- import mlrun.model_monitoring.prometheus
29
+ import mlrun.model_monitoring.db
30
30
  import mlrun.serving.states
31
31
  import mlrun.utils
32
- import mlrun.utils.v3io_clients
33
32
  from mlrun.common.schemas.model_monitoring.constants import (
34
33
  EventFieldType,
35
34
  EventKeyMetrics,
@@ -38,6 +37,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
38
37
  ModelEndpointTarget,
39
38
  ProjectSecretKeys,
40
39
  )
40
+ from mlrun.model_monitoring.db import StoreBase, TSDBConnector
41
41
  from mlrun.utils import logger
42
42
 
43
43
 
@@ -49,14 +49,12 @@ class EventStreamProcessor:
49
49
  parquet_batching_max_events: int,
50
50
  parquet_batching_timeout_secs: int,
51
51
  parquet_target: str,
52
- sample_window: int = 10,
53
52
  aggregate_windows: typing.Optional[list[str]] = None,
54
- aggregate_period: str = "30s",
53
+ aggregate_period: str = "5m",
55
54
  model_monitoring_access_key: str = None,
56
55
  ):
57
56
  # General configurations, mainly used for the storey steps in the future serving graph
58
57
  self.project = project
59
- self.sample_window = sample_window
60
58
  self.aggregate_windows = aggregate_windows or ["5m", "1h"]
61
59
  self.aggregate_period = aggregate_period
62
60
 
@@ -65,10 +63,6 @@ class EventStreamProcessor:
65
63
  self.parquet_batching_max_events = parquet_batching_max_events
66
64
  self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
67
65
 
68
- self.model_endpoint_store_target = (
69
- mlrun.mlconf.model_endpoint_monitoring.store_type
70
- )
71
-
72
66
  logger.info(
73
67
  "Initializing model monitoring event stream processor",
74
68
  parquet_path=self.parquet_path,
@@ -76,6 +70,7 @@ class EventStreamProcessor:
76
70
  )
77
71
 
78
72
  self.storage_options = None
73
+ self.tsdb_configurations = {}
79
74
  if not mlrun.mlconf.is_ce_mode():
80
75
  self._initialize_v3io_configurations(
81
76
  model_monitoring_access_key=model_monitoring_access_key
@@ -134,78 +129,83 @@ class EventStreamProcessor:
134
129
  self.tsdb_batching_max_events = tsdb_batching_max_events
135
130
  self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
136
131
 
137
- def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
132
+ def apply_monitoring_serving_graph(
133
+ self,
134
+ fn: mlrun.runtimes.ServingRuntime,
135
+ tsdb_connector: TSDBConnector,
136
+ endpoint_store: StoreBase,
137
+ ) -> None:
138
138
  """
139
- Apply monitoring serving graph to a given serving function. The following serving graph includes about 20 steps
140
- of different operations that are executed on the events from the model server. Each event has
141
- metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
142
- Throughout the serving graph, the results are written to 3 different databases:
143
- 1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
144
- time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
145
- by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
146
- from other processes, such as current_stats that is being calculated by the monitoring batch job
147
- process. If the target is from type KV, then the model endpoints table can be found under
148
- v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
149
- is stored within the database that was defined in the provided connection string and can be found
150
- under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
151
- 2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
152
- This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
139
+ Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
140
+ parts that each one them includes several steps of different operations that are executed on the events from
141
+ the model server.
142
+ Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
143
+ metrics from the model server.
144
+ In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
145
+ validation of the event data and adding important details to the event such as endpoint_id.
146
+ In the next parts, the serving graph stores data to 3 different targets:
147
+ 1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
148
+ time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
149
+ endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
150
+ represents sample statistics from the training data. If the target is from type KV, then the model endpoints
151
+ table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
152
+ SQL, then the table is stored within the database that was defined in the provided connection string.
153
+ 2. TSDB: live data of different key metric dictionaries in tsdb target.
154
+ This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
153
155
  can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
154
156
  3 different key metric dictionaries: base_metrics (average latency and predictions over time),
155
157
  endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
156
- If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
157
- monitoring stream local memory.
158
- 3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
159
- that run every hour by default. If defined, the parquet target path can be found under
160
- mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
161
- mlrun.mlconf.model_endpoint_monitoring.user_space.
158
+ 3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
159
+ the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
160
+ the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
161
+ using CE, the parquet target path is based on the defined MLRun artifact path.
162
162
 
163
163
  :param fn: A serving function.
164
+ :param tsdb_connector: Time series database connector.
165
+ :param endpoint_store: KV/SQL store used for endpoint data.
164
166
  """
165
167
 
166
168
  graph = typing.cast(
167
169
  mlrun.serving.states.RootFlowStep,
168
170
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
169
171
  )
172
+ graph.add_step(
173
+ "ExtractEndpointID",
174
+ "extract_endpoint",
175
+ full_event=True,
176
+ )
170
177
 
171
- # Step 1 - Event routing based on the provided path
172
- def apply_event_routing():
173
- typing.cast(
174
- mlrun.serving.TaskStep,
175
- graph.add_step(
176
- "EventRouting",
177
- full_event=True,
178
- project=self.project,
179
- ),
180
- ).respond()
181
-
182
- apply_event_routing()
178
+ # split the graph between event with error vs valid event
179
+ graph.add_step(
180
+ "storey.Filter",
181
+ "FilterError",
182
+ after="extract_endpoint",
183
+ _fn="(event.get('error') is None)",
184
+ )
183
185
 
184
- # Step 2 - Filter out events with '-' in the path basename from going forward
185
- # through the next steps of the stream graph
186
- def apply_storey_filter_stream_events():
187
- # Remove none values from each event
188
- graph.add_step(
189
- "storey.Filter",
190
- "filter_stream_event",
191
- _fn="('-' not in event.path.split('/')[-1])",
192
- full_event=True,
193
- )
186
+ graph.add_step(
187
+ "storey.Filter",
188
+ "ForwardError",
189
+ after="extract_endpoint",
190
+ _fn="(event.get('error') is not None)",
191
+ )
194
192
 
195
- apply_storey_filter_stream_events()
193
+ tsdb_connector.handle_model_error(
194
+ graph,
195
+ )
196
196
 
197
- # Step 3 - Process endpoint event: splitting into sub-events and validate event data
197
+ # Process endpoint event: splitting into sub-events and validate event data
198
198
  def apply_process_endpoint_event():
199
199
  graph.add_step(
200
200
  "ProcessEndpointEvent",
201
+ after="extract_endpoint", # TODO: change this to FilterError in ML-7456
201
202
  full_event=True,
202
203
  project=self.project,
203
- after="filter_stream_event",
204
204
  )
205
205
 
206
206
  apply_process_endpoint_event()
207
207
 
208
- # Steps 4,5 - Applying Storey operations of filtering and flatten
208
+ # Applying Storey operations of filtering and flatten
209
209
  def apply_storey_filter_and_flatmap():
210
210
  # Remove none values from each event
211
211
  graph.add_step(
@@ -222,7 +222,7 @@ class EventStreamProcessor:
222
222
 
223
223
  apply_storey_filter_and_flatmap()
224
224
 
225
- # Step 6 - Validating feature names and map each feature to its value
225
+ # Validating feature names and map each feature to its value
226
226
  def apply_map_feature_names():
227
227
  graph.add_step(
228
228
  "MapFeatureNames",
@@ -234,9 +234,9 @@ class EventStreamProcessor:
234
234
 
235
235
  apply_map_feature_names()
236
236
 
237
- # Step 7 - Calculate number of predictions and average latency
237
+ # Calculate number of predictions and average latency
238
238
  def apply_storey_aggregations():
239
- # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
239
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
240
240
  graph.add_step(
241
241
  class_name="storey.AggregateByKey",
242
242
  aggregates=[
@@ -254,7 +254,7 @@ class EventStreamProcessor:
254
254
  table=".",
255
255
  key_field=EventFieldType.ENDPOINT_ID,
256
256
  )
257
- # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
257
+ # Calculate average latency time for each window (5 min and 1 hour by default)
258
258
  graph.add_step(
259
259
  class_name="storey.Rename",
260
260
  mapping={
@@ -267,8 +267,8 @@ class EventStreamProcessor:
267
267
 
268
268
  apply_storey_aggregations()
269
269
 
270
- # Steps 8-10 - KV/SQL branch
271
- # Step 8 - Filter relevant keys from the event before writing the data into the database table
270
+ # KV/SQL branch
271
+ # Filter relevant keys from the event before writing the data into the database table
272
272
  def apply_process_before_endpoint_update():
273
273
  graph.add_step(
274
274
  "ProcessBeforeEndpointUpdate",
@@ -278,7 +278,7 @@ class EventStreamProcessor:
278
278
 
279
279
  apply_process_before_endpoint_update()
280
280
 
281
- # Step 9 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
281
+ # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
282
282
  # about average latency and the amount of predictions over time
283
283
  def apply_update_endpoint():
284
284
  graph.add_step(
@@ -286,12 +286,11 @@ class EventStreamProcessor:
286
286
  name="UpdateEndpoint",
287
287
  after="ProcessBeforeEndpointUpdate",
288
288
  project=self.project,
289
- model_endpoint_store_target=self.model_endpoint_store_target,
290
289
  )
291
290
 
292
291
  apply_update_endpoint()
293
292
 
294
- # Step 10 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
293
+ # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
295
294
  # which will be used by Grafana monitoring dashboards
296
295
  def apply_infer_schema():
297
296
  graph.add_step(
@@ -303,119 +302,13 @@ class EventStreamProcessor:
303
302
  table=self.kv_path,
304
303
  )
305
304
 
306
- if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
305
+ if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
307
306
  apply_infer_schema()
308
307
 
309
- # Step 11 - Emits the event in window size of events based on sample_window size (10 by default)
310
- def apply_storey_sample_window():
311
- graph.add_step(
312
- "storey.steps.SampleWindow",
313
- name="sample",
314
- after="Rename",
315
- window_size=self.sample_window,
316
- key=EventFieldType.ENDPOINT_ID,
317
- )
318
-
319
- apply_storey_sample_window()
308
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
320
309
 
321
- # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
322
- # Steps 20-21 - Prometheus branch
323
- if not mlrun.mlconf.is_ce_mode():
324
- # TSDB branch
325
-
326
- # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
327
- # stats and details about the events
328
- def apply_process_before_tsdb():
329
- graph.add_step(
330
- "ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
331
- )
332
-
333
- apply_process_before_tsdb()
334
-
335
- # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
336
- def apply_filter_and_unpacked_keys(name, keys):
337
- graph.add_step(
338
- "FilterAndUnpackKeys",
339
- name=name,
340
- after="ProcessBeforeTSDB",
341
- keys=[keys],
342
- )
343
-
344
- def apply_tsdb_target(name, after):
345
- graph.add_step(
346
- "storey.TSDBTarget",
347
- name=name,
348
- after=after,
349
- path=self.tsdb_path,
350
- rate="10/m",
351
- time_col=EventFieldType.TIMESTAMP,
352
- container=self.tsdb_container,
353
- v3io_frames=self.v3io_framesd,
354
- infer_columns_from_data=True,
355
- index_cols=[
356
- EventFieldType.ENDPOINT_ID,
357
- EventFieldType.RECORD_TYPE,
358
- EventFieldType.ENDPOINT_TYPE,
359
- ],
360
- max_events=self.tsdb_batching_max_events,
361
- flush_after_seconds=self.tsdb_batching_timeout_secs,
362
- key=EventFieldType.ENDPOINT_ID,
363
- )
364
-
365
- # Steps 13-14 - unpacked base_metrics dictionary
366
- apply_filter_and_unpacked_keys(
367
- name="FilterAndUnpackKeys1",
368
- keys=EventKeyMetrics.BASE_METRICS,
369
- )
370
- apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
371
-
372
- # Steps 15-16 - unpacked endpoint_features dictionary
373
- apply_filter_and_unpacked_keys(
374
- name="FilterAndUnpackKeys2",
375
- keys=EventKeyMetrics.ENDPOINT_FEATURES,
376
- )
377
- apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
378
-
379
- # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
380
- apply_filter_and_unpacked_keys(
381
- name="FilterAndUnpackKeys3",
382
- keys=EventKeyMetrics.CUSTOM_METRICS,
383
- )
384
-
385
- def apply_storey_filter():
386
- graph.add_step(
387
- "storey.Filter",
388
- "FilterNotNone",
389
- after="FilterAndUnpackKeys3",
390
- _fn="(event is not None)",
391
- )
392
-
393
- apply_storey_filter()
394
- apply_tsdb_target(name="tsdb3", after="FilterNotNone")
395
- else:
396
- # Prometheus branch
397
-
398
- # Step 20 - Increase the prediction counter by 1 and update the latency value
399
- graph.add_step(
400
- "IncCounter",
401
- name="IncCounter",
402
- after="MapFeatureNames",
403
- project=self.project,
404
- )
405
-
406
- # Step 21 - Record a sample of features and labels
407
- def apply_record_features_to_prometheus():
408
- graph.add_step(
409
- "RecordFeatures",
410
- name="RecordFeaturesToPrometheus",
411
- after="sample",
412
- project=self.project,
413
- )
414
-
415
- apply_record_features_to_prometheus()
416
-
417
- # Steps 22-23 - Parquet branch
418
- # Step 22 - Filter and validate different keys before writing the data to Parquet target
310
+ # Parquet branch
311
+ # Filter and validate different keys before writing the data to Parquet target
419
312
  def apply_process_before_parquet():
420
313
  graph.add_step(
421
314
  "ProcessBeforeParquet",
@@ -426,7 +319,7 @@ class EventStreamProcessor:
426
319
 
427
320
  apply_process_before_parquet()
428
321
 
429
- # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
322
+ # Write the Parquet target file, partitioned by key (endpoint_id) and time.
430
323
  def apply_parquet_target():
431
324
  graph.add_step(
432
325
  "storey.ParquetTarget",
@@ -441,6 +334,7 @@ class EventStreamProcessor:
441
334
  index_cols=[EventFieldType.ENDPOINT_ID],
442
335
  key_bucketing_number=0,
443
336
  time_partitioning_granularity="hour",
337
+ time_field=EventFieldType.TIMESTAMP,
444
338
  partition_cols=["$key", "$year", "$month", "$day", "$hour"],
445
339
  )
446
340
 
@@ -500,74 +394,36 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
500
394
  return e
501
395
 
502
396
 
503
- class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
504
- def __init__(self, **kwargs):
397
+ class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
398
+ def __init__(self, **kwargs) -> None:
505
399
  """
506
- Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
507
- that each one of them contains important details and stats about the events:
508
- 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
509
- storey.AggregateByKey which was executed in step 5.
510
- 2. endpoint_features: feature names and values along with the prediction names and value.
511
- 3. custom_metric (opt): optional metrics provided by the user.
512
-
513
- :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
514
-
400
+ Generate the model endpoint ID based on the event parameters and attach it to the event.
515
401
  """
516
402
  super().__init__(**kwargs)
517
403
 
518
- def do(self, event):
519
- # Compute prediction per second
520
- event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
521
- float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
522
- )
523
- base_fields = [
524
- EventFieldType.TIMESTAMP,
525
- EventFieldType.ENDPOINT_ID,
526
- EventFieldType.ENDPOINT_TYPE,
527
- ]
404
+ def do(self, full_event) -> typing.Union[storey.Event, None]:
405
+ # Getting model version and function uri from event
406
+ # and use them for retrieving the endpoint_id
407
+ function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
408
+ if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
409
+ return None
528
410
 
529
- # Getting event timestamp and endpoint_id
530
- base_event = {k: event[k] for k in base_fields}
531
-
532
- # base_metrics includes the stats about the average latency and the amount of predictions over time
533
- base_metrics = {
534
- EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
535
- EventLiveStats.PREDICTIONS_PER_SECOND: event[
536
- EventLiveStats.PREDICTIONS_PER_SECOND
537
- ],
538
- EventLiveStats.PREDICTIONS_COUNT_5M: event[
539
- EventLiveStats.PREDICTIONS_COUNT_5M
540
- ],
541
- EventLiveStats.PREDICTIONS_COUNT_1H: event[
542
- EventLiveStats.PREDICTIONS_COUNT_1H
543
- ],
544
- EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
545
- EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
546
- **base_event,
547
- }
411
+ model = full_event.body.get(EventFieldType.MODEL)
412
+ if not is_not_none(model, [EventFieldType.MODEL]):
413
+ return None
548
414
 
549
- # endpoint_features includes the event values of each feature and prediction
550
- endpoint_features = {
551
- EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
552
- **event[EventFieldType.NAMED_PREDICTIONS],
553
- **event[EventFieldType.NAMED_FEATURES],
554
- **base_event,
555
- }
556
- # Create a dictionary that includes both base_metrics and endpoint_features
557
- processed = {
558
- EventKeyMetrics.BASE_METRICS: base_metrics,
559
- EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
560
- }
415
+ version = full_event.body.get(EventFieldType.VERSION)
416
+ versioned_model = f"{model}:{version}" if version else f"{model}:latest"
561
417
 
562
- # If metrics provided, add another dictionary if custom_metrics values
563
- if event[EventFieldType.METRICS]:
564
- processed[EventKeyMetrics.CUSTOM_METRICS] = {
565
- EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
566
- **event[EventFieldType.METRICS],
567
- **base_event,
568
- }
418
+ endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
419
+ function_uri=function_uri,
420
+ versioned_model=versioned_model,
421
+ )
569
422
 
570
- return processed
423
+ endpoint_id = str(endpoint_id)
424
+ full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
425
+ full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
426
+ return full_event
571
427
 
572
428
 
573
429
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
@@ -643,28 +499,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
643
499
  def do(self, full_event):
644
500
  event = full_event.body
645
501
 
646
- # Getting model version and function uri from event
647
- # and use them for retrieving the endpoint_id
648
- function_uri = event.get(EventFieldType.FUNCTION_URI)
649
- if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
650
- return None
651
-
652
- model = event.get(EventFieldType.MODEL)
653
- if not is_not_none(model, [EventFieldType.MODEL]):
654
- return None
655
-
656
- version = event.get(EventFieldType.VERSION)
657
- versioned_model = f"{model}:{version}" if version else f"{model}:latest"
658
-
659
- endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
660
- function_uri=function_uri,
661
- versioned_model=versioned_model,
662
- )
663
-
664
- endpoint_id = str(endpoint_id)
665
-
666
- event[EventFieldType.VERSIONED_MODEL] = versioned_model
667
- event[EventFieldType.ENDPOINT_ID] = endpoint_id
502
+ versioned_model = event[EventFieldType.VERSIONED_MODEL]
503
+ endpoint_id = event[EventFieldType.ENDPOINT_ID]
504
+ function_uri = event[EventFieldType.FUNCTION_URI]
668
505
 
669
506
  # In case this process fails, resume state from existing record
670
507
  self.resume_state(endpoint_id)
@@ -672,13 +509,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
672
509
  # If error key has been found in the current event,
673
510
  # increase the error counter by 1 and raise the error description
674
511
  error = event.get("error")
675
- if error:
512
+ if error: # TODO: delete this in ML-7456
676
513
  self.error_count[endpoint_id] += 1
677
- mlrun.model_monitoring.prometheus.write_errors(
678
- project=self.project,
679
- endpoint_id=event["endpoint_id"],
680
- model_name=event["model"],
681
- )
682
514
  raise mlrun.errors.MLRunInvalidArgumentError(str(error))
683
515
 
684
516
  # Validate event fields
@@ -745,6 +577,26 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
745
577
 
746
578
  # Separate each model invocation into sub events that will be stored as dictionary
747
579
  # in list of events. This list will be used as the body for the storey event.
580
+ if not isinstance(features, list):
581
+ raise mlrun.errors.MLRunInvalidArgumentError(
582
+ "Model's inputs must be a list"
583
+ )
584
+ features = (
585
+ features
586
+ if not any(not isinstance(feat, list) for feat in features)
587
+ else [features]
588
+ )
589
+ if not isinstance(predictions, list):
590
+ predictions = [[predictions]]
591
+ elif isinstance(predictions, list) and len(predictions) == len(features):
592
+ pass # predictions are already in the right format
593
+ else:
594
+ predictions = (
595
+ predictions
596
+ if not any(not isinstance(pred, list) for pred in predictions)
597
+ else [predictions]
598
+ )
599
+
748
600
  events = []
749
601
  for i, (feature, prediction) in enumerate(zip(features, predictions)):
750
602
  if not isinstance(prediction, list):
@@ -766,6 +618,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
766
618
  EventFieldType.PREDICTION: prediction,
767
619
  EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
768
620
  EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
621
+ EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
622
+ self.last_request[endpoint_id]
623
+ ).timestamp(),
769
624
  EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
770
625
  EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
771
626
  EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
@@ -804,7 +659,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
804
659
  # left them
805
660
  if endpoint_id not in self.endpoints:
806
661
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
807
- endpoint_record = get_endpoint_record(
662
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
808
663
  project=self.project,
809
664
  endpoint_id=endpoint_id,
810
665
  )
@@ -850,36 +705,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
850
705
  return False
851
706
 
852
707
 
853
- class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
854
- def __init__(self, keys, **kwargs):
855
- """
856
- Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
857
- or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
858
-
859
- :param keys: list of key metrics.
860
-
861
- :returns: An unpacked dictionary of event filtered by the provided key metrics.
862
- """
863
- super().__init__(**kwargs)
864
- self.keys = keys
865
-
866
- def do(self, event):
867
- # Keep only the relevant dictionary based on the provided keys
868
- new_event = {}
869
- for key in self.keys:
870
- if key in event:
871
- new_event[key] = event[key]
872
-
873
- # Create unpacked dictionary
874
- unpacked = {}
875
- for key in new_event.keys():
876
- if key in self.keys:
877
- unpacked = {**unpacked, **new_event[key]}
878
- else:
879
- unpacked[key] = new_event[key]
880
- return unpacked if unpacked else None
881
-
882
-
883
708
  class MapFeatureNames(mlrun.feature_store.steps.MapClass):
884
709
  def __init__(
885
710
  self,
@@ -935,9 +760,15 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
935
760
 
936
761
  feature_values = event[EventFieldType.FEATURES]
937
762
  label_values = event[EventFieldType.PREDICTION]
763
+
764
+ for index in range(len(feature_values)):
765
+ feature_value = feature_values[index]
766
+ if isinstance(feature_value, int):
767
+ feature_values[index] = float(feature_value)
768
+
938
769
  # Get feature names and label columns
939
770
  if endpoint_id not in self.feature_names:
940
- endpoint_record = get_endpoint_record(
771
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
941
772
  project=self.project,
942
773
  endpoint_id=endpoint_id,
943
774
  )
@@ -1065,7 +896,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
1065
896
 
1066
897
 
1067
898
  class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1068
- def __init__(self, project: str, model_endpoint_store_target: str, **kwargs):
899
+ def __init__(self, project: str, **kwargs):
1069
900
  """
1070
901
  Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
1071
902
  the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
@@ -1075,7 +906,6 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
1075
906
  """
1076
907
  super().__init__(**kwargs)
1077
908
  self.project = project
1078
- self.model_endpoint_store_target = model_endpoint_store_target
1079
909
 
1080
910
  def do(self, event: dict):
1081
911
  # Remove labels from the event
@@ -1118,6 +948,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1118
948
  def do(self, event: dict):
1119
949
  key_set = set(event.keys())
1120
950
  if not key_set.issubset(self.keys):
951
+ import mlrun.utils.v3io_clients
952
+
1121
953
  self.keys.update(key_set)
1122
954
  # Apply infer_schema on the kv table for generating the schema file
1123
955
  mlrun.utils.v3io_clients.get_frames_client(
@@ -1128,104 +960,12 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
1128
960
  return event
1129
961
 
1130
962
 
1131
- class EventRouting(mlrun.feature_store.steps.MapClass):
1132
- """
1133
- Router the event according to the configured path under event.path. Please note that this step returns the result
1134
- to the caller. At the moment there are several paths:
1135
-
1136
- - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
1137
- to scrape the results from the monitoring stream memory.
1138
-
1139
- - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
1140
- statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
1141
- metrics.
1142
-
1143
- - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
1144
-
1145
- """
1146
-
1147
- def __init__(
1148
- self,
1149
- project: str,
1150
- **kwargs,
1151
- ):
1152
- super().__init__(**kwargs)
1153
- self.project: str = project
1154
-
1155
- def do(self, event):
1156
- if event.path == "/model-monitoring-metrics":
1157
- # Return a parsed Prometheus registry file
1158
- event.body = mlrun.model_monitoring.prometheus.get_registry()
1159
- elif event.path == "/monitoring-batch-metrics":
1160
- # Update statistical metrics
1161
- for event_metric in event.body:
1162
- mlrun.model_monitoring.prometheus.write_drift_metrics(
1163
- project=self.project,
1164
- endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
1165
- metric=event_metric[EventFieldType.METRIC],
1166
- value=event_metric[EventFieldType.VALUE],
1167
- )
1168
- elif event.path == "/monitoring-drift-status":
1169
- # Update drift status
1170
- mlrun.model_monitoring.prometheus.write_drift_status(
1171
- project=self.project,
1172
- endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
1173
- drift_status=event.body[EventFieldType.DRIFT_STATUS],
1174
- )
1175
-
1176
- return event
1177
-
1178
-
1179
- class IncCounter(mlrun.feature_store.steps.MapClass):
1180
- """Increase prediction counter by 1 and update the total latency value"""
1181
-
1182
- def __init__(self, project: str, **kwargs):
1183
- super().__init__(**kwargs)
1184
- self.project: str = project
1185
-
1186
- def do(self, event):
1187
- # Compute prediction per second
1188
-
1189
- mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
1190
- project=self.project,
1191
- endpoint_id=event[EventFieldType.ENDPOINT_ID],
1192
- latency=event[EventFieldType.LATENCY],
1193
- model_name=event[EventFieldType.MODEL],
1194
- endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
1195
- )
1196
-
1197
- return event
1198
-
1199
-
1200
- class RecordFeatures(mlrun.feature_store.steps.MapClass):
1201
- """Record a sample of features and labels in Prometheus registry"""
1202
-
1203
- def __init__(self, project: str, **kwargs):
1204
- super().__init__(**kwargs)
1205
- self.project: str = project
1206
-
1207
- def do(self, event):
1208
- # Generate a dictionary of features and predictions
1209
- features = {
1210
- **event[EventFieldType.NAMED_PREDICTIONS],
1211
- **event[EventFieldType.NAMED_FEATURES],
1212
- }
1213
-
1214
- mlrun.model_monitoring.prometheus.write_income_features(
1215
- project=self.project,
1216
- endpoint_id=event[EventFieldType.ENDPOINT_ID],
1217
- features=features,
1218
- )
1219
-
1220
- return event
1221
-
1222
-
1223
963
  def update_endpoint_record(
1224
964
  project: str,
1225
965
  endpoint_id: str,
1226
966
  attributes: dict,
1227
967
  ):
1228
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
968
+ model_endpoint_store = mlrun.model_monitoring.get_store_object(
1229
969
  project=project,
1230
970
  )
1231
971
 
@@ -1234,13 +974,6 @@ def update_endpoint_record(
1234
974
  )
1235
975
 
1236
976
 
1237
- def get_endpoint_record(project: str, endpoint_id: str):
1238
- model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
1239
- project=project,
1240
- )
1241
- return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
1242
-
1243
-
1244
977
  def update_monitoring_feature_set(
1245
978
  endpoint_record: dict[str, typing.Any],
1246
979
  feature_names: list[str],