mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +26 -112
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +46 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +47 -48
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +69 -0
  13. mlrun/common/db/sql_session.py +2 -3
  14. mlrun/common/formatters/__init__.py +19 -0
  15. mlrun/common/formatters/artifact.py +21 -0
  16. mlrun/common/formatters/base.py +78 -0
  17. mlrun/common/formatters/function.py +41 -0
  18. mlrun/common/formatters/pipeline.py +53 -0
  19. mlrun/common/formatters/project.py +51 -0
  20. mlrun/common/helpers.py +1 -2
  21. mlrun/common/model_monitoring/helpers.py +9 -5
  22. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  23. mlrun/common/schemas/__init__.py +24 -4
  24. mlrun/common/schemas/alert.py +203 -0
  25. mlrun/common/schemas/api_gateway.py +148 -0
  26. mlrun/common/schemas/artifact.py +18 -8
  27. mlrun/common/schemas/auth.py +11 -5
  28. mlrun/common/schemas/background_task.py +1 -1
  29. mlrun/common/schemas/client_spec.py +4 -1
  30. mlrun/common/schemas/feature_store.py +16 -16
  31. mlrun/common/schemas/frontend_spec.py +8 -7
  32. mlrun/common/schemas/function.py +5 -1
  33. mlrun/common/schemas/hub.py +11 -18
  34. mlrun/common/schemas/memory_reports.py +2 -2
  35. mlrun/common/schemas/model_monitoring/__init__.py +18 -3
  36. mlrun/common/schemas/model_monitoring/constants.py +83 -26
  37. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  38. mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
  39. mlrun/common/schemas/notification.py +4 -4
  40. mlrun/common/schemas/object.py +2 -2
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +1 -10
  43. mlrun/common/schemas/project.py +24 -23
  44. mlrun/common/schemas/runtime_resource.py +8 -12
  45. mlrun/common/schemas/schedule.py +3 -3
  46. mlrun/common/schemas/tag.py +1 -2
  47. mlrun/common/schemas/workflow.py +2 -2
  48. mlrun/common/types.py +7 -1
  49. mlrun/config.py +54 -17
  50. mlrun/data_types/to_pandas.py +10 -12
  51. mlrun/datastore/__init__.py +5 -8
  52. mlrun/datastore/alibaba_oss.py +130 -0
  53. mlrun/datastore/azure_blob.py +17 -5
  54. mlrun/datastore/base.py +62 -39
  55. mlrun/datastore/datastore.py +28 -9
  56. mlrun/datastore/datastore_profile.py +146 -20
  57. mlrun/datastore/filestore.py +0 -1
  58. mlrun/datastore/google_cloud_storage.py +6 -2
  59. mlrun/datastore/hdfs.py +56 -0
  60. mlrun/datastore/inmem.py +2 -2
  61. mlrun/datastore/redis.py +6 -2
  62. mlrun/datastore/s3.py +9 -0
  63. mlrun/datastore/snowflake_utils.py +43 -0
  64. mlrun/datastore/sources.py +201 -96
  65. mlrun/datastore/spark_utils.py +1 -2
  66. mlrun/datastore/store_resources.py +7 -7
  67. mlrun/datastore/targets.py +358 -104
  68. mlrun/datastore/utils.py +72 -58
  69. mlrun/datastore/v3io.py +5 -1
  70. mlrun/db/base.py +185 -35
  71. mlrun/db/factory.py +1 -1
  72. mlrun/db/httpdb.py +614 -179
  73. mlrun/db/nopdb.py +210 -26
  74. mlrun/errors.py +12 -1
  75. mlrun/execution.py +41 -24
  76. mlrun/feature_store/__init__.py +0 -2
  77. mlrun/feature_store/api.py +40 -72
  78. mlrun/feature_store/common.py +1 -1
  79. mlrun/feature_store/feature_set.py +76 -55
  80. mlrun/feature_store/feature_vector.py +28 -30
  81. mlrun/feature_store/ingestion.py +7 -6
  82. mlrun/feature_store/retrieval/base.py +16 -11
  83. mlrun/feature_store/retrieval/conversion.py +11 -13
  84. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  85. mlrun/feature_store/retrieval/job.py +9 -3
  86. mlrun/feature_store/retrieval/local_merger.py +2 -0
  87. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  88. mlrun/feature_store/steps.py +37 -34
  89. mlrun/features.py +9 -20
  90. mlrun/frameworks/_common/artifacts_library.py +9 -9
  91. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  92. mlrun/frameworks/_common/model_handler.py +48 -48
  93. mlrun/frameworks/_common/plan.py +2 -3
  94. mlrun/frameworks/_common/producer.py +3 -4
  95. mlrun/frameworks/_common/utils.py +5 -5
  96. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  97. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  98. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  99. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  100. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  101. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  102. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  103. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  104. mlrun/frameworks/_ml_common/plan.py +1 -1
  105. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  106. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  107. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  108. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  109. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  110. mlrun/frameworks/_ml_common/utils.py +4 -4
  111. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  112. mlrun/frameworks/huggingface/model_server.py +4 -4
  113. mlrun/frameworks/lgbm/__init__.py +33 -33
  114. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  115. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  116. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  117. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  118. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  119. mlrun/frameworks/lgbm/model_handler.py +10 -10
  120. mlrun/frameworks/lgbm/model_server.py +6 -6
  121. mlrun/frameworks/lgbm/utils.py +5 -5
  122. mlrun/frameworks/onnx/dataset.py +8 -8
  123. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  124. mlrun/frameworks/onnx/model_handler.py +6 -6
  125. mlrun/frameworks/onnx/model_server.py +7 -7
  126. mlrun/frameworks/parallel_coordinates.py +4 -3
  127. mlrun/frameworks/pytorch/__init__.py +18 -18
  128. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  129. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  130. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  131. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  132. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  133. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  134. mlrun/frameworks/pytorch/model_handler.py +17 -17
  135. mlrun/frameworks/pytorch/model_server.py +7 -7
  136. mlrun/frameworks/sklearn/__init__.py +13 -13
  137. mlrun/frameworks/sklearn/estimator.py +4 -4
  138. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  139. mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
  140. mlrun/frameworks/sklearn/model_handler.py +2 -2
  141. mlrun/frameworks/tf_keras/__init__.py +10 -7
  142. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  143. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  144. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  145. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  146. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  147. mlrun/frameworks/tf_keras/model_server.py +6 -6
  148. mlrun/frameworks/xgboost/__init__.py +13 -13
  149. mlrun/frameworks/xgboost/model_handler.py +6 -6
  150. mlrun/k8s_utils.py +14 -16
  151. mlrun/launcher/__init__.py +1 -1
  152. mlrun/launcher/base.py +16 -15
  153. mlrun/launcher/client.py +8 -6
  154. mlrun/launcher/factory.py +1 -1
  155. mlrun/launcher/local.py +17 -11
  156. mlrun/launcher/remote.py +16 -10
  157. mlrun/lists.py +7 -6
  158. mlrun/model.py +238 -73
  159. mlrun/model_monitoring/__init__.py +1 -1
  160. mlrun/model_monitoring/api.py +138 -315
  161. mlrun/model_monitoring/application.py +5 -296
  162. mlrun/model_monitoring/applications/__init__.py +24 -0
  163. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  164. mlrun/model_monitoring/applications/base.py +282 -0
  165. mlrun/model_monitoring/applications/context.py +214 -0
  166. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  167. mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
  168. mlrun/model_monitoring/applications/results.py +99 -0
  169. mlrun/model_monitoring/controller.py +104 -84
  170. mlrun/model_monitoring/controller_handler.py +13 -5
  171. mlrun/model_monitoring/db/__init__.py +18 -0
  172. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  173. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  174. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
  175. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  176. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  177. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  178. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  179. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  180. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  181. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  182. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
  183. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  184. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  185. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  187. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  188. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  189. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  190. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  191. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  192. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  193. mlrun/model_monitoring/evidently_application.py +6 -118
  194. mlrun/model_monitoring/features_drift_table.py +134 -106
  195. mlrun/model_monitoring/helpers.py +127 -28
  196. mlrun/model_monitoring/metrics/__init__.py +13 -0
  197. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  198. mlrun/model_monitoring/model_endpoint.py +3 -2
  199. mlrun/model_monitoring/prometheus.py +1 -4
  200. mlrun/model_monitoring/stream_processing.py +62 -231
  201. mlrun/model_monitoring/tracking_policy.py +9 -2
  202. mlrun/model_monitoring/writer.py +152 -124
  203. mlrun/package/__init__.py +6 -6
  204. mlrun/package/context_handler.py +5 -5
  205. mlrun/package/packager.py +7 -7
  206. mlrun/package/packagers/default_packager.py +6 -6
  207. mlrun/package/packagers/numpy_packagers.py +15 -15
  208. mlrun/package/packagers/pandas_packagers.py +5 -5
  209. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  210. mlrun/package/packagers_manager.py +19 -23
  211. mlrun/package/utils/_formatter.py +6 -6
  212. mlrun/package/utils/_pickler.py +2 -2
  213. mlrun/package/utils/_supported_format.py +4 -4
  214. mlrun/package/utils/log_hint_utils.py +2 -2
  215. mlrun/package/utils/type_hint_utils.py +4 -9
  216. mlrun/platforms/__init__.py +11 -10
  217. mlrun/platforms/iguazio.py +24 -203
  218. mlrun/projects/operations.py +35 -21
  219. mlrun/projects/pipelines.py +68 -99
  220. mlrun/projects/project.py +830 -266
  221. mlrun/render.py +3 -11
  222. mlrun/run.py +162 -166
  223. mlrun/runtimes/__init__.py +62 -7
  224. mlrun/runtimes/base.py +39 -32
  225. mlrun/runtimes/daskjob.py +8 -8
  226. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  227. mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
  228. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  229. mlrun/runtimes/funcdoc.py +0 -28
  230. mlrun/runtimes/function_reference.py +1 -1
  231. mlrun/runtimes/kubejob.py +28 -122
  232. mlrun/runtimes/local.py +6 -3
  233. mlrun/runtimes/mpijob/__init__.py +0 -20
  234. mlrun/runtimes/mpijob/abstract.py +9 -10
  235. mlrun/runtimes/mpijob/v1.py +1 -1
  236. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  237. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  238. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  239. mlrun/runtimes/nuclio/application/application.py +523 -0
  240. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  241. mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
  242. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  243. mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
  244. mlrun/runtimes/pod.py +286 -88
  245. mlrun/runtimes/remotesparkjob.py +2 -2
  246. mlrun/runtimes/sparkjob/spark3job.py +51 -34
  247. mlrun/runtimes/utils.py +7 -75
  248. mlrun/secrets.py +9 -5
  249. mlrun/serving/remote.py +2 -7
  250. mlrun/serving/routers.py +13 -10
  251. mlrun/serving/server.py +22 -26
  252. mlrun/serving/states.py +99 -25
  253. mlrun/serving/utils.py +3 -3
  254. mlrun/serving/v1_serving.py +6 -7
  255. mlrun/serving/v2_serving.py +59 -20
  256. mlrun/track/tracker.py +2 -1
  257. mlrun/track/tracker_manager.py +3 -3
  258. mlrun/track/trackers/mlflow_tracker.py +1 -2
  259. mlrun/utils/async_http.py +5 -7
  260. mlrun/utils/azure_vault.py +1 -1
  261. mlrun/utils/clones.py +1 -2
  262. mlrun/utils/condition_evaluator.py +3 -3
  263. mlrun/utils/db.py +3 -3
  264. mlrun/utils/helpers.py +183 -197
  265. mlrun/utils/http.py +2 -5
  266. mlrun/utils/logger.py +76 -14
  267. mlrun/utils/notifications/notification/__init__.py +17 -12
  268. mlrun/utils/notifications/notification/base.py +14 -2
  269. mlrun/utils/notifications/notification/console.py +2 -0
  270. mlrun/utils/notifications/notification/git.py +3 -1
  271. mlrun/utils/notifications/notification/ipython.py +3 -1
  272. mlrun/utils/notifications/notification/slack.py +101 -21
  273. mlrun/utils/notifications/notification/webhook.py +11 -1
  274. mlrun/utils/notifications/notification_pusher.py +155 -30
  275. mlrun/utils/retryer.py +208 -0
  276. mlrun/utils/singleton.py +1 -1
  277. mlrun/utils/v3io_clients.py +2 -4
  278. mlrun/utils/version/version.json +2 -2
  279. mlrun/utils/version/version.py +2 -6
  280. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
  281. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  282. mlrun/kfpops.py +0 -868
  283. mlrun/model_monitoring/batch.py +0 -1095
  284. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  285. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  286. mlrun/platforms/other.py +0 -306
  287. mlrun-1.6.4rc2.dist-info/RECORD +0 -314
  288. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  289. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
  290. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  291. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
@@ -1,1095 +0,0 @@
1
- # Copyright 2023 Iguazio
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import abc
16
- import collections
17
- import dataclasses
18
- import datetime
19
- import json
20
- import os
21
- import re
22
- from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, Union
23
- from urllib.parse import urlparse
24
-
25
- import fsspec
26
- import numpy as np
27
- import pandas as pd
28
- import pyarrow
29
- import requests
30
- import v3io
31
- import v3io.dataplane
32
- import v3io_frames
33
- import v3iofs # noqa: F401, required for V3IO file system with fsspec
34
- from v3io_frames.frames_pb2 import IGNORE
35
-
36
- import mlrun.common.helpers
37
- import mlrun.common.model_monitoring.helpers
38
- import mlrun.common.schemas.model_monitoring
39
- import mlrun.data_types.infer
40
- import mlrun.feature_store as fstore
41
- import mlrun.utils.v3io_clients
42
- from mlrun.utils import logger
43
-
44
- # A type for representing a drift result, a tuple of the status and the drift mean:
45
- DriftResultType = Tuple[mlrun.common.schemas.model_monitoring.DriftStatus, float]
46
-
47
-
48
- @dataclasses.dataclass
49
- class HistogramDistanceMetric(abc.ABC):
50
- """
51
- An abstract base class for distance metrics between histograms.
52
-
53
- :args distrib_t: array of distribution t (usually the latest dataset distribution)
54
- :args distrib_u: array of distribution u (usually the sample dataset distribution)
55
-
56
- Each distribution must contain nonnegative floats that sum up to 1.0.
57
- """
58
-
59
- distrib_t: np.ndarray
60
- distrib_u: np.ndarray
61
-
62
- NAME: ClassVar[str]
63
-
64
- # noinspection PyMethodOverriding
65
- def __init_subclass__(cls, *, metric_name: str, **kwargs) -> None:
66
- super().__init_subclass__(**kwargs)
67
- cls.NAME = metric_name
68
-
69
- @abc.abstractmethod
70
- def compute(self) -> float:
71
- raise NotImplementedError
72
-
73
-
74
- class TotalVarianceDistance(HistogramDistanceMetric, metric_name="tvd"):
75
- """
76
- Provides a symmetric drift distance between two periods t and u
77
- Z - vector of random variables
78
- Pt - Probability distribution over time span t
79
- """
80
-
81
- def compute(self) -> float:
82
- """
83
- Calculate Total Variance distance.
84
-
85
- :returns: Total Variance Distance.
86
- """
87
- return np.sum(np.abs(self.distrib_t - self.distrib_u)) / 2
88
-
89
-
90
- class HellingerDistance(HistogramDistanceMetric, metric_name="hellinger"):
91
- """
92
- Hellinger distance is an f divergence measure, similar to the Kullback-Leibler (KL) divergence.
93
- It used to quantify the difference between two probability distributions.
94
- However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space.
95
- The output range of Hellinger distance is [0,1]. The closer to 0, the more similar the two distributions.
96
- """
97
-
98
- def compute(self) -> float:
99
- """
100
- Calculate Hellinger Distance
101
-
102
- :returns: Hellinger Distance
103
- """
104
- return np.sqrt(
105
- max(
106
- 1 - np.sum(np.sqrt(self.distrib_u * self.distrib_t)),
107
- 0, # numerical errors may produce small negative numbers, e.g. -1e-16.
108
- # However, Cauchy-Schwarz inequality assures this number is in the range [0, 1]
109
- )
110
- )
111
-
112
-
113
- class KullbackLeiblerDivergence(HistogramDistanceMetric, metric_name="kld"):
114
- """
115
- KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another.
116
- It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality.
117
- KL Divergence of 0, indicates two identical distributions.
118
- """
119
-
120
- @staticmethod
121
- def _calc_kl_div(
122
- actual_dist: np.array, expected_dist: np.array, kld_scaling: float
123
- ) -> float:
124
- """Return the asymmetric KL divergence"""
125
- # We take 0*log(0) == 0 for this calculation
126
- mask = actual_dist != 0
127
- actual_dist = actual_dist[mask]
128
- expected_dist = expected_dist[mask]
129
- return np.sum(
130
- actual_dist
131
- * np.log(
132
- actual_dist / np.where(expected_dist != 0, expected_dist, kld_scaling)
133
- ),
134
- )
135
-
136
- def compute(
137
- self, capping: Optional[float] = None, kld_scaling: float = 1e-4
138
- ) -> float:
139
- """
140
- :param capping: A bounded value for the KL Divergence. For infinite distance, the result is replaced with
141
- the capping value which indicates a huge differences between the distributions.
142
- :param kld_scaling: Will be used to replace 0 values for executing the logarithmic operation.
143
-
144
- :returns: symmetric KL Divergence
145
- """
146
- t_u = self._calc_kl_div(self.distrib_t, self.distrib_u, kld_scaling)
147
- u_t = self._calc_kl_div(self.distrib_u, self.distrib_t, kld_scaling)
148
- result = t_u + u_t
149
- if capping and result == float("inf"):
150
- return capping
151
- return result
152
-
153
-
154
- class VirtualDrift:
155
- """
156
- Virtual Drift object is used for handling the drift calculations.
157
- It contains the metrics objects and the related methods for the detection of potential drift.
158
- """
159
-
160
- def __init__(
161
- self,
162
- prediction_col: Optional[str] = None,
163
- label_col: Optional[str] = None,
164
- feature_weights: Optional[List[float]] = None,
165
- inf_capping: Optional[float] = 10,
166
- ):
167
- """
168
- Initialize a Virtual Drift object.
169
-
170
- :param prediction_col: The name of the dataframe column which represents the predictions of the model. If
171
- provided, it will be used for calculating drift over the predictions. The name of the
172
- dataframe column which represents the labels of the model. If provided, it will be used
173
- for calculating drift over the labels.
174
- :param feature_weights: Weights that can be applied to the features and to be considered during the drift
175
- analysis.
176
- :param inf_capping: A bounded value for the results of the statistical metric. For example, when calculating
177
- KL divergence and getting infinite distance between the two distributions, the result
178
- will be replaced with the capping value.
179
- """
180
- self.prediction_col = prediction_col
181
- self.label_col = label_col
182
- self.feature_weights = feature_weights
183
- self.capping = inf_capping
184
-
185
- # Initialize objects of the current metrics
186
- self.metrics: Dict[str, Type[HistogramDistanceMetric]] = {
187
- metric_class.NAME: metric_class
188
- for metric_class in (
189
- TotalVarianceDistance,
190
- HellingerDistance,
191
- KullbackLeiblerDivergence,
192
- )
193
- }
194
-
195
- @staticmethod
196
- def dict_to_histogram(histogram_dict: Dict[str, Dict[str, Any]]) -> pd.DataFrame:
197
- """
198
- Convert histogram dictionary to pandas DataFrame with feature histograms as columns
199
-
200
- :param histogram_dict: Histogram dictionary
201
-
202
- :returns: Histogram dataframe
203
- """
204
-
205
- # Create a dictionary with feature histograms as values
206
- histograms = {}
207
- for feature, stats in histogram_dict.items():
208
- if "hist" in stats:
209
- # Normalize to probability distribution of each feature
210
- histograms[feature] = np.array(stats["hist"][0]) / stats["count"]
211
-
212
- # Convert the dictionary to pandas DataFrame
213
- histograms = pd.DataFrame(histograms)
214
-
215
- return histograms
216
-
217
- def compute_metrics_over_df(
218
- self,
219
- base_histogram: Dict[str, Dict[str, Any]],
220
- latest_histogram: Dict[str, Dict[str, Any]],
221
- ) -> Dict[str, Dict[str, Any]]:
222
- """
223
- Calculate metrics values for each feature.
224
-
225
- For example:
226
- {tvd: {feature_1: 0.001, feature_2: 0.2: ,...}}
227
-
228
- :param base_histogram: histogram dataframe that represents the distribution of the features from the original
229
- training set.
230
- :param latest_histogram: Histogram dataframe that represents the distribution of the features from the latest
231
- input batch.
232
-
233
- :returns: A dictionary in which for each metric (key) we assign the values for each feature.
234
- """
235
-
236
- # compute the different metrics for each feature distribution and store the results in dictionary
237
- drift_measures = {}
238
- for metric_name, metric in self.metrics.items():
239
- drift_measures[metric_name] = {
240
- feature: metric(
241
- base_histogram.loc[:, feature], latest_histogram.loc[:, feature]
242
- ).compute()
243
- for feature in base_histogram
244
- }
245
-
246
- return drift_measures
247
-
248
- def compute_drift_from_histograms(
249
- self,
250
- feature_stats: Dict[str, Dict[str, Any]],
251
- current_stats: Dict[str, Dict[str, Any]],
252
- ) -> Dict[str, Dict[str, Any]]:
253
- """
254
- Compare the distributions of both the original features data and the latest input data
255
- :param feature_stats: Histogram dictionary of the original feature dataset that was used in the model training.
256
- :param current_stats: Histogram dictionary of the recent input data
257
-
258
- :returns: A dictionary that includes the drift results for each feature.
259
-
260
- """
261
-
262
- # convert histogram dictionaries to DataFrame of the histograms
263
- # with feature histogram as cols
264
- base_histogram = self.dict_to_histogram(feature_stats)
265
- latest_histogram = self.dict_to_histogram(current_stats)
266
-
267
- # verify all the features exist between datasets
268
- base_features = set(base_histogram.columns)
269
- latest_features = set(latest_histogram.columns)
270
- features_common = list(base_features.intersection(latest_features))
271
- feature_difference = list(base_features ^ latest_features)
272
- if not features_common:
273
- raise ValueError(
274
- f"No common features found: {base_features} <> {latest_features}"
275
- )
276
-
277
- # drop columns of non-exist features
278
- base_histogram = base_histogram.drop(
279
- feature_difference, axis=1, errors="ignore"
280
- )
281
- latest_histogram = latest_histogram.drop(
282
- feature_difference, axis=1, errors="ignore"
283
- )
284
-
285
- # compute the statistical metrics per feature
286
- features_drift_measures = self.compute_metrics_over_df(
287
- base_histogram.loc[:, features_common],
288
- latest_histogram.loc[:, features_common],
289
- )
290
-
291
- # compute total value for each metric
292
- for metric_name in self.metrics.keys():
293
- feature_values = list(features_drift_measures[metric_name].values())
294
- features_drift_measures[metric_name]["total_sum"] = np.sum(feature_values)
295
- features_drift_measures[metric_name]["total_mean"] = np.mean(feature_values)
296
-
297
- # add weighted mean by given feature weights if provided
298
- if self.feature_weights:
299
- features_drift_measures[metric_name]["total_weighted_mean"] = np.dot(
300
- feature_values, self.feature_weights
301
- )
302
-
303
- # define drift result dictionary with values as a dictionary
304
- drift_result = collections.defaultdict(dict)
305
-
306
- # fill drift result dictionary with the statistical metrics results per feature
307
- # and the total sum and mean of each metric
308
- for feature in features_common:
309
- for metric, values in features_drift_measures.items():
310
- drift_result[feature][metric] = values[feature]
311
- sum = features_drift_measures[metric]["total_sum"]
312
- mean = features_drift_measures[metric]["total_mean"]
313
- drift_result[f"{metric}_sum"] = sum
314
- drift_result[f"{metric}_mean"] = mean
315
- if self.feature_weights:
316
- metric_measure = features_drift_measures[metric]
317
- weighted_mean = metric_measure["total_weighted_mean"]
318
- drift_result[f"{metric}_weighted_mean"] = weighted_mean
319
-
320
- # compute the drift metric over the labels
321
- if self.label_col:
322
- label_drift_measures = self.compute_metrics_over_df(
323
- base_histogram.loc[:, self.label_col],
324
- latest_histogram.loc[:, self.label_col],
325
- )
326
- for metric, values in label_drift_measures.items():
327
- drift_result[self.label_col][metric] = values[metric]
328
-
329
- # compute the drift metric over the predictions
330
- if self.prediction_col:
331
- prediction_drift_measures = self.compute_metrics_over_df(
332
- base_histogram.loc[:, self.prediction_col],
333
- latest_histogram.loc[:, self.prediction_col],
334
- )
335
- for metric, values in prediction_drift_measures.items():
336
- drift_result[self.prediction_col][metric] = values[metric]
337
-
338
- return drift_result
339
-
340
- @staticmethod
341
- def check_for_drift_per_feature(
342
- metrics_results_dictionary: Dict[str, Union[float, dict]],
343
- possible_drift_threshold: float = 0.5,
344
- drift_detected_threshold: float = 0.7,
345
- ) -> Dict[str, DriftResultType]:
346
- """
347
- Check for drift based on the defined decision rule and the calculated results of the statistical metrics per
348
- feature.
349
-
350
- :param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
351
- all features.
352
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
353
- Default: 0.5.
354
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
355
- Default: 0.7.
356
-
357
- :returns: A dictionary of all the features and their drift status and results tuples, tuple of:
358
- [0] = Drift status enum based on the thresholds given.
359
- [1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
360
- distance.
361
- """
362
- # Initialize the drift results dictionary:
363
- drift_results = {}
364
-
365
- # Calculate the result per feature:
366
- for feature, results in metrics_results_dictionary.items():
367
- # A feature result must be a dictionary, otherwise it's the total mean (float):
368
- if not isinstance(results, dict):
369
- continue
370
- # Calculate the feature's drift mean:
371
- tvd = results[TotalVarianceDistance.NAME]
372
- hellinger = results[HellingerDistance.NAME]
373
- if tvd is None or hellinger is None:
374
- logger.warning(
375
- "Can't calculate drift for this feature because at least one of the required "
376
- "statistical metrics is missing",
377
- feature=feature,
378
- tvd=tvd,
379
- hellinger=hellinger,
380
- )
381
- continue
382
- metrics_results_dictionary = (tvd + hellinger) / 2
383
- # Decision rule for drift detection:
384
- drift_status = VirtualDrift._get_drift_status(
385
- drift_result=metrics_results_dictionary,
386
- possible_drift_threshold=possible_drift_threshold,
387
- drift_detected_threshold=drift_detected_threshold,
388
- )
389
- # Collect the drift result:
390
- drift_results[feature] = (drift_status, metrics_results_dictionary)
391
-
392
- return drift_results
393
-
394
- @staticmethod
395
- def check_for_drift(
396
- metrics_results_dictionary: Dict[str, Union[float, dict]],
397
- possible_drift_threshold: float = 0.5,
398
- drift_detected_threshold: float = 0.7,
399
- ) -> DriftResultType:
400
- """
401
- Check for drift based on the defined decision rule and the calculated results of the statistical metrics by the
402
- mean of all features.
403
-
404
- :param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
405
- all features.
406
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
407
- Default: 0.5.
408
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
409
- Default: 0.7.
410
-
411
- :returns: A tuple of:
412
- [0] = Drift status enum based on the thresholds given.
413
- [1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
414
- distance.
415
- """
416
- # Calculate the mean drift result:
417
- tvd_mean = metrics_results_dictionary[f"{TotalVarianceDistance.NAME}_mean"]
418
- hellinger_mean = metrics_results_dictionary.get(
419
- f"{HellingerDistance.NAME}_mean"
420
- )
421
- drift_result = 0.0
422
- if tvd_mean and hellinger_mean:
423
- drift_result = (tvd_mean + hellinger_mean) / 2
424
-
425
- # Decision rule for drift detection:
426
- drift_status = VirtualDrift._get_drift_status(
427
- drift_result=drift_result,
428
- possible_drift_threshold=possible_drift_threshold,
429
- drift_detected_threshold=drift_detected_threshold,
430
- )
431
-
432
- return drift_status, drift_result
433
-
434
- @staticmethod
435
- def _get_drift_status(
436
- drift_result: float,
437
- possible_drift_threshold: float,
438
- drift_detected_threshold: float,
439
- ) -> mlrun.common.schemas.model_monitoring.DriftStatus:
440
- """
441
- Get the drift status according to the result and thresholds given.
442
-
443
- :param drift_result: The drift result.
444
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
445
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
446
-
447
- :returns: The figured drift status.
448
- """
449
- drift_status = mlrun.common.schemas.model_monitoring.DriftStatus.NO_DRIFT
450
- if drift_result >= drift_detected_threshold:
451
- drift_status = (
452
- mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
453
- )
454
- elif drift_result >= possible_drift_threshold:
455
- drift_status = (
456
- mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
457
- )
458
-
459
- return drift_status
460
-
461
-
462
- def calculate_inputs_statistics(
463
- sample_set_statistics: dict, inputs: pd.DataFrame
464
- ) -> dict:
465
- """
466
- Calculate the inputs data statistics for drift monitoring purpose.
467
-
468
- :param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
469
- histograms of each feature will be used to recalculate the histograms of the inputs.
470
- :param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
471
- sample set.
472
-
473
- :returns: The calculated statistics of the inputs data.
474
- """
475
-
476
- # Use `DFDataInfer` to calculate the statistics over the inputs:
477
- inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
478
- df=inputs,
479
- options=mlrun.data_types.infer.InferOptions.Histogram,
480
- )
481
-
482
- # Recalculate the histograms over the bins that are set in the sample-set of the end point:
483
- for feature in inputs_statistics.keys():
484
- if feature in sample_set_statistics:
485
- counts, bins = np.histogram(
486
- inputs[feature].to_numpy(),
487
- bins=sample_set_statistics[feature]["hist"][1],
488
- )
489
- inputs_statistics[feature]["hist"] = [
490
- counts.tolist(),
491
- bins.tolist(),
492
- ]
493
- elif "hist" in inputs_statistics[feature]:
494
- # Comply with the other common features' histogram length
495
- mlrun.common.model_monitoring.helpers.pad_hist(
496
- mlrun.common.model_monitoring.helpers.Histogram(
497
- inputs_statistics[feature]["hist"]
498
- )
499
- )
500
-
501
- return inputs_statistics
502
-
503
-
504
- class BatchProcessor:
505
- """
506
- The main object to handle the batch processing job. This object is used to get the required configurations and
507
- to manage the main monitoring drift detection process based on the current batch.
508
- Note that the BatchProcessor object requires access keys along with valid project configurations.
509
- """
510
-
511
- def __init__(
512
- self,
513
- context: mlrun.run.MLClientCtx,
514
- project: str,
515
- ):
516
- """
517
- Initialize Batch Processor object.
518
-
519
- :param context: An MLRun context.
520
- :param project: Project name.
521
- """
522
- self.context = context
523
- self.project = project
524
-
525
- # Initialize virtual drift object
526
- self.virtual_drift = VirtualDrift(inf_capping=10)
527
-
528
- logger.info(
529
- "Initializing BatchProcessor",
530
- project=project,
531
- )
532
-
533
- # Get drift thresholds from the model monitoring configuration
534
- # fmt: off
535
- self.default_possible_drift_threshold = (
536
- mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift
537
- )
538
- self.default_drift_detected_threshold = (
539
- mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected
540
- )
541
- # fmt: on
542
-
543
- # Get a runtime database
544
-
545
- self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
546
-
547
- if not mlrun.mlconf.is_ce_mode():
548
- # TODO: Once there is a time series DB alternative in a non-CE deployment, we need to update this if
549
- # statement to be applied only for V3IO TSDB
550
- self._initialize_v3io_configurations()
551
-
552
- # If an error occurs, it will be raised using the following argument
553
- self.exception = None
554
-
555
- # Get the batch interval range
556
- self.batch_dict = context.parameters[
557
- mlrun.common.schemas.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
558
- ]
559
-
560
- # TODO: This will be removed in 1.5.0 once the job params can be parsed with different types
561
- # Convert batch dict string into a dictionary
562
- if isinstance(self.batch_dict, str):
563
- self._parse_batch_dict_str()
564
-
565
- # If provided, only model endpoints in that that list will be analyzed
566
- self.model_endpoints = context.parameters.get(
567
- mlrun.common.schemas.model_monitoring.EventFieldType.MODEL_ENDPOINTS, None
568
- )
569
-
570
- def _initialize_v3io_configurations(self):
571
- self.v3io_access_key = os.environ.get("V3IO_ACCESS_KEY")
572
- self.model_monitoring_access_key = (
573
- os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
574
- )
575
-
576
- # Define the required paths for the project objects
577
- tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
578
- project=self.project,
579
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.EVENTS,
580
- )
581
- (
582
- _,
583
- self.tsdb_container,
584
- self.tsdb_path,
585
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
586
- tsdb_path
587
- )
588
- # stream_path = template.format(project=self.project, kind="log_stream")
589
- stream_path = mlrun.mlconf.get_model_monitoring_file_target_path(
590
- project=self.project,
591
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.LOG_STREAM,
592
- )
593
- (
594
- _,
595
- self.stream_container,
596
- self.stream_path,
597
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
598
- stream_path
599
- )
600
-
601
- # Get the frames clients based on the v3io configuration
602
- # it will be used later for writing the results into the tsdb
603
- self.v3io = mlrun.utils.v3io_clients.get_v3io_client(
604
- access_key=self.v3io_access_key
605
- )
606
- self.frames = mlrun.utils.v3io_clients.get_frames_client(
607
- address=mlrun.mlconf.v3io_framesd,
608
- container=self.tsdb_container,
609
- token=self.v3io_access_key,
610
- )
611
- logger.info(
612
- "Creating table in TSDB if it does not already exist", table=self.tsdb_path
613
- )
614
- self.frames.create(
615
- backend="tsdb",
616
- table=self.tsdb_path,
617
- if_exists=IGNORE,
618
- rate="1/s",
619
- )
620
-
621
- def post_init(self):
622
- """
623
- Preprocess of the batch processing.
624
- """
625
-
626
- if not mlrun.mlconf.is_ce_mode():
627
- # Create v3io stream based on the input stream
628
- response = self.v3io.stream.create(
629
- container=self.stream_container,
630
- stream_path=self.stream_path,
631
- shard_count=1,
632
- raise_for_status=v3io.dataplane.RaiseForStatus.never,
633
- access_key=self.v3io_access_key,
634
- )
635
-
636
- if not (
637
- response.status_code == 400 and "ResourceInUse" in str(response.body)
638
- ):
639
- response.raise_for_status([409, 204, 403])
640
- pass
641
-
642
- def run(self):
643
- """
644
- Main method for manage the drift analysis and write the results into tsdb and KV table.
645
- """
646
- # Get model endpoints (each deployed project has at least 1 serving model):
647
-
648
- try:
649
- endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
650
-
651
- except Exception as e:
652
- logger.error("Failed to list endpoints", exc=e)
653
- return
654
-
655
- for endpoint in endpoints:
656
- if (
657
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.ACTIVE]
658
- and endpoint[
659
- mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
660
- ]
661
- == mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
662
- ):
663
- # Skip router endpoint:
664
- if (
665
- int(
666
- endpoint[
667
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
668
- ]
669
- )
670
- == mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
671
- ):
672
- # Router endpoint has no feature stats
673
- logger.info(
674
- f"{endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]} is router skipping"
675
- )
676
- continue
677
- self.update_drift_metrics(endpoint=endpoint)
678
-
679
- def update_drift_metrics(self, endpoint: dict):
680
- try:
681
- m_fs = fstore.get_feature_set(
682
- endpoint[
683
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
684
- ]
685
- )
686
-
687
- # Getting batch interval start time and end time
688
- start_time, end_time = self._get_interval_range()
689
-
690
- try:
691
- df = m_fs.to_dataframe(
692
- start_time=start_time,
693
- end_time=end_time,
694
- time_column=mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
695
- )
696
-
697
- if len(df) == 0:
698
- logger.warn(
699
- "Not enough model events since the beginning of the batch interval",
700
- parquet_target=m_fs.status.targets[0].path,
701
- endpoint=endpoint[
702
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
703
- ],
704
- min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
705
- start_time=str(
706
- datetime.datetime.now() - datetime.timedelta(hours=1)
707
- ),
708
- end_time=str(datetime.datetime.now()),
709
- )
710
- return
711
-
712
- # TODO: The below warn will be removed once the state of the Feature Store target is updated
713
- # as expected. In that case, the existence of the file will be checked before trying to get
714
- # the offline data from the feature set.
715
- # Continue if not enough events provided since the deployment of the model endpoint
716
- except FileNotFoundError:
717
- logger.warn(
718
- "Parquet not found, probably due to not enough model events",
719
- parquet_target=m_fs.status.targets[0].path,
720
- endpoint=endpoint[
721
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
722
- ],
723
- min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
724
- )
725
- return
726
-
727
- except pyarrow.ArrowInvalid:
728
- target_path = m_fs.status.targets[0].path
729
- fs = fsspec.filesystem(urlparse(target_path).scheme)
730
- paths = fs.glob(target_path + "/**")
731
- logger.warn(
732
- "Parquet found, but could not be read. Listing the files "
733
- "and folders in the model endpoint's parquet folder",
734
- target_path=target_path,
735
- paths=paths,
736
- )
737
- for path in paths:
738
- details = fs.listdir(path)
739
- logger.info("Path details", path=path, details=details)
740
- raise
741
-
742
- # Get feature names from monitoring feature set
743
- feature_names = [
744
- feature_name["name"] for feature_name in m_fs.spec.features.to_dict()
745
- ]
746
-
747
- # Create DataFrame based on the input features
748
- stats_columns = [
749
- mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
750
- *feature_names,
751
- ]
752
-
753
- # Add label names if provided
754
- if endpoint[
755
- mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
756
- ]:
757
- labels = endpoint[
758
- mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
759
- ]
760
- if isinstance(labels, str):
761
- labels = json.loads(labels)
762
- for label in labels:
763
- if label not in stats_columns:
764
- stats_columns.append(label)
765
- named_features_df = df[stats_columns].copy()
766
-
767
- # Infer feature set stats and schema
768
- fstore.api._infer_from_static_df(
769
- named_features_df,
770
- m_fs,
771
- options=mlrun.data_types.infer.InferOptions.all_stats(),
772
- )
773
-
774
- # Save feature set to apply changes
775
- m_fs.save()
776
-
777
- # Get the timestamp of the latest request:
778
- timestamp = df[
779
- mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
780
- ].iloc[-1]
781
-
782
- # Get the feature stats from the model endpoint for reference data
783
- feature_stats = json.loads(
784
- endpoint[
785
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
786
- ]
787
- )
788
- # Pad the original feature stats to accommodate current data out
789
- # of the original range (unless already padded)
790
- mlrun.common.model_monitoring.helpers.pad_features_hist(
791
- mlrun.common.model_monitoring.helpers.FeatureStats(feature_stats)
792
- )
793
-
794
- # Get the current stats:
795
- current_stats = calculate_inputs_statistics(
796
- sample_set_statistics=feature_stats,
797
- inputs=named_features_df,
798
- )
799
-
800
- # Compute the drift based on the histogram of the current stats and the histogram of the original
801
- # feature stats that can be found in the model endpoint object:
802
- drift_result = self.virtual_drift.compute_drift_from_histograms(
803
- feature_stats=feature_stats,
804
- current_stats=current_stats,
805
- )
806
- logger.info("Drift result", drift_result=drift_result)
807
-
808
- # Get drift thresholds from the model configuration:
809
- monitor_configuration = (
810
- json.loads(
811
- endpoint[
812
- mlrun.common.schemas.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
813
- ]
814
- )
815
- or {}
816
- )
817
-
818
- # For backwards compatibility first check if the old drift thresholds
819
- # (both `possible drift and `drift_detected`) keys exist in the monitor configuration dict
820
- # TODO: Remove the first get in 1.7.0
821
- possible_drift = monitor_configuration.get(
822
- "possible_drift",
823
- monitor_configuration.get(
824
- mlrun.common.schemas.model_monitoring.EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
825
- self.default_possible_drift_threshold,
826
- ),
827
- )
828
-
829
- drift_detected = monitor_configuration.get(
830
- "drift_detected",
831
- monitor_configuration.get(
832
- mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_DETECTED_THRESHOLD,
833
- self.default_drift_detected_threshold,
834
- ),
835
- )
836
-
837
- # Check for possible drift based on the results of the statistical metrics defined above:
838
- drift_status, drift_measure = self.virtual_drift.check_for_drift(
839
- metrics_results_dictionary=drift_result,
840
- possible_drift_threshold=possible_drift,
841
- drift_detected_threshold=drift_detected,
842
- )
843
- logger.info(
844
- "Drift status",
845
- endpoint_id=endpoint[
846
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
847
- ],
848
- drift_status=drift_status.value,
849
- drift_measure=drift_measure,
850
- )
851
-
852
- attributes = {
853
- "current_stats": json.dumps(current_stats),
854
- "drift_measures": json.dumps(drift_result),
855
- "drift_status": drift_status.value,
856
- }
857
-
858
- self.db.update_model_endpoint(
859
- endpoint_id=endpoint[
860
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
861
- ],
862
- attributes=attributes,
863
- )
864
-
865
- if not mlrun.mlconf.is_ce_mode():
866
- # Generate V3IO KV schema if not exist
867
- self._infer_kv_schema()
868
-
869
- # Update drift results in TSDB
870
- self._update_drift_in_v3io_tsdb(
871
- endpoint_id=endpoint[
872
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
873
- ],
874
- drift_status=drift_status,
875
- drift_measure=drift_measure,
876
- drift_result=drift_result,
877
- timestamp=timestamp,
878
- )
879
-
880
- else:
881
- # Update drift results in Prometheus
882
- self._update_drift_in_prometheus(
883
- endpoint_id=endpoint[
884
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
885
- ],
886
- drift_status=drift_status,
887
- drift_result=drift_result,
888
- )
889
-
890
- except Exception as e:
891
- logger.error(
892
- f"Exception for endpoint {endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]}"
893
- )
894
- self.exception = e
895
- logger.info(
896
- "Done updating drift measures",
897
- endpoint_id=endpoint[
898
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
899
- ],
900
- )
901
-
902
- def _get_interval_range(self) -> Tuple[datetime.datetime, datetime.datetime]:
903
- """Getting batch interval time range"""
904
- minutes, hours, days = (
905
- self.batch_dict[
906
- mlrun.common.schemas.model_monitoring.EventFieldType.MINUTES
907
- ],
908
- self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.HOURS],
909
- self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.DAYS],
910
- )
911
- start_time = datetime.datetime.now() - datetime.timedelta(
912
- minutes=minutes, hours=hours, days=days
913
- )
914
- end_time = datetime.datetime.now()
915
- return start_time, end_time
916
-
917
- def _parse_batch_dict_str(self):
918
- """Convert batch dictionary string into a valid dictionary"""
919
- characters_to_remove = "{} "
920
- pattern = "[" + characters_to_remove + "]"
921
- # Remove unnecessary characters from the provided string
922
- batch_list = re.sub(pattern, "", self.batch_dict).split(",")
923
- # Initialize the dictionary of batch interval ranges
924
- self.batch_dict = {}
925
- for pair in batch_list:
926
- pair_list = pair.split(":")
927
- self.batch_dict[pair_list[0]] = float(pair_list[1])
928
-
929
- def _update_drift_in_v3io_tsdb(
930
- self,
931
- endpoint_id: str,
932
- drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
933
- drift_measure: float,
934
- drift_result: Dict[str, Dict[str, Any]],
935
- timestamp: pd.Timestamp,
936
- ):
937
- """Update drift results in input stream.
938
-
939
- :param endpoint_id: The unique id of the model endpoint.
940
- :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
941
- :param drift_measure: The drift result (float) based on the mean of the Total Variance Distance and the
942
- Hellinger distance.
943
- :param drift_result: A dictionary that includes the drift results for each feature.
944
- :param timestamp: Pandas Timestamp value.
945
-
946
- """
947
-
948
- if (
949
- drift_status
950
- == mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
951
- or drift_status
952
- == mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
953
- ):
954
- self.v3io.stream.put_records(
955
- container=self.stream_container,
956
- stream_path=self.stream_path,
957
- records=[
958
- {
959
- "data": json.dumps(
960
- {
961
- "endpoint_id": endpoint_id,
962
- "drift_status": drift_status.value,
963
- "drift_measure": drift_measure,
964
- "drift_per_feature": {**drift_result},
965
- }
966
- )
967
- }
968
- ],
969
- )
970
-
971
- # Update the results in tsdb:
972
- tsdb_drift_measures = {
973
- "endpoint_id": endpoint_id,
974
- "timestamp": timestamp,
975
- "record_type": "drift_measures",
976
- "tvd_mean": drift_result["tvd_mean"],
977
- "kld_mean": drift_result["kld_mean"],
978
- "hellinger_mean": drift_result["hellinger_mean"],
979
- }
980
-
981
- try:
982
- self.frames.write(
983
- backend="tsdb",
984
- table=self.tsdb_path,
985
- dfs=pd.DataFrame.from_records([tsdb_drift_measures]),
986
- index_cols=["timestamp", "endpoint_id", "record_type"],
987
- )
988
- except v3io_frames.errors.Error as err:
989
- logger.warn(
990
- "Could not write drift measures to TSDB",
991
- err=err,
992
- tsdb_path=self.tsdb_path,
993
- endpoint=endpoint_id,
994
- )
995
-
996
- def _update_drift_in_prometheus(
997
- self,
998
- endpoint_id: str,
999
- drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
1000
- drift_result: Dict[str, Dict[str, Any]],
1001
- ):
1002
- """Push drift metrics to Prometheus registry. Please note that the metrics are being pushed through HTTP
1003
- to the monitoring stream pod that writes them into a local registry. Afterwards, Prometheus wil scrape these
1004
- metrics that will be available in the Grafana charts.
1005
-
1006
- :param endpoint_id: The unique id of the model endpoint.
1007
- :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
1008
- :param drift_result: A dictionary that includes the drift results for each feature.
1009
-
1010
-
1011
- """
1012
- stream_http_path = (
1013
- mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
1014
- project=self.project, namespace=mlrun.mlconf.namespace
1015
- )
1016
- )
1017
-
1018
- http_session = mlrun.utils.HTTPSessionWithRetry(
1019
- retry_on_post=True,
1020
- verbose=True,
1021
- max_retries=1,
1022
- )
1023
- try:
1024
- # Model monitoring stream http health check
1025
- http_session.request("GET", url=stream_http_path)
1026
-
1027
- # Update statistical metrics
1028
- statistical_metrics = ["hellinger_mean", "tvd_mean", "kld_mean"]
1029
- metrics = []
1030
- for metric in statistical_metrics:
1031
- metrics.append(
1032
- {
1033
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
1034
- mlrun.common.schemas.model_monitoring.EventFieldType.METRIC: metric,
1035
- mlrun.common.schemas.model_monitoring.EventFieldType.VALUE: drift_result[
1036
- metric
1037
- ],
1038
- }
1039
- )
1040
-
1041
- http_session.request(
1042
- method="POST",
1043
- url=stream_http_path + "/monitoring-batch-metrics",
1044
- data=json.dumps(metrics),
1045
- )
1046
-
1047
- # Update drift status
1048
- drift_status_dict = {
1049
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
1050
- mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_STATUS: drift_status.value,
1051
- }
1052
-
1053
- http_session.request(
1054
- method="POST",
1055
- url=stream_http_path + "/monitoring-drift-status",
1056
- data=json.dumps(drift_status_dict),
1057
- )
1058
-
1059
- except requests.exceptions.ConnectionError as exc:
1060
- logger.warning(
1061
- "Can't push metrics to Prometheus registry. "
1062
- "Monitoring stream pod is not found, probably not deployed. "
1063
- "To deploy, call set_tracking() on a serving function. exc: ",
1064
- exc=exc,
1065
- )
1066
-
1067
- def _infer_kv_schema(self):
1068
- """
1069
- Create KV schema file if not exist. This schema is being used by the Grafana dashboards.
1070
- """
1071
-
1072
- schema_file = self.db.client.kv.new_cursor(
1073
- container=self.db.container,
1074
- table_path=self.db.path,
1075
- filter_expression='__name==".#schema"',
1076
- )
1077
-
1078
- if not schema_file.all():
1079
- logger.info(
1080
- "Generate a new V3IO KV schema file", kv_table_path=self.db.path
1081
- )
1082
- self.frames.execute(
1083
- backend="kv", table=self.db.path, command="infer_schema"
1084
- )
1085
-
1086
-
1087
- def handler(context: mlrun.run.MLClientCtx):
1088
- batch_processor = BatchProcessor(
1089
- context=context,
1090
- project=context.project,
1091
- )
1092
- batch_processor.post_init()
1093
- batch_processor.run()
1094
- if batch_processor.exception:
1095
- raise batch_processor.exception