mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -1,974 +0,0 @@
1
- # Copyright 2023 Iguazio
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import collections
16
- import datetime
17
- import json
18
- import os
19
- import re
20
- from typing import Any, Optional, Union
21
-
22
- import numpy as np
23
- import pandas as pd
24
- import requests
25
- import v3io
26
- import v3io.dataplane
27
- import v3io_frames
28
- from v3io_frames.frames_pb2 import IGNORE
29
-
30
- import mlrun.common.helpers
31
- import mlrun.common.model_monitoring.helpers
32
- import mlrun.common.schemas.model_monitoring
33
- import mlrun.data_types.infer
34
- import mlrun.feature_store as fstore
35
- import mlrun.utils.v3io_clients
36
- from mlrun.model_monitoring.metrics.histogram_distance import (
37
- HellingerDistance,
38
- HistogramDistanceMetric,
39
- KullbackLeiblerDivergence,
40
- TotalVarianceDistance,
41
- )
42
- from mlrun.utils import logger
43
-
44
- # A type for representing a drift result, a tuple of the status and the drift mean:
45
- DriftResultType = tuple[mlrun.common.schemas.model_monitoring.DriftStatus, float]
46
-
47
-
48
- class VirtualDrift:
49
- """
50
- Virtual Drift object is used for handling the drift calculations.
51
- It contains the metrics objects and the related methods for the detection of potential drift.
52
- """
53
-
54
- def __init__(
55
- self,
56
- prediction_col: Optional[str] = None,
57
- label_col: Optional[str] = None,
58
- feature_weights: Optional[list[float]] = None,
59
- inf_capping: Optional[float] = 10,
60
- ):
61
- """
62
- Initialize a Virtual Drift object.
63
-
64
- :param prediction_col: The name of the dataframe column which represents the predictions of the model. If
65
- provided, it will be used for calculating drift over the predictions. The name of the
66
- dataframe column which represents the labels of the model. If provided, it will be used
67
- for calculating drift over the labels.
68
- :param feature_weights: Weights that can be applied to the features and to be considered during the drift
69
- analysis.
70
- :param inf_capping: A bounded value for the results of the statistical metric. For example, when calculating
71
- KL divergence and getting infinite distance between the two distributions, the result
72
- will be replaced with the capping value.
73
- """
74
- self.prediction_col = prediction_col
75
- self.label_col = label_col
76
- self.feature_weights = feature_weights
77
- self.capping = inf_capping
78
-
79
- # Initialize objects of the current metrics
80
- self.metrics: dict[str, type[HistogramDistanceMetric]] = {
81
- metric_class.NAME: metric_class
82
- for metric_class in (
83
- TotalVarianceDistance,
84
- HellingerDistance,
85
- KullbackLeiblerDivergence,
86
- )
87
- }
88
-
89
- @staticmethod
90
- def dict_to_histogram(histogram_dict: dict[str, dict[str, Any]]) -> pd.DataFrame:
91
- """
92
- Convert histogram dictionary to pandas DataFrame with feature histograms as columns
93
-
94
- :param histogram_dict: Histogram dictionary
95
-
96
- :returns: Histogram dataframe
97
- """
98
-
99
- # Create a dictionary with feature histograms as values
100
- histograms = {}
101
- for feature, stats in histogram_dict.items():
102
- if "hist" in stats:
103
- # Normalize to probability distribution of each feature
104
- histograms[feature] = np.array(stats["hist"][0]) / stats["count"]
105
-
106
- # Convert the dictionary to pandas DataFrame
107
- histograms = pd.DataFrame(histograms)
108
-
109
- return histograms
110
-
111
- def compute_metrics_over_df(
112
- self,
113
- base_histogram: dict[str, dict[str, Any]],
114
- latest_histogram: dict[str, dict[str, Any]],
115
- ) -> dict[str, dict[str, Any]]:
116
- """
117
- Calculate metrics values for each feature.
118
-
119
- For example:
120
- {tvd: {feature_1: 0.001, feature_2: 0.2: ,...}}
121
-
122
- :param base_histogram: histogram dataframe that represents the distribution of the features from the original
123
- training set.
124
- :param latest_histogram: Histogram dataframe that represents the distribution of the features from the latest
125
- input batch.
126
-
127
- :returns: A dictionary in which for each metric (key) we assign the values for each feature.
128
- """
129
-
130
- # compute the different metrics for each feature distribution and store the results in dictionary
131
- drift_measures = {}
132
- for metric_name, metric in self.metrics.items():
133
- drift_measures[metric_name] = {
134
- feature: metric(
135
- base_histogram.loc[:, feature], latest_histogram.loc[:, feature]
136
- ).compute()
137
- for feature in base_histogram
138
- }
139
-
140
- return drift_measures
141
-
142
- def compute_drift_from_histograms(
143
- self,
144
- feature_stats: dict[str, dict[str, Any]],
145
- current_stats: dict[str, dict[str, Any]],
146
- ) -> dict[str, dict[str, Any]]:
147
- """
148
- Compare the distributions of both the original features data and the latest input data
149
- :param feature_stats: Histogram dictionary of the original feature dataset that was used in the model training.
150
- :param current_stats: Histogram dictionary of the recent input data
151
-
152
- :returns: A dictionary that includes the drift results for each feature.
153
-
154
- """
155
-
156
- # convert histogram dictionaries to DataFrame of the histograms
157
- # with feature histogram as cols
158
- base_histogram = self.dict_to_histogram(feature_stats)
159
- latest_histogram = self.dict_to_histogram(current_stats)
160
-
161
- # verify all the features exist between datasets
162
- base_features = set(base_histogram.columns)
163
- latest_features = set(latest_histogram.columns)
164
- features_common = list(base_features.intersection(latest_features))
165
- feature_difference = list(base_features ^ latest_features)
166
- if not features_common:
167
- raise ValueError(
168
- f"No common features found: {base_features} <> {latest_features}"
169
- )
170
-
171
- # drop columns of non-exist features
172
- base_histogram = base_histogram.drop(
173
- feature_difference, axis=1, errors="ignore"
174
- )
175
- latest_histogram = latest_histogram.drop(
176
- feature_difference, axis=1, errors="ignore"
177
- )
178
-
179
- # compute the statistical metrics per feature
180
- features_drift_measures = self.compute_metrics_over_df(
181
- base_histogram.loc[:, features_common],
182
- latest_histogram.loc[:, features_common],
183
- )
184
-
185
- # compute total value for each metric
186
- for metric_name in self.metrics.keys():
187
- feature_values = list(features_drift_measures[metric_name].values())
188
- features_drift_measures[metric_name]["total_sum"] = np.sum(feature_values)
189
- features_drift_measures[metric_name]["total_mean"] = np.mean(feature_values)
190
-
191
- # add weighted mean by given feature weights if provided
192
- if self.feature_weights:
193
- features_drift_measures[metric_name]["total_weighted_mean"] = np.dot(
194
- feature_values, self.feature_weights
195
- )
196
-
197
- # define drift result dictionary with values as a dictionary
198
- drift_result = collections.defaultdict(dict)
199
-
200
- # fill drift result dictionary with the statistical metrics results per feature
201
- # and the total sum and mean of each metric
202
- for feature in features_common:
203
- for metric, values in features_drift_measures.items():
204
- drift_result[feature][metric] = values[feature]
205
- sum = features_drift_measures[metric]["total_sum"]
206
- mean = features_drift_measures[metric]["total_mean"]
207
- drift_result[f"{metric}_sum"] = sum
208
- drift_result[f"{metric}_mean"] = mean
209
- if self.feature_weights:
210
- metric_measure = features_drift_measures[metric]
211
- weighted_mean = metric_measure["total_weighted_mean"]
212
- drift_result[f"{metric}_weighted_mean"] = weighted_mean
213
-
214
- # compute the drift metric over the labels
215
- if self.label_col:
216
- label_drift_measures = self.compute_metrics_over_df(
217
- base_histogram.loc[:, self.label_col],
218
- latest_histogram.loc[:, self.label_col],
219
- )
220
- for metric, values in label_drift_measures.items():
221
- drift_result[self.label_col][metric] = values[metric]
222
-
223
- # compute the drift metric over the predictions
224
- if self.prediction_col:
225
- prediction_drift_measures = self.compute_metrics_over_df(
226
- base_histogram.loc[:, self.prediction_col],
227
- latest_histogram.loc[:, self.prediction_col],
228
- )
229
- for metric, values in prediction_drift_measures.items():
230
- drift_result[self.prediction_col][metric] = values[metric]
231
-
232
- return drift_result
233
-
234
- @staticmethod
235
- def check_for_drift_per_feature(
236
- metrics_results_dictionary: dict[str, Union[float, dict]],
237
- possible_drift_threshold: float = 0.5,
238
- drift_detected_threshold: float = 0.7,
239
- ) -> dict[str, DriftResultType]:
240
- """
241
- Check for drift based on the defined decision rule and the calculated results of the statistical metrics per
242
- feature.
243
-
244
- :param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
245
- all features.
246
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
247
- Default: 0.5.
248
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
249
- Default: 0.7.
250
-
251
- :returns: A dictionary of all the features and their drift status and results tuples, tuple of:
252
- [0] = Drift status enum based on the thresholds given.
253
- [1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
254
- distance.
255
- """
256
- # Initialize the drift results dictionary:
257
- drift_results = {}
258
-
259
- # Calculate the result per feature:
260
- for feature, results in metrics_results_dictionary.items():
261
- # A feature result must be a dictionary, otherwise it's the total mean (float):
262
- if not isinstance(results, dict):
263
- continue
264
- # Calculate the feature's drift mean:
265
- tvd = results[TotalVarianceDistance.NAME]
266
- hellinger = results[HellingerDistance.NAME]
267
- if tvd is None or hellinger is None:
268
- logger.warning(
269
- "Can't calculate drift for this feature because at least one of the required "
270
- "statistical metrics is missing",
271
- feature=feature,
272
- tvd=tvd,
273
- hellinger=hellinger,
274
- )
275
- continue
276
- metrics_results_dictionary = (tvd + hellinger) / 2
277
- # Decision rule for drift detection:
278
- drift_status = VirtualDrift._get_drift_status(
279
- drift_result=metrics_results_dictionary,
280
- possible_drift_threshold=possible_drift_threshold,
281
- drift_detected_threshold=drift_detected_threshold,
282
- )
283
- # Collect the drift result:
284
- drift_results[feature] = (drift_status, metrics_results_dictionary)
285
-
286
- return drift_results
287
-
288
- @staticmethod
289
- def check_for_drift(
290
- metrics_results_dictionary: dict[str, Union[float, dict]],
291
- possible_drift_threshold: float = 0.5,
292
- drift_detected_threshold: float = 0.7,
293
- ) -> DriftResultType:
294
- """
295
- Check for drift based on the defined decision rule and the calculated results of the statistical metrics by the
296
- mean of all features.
297
-
298
- :param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
299
- all features.
300
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
301
- Default: 0.5.
302
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
303
- Default: 0.7.
304
-
305
- :returns: A tuple of:
306
- [0] = Drift status enum based on the thresholds given.
307
- [1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
308
- distance.
309
- """
310
- # Calculate the mean drift result:
311
- tvd_mean = metrics_results_dictionary[f"{TotalVarianceDistance.NAME}_mean"]
312
- hellinger_mean = metrics_results_dictionary.get(
313
- f"{HellingerDistance.NAME}_mean"
314
- )
315
- drift_result = 0.0
316
- if tvd_mean and hellinger_mean:
317
- drift_result = (tvd_mean + hellinger_mean) / 2
318
-
319
- # Decision rule for drift detection:
320
- drift_status = VirtualDrift._get_drift_status(
321
- drift_result=drift_result,
322
- possible_drift_threshold=possible_drift_threshold,
323
- drift_detected_threshold=drift_detected_threshold,
324
- )
325
-
326
- return drift_status, drift_result
327
-
328
- @staticmethod
329
- def _get_drift_status(
330
- drift_result: float,
331
- possible_drift_threshold: float,
332
- drift_detected_threshold: float,
333
- ) -> mlrun.common.schemas.model_monitoring.DriftStatus:
334
- """
335
- Get the drift status according to the result and thresholds given.
336
-
337
- :param drift_result: The drift result.
338
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
339
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
340
-
341
- :returns: The figured drift status.
342
- """
343
- drift_status = mlrun.common.schemas.model_monitoring.DriftStatus.NO_DRIFT
344
- if drift_result >= drift_detected_threshold:
345
- drift_status = (
346
- mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
347
- )
348
- elif drift_result >= possible_drift_threshold:
349
- drift_status = (
350
- mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
351
- )
352
-
353
- return drift_status
354
-
355
-
356
- def calculate_inputs_statistics(
357
- sample_set_statistics: dict, inputs: pd.DataFrame
358
- ) -> dict:
359
- """
360
- Calculate the inputs data statistics for drift monitoring purpose.
361
-
362
- :param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
363
- histograms of each feature will be used to recalculate the histograms of the inputs.
364
- :param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
365
- sample set.
366
-
367
- :returns: The calculated statistics of the inputs data.
368
- """
369
-
370
- # Use `DFDataInfer` to calculate the statistics over the inputs:
371
- inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
372
- df=inputs,
373
- options=mlrun.data_types.infer.InferOptions.Histogram,
374
- )
375
-
376
- # Recalculate the histograms over the bins that are set in the sample-set of the end point:
377
- for feature in inputs_statistics.keys():
378
- if feature in sample_set_statistics:
379
- counts, bins = np.histogram(
380
- inputs[feature].to_numpy(),
381
- bins=sample_set_statistics[feature]["hist"][1],
382
- )
383
- inputs_statistics[feature]["hist"] = [
384
- counts.tolist(),
385
- bins.tolist(),
386
- ]
387
- elif "hist" in inputs_statistics[feature]:
388
- # Comply with the other common features' histogram length
389
- mlrun.common.model_monitoring.helpers.pad_hist(
390
- mlrun.common.model_monitoring.helpers.Histogram(
391
- inputs_statistics[feature]["hist"]
392
- )
393
- )
394
-
395
- return inputs_statistics
396
-
397
-
398
- class BatchProcessor:
399
- """
400
- The main object to handle the batch processing job. This object is used to get the required configurations and
401
- to manage the main monitoring drift detection process based on the current batch.
402
- Note that the BatchProcessor object requires access keys along with valid project configurations.
403
- """
404
-
405
- def __init__(
406
- self,
407
- context: mlrun.run.MLClientCtx,
408
- project: str,
409
- ):
410
- """
411
- Initialize Batch Processor object.
412
-
413
- :param context: An MLRun context.
414
- :param project: Project name.
415
- """
416
- self.context = context
417
- self.project = project
418
-
419
- # Initialize virtual drift object
420
- self.virtual_drift = VirtualDrift(inf_capping=10)
421
-
422
- logger.info(
423
- "Initializing BatchProcessor",
424
- project=project,
425
- )
426
-
427
- # Get drift thresholds from the model monitoring configuration
428
- # fmt: off
429
- self.default_possible_drift_threshold = (
430
- mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift
431
- )
432
- self.default_drift_detected_threshold = (
433
- mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected
434
- )
435
- # fmt: on
436
-
437
- # Get a runtime database
438
-
439
- self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
440
-
441
- if not mlrun.mlconf.is_ce_mode():
442
- # TODO: Once there is a time series DB alternative in a non-CE deployment, we need to update this if
443
- # statement to be applied only for V3IO TSDB
444
- self._initialize_v3io_configurations()
445
-
446
- # If an error occurs, it will be raised using the following argument
447
- self.exception = None
448
-
449
- # Get the batch interval range
450
- self.batch_dict = context.parameters[
451
- mlrun.common.schemas.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
452
- ]
453
-
454
- # TODO: This will be removed in 1.5.0 once the job params can be parsed with different types
455
- # Convert batch dict string into a dictionary
456
- if isinstance(self.batch_dict, str):
457
- self._parse_batch_dict_str()
458
-
459
- # If provided, only model endpoints in that that list will be analyzed
460
- self.model_endpoints = context.parameters.get(
461
- mlrun.common.schemas.model_monitoring.EventFieldType.MODEL_ENDPOINTS, None
462
- )
463
-
464
- def _initialize_v3io_configurations(self):
465
- self.v3io_access_key = os.environ.get("V3IO_ACCESS_KEY")
466
- self.model_monitoring_access_key = (
467
- os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
468
- )
469
-
470
- # Define the required paths for the project objects
471
- tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
472
- project=self.project,
473
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.EVENTS,
474
- )
475
- (
476
- _,
477
- self.tsdb_container,
478
- self.tsdb_path,
479
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
480
- tsdb_path
481
- )
482
- # stream_path = template.format(project=self.project, kind="log_stream")
483
- stream_path = mlrun.mlconf.get_model_monitoring_file_target_path(
484
- project=self.project,
485
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.LOG_STREAM,
486
- )
487
- (
488
- _,
489
- self.stream_container,
490
- self.stream_path,
491
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
492
- stream_path
493
- )
494
-
495
- # Get the frames clients based on the v3io configuration
496
- # it will be used later for writing the results into the tsdb
497
- self.v3io = mlrun.utils.v3io_clients.get_v3io_client(
498
- access_key=self.v3io_access_key
499
- )
500
- self.frames = mlrun.utils.v3io_clients.get_frames_client(
501
- address=mlrun.mlconf.v3io_framesd,
502
- container=self.tsdb_container,
503
- token=self.v3io_access_key,
504
- )
505
- logger.info(
506
- "Creating table in TSDB if it does not already exist", table=self.tsdb_path
507
- )
508
- self.frames.create(
509
- backend="tsdb",
510
- table=self.tsdb_path,
511
- if_exists=IGNORE,
512
- rate="1/s",
513
- )
514
-
515
- def post_init(self):
516
- """
517
- Preprocess of the batch processing.
518
- """
519
-
520
- if not mlrun.mlconf.is_ce_mode():
521
- # Create v3io stream based on the input stream
522
- response = self.v3io.stream.create(
523
- container=self.stream_container,
524
- stream_path=self.stream_path,
525
- shard_count=1,
526
- raise_for_status=v3io.dataplane.RaiseForStatus.never,
527
- access_key=self.v3io_access_key,
528
- )
529
-
530
- if not (
531
- response.status_code == 400 and "ResourceInUse" in str(response.body)
532
- ):
533
- response.raise_for_status([409, 204, 403])
534
- pass
535
-
536
- def run(self):
537
- """
538
- Main method for manage the drift analysis and write the results into tsdb and KV table.
539
- """
540
- # Get model endpoints (each deployed project has at least 1 serving model):
541
-
542
- try:
543
- endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
544
-
545
- except Exception as e:
546
- logger.error("Failed to list endpoints", exc=e)
547
- return
548
-
549
- for endpoint in endpoints:
550
- if (
551
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.ACTIVE]
552
- and endpoint[
553
- mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
554
- ]
555
- == mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
556
- ):
557
- # Skip router endpoint:
558
- if (
559
- int(
560
- endpoint[
561
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
562
- ]
563
- )
564
- == mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
565
- ):
566
- # Router endpoint has no feature stats
567
- logger.info(
568
- f"{endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]} is router skipping"
569
- )
570
- continue
571
- self.update_drift_metrics(endpoint=endpoint)
572
-
573
- def update_drift_metrics(self, endpoint: dict):
574
- try:
575
- m_fs = fstore.get_feature_set(
576
- endpoint[
577
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
578
- ]
579
- )
580
-
581
- # Getting batch interval start time and end time
582
- start_time, end_time = self._get_interval_range()
583
-
584
- try:
585
- df = m_fs.to_dataframe(
586
- start_time=start_time,
587
- end_time=end_time,
588
- time_column=mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
589
- )
590
-
591
- if len(df) == 0:
592
- logger.warn(
593
- "Not enough model events since the beginning of the batch interval",
594
- parquet_target=m_fs.status.targets[0].path,
595
- endpoint=endpoint[
596
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
597
- ],
598
- min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
599
- start_time=str(
600
- datetime.datetime.now() - datetime.timedelta(hours=1)
601
- ),
602
- end_time=str(datetime.datetime.now()),
603
- )
604
- return
605
-
606
- # TODO: The below warn will be removed once the state of the Feature Store target is updated
607
- # as expected. In that case, the existence of the file will be checked before trying to get
608
- # the offline data from the feature set.
609
- # Continue if not enough events provided since the deployment of the model endpoint
610
- except FileNotFoundError:
611
- logger.warn(
612
- "Parquet not found, probably due to not enough model events",
613
- parquet_target=m_fs.status.targets[0].path,
614
- endpoint=endpoint[
615
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
616
- ],
617
- min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
618
- )
619
- return
620
-
621
- # Get feature names from monitoring feature set
622
- feature_names = [
623
- feature_name["name"] for feature_name in m_fs.spec.features.to_dict()
624
- ]
625
-
626
- # Create DataFrame based on the input features
627
- stats_columns = [
628
- mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
629
- *feature_names,
630
- ]
631
-
632
- # Add label names if provided
633
- if endpoint[
634
- mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
635
- ]:
636
- labels = endpoint[
637
- mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
638
- ]
639
- if isinstance(labels, str):
640
- labels = json.loads(labels)
641
- for label in labels:
642
- if label not in stats_columns:
643
- stats_columns.append(label)
644
- named_features_df = df[stats_columns].copy()
645
-
646
- # Infer feature set stats and schema
647
- fstore.api._infer_from_static_df(
648
- named_features_df,
649
- m_fs,
650
- options=mlrun.data_types.infer.InferOptions.all_stats(),
651
- )
652
-
653
- # Save feature set to apply changes
654
- m_fs.save()
655
-
656
- # Get the timestamp of the latest request:
657
- timestamp = df[
658
- mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
659
- ].iloc[-1]
660
-
661
- # Get the feature stats from the model endpoint for reference data
662
- feature_stats = json.loads(
663
- endpoint[
664
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
665
- ]
666
- )
667
- # Pad the original feature stats to accommodate current data out
668
- # of the original range (unless already padded)
669
- mlrun.common.model_monitoring.helpers.pad_features_hist(
670
- mlrun.common.model_monitoring.helpers.FeatureStats(feature_stats)
671
- )
672
-
673
- # Get the current stats:
674
- current_stats = calculate_inputs_statistics(
675
- sample_set_statistics=feature_stats,
676
- inputs=named_features_df,
677
- )
678
-
679
- # Compute the drift based on the histogram of the current stats and the histogram of the original
680
- # feature stats that can be found in the model endpoint object:
681
- drift_result = self.virtual_drift.compute_drift_from_histograms(
682
- feature_stats=feature_stats,
683
- current_stats=current_stats,
684
- )
685
- logger.info("Drift result", drift_result=drift_result)
686
-
687
- # Get drift thresholds from the model configuration:
688
- monitor_configuration = (
689
- json.loads(
690
- endpoint[
691
- mlrun.common.schemas.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
692
- ]
693
- )
694
- or {}
695
- )
696
-
697
- # For backwards compatibility first check if the old drift thresholds
698
- # (both `possible drift and `drift_detected`) keys exist in the monitor configuration dict
699
- # TODO: Remove the first get in 1.7.0
700
- possible_drift = monitor_configuration.get(
701
- "possible_drift",
702
- monitor_configuration.get(
703
- mlrun.common.schemas.model_monitoring.EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
704
- self.default_possible_drift_threshold,
705
- ),
706
- )
707
-
708
- drift_detected = monitor_configuration.get(
709
- "drift_detected",
710
- monitor_configuration.get(
711
- mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_DETECTED_THRESHOLD,
712
- self.default_drift_detected_threshold,
713
- ),
714
- )
715
-
716
- # Check for possible drift based on the results of the statistical metrics defined above:
717
- drift_status, drift_measure = self.virtual_drift.check_for_drift(
718
- metrics_results_dictionary=drift_result,
719
- possible_drift_threshold=possible_drift,
720
- drift_detected_threshold=drift_detected,
721
- )
722
- logger.info(
723
- "Drift status",
724
- endpoint_id=endpoint[
725
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
726
- ],
727
- drift_status=drift_status.value,
728
- drift_measure=drift_measure,
729
- )
730
-
731
- attributes = {
732
- "current_stats": json.dumps(current_stats),
733
- "drift_measures": json.dumps(drift_result),
734
- "drift_status": drift_status.value,
735
- }
736
-
737
- self.db.update_model_endpoint(
738
- endpoint_id=endpoint[
739
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
740
- ],
741
- attributes=attributes,
742
- )
743
-
744
- if not mlrun.mlconf.is_ce_mode():
745
- # Generate V3IO KV schema if not exist
746
- self._infer_kv_schema()
747
-
748
- # Update drift results in TSDB
749
- self._update_drift_in_v3io_tsdb(
750
- endpoint_id=endpoint[
751
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
752
- ],
753
- drift_status=drift_status,
754
- drift_measure=drift_measure,
755
- drift_result=drift_result,
756
- timestamp=timestamp,
757
- )
758
-
759
- else:
760
- # Update drift results in Prometheus
761
- self._update_drift_in_prometheus(
762
- endpoint_id=endpoint[
763
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
764
- ],
765
- drift_status=drift_status,
766
- drift_result=drift_result,
767
- )
768
-
769
- except Exception as e:
770
- logger.error(
771
- f"Exception for endpoint {endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]}"
772
- )
773
- self.exception = e
774
- logger.info(
775
- "Done updating drift measures",
776
- endpoint_id=endpoint[
777
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
778
- ],
779
- )
780
-
781
- def _get_interval_range(self) -> tuple[datetime.datetime, datetime.datetime]:
782
- """Getting batch interval time range"""
783
- minutes, hours, days = (
784
- self.batch_dict[
785
- mlrun.common.schemas.model_monitoring.EventFieldType.MINUTES
786
- ],
787
- self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.HOURS],
788
- self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.DAYS],
789
- )
790
- start_time = datetime.datetime.now() - datetime.timedelta(
791
- minutes=minutes, hours=hours, days=days
792
- )
793
- end_time = datetime.datetime.now()
794
- return start_time, end_time
795
-
796
- def _parse_batch_dict_str(self):
797
- """Convert batch dictionary string into a valid dictionary"""
798
- characters_to_remove = "{} "
799
- pattern = "[" + characters_to_remove + "]"
800
- # Remove unnecessary characters from the provided string
801
- batch_list = re.sub(pattern, "", self.batch_dict).split(",")
802
- # Initialize the dictionary of batch interval ranges
803
- self.batch_dict = {}
804
- for pair in batch_list:
805
- pair_list = pair.split(":")
806
- self.batch_dict[pair_list[0]] = float(pair_list[1])
807
-
808
- def _update_drift_in_v3io_tsdb(
809
- self,
810
- endpoint_id: str,
811
- drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
812
- drift_measure: float,
813
- drift_result: dict[str, dict[str, Any]],
814
- timestamp: pd.Timestamp,
815
- ):
816
- """Update drift results in input stream.
817
-
818
- :param endpoint_id: The unique id of the model endpoint.
819
- :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
820
- :param drift_measure: The drift result (float) based on the mean of the Total Variance Distance and the
821
- Hellinger distance.
822
- :param drift_result: A dictionary that includes the drift results for each feature.
823
- :param timestamp: Pandas Timestamp value.
824
-
825
- """
826
-
827
- if (
828
- drift_status
829
- == mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
830
- or drift_status
831
- == mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
832
- ):
833
- self.v3io.stream.put_records(
834
- container=self.stream_container,
835
- stream_path=self.stream_path,
836
- records=[
837
- {
838
- "data": json.dumps(
839
- {
840
- "endpoint_id": endpoint_id,
841
- "drift_status": drift_status.value,
842
- "drift_measure": drift_measure,
843
- "drift_per_feature": {**drift_result},
844
- }
845
- )
846
- }
847
- ],
848
- )
849
-
850
- # Update the results in tsdb:
851
- tsdb_drift_measures = {
852
- "endpoint_id": endpoint_id,
853
- "timestamp": timestamp,
854
- "record_type": "drift_measures",
855
- "tvd_mean": drift_result["tvd_mean"],
856
- "kld_mean": drift_result["kld_mean"],
857
- "hellinger_mean": drift_result["hellinger_mean"],
858
- }
859
-
860
- try:
861
- self.frames.write(
862
- backend="tsdb",
863
- table=self.tsdb_path,
864
- dfs=pd.DataFrame.from_records([tsdb_drift_measures]),
865
- index_cols=["timestamp", "endpoint_id", "record_type"],
866
- )
867
- except v3io_frames.errors.Error as err:
868
- logger.warn(
869
- "Could not write drift measures to TSDB",
870
- err=err,
871
- tsdb_path=self.tsdb_path,
872
- endpoint=endpoint_id,
873
- )
874
-
875
- def _update_drift_in_prometheus(
876
- self,
877
- endpoint_id: str,
878
- drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
879
- drift_result: dict[str, dict[str, Any]],
880
- ):
881
- """Push drift metrics to Prometheus registry. Please note that the metrics are being pushed through HTTP
882
- to the monitoring stream pod that writes them into a local registry. Afterwards, Prometheus wil scrape these
883
- metrics that will be available in the Grafana charts.
884
-
885
- :param endpoint_id: The unique id of the model endpoint.
886
- :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
887
- :param drift_result: A dictionary that includes the drift results for each feature.
888
-
889
-
890
- """
891
- stream_http_path = (
892
- mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
893
- project=self.project, namespace=mlrun.mlconf.namespace
894
- )
895
- )
896
-
897
- http_session = mlrun.utils.HTTPSessionWithRetry(
898
- retry_on_post=True,
899
- verbose=True,
900
- max_retries=1,
901
- )
902
- try:
903
- # Model monitoring stream http health check
904
- http_session.request("GET", url=stream_http_path)
905
-
906
- # Update statistical metrics
907
- statistical_metrics = ["hellinger_mean", "tvd_mean", "kld_mean"]
908
- metrics = []
909
- for metric in statistical_metrics:
910
- metrics.append(
911
- {
912
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
913
- mlrun.common.schemas.model_monitoring.EventFieldType.METRIC: metric,
914
- mlrun.common.schemas.model_monitoring.EventFieldType.VALUE: drift_result[
915
- metric
916
- ],
917
- }
918
- )
919
-
920
- http_session.request(
921
- method="POST",
922
- url=stream_http_path + "/monitoring-batch-metrics",
923
- data=json.dumps(metrics),
924
- )
925
-
926
- # Update drift status
927
- drift_status_dict = {
928
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
929
- mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_STATUS: drift_status.value,
930
- }
931
-
932
- http_session.request(
933
- method="POST",
934
- url=stream_http_path + "/monitoring-drift-status",
935
- data=json.dumps(drift_status_dict),
936
- )
937
-
938
- except requests.exceptions.ConnectionError as exc:
939
- logger.warning(
940
- "Can't push metrics to Prometheus registry. "
941
- "Monitoring stream pod is not found, probably not deployed. "
942
- "To deploy, call set_tracking() on a serving function. exc: ",
943
- exc=exc,
944
- )
945
-
946
- def _infer_kv_schema(self):
947
- """
948
- Create KV schema file if not exist. This schema is being used by the Grafana dashboards.
949
- """
950
-
951
- schema_file = self.db.client.kv.new_cursor(
952
- container=self.db.container,
953
- table_path=self.db.path,
954
- filter_expression='__name==".#schema"',
955
- )
956
-
957
- if not schema_file.all():
958
- logger.info(
959
- "Generate a new V3IO KV schema file", kv_table_path=self.db.path
960
- )
961
- self.frames.execute(
962
- backend="kv", table=self.db.path, command="infer_schema"
963
- )
964
-
965
-
966
- def handler(context: mlrun.run.MLClientCtx):
967
- batch_processor = BatchProcessor(
968
- context=context,
969
- project=context.project,
970
- )
971
- batch_processor.post_init()
972
- batch_processor.run()
973
- if batch_processor.exception:
974
- raise batch_processor.exception