mlrun 1.7.0rc6__py3-none-any.whl → 1.7.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (84) hide show
  1. mlrun/__main__.py +2 -0
  2. mlrun/common/constants.py +6 -0
  3. mlrun/common/schemas/__init__.py +5 -0
  4. mlrun/common/schemas/api_gateway.py +8 -1
  5. mlrun/common/schemas/hub.py +7 -9
  6. mlrun/common/schemas/model_monitoring/__init__.py +4 -0
  7. mlrun/common/schemas/model_monitoring/constants.py +36 -19
  8. mlrun/{model_monitoring/stores/models/__init__.py → common/schemas/pagination.py} +9 -10
  9. mlrun/common/schemas/project.py +16 -10
  10. mlrun/common/types.py +7 -1
  11. mlrun/config.py +35 -10
  12. mlrun/data_types/data_types.py +4 -0
  13. mlrun/datastore/__init__.py +3 -7
  14. mlrun/datastore/alibaba_oss.py +130 -0
  15. mlrun/datastore/azure_blob.py +4 -5
  16. mlrun/datastore/base.py +22 -16
  17. mlrun/datastore/datastore.py +4 -0
  18. mlrun/datastore/datastore_profile.py +19 -1
  19. mlrun/datastore/google_cloud_storage.py +1 -1
  20. mlrun/datastore/snowflake_utils.py +43 -0
  21. mlrun/datastore/sources.py +11 -29
  22. mlrun/datastore/targets.py +131 -11
  23. mlrun/datastore/utils.py +10 -5
  24. mlrun/db/base.py +58 -6
  25. mlrun/db/httpdb.py +183 -77
  26. mlrun/db/nopdb.py +110 -0
  27. mlrun/feature_store/api.py +3 -2
  28. mlrun/feature_store/retrieval/spark_merger.py +27 -23
  29. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  30. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  31. mlrun/kfpops.py +2 -5
  32. mlrun/launcher/base.py +1 -1
  33. mlrun/launcher/client.py +2 -2
  34. mlrun/model.py +1 -0
  35. mlrun/model_monitoring/__init__.py +1 -1
  36. mlrun/model_monitoring/api.py +104 -295
  37. mlrun/model_monitoring/controller.py +25 -25
  38. mlrun/model_monitoring/db/__init__.py +16 -0
  39. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -34
  40. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  41. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +47 -6
  42. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  43. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +49 -0
  44. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +76 -3
  45. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +68 -0
  46. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/sqlite.py +13 -1
  47. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +662 -0
  48. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  49. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +134 -3
  50. mlrun/model_monitoring/helpers.py +3 -3
  51. mlrun/model_monitoring/stream_processing.py +41 -9
  52. mlrun/model_monitoring/tracking_policy.py +7 -1
  53. mlrun/model_monitoring/writer.py +4 -36
  54. mlrun/projects/pipelines.py +14 -2
  55. mlrun/projects/project.py +141 -122
  56. mlrun/run.py +8 -2
  57. mlrun/runtimes/__init__.py +16 -0
  58. mlrun/runtimes/base.py +10 -1
  59. mlrun/runtimes/kubejob.py +26 -121
  60. mlrun/runtimes/nuclio/api_gateway.py +243 -66
  61. mlrun/runtimes/nuclio/application/application.py +79 -1
  62. mlrun/runtimes/nuclio/application/reverse_proxy.go +9 -1
  63. mlrun/runtimes/nuclio/function.py +14 -8
  64. mlrun/runtimes/nuclio/serving.py +30 -34
  65. mlrun/runtimes/pod.py +171 -0
  66. mlrun/runtimes/utils.py +0 -28
  67. mlrun/serving/remote.py +2 -3
  68. mlrun/serving/routers.py +4 -3
  69. mlrun/serving/server.py +5 -7
  70. mlrun/serving/states.py +40 -23
  71. mlrun/serving/v2_serving.py +4 -3
  72. mlrun/utils/helpers.py +34 -0
  73. mlrun/utils/http.py +1 -1
  74. mlrun/utils/retryer.py +1 -0
  75. mlrun/utils/version/version.json +2 -2
  76. {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc9.dist-info}/METADATA +25 -16
  77. {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc9.dist-info}/RECORD +81 -75
  78. mlrun/model_monitoring/batch.py +0 -933
  79. mlrun/model_monitoring/stores/models/mysql.py +0 -34
  80. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  81. {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc9.dist-info}/LICENSE +0 -0
  82. {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc9.dist-info}/WHEEL +0 -0
  83. {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc9.dist-info}/entry_points.txt +0 -0
  84. {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc9.dist-info}/top_level.txt +0 -0
@@ -1,933 +0,0 @@
1
- # Copyright 2023 Iguazio
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import collections
16
- import datetime
17
- import json
18
- import os
19
- import re
20
- from typing import Any, Optional, Union
21
-
22
- import numpy as np
23
- import pandas as pd
24
- import requests
25
- import v3io
26
- import v3io.dataplane
27
- import v3io_frames
28
- from v3io_frames.frames_pb2 import IGNORE
29
-
30
- import mlrun.common.helpers
31
- import mlrun.common.model_monitoring.helpers
32
- import mlrun.common.schemas.model_monitoring
33
- import mlrun.data_types.infer
34
- import mlrun.feature_store as fstore
35
- import mlrun.utils.v3io_clients
36
- from mlrun.model_monitoring.helpers import calculate_inputs_statistics
37
- from mlrun.model_monitoring.metrics.histogram_distance import (
38
- HellingerDistance,
39
- HistogramDistanceMetric,
40
- KullbackLeiblerDivergence,
41
- TotalVarianceDistance,
42
- )
43
- from mlrun.utils import logger
44
-
45
- # A type for representing a drift result, a tuple of the status and the drift mean:
46
- DriftResultType = tuple[mlrun.common.schemas.model_monitoring.DriftStatus, float]
47
-
48
-
49
- class VirtualDrift:
50
- """
51
- Virtual Drift object is used for handling the drift calculations.
52
- It contains the metrics objects and the related methods for the detection of potential drift.
53
- """
54
-
55
- def __init__(
56
- self,
57
- prediction_col: Optional[str] = None,
58
- label_col: Optional[str] = None,
59
- feature_weights: Optional[list[float]] = None,
60
- inf_capping: Optional[float] = 10,
61
- ):
62
- """
63
- Initialize a Virtual Drift object.
64
-
65
- :param prediction_col: The name of the dataframe column which represents the predictions of the model. If
66
- provided, it will be used for calculating drift over the predictions. The name of the
67
- dataframe column which represents the labels of the model. If provided, it will be used
68
- for calculating drift over the labels.
69
- :param feature_weights: Weights that can be applied to the features and to be considered during the drift
70
- analysis.
71
- :param inf_capping: A bounded value for the results of the statistical metric. For example, when calculating
72
- KL divergence and getting infinite distance between the two distributions, the result
73
- will be replaced with the capping value.
74
- """
75
- self.prediction_col = prediction_col
76
- self.label_col = label_col
77
- self.feature_weights = feature_weights
78
- self.capping = inf_capping
79
-
80
- # Initialize objects of the current metrics
81
- self.metrics: dict[str, type[HistogramDistanceMetric]] = {
82
- metric_class.NAME: metric_class
83
- for metric_class in (
84
- TotalVarianceDistance,
85
- HellingerDistance,
86
- KullbackLeiblerDivergence,
87
- )
88
- }
89
-
90
- @staticmethod
91
- def dict_to_histogram(histogram_dict: dict[str, dict[str, Any]]) -> pd.DataFrame:
92
- """
93
- Convert histogram dictionary to pandas DataFrame with feature histograms as columns
94
-
95
- :param histogram_dict: Histogram dictionary
96
-
97
- :returns: Histogram dataframe
98
- """
99
-
100
- # Create a dictionary with feature histograms as values
101
- histograms = {}
102
- for feature, stats in histogram_dict.items():
103
- if "hist" in stats:
104
- # Normalize to probability distribution of each feature
105
- histograms[feature] = np.array(stats["hist"][0]) / stats["count"]
106
-
107
- # Convert the dictionary to pandas DataFrame
108
- histograms = pd.DataFrame(histograms)
109
-
110
- return histograms
111
-
112
- def compute_metrics_over_df(
113
- self,
114
- base_histogram: dict[str, dict[str, Any]],
115
- latest_histogram: dict[str, dict[str, Any]],
116
- ) -> dict[str, dict[str, Any]]:
117
- """
118
- Calculate metrics values for each feature.
119
-
120
- For example:
121
- {tvd: {feature_1: 0.001, feature_2: 0.2: ,...}}
122
-
123
- :param base_histogram: histogram dataframe that represents the distribution of the features from the original
124
- training set.
125
- :param latest_histogram: Histogram dataframe that represents the distribution of the features from the latest
126
- input batch.
127
-
128
- :returns: A dictionary in which for each metric (key) we assign the values for each feature.
129
- """
130
-
131
- # compute the different metrics for each feature distribution and store the results in dictionary
132
- drift_measures = {}
133
- for metric_name, metric in self.metrics.items():
134
- drift_measures[metric_name] = {
135
- feature: metric(
136
- base_histogram.loc[:, feature], latest_histogram.loc[:, feature]
137
- ).compute()
138
- for feature in base_histogram
139
- }
140
-
141
- return drift_measures
142
-
143
- def compute_drift_from_histograms(
144
- self,
145
- feature_stats: dict[str, dict[str, Any]],
146
- current_stats: dict[str, dict[str, Any]],
147
- ) -> dict[str, dict[str, Any]]:
148
- """
149
- Compare the distributions of both the original features data and the latest input data
150
- :param feature_stats: Histogram dictionary of the original feature dataset that was used in the model training.
151
- :param current_stats: Histogram dictionary of the recent input data
152
-
153
- :returns: A dictionary that includes the drift results for each feature.
154
-
155
- """
156
-
157
- # convert histogram dictionaries to DataFrame of the histograms
158
- # with feature histogram as cols
159
- base_histogram = self.dict_to_histogram(feature_stats)
160
- latest_histogram = self.dict_to_histogram(current_stats)
161
-
162
- # verify all the features exist between datasets
163
- base_features = set(base_histogram.columns)
164
- latest_features = set(latest_histogram.columns)
165
- features_common = list(base_features.intersection(latest_features))
166
- feature_difference = list(base_features ^ latest_features)
167
- if not features_common:
168
- raise ValueError(
169
- f"No common features found: {base_features} <> {latest_features}"
170
- )
171
-
172
- # drop columns of non-exist features
173
- base_histogram = base_histogram.drop(
174
- feature_difference, axis=1, errors="ignore"
175
- )
176
- latest_histogram = latest_histogram.drop(
177
- feature_difference, axis=1, errors="ignore"
178
- )
179
-
180
- # compute the statistical metrics per feature
181
- features_drift_measures = self.compute_metrics_over_df(
182
- base_histogram.loc[:, features_common],
183
- latest_histogram.loc[:, features_common],
184
- )
185
-
186
- # compute total value for each metric
187
- for metric_name in self.metrics.keys():
188
- feature_values = list(features_drift_measures[metric_name].values())
189
- features_drift_measures[metric_name]["total_sum"] = np.sum(feature_values)
190
- features_drift_measures[metric_name]["total_mean"] = np.mean(feature_values)
191
-
192
- # add weighted mean by given feature weights if provided
193
- if self.feature_weights:
194
- features_drift_measures[metric_name]["total_weighted_mean"] = np.dot(
195
- feature_values, self.feature_weights
196
- )
197
-
198
- # define drift result dictionary with values as a dictionary
199
- drift_result = collections.defaultdict(dict)
200
-
201
- # fill drift result dictionary with the statistical metrics results per feature
202
- # and the total sum and mean of each metric
203
- for feature in features_common:
204
- for metric, values in features_drift_measures.items():
205
- drift_result[feature][metric] = values[feature]
206
- sum = features_drift_measures[metric]["total_sum"]
207
- mean = features_drift_measures[metric]["total_mean"]
208
- drift_result[f"{metric}_sum"] = sum
209
- drift_result[f"{metric}_mean"] = mean
210
- if self.feature_weights:
211
- metric_measure = features_drift_measures[metric]
212
- weighted_mean = metric_measure["total_weighted_mean"]
213
- drift_result[f"{metric}_weighted_mean"] = weighted_mean
214
-
215
- # compute the drift metric over the labels
216
- if self.label_col:
217
- label_drift_measures = self.compute_metrics_over_df(
218
- base_histogram.loc[:, self.label_col],
219
- latest_histogram.loc[:, self.label_col],
220
- )
221
- for metric, values in label_drift_measures.items():
222
- drift_result[self.label_col][metric] = values[metric]
223
-
224
- # compute the drift metric over the predictions
225
- if self.prediction_col:
226
- prediction_drift_measures = self.compute_metrics_over_df(
227
- base_histogram.loc[:, self.prediction_col],
228
- latest_histogram.loc[:, self.prediction_col],
229
- )
230
- for metric, values in prediction_drift_measures.items():
231
- drift_result[self.prediction_col][metric] = values[metric]
232
-
233
- return drift_result
234
-
235
- @staticmethod
236
- def check_for_drift_per_feature(
237
- metrics_results_dictionary: dict[str, Union[float, dict]],
238
- possible_drift_threshold: float = 0.5,
239
- drift_detected_threshold: float = 0.7,
240
- ) -> dict[str, DriftResultType]:
241
- """
242
- Check for drift based on the defined decision rule and the calculated results of the statistical metrics per
243
- feature.
244
-
245
- :param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
246
- all features.
247
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
248
- Default: 0.5.
249
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
250
- Default: 0.7.
251
-
252
- :returns: A dictionary of all the features and their drift status and results tuples, tuple of:
253
- [0] = Drift status enum based on the thresholds given.
254
- [1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
255
- distance.
256
- """
257
- # Initialize the drift results dictionary:
258
- drift_results = {}
259
-
260
- # Calculate the result per feature:
261
- for feature, results in metrics_results_dictionary.items():
262
- # A feature result must be a dictionary, otherwise it's the total mean (float):
263
- if not isinstance(results, dict):
264
- continue
265
- # Calculate the feature's drift mean:
266
- tvd = results[TotalVarianceDistance.NAME]
267
- hellinger = results[HellingerDistance.NAME]
268
- if tvd is None or hellinger is None:
269
- logger.warning(
270
- "Can't calculate drift for this feature because at least one of the required "
271
- "statistical metrics is missing",
272
- feature=feature,
273
- tvd=tvd,
274
- hellinger=hellinger,
275
- )
276
- continue
277
- metrics_results_dictionary = (tvd + hellinger) / 2
278
- # Decision rule for drift detection:
279
- drift_status = VirtualDrift._get_drift_status(
280
- drift_result=metrics_results_dictionary,
281
- possible_drift_threshold=possible_drift_threshold,
282
- drift_detected_threshold=drift_detected_threshold,
283
- )
284
- # Collect the drift result:
285
- drift_results[feature] = (drift_status, metrics_results_dictionary)
286
-
287
- return drift_results
288
-
289
- @staticmethod
290
- def check_for_drift(
291
- metrics_results_dictionary: dict[str, Union[float, dict]],
292
- possible_drift_threshold: float = 0.5,
293
- drift_detected_threshold: float = 0.7,
294
- ) -> DriftResultType:
295
- """
296
- Check for drift based on the defined decision rule and the calculated results of the statistical metrics by the
297
- mean of all features.
298
-
299
- :param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
300
- all features.
301
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
302
- Default: 0.5.
303
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
304
- Default: 0.7.
305
-
306
- :returns: A tuple of:
307
- [0] = Drift status enum based on the thresholds given.
308
- [1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
309
- distance.
310
- """
311
- # Calculate the mean drift result:
312
- tvd_mean = metrics_results_dictionary[f"{TotalVarianceDistance.NAME}_mean"]
313
- hellinger_mean = metrics_results_dictionary.get(
314
- f"{HellingerDistance.NAME}_mean"
315
- )
316
- drift_result = 0.0
317
- if tvd_mean and hellinger_mean:
318
- drift_result = (tvd_mean + hellinger_mean) / 2
319
-
320
- # Decision rule for drift detection:
321
- drift_status = VirtualDrift._get_drift_status(
322
- drift_result=drift_result,
323
- possible_drift_threshold=possible_drift_threshold,
324
- drift_detected_threshold=drift_detected_threshold,
325
- )
326
-
327
- return drift_status, drift_result
328
-
329
- @staticmethod
330
- def _get_drift_status(
331
- drift_result: float,
332
- possible_drift_threshold: float,
333
- drift_detected_threshold: float,
334
- ) -> mlrun.common.schemas.model_monitoring.DriftStatus:
335
- """
336
- Get the drift status according to the result and thresholds given.
337
-
338
- :param drift_result: The drift result.
339
- :param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
340
- :param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
341
-
342
- :returns: The figured drift status.
343
- """
344
- drift_status = mlrun.common.schemas.model_monitoring.DriftStatus.NO_DRIFT
345
- if drift_result >= drift_detected_threshold:
346
- drift_status = (
347
- mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
348
- )
349
- elif drift_result >= possible_drift_threshold:
350
- drift_status = (
351
- mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
352
- )
353
-
354
- return drift_status
355
-
356
-
357
- class BatchProcessor:
358
- """
359
- The main object to handle the batch processing job. This object is used to get the required configurations and
360
- to manage the main monitoring drift detection process based on the current batch.
361
- Note that the BatchProcessor object requires access keys along with valid project configurations.
362
- """
363
-
364
- def __init__(
365
- self,
366
- context: mlrun.run.MLClientCtx,
367
- project: str,
368
- ):
369
- """
370
- Initialize Batch Processor object.
371
-
372
- :param context: An MLRun context.
373
- :param project: Project name.
374
- """
375
- self.context = context
376
- self.project = project
377
-
378
- # Initialize virtual drift object
379
- self.virtual_drift = VirtualDrift(inf_capping=10)
380
-
381
- logger.info(
382
- "Initializing BatchProcessor",
383
- project=project,
384
- )
385
-
386
- # Get drift thresholds from the model monitoring configuration
387
- # fmt: off
388
- self.default_possible_drift_threshold = (
389
- mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift
390
- )
391
- self.default_drift_detected_threshold = (
392
- mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected
393
- )
394
- # fmt: on
395
-
396
- # Get a runtime database
397
-
398
- self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
399
-
400
- if not mlrun.mlconf.is_ce_mode():
401
- # TODO: Once there is a time series DB alternative in a non-CE deployment, we need to update this if
402
- # statement to be applied only for V3IO TSDB
403
- self._initialize_v3io_configurations()
404
-
405
- # If an error occurs, it will be raised using the following argument
406
- self.exception = None
407
-
408
- # Get the batch interval range
409
- self.batch_dict = context.parameters[
410
- mlrun.common.schemas.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
411
- ]
412
-
413
- # TODO: This will be removed in 1.5.0 once the job params can be parsed with different types
414
- # Convert batch dict string into a dictionary
415
- if isinstance(self.batch_dict, str):
416
- self._parse_batch_dict_str()
417
-
418
- # If provided, only model endpoints in that that list will be analyzed
419
- self.model_endpoints = context.parameters.get(
420
- mlrun.common.schemas.model_monitoring.EventFieldType.MODEL_ENDPOINTS, None
421
- )
422
-
423
- def _initialize_v3io_configurations(self):
424
- self.v3io_access_key = os.environ.get("V3IO_ACCESS_KEY")
425
- self.model_monitoring_access_key = (
426
- os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
427
- )
428
-
429
- # Define the required paths for the project objects
430
- tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
431
- project=self.project,
432
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.EVENTS,
433
- )
434
- (
435
- _,
436
- self.tsdb_container,
437
- self.tsdb_path,
438
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
439
- tsdb_path
440
- )
441
- # stream_path = template.format(project=self.project, kind="log_stream")
442
- stream_path = mlrun.mlconf.get_model_monitoring_file_target_path(
443
- project=self.project,
444
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.LOG_STREAM,
445
- )
446
- (
447
- _,
448
- self.stream_container,
449
- self.stream_path,
450
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
451
- stream_path
452
- )
453
-
454
- # Get the frames clients based on the v3io configuration
455
- # it will be used later for writing the results into the tsdb
456
- self.v3io = mlrun.utils.v3io_clients.get_v3io_client(
457
- access_key=self.v3io_access_key
458
- )
459
- self.frames = mlrun.utils.v3io_clients.get_frames_client(
460
- address=mlrun.mlconf.v3io_framesd,
461
- container=self.tsdb_container,
462
- token=self.v3io_access_key,
463
- )
464
- logger.info(
465
- "Creating table in TSDB if it does not already exist", table=self.tsdb_path
466
- )
467
- self.frames.create(
468
- backend="tsdb",
469
- table=self.tsdb_path,
470
- if_exists=IGNORE,
471
- rate="1/s",
472
- )
473
-
474
- def post_init(self):
475
- """
476
- Preprocess of the batch processing.
477
- """
478
-
479
- if not mlrun.mlconf.is_ce_mode():
480
- # Create v3io stream based on the input stream
481
- response = self.v3io.stream.create(
482
- container=self.stream_container,
483
- stream_path=self.stream_path,
484
- shard_count=1,
485
- raise_for_status=v3io.dataplane.RaiseForStatus.never,
486
- access_key=self.v3io_access_key,
487
- )
488
-
489
- if not (
490
- response.status_code == 400 and "ResourceInUse" in str(response.body)
491
- ):
492
- response.raise_for_status([409, 204, 403])
493
- pass
494
-
495
- def run(self):
496
- """
497
- Main method for manage the drift analysis and write the results into tsdb and KV table.
498
- """
499
- # Get model endpoints (each deployed project has at least 1 serving model):
500
-
501
- try:
502
- endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
503
-
504
- except Exception as e:
505
- logger.error("Failed to list endpoints", exc=e)
506
- return
507
-
508
- for endpoint in endpoints:
509
- if (
510
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.ACTIVE]
511
- and endpoint[
512
- mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
513
- ]
514
- == mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
515
- ):
516
- # Skip router endpoint:
517
- if (
518
- int(
519
- endpoint[
520
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
521
- ]
522
- )
523
- == mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
524
- ):
525
- # Router endpoint has no feature stats
526
- logger.info(
527
- f"{endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]} is router skipping"
528
- )
529
- continue
530
- self.update_drift_metrics(endpoint=endpoint)
531
-
532
- def update_drift_metrics(self, endpoint: dict):
533
- try:
534
- m_fs = fstore.get_feature_set(
535
- endpoint[
536
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
537
- ]
538
- )
539
-
540
- # Getting batch interval start time and end time
541
- start_time, end_time = self._get_interval_range()
542
-
543
- try:
544
- df = m_fs.to_dataframe(
545
- start_time=start_time,
546
- end_time=end_time,
547
- time_column=mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
548
- )
549
-
550
- if len(df) == 0:
551
- logger.warn(
552
- "Not enough model events since the beginning of the batch interval",
553
- parquet_target=m_fs.status.targets[0].path,
554
- endpoint=endpoint[
555
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
556
- ],
557
- min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
558
- start_time=str(
559
- datetime.datetime.now() - datetime.timedelta(hours=1)
560
- ),
561
- end_time=str(datetime.datetime.now()),
562
- )
563
- return
564
-
565
- # TODO: The below warn will be removed once the state of the Feature Store target is updated
566
- # as expected. In that case, the existence of the file will be checked before trying to get
567
- # the offline data from the feature set.
568
- # Continue if not enough events provided since the deployment of the model endpoint
569
- except FileNotFoundError:
570
- logger.warn(
571
- "Parquet not found, probably due to not enough model events",
572
- parquet_target=m_fs.status.targets[0].path,
573
- endpoint=endpoint[
574
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
575
- ],
576
- min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
577
- )
578
- return
579
-
580
- # Get feature names from monitoring feature set
581
- feature_names = [
582
- feature_name["name"] for feature_name in m_fs.spec.features.to_dict()
583
- ]
584
-
585
- # Create DataFrame based on the input features
586
- stats_columns = [
587
- mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
588
- *feature_names,
589
- ]
590
-
591
- # Add label names if provided
592
- if endpoint[
593
- mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
594
- ]:
595
- labels = endpoint[
596
- mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
597
- ]
598
- if isinstance(labels, str):
599
- labels = json.loads(labels)
600
- for label in labels:
601
- if label not in stats_columns:
602
- stats_columns.append(label)
603
- named_features_df = df[stats_columns].copy()
604
-
605
- # Infer feature set stats and schema
606
- fstore.api._infer_from_static_df(
607
- named_features_df,
608
- m_fs,
609
- options=mlrun.data_types.infer.InferOptions.all_stats(),
610
- )
611
-
612
- # Save feature set to apply changes
613
- m_fs.save()
614
-
615
- # Get the timestamp of the latest request:
616
- timestamp = df[
617
- mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
618
- ].iloc[-1]
619
-
620
- # Get the feature stats from the model endpoint for reference data
621
- feature_stats = json.loads(
622
- endpoint[
623
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
624
- ]
625
- )
626
- # Pad the original feature stats to accommodate current data out
627
- # of the original range (unless already padded)
628
- mlrun.common.model_monitoring.helpers.pad_features_hist(
629
- mlrun.common.model_monitoring.helpers.FeatureStats(feature_stats)
630
- )
631
-
632
- # Get the current stats:
633
- current_stats = calculate_inputs_statistics(
634
- sample_set_statistics=feature_stats,
635
- inputs=named_features_df,
636
- )
637
-
638
- # Compute the drift based on the histogram of the current stats and the histogram of the original
639
- # feature stats that can be found in the model endpoint object:
640
- drift_result = self.virtual_drift.compute_drift_from_histograms(
641
- feature_stats=feature_stats,
642
- current_stats=current_stats,
643
- )
644
- logger.info("Drift result", drift_result=drift_result)
645
-
646
- # Get drift thresholds from the model configuration:
647
- monitor_configuration = (
648
- json.loads(
649
- endpoint[
650
- mlrun.common.schemas.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
651
- ]
652
- )
653
- or {}
654
- )
655
-
656
- # For backwards compatibility first check if the old drift thresholds
657
- # (both `possible drift and `drift_detected`) keys exist in the monitor configuration dict
658
- # TODO: Remove the first get in 1.7.0
659
- possible_drift = monitor_configuration.get(
660
- "possible_drift",
661
- monitor_configuration.get(
662
- mlrun.common.schemas.model_monitoring.EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
663
- self.default_possible_drift_threshold,
664
- ),
665
- )
666
-
667
- drift_detected = monitor_configuration.get(
668
- "drift_detected",
669
- monitor_configuration.get(
670
- mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_DETECTED_THRESHOLD,
671
- self.default_drift_detected_threshold,
672
- ),
673
- )
674
-
675
- # Check for possible drift based on the results of the statistical metrics defined above:
676
- drift_status, drift_measure = self.virtual_drift.check_for_drift(
677
- metrics_results_dictionary=drift_result,
678
- possible_drift_threshold=possible_drift,
679
- drift_detected_threshold=drift_detected,
680
- )
681
- logger.info(
682
- "Drift status",
683
- endpoint_id=endpoint[
684
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
685
- ],
686
- drift_status=drift_status.value,
687
- drift_measure=drift_measure,
688
- )
689
-
690
- attributes = {
691
- "current_stats": json.dumps(current_stats),
692
- "drift_measures": json.dumps(drift_result),
693
- "drift_status": drift_status.value,
694
- }
695
-
696
- self.db.update_model_endpoint(
697
- endpoint_id=endpoint[
698
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
699
- ],
700
- attributes=attributes,
701
- )
702
-
703
- if not mlrun.mlconf.is_ce_mode():
704
- # Generate V3IO KV schema if not exist
705
- self._infer_kv_schema()
706
-
707
- # Update drift results in TSDB
708
- self._update_drift_in_v3io_tsdb(
709
- endpoint_id=endpoint[
710
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
711
- ],
712
- drift_status=drift_status,
713
- drift_measure=drift_measure,
714
- drift_result=drift_result,
715
- timestamp=timestamp,
716
- )
717
-
718
- else:
719
- # Update drift results in Prometheus
720
- self._update_drift_in_prometheus(
721
- endpoint_id=endpoint[
722
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
723
- ],
724
- drift_status=drift_status,
725
- drift_result=drift_result,
726
- )
727
-
728
- except Exception as e:
729
- logger.error(
730
- f"Exception for endpoint {endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]}"
731
- )
732
- self.exception = e
733
- logger.info(
734
- "Done updating drift measures",
735
- endpoint_id=endpoint[
736
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
737
- ],
738
- )
739
-
740
- def _get_interval_range(self) -> tuple[datetime.datetime, datetime.datetime]:
741
- """Getting batch interval time range"""
742
- minutes, hours, days = (
743
- self.batch_dict[
744
- mlrun.common.schemas.model_monitoring.EventFieldType.MINUTES
745
- ],
746
- self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.HOURS],
747
- self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.DAYS],
748
- )
749
- start_time = datetime.datetime.now() - datetime.timedelta(
750
- minutes=minutes, hours=hours, days=days
751
- )
752
- end_time = datetime.datetime.now()
753
- return start_time, end_time
754
-
755
- def _parse_batch_dict_str(self):
756
- """Convert batch dictionary string into a valid dictionary"""
757
- characters_to_remove = "{} "
758
- pattern = "[" + characters_to_remove + "]"
759
- # Remove unnecessary characters from the provided string
760
- batch_list = re.sub(pattern, "", self.batch_dict).split(",")
761
- # Initialize the dictionary of batch interval ranges
762
- self.batch_dict = {}
763
- for pair in batch_list:
764
- pair_list = pair.split(":")
765
- self.batch_dict[pair_list[0]] = float(pair_list[1])
766
-
767
- def _update_drift_in_v3io_tsdb(
768
- self,
769
- endpoint_id: str,
770
- drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
771
- drift_measure: float,
772
- drift_result: dict[str, dict[str, Any]],
773
- timestamp: pd.Timestamp,
774
- ):
775
- """Update drift results in input stream.
776
-
777
- :param endpoint_id: The unique id of the model endpoint.
778
- :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
779
- :param drift_measure: The drift result (float) based on the mean of the Total Variance Distance and the
780
- Hellinger distance.
781
- :param drift_result: A dictionary that includes the drift results for each feature.
782
- :param timestamp: Pandas Timestamp value.
783
-
784
- """
785
-
786
- if (
787
- drift_status
788
- == mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
789
- or drift_status
790
- == mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
791
- ):
792
- self.v3io.stream.put_records(
793
- container=self.stream_container,
794
- stream_path=self.stream_path,
795
- records=[
796
- {
797
- "data": json.dumps(
798
- {
799
- "endpoint_id": endpoint_id,
800
- "drift_status": drift_status.value,
801
- "drift_measure": drift_measure,
802
- "drift_per_feature": {**drift_result},
803
- }
804
- )
805
- }
806
- ],
807
- )
808
-
809
- # Update the results in tsdb:
810
- tsdb_drift_measures = {
811
- "endpoint_id": endpoint_id,
812
- "timestamp": timestamp,
813
- "record_type": "drift_measures",
814
- "tvd_mean": drift_result["tvd_mean"],
815
- "kld_mean": drift_result["kld_mean"],
816
- "hellinger_mean": drift_result["hellinger_mean"],
817
- }
818
-
819
- try:
820
- self.frames.write(
821
- backend="tsdb",
822
- table=self.tsdb_path,
823
- dfs=pd.DataFrame.from_records([tsdb_drift_measures]),
824
- index_cols=["timestamp", "endpoint_id", "record_type"],
825
- )
826
- except v3io_frames.errors.Error as err:
827
- logger.warn(
828
- "Could not write drift measures to TSDB",
829
- err=err,
830
- tsdb_path=self.tsdb_path,
831
- endpoint=endpoint_id,
832
- )
833
-
834
- def _update_drift_in_prometheus(
835
- self,
836
- endpoint_id: str,
837
- drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
838
- drift_result: dict[str, dict[str, Any]],
839
- ):
840
- """Push drift metrics to Prometheus registry. Please note that the metrics are being pushed through HTTP
841
- to the monitoring stream pod that writes them into a local registry. Afterwards, Prometheus wil scrape these
842
- metrics that will be available in the Grafana charts.
843
-
844
- :param endpoint_id: The unique id of the model endpoint.
845
- :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
846
- :param drift_result: A dictionary that includes the drift results for each feature.
847
-
848
-
849
- """
850
- stream_http_path = (
851
- mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
852
- project=self.project, namespace=mlrun.mlconf.namespace
853
- )
854
- )
855
-
856
- http_session = mlrun.utils.HTTPSessionWithRetry(
857
- retry_on_post=True,
858
- verbose=True,
859
- max_retries=1,
860
- )
861
- try:
862
- # Model monitoring stream http health check
863
- http_session.request("GET", url=stream_http_path)
864
-
865
- # Update statistical metrics
866
- statistical_metrics = ["hellinger_mean", "tvd_mean", "kld_mean"]
867
- metrics = []
868
- for metric in statistical_metrics:
869
- metrics.append(
870
- {
871
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
872
- mlrun.common.schemas.model_monitoring.EventFieldType.METRIC: metric,
873
- mlrun.common.schemas.model_monitoring.EventFieldType.VALUE: drift_result[
874
- metric
875
- ],
876
- }
877
- )
878
-
879
- http_session.request(
880
- method="POST",
881
- url=stream_http_path + "/monitoring-batch-metrics",
882
- data=json.dumps(metrics),
883
- )
884
-
885
- # Update drift status
886
- drift_status_dict = {
887
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
888
- mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_STATUS: drift_status.value,
889
- }
890
-
891
- http_session.request(
892
- method="POST",
893
- url=stream_http_path + "/monitoring-drift-status",
894
- data=json.dumps(drift_status_dict),
895
- )
896
-
897
- except requests.exceptions.ConnectionError as exc:
898
- logger.warning(
899
- "Can't push metrics to Prometheus registry. "
900
- "Monitoring stream pod is not found, probably not deployed. "
901
- "To deploy, call set_tracking() on a serving function. exc: ",
902
- exc=exc,
903
- )
904
-
905
- def _infer_kv_schema(self):
906
- """
907
- Create KV schema file if not exist. This schema is being used by the Grafana dashboards.
908
- """
909
-
910
- schema_file = self.db.client.kv.new_cursor(
911
- container=self.db.container,
912
- table_path=self.db.path,
913
- filter_expression='__name==".#schema"',
914
- )
915
-
916
- if not schema_file.all():
917
- logger.info(
918
- "Generate a new V3IO KV schema file", kv_table_path=self.db.path
919
- )
920
- self.frames.execute(
921
- backend="kv", table=self.db.path, command="infer_schema"
922
- )
923
-
924
-
925
- def handler(context: mlrun.run.MLClientCtx):
926
- batch_processor = BatchProcessor(
927
- context=context,
928
- project=context.project,
929
- )
930
- batch_processor.post_init()
931
- batch_processor.run()
932
- if batch_processor.exception:
933
- raise batch_processor.exception