mlrun 1.7.2__py3-none-any.whl → 1.8.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (222) hide show
  1. mlrun/__init__.py +14 -12
  2. mlrun/__main__.py +3 -3
  3. mlrun/alerts/alert.py +19 -12
  4. mlrun/artifacts/__init__.py +0 -2
  5. mlrun/artifacts/base.py +34 -11
  6. mlrun/artifacts/dataset.py +16 -16
  7. mlrun/artifacts/manager.py +13 -13
  8. mlrun/artifacts/model.py +66 -53
  9. mlrun/common/constants.py +6 -0
  10. mlrun/common/formatters/__init__.py +1 -0
  11. mlrun/common/formatters/feature_set.py +1 -0
  12. mlrun/common/formatters/function.py +1 -0
  13. mlrun/common/formatters/model_endpoint.py +30 -0
  14. mlrun/common/formatters/pipeline.py +1 -2
  15. mlrun/common/model_monitoring/__init__.py +0 -3
  16. mlrun/common/model_monitoring/helpers.py +1 -1
  17. mlrun/common/runtimes/constants.py +1 -2
  18. mlrun/common/schemas/__init__.py +4 -2
  19. mlrun/common/schemas/artifact.py +0 -6
  20. mlrun/common/schemas/common.py +50 -0
  21. mlrun/common/schemas/model_monitoring/__init__.py +8 -1
  22. mlrun/common/schemas/model_monitoring/constants.py +62 -12
  23. mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +149 -0
  24. mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -5
  25. mlrun/common/schemas/partition.py +122 -0
  26. mlrun/config.py +43 -15
  27. mlrun/data_types/__init__.py +0 -2
  28. mlrun/data_types/data_types.py +0 -1
  29. mlrun/data_types/infer.py +3 -1
  30. mlrun/data_types/spark.py +4 -4
  31. mlrun/data_types/to_pandas.py +2 -11
  32. mlrun/datastore/__init__.py +0 -2
  33. mlrun/datastore/alibaba_oss.py +4 -1
  34. mlrun/datastore/azure_blob.py +4 -1
  35. mlrun/datastore/base.py +12 -4
  36. mlrun/datastore/datastore.py +9 -3
  37. mlrun/datastore/datastore_profile.py +1 -1
  38. mlrun/datastore/dbfs_store.py +4 -1
  39. mlrun/datastore/filestore.py +4 -1
  40. mlrun/datastore/google_cloud_storage.py +4 -1
  41. mlrun/datastore/hdfs.py +4 -1
  42. mlrun/datastore/inmem.py +4 -1
  43. mlrun/datastore/redis.py +4 -1
  44. mlrun/datastore/s3.py +4 -1
  45. mlrun/datastore/sources.py +51 -49
  46. mlrun/datastore/store_resources.py +0 -2
  47. mlrun/datastore/targets.py +22 -23
  48. mlrun/datastore/utils.py +2 -2
  49. mlrun/datastore/v3io.py +4 -1
  50. mlrun/datastore/wasbfs/fs.py +13 -12
  51. mlrun/db/base.py +126 -62
  52. mlrun/db/factory.py +3 -0
  53. mlrun/db/httpdb.py +767 -231
  54. mlrun/db/nopdb.py +126 -57
  55. mlrun/errors.py +2 -2
  56. mlrun/execution.py +55 -29
  57. mlrun/feature_store/__init__.py +0 -2
  58. mlrun/feature_store/api.py +40 -40
  59. mlrun/feature_store/common.py +9 -9
  60. mlrun/feature_store/feature_set.py +20 -18
  61. mlrun/feature_store/feature_vector.py +27 -24
  62. mlrun/feature_store/retrieval/base.py +14 -9
  63. mlrun/feature_store/retrieval/job.py +2 -1
  64. mlrun/feature_store/steps.py +2 -2
  65. mlrun/features.py +30 -13
  66. mlrun/frameworks/__init__.py +1 -2
  67. mlrun/frameworks/_common/__init__.py +1 -2
  68. mlrun/frameworks/_common/artifacts_library.py +2 -2
  69. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  70. mlrun/frameworks/_common/model_handler.py +29 -27
  71. mlrun/frameworks/_common/producer.py +3 -1
  72. mlrun/frameworks/_dl_common/__init__.py +1 -2
  73. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  74. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  75. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  76. mlrun/frameworks/_ml_common/__init__.py +1 -2
  77. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  78. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  79. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  80. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  81. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  82. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  83. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  84. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  85. mlrun/frameworks/huggingface/__init__.py +1 -2
  86. mlrun/frameworks/huggingface/model_server.py +9 -9
  87. mlrun/frameworks/lgbm/__init__.py +47 -44
  88. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  89. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  90. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  91. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  92. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  93. mlrun/frameworks/lgbm/model_handler.py +15 -11
  94. mlrun/frameworks/lgbm/model_server.py +11 -7
  95. mlrun/frameworks/lgbm/utils.py +2 -2
  96. mlrun/frameworks/onnx/__init__.py +1 -2
  97. mlrun/frameworks/onnx/dataset.py +3 -3
  98. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  99. mlrun/frameworks/onnx/model_handler.py +7 -5
  100. mlrun/frameworks/onnx/model_server.py +8 -6
  101. mlrun/frameworks/parallel_coordinates.py +11 -11
  102. mlrun/frameworks/pytorch/__init__.py +22 -23
  103. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  104. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  105. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  106. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  107. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  108. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  109. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  110. mlrun/frameworks/pytorch/model_handler.py +21 -17
  111. mlrun/frameworks/pytorch/model_server.py +13 -9
  112. mlrun/frameworks/sklearn/__init__.py +19 -18
  113. mlrun/frameworks/sklearn/estimator.py +2 -2
  114. mlrun/frameworks/sklearn/metric.py +3 -3
  115. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  116. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  117. mlrun/frameworks/sklearn/model_handler.py +4 -3
  118. mlrun/frameworks/tf_keras/__init__.py +11 -12
  119. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  120. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  121. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  122. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  123. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  124. mlrun/frameworks/tf_keras/model_server.py +12 -8
  125. mlrun/frameworks/xgboost/__init__.py +19 -18
  126. mlrun/frameworks/xgboost/model_handler.py +13 -9
  127. mlrun/launcher/base.py +3 -4
  128. mlrun/launcher/local.py +1 -1
  129. mlrun/launcher/remote.py +1 -1
  130. mlrun/lists.py +4 -3
  131. mlrun/model.py +108 -44
  132. mlrun/model_monitoring/__init__.py +1 -2
  133. mlrun/model_monitoring/api.py +6 -6
  134. mlrun/model_monitoring/applications/_application_steps.py +13 -15
  135. mlrun/model_monitoring/applications/histogram_data_drift.py +41 -15
  136. mlrun/model_monitoring/applications/results.py +55 -3
  137. mlrun/model_monitoring/controller.py +185 -223
  138. mlrun/model_monitoring/db/_schedules.py +156 -0
  139. mlrun/model_monitoring/db/_stats.py +189 -0
  140. mlrun/model_monitoring/db/stores/__init__.py +1 -1
  141. mlrun/model_monitoring/db/stores/base/store.py +6 -65
  142. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -25
  143. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -97
  144. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +2 -58
  145. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -15
  146. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +6 -257
  147. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +9 -271
  148. mlrun/model_monitoring/db/tsdb/base.py +74 -22
  149. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +66 -35
  150. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  151. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +284 -51
  152. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  153. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -17
  154. mlrun/model_monitoring/helpers.py +97 -1
  155. mlrun/model_monitoring/model_endpoint.py +4 -2
  156. mlrun/model_monitoring/stream_processing.py +2 -2
  157. mlrun/model_monitoring/tracking_policy.py +10 -3
  158. mlrun/model_monitoring/writer.py +47 -26
  159. mlrun/package/__init__.py +3 -6
  160. mlrun/package/context_handler.py +1 -1
  161. mlrun/package/packager.py +12 -9
  162. mlrun/package/packagers/__init__.py +0 -2
  163. mlrun/package/packagers/default_packager.py +14 -11
  164. mlrun/package/packagers/numpy_packagers.py +16 -7
  165. mlrun/package/packagers/pandas_packagers.py +18 -18
  166. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  167. mlrun/package/packagers_manager.py +31 -14
  168. mlrun/package/utils/__init__.py +0 -3
  169. mlrun/package/utils/_pickler.py +6 -6
  170. mlrun/platforms/__init__.py +3 -3
  171. mlrun/platforms/iguazio.py +4 -1
  172. mlrun/projects/__init__.py +1 -6
  173. mlrun/projects/operations.py +27 -27
  174. mlrun/projects/pipelines.py +85 -215
  175. mlrun/projects/project.py +444 -158
  176. mlrun/run.py +9 -9
  177. mlrun/runtimes/__init__.py +1 -3
  178. mlrun/runtimes/base.py +13 -10
  179. mlrun/runtimes/daskjob.py +9 -9
  180. mlrun/runtimes/generators.py +2 -1
  181. mlrun/runtimes/kubejob.py +4 -5
  182. mlrun/runtimes/mpijob/__init__.py +0 -2
  183. mlrun/runtimes/mpijob/abstract.py +7 -6
  184. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  185. mlrun/runtimes/nuclio/application/application.py +11 -11
  186. mlrun/runtimes/nuclio/function.py +14 -14
  187. mlrun/runtimes/nuclio/serving.py +9 -9
  188. mlrun/runtimes/pod.py +74 -29
  189. mlrun/runtimes/remotesparkjob.py +3 -2
  190. mlrun/runtimes/sparkjob/__init__.py +0 -2
  191. mlrun/runtimes/sparkjob/spark3job.py +21 -11
  192. mlrun/runtimes/utils.py +6 -5
  193. mlrun/serving/merger.py +6 -4
  194. mlrun/serving/remote.py +18 -17
  195. mlrun/serving/routers.py +27 -27
  196. mlrun/serving/server.py +1 -1
  197. mlrun/serving/states.py +76 -71
  198. mlrun/serving/utils.py +13 -2
  199. mlrun/serving/v1_serving.py +3 -2
  200. mlrun/serving/v2_serving.py +4 -4
  201. mlrun/track/__init__.py +1 -1
  202. mlrun/track/tracker.py +2 -2
  203. mlrun/track/trackers/mlflow_tracker.py +6 -5
  204. mlrun/utils/async_http.py +1 -1
  205. mlrun/utils/helpers.py +72 -28
  206. mlrun/utils/logger.py +104 -2
  207. mlrun/utils/notifications/notification/base.py +23 -4
  208. mlrun/utils/notifications/notification/console.py +1 -1
  209. mlrun/utils/notifications/notification/git.py +6 -6
  210. mlrun/utils/notifications/notification/ipython.py +5 -4
  211. mlrun/utils/notifications/notification/slack.py +1 -1
  212. mlrun/utils/notifications/notification/webhook.py +13 -17
  213. mlrun/utils/notifications/notification_pusher.py +23 -19
  214. mlrun/utils/regex.py +1 -1
  215. mlrun/utils/version/version.json +2 -2
  216. {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/METADATA +187 -199
  217. mlrun-1.8.0rc1.dist-info/RECORD +356 -0
  218. {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/WHEEL +1 -1
  219. mlrun-1.7.2.dist-info/RECORD +0 -351
  220. {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/LICENSE +0 -0
  221. {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/entry_points.txt +0 -0
  222. {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/top_level.txt +0 -0
@@ -11,31 +11,30 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import concurrent.futures
15
16
  import datetime
16
17
  import json
17
18
  import os
18
- import re
19
19
  from collections.abc import Iterator
20
- from typing import NamedTuple, Optional, Union, cast
20
+ from contextlib import AbstractContextManager
21
+ from types import TracebackType
22
+ from typing import Any, NamedTuple, Optional, cast
21
23
 
22
- import nuclio
24
+ import nuclio_sdk
23
25
 
24
26
  import mlrun
25
27
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
26
- import mlrun.data_types.infer
27
28
  import mlrun.feature_store as fstore
28
- import mlrun.model_monitoring.db.stores
29
- from mlrun.config import config as mlconf
29
+ import mlrun.model_monitoring
30
30
  from mlrun.datastore import get_stream_pusher
31
31
  from mlrun.errors import err_to_str
32
- from mlrun.model_monitoring.helpers import (
33
- _BatchDict,
34
- batch_dict2timedelta,
35
- get_stream_path,
36
- )
32
+ from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
33
+ from mlrun.model_monitoring.helpers import batch_dict2timedelta, get_stream_path
37
34
  from mlrun.utils import datetime_now, logger
38
35
 
36
+ _SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
37
+
39
38
 
40
39
  class _Interval(NamedTuple):
41
40
  start: datetime.datetime
@@ -45,12 +44,12 @@ class _Interval(NamedTuple):
45
44
  class _BatchWindow:
46
45
  def __init__(
47
46
  self,
48
- project: str,
49
- endpoint: str,
47
+ *,
48
+ schedules_file: ModelMonitoringSchedulesFile,
50
49
  application: str,
51
50
  timedelta_seconds: int,
52
- last_updated: Optional[int],
53
- first_request: Optional[int],
51
+ last_updated: int,
52
+ first_request: int,
54
53
  ) -> None:
55
54
  """
56
55
  Initialize a batch window object that handles the batch interval time range
@@ -58,151 +57,114 @@ class _BatchWindow:
58
57
  All the time values are in seconds.
59
58
  The start and stop time are in seconds since the epoch.
60
59
  """
61
- self.project = project
62
- self._endpoint = endpoint
63
60
  self._application = application
64
61
  self._first_request = first_request
65
62
  self._stop = last_updated
66
63
  self._step = timedelta_seconds
67
- self._db = mlrun.model_monitoring.get_store_object(project=self.project)
64
+ self._db = schedules_file
68
65
  self._start = self._get_last_analyzed()
69
66
 
70
- def _get_last_analyzed(self) -> Optional[int]:
71
- try:
72
- last_analyzed = self._db.get_last_analyzed(
73
- endpoint_id=self._endpoint,
74
- application_name=self._application,
75
- )
76
- except mlrun.errors.MLRunNotFoundError:
77
- logger.info(
78
- "No last analyzed time was found for this endpoint and "
79
- "application, as this is probably the first time this "
80
- "application is running. Using the latest between first "
81
- "request time or last update time minus one day instead",
82
- endpoint=self._endpoint,
83
- application=self._application,
84
- first_request=self._first_request,
85
- last_updated=self._stop,
86
- )
67
+ def _get_saved_last_analyzed(self) -> Optional[int]:
68
+ return self._db.get_application_time(self._application)
87
69
 
88
- if self._first_request and self._stop:
89
- # TODO : Change the timedelta according to the policy.
90
- first_period_in_seconds = max(
91
- int(datetime.timedelta(days=1).total_seconds()), self._step
92
- ) # max between one day and the base period
93
- return max(
94
- self._first_request,
95
- self._stop - first_period_in_seconds,
96
- )
97
- return self._first_request
98
-
99
- logger.info(
100
- "Got the last analyzed time for this endpoint and application",
101
- endpoint=self._endpoint,
102
- application=self._application,
103
- last_analyzed=last_analyzed,
70
+ def _update_last_analyzed(self, last_analyzed: int) -> None:
71
+ self._db.update_application_time(
72
+ application=self._application, timestamp=last_analyzed
104
73
  )
105
- return last_analyzed
106
74
 
107
- def _update_last_analyzed(self, last_analyzed: int) -> None:
75
+ def _get_initial_last_analyzed(self) -> int:
108
76
  logger.info(
109
- "Updating the last analyzed time for this endpoint and application",
110
- endpoint=self._endpoint,
77
+ "No last analyzed time was found for this endpoint and application, as this is "
78
+ "probably the first time this application is running. Initializing last analyzed "
79
+ "to the latest between first request time or last update time minus one day",
111
80
  application=self._application,
112
- last_analyzed=last_analyzed,
81
+ first_request=self._first_request,
82
+ last_updated=self._stop,
113
83
  )
114
-
115
- self._db.update_last_analyzed(
116
- endpoint_id=self._endpoint,
117
- application_name=self._application,
118
- last_analyzed=last_analyzed,
84
+ # max between one day and the base period
85
+ first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
86
+ return max(
87
+ self._first_request,
88
+ self._stop - first_period_in_seconds,
119
89
  )
120
90
 
121
- def get_intervals(
122
- self,
123
- ) -> Iterator[_Interval]:
124
- """Generate the batch interval time ranges."""
125
- if self._start is not None and self._stop is not None:
126
- entered = False
127
- # Iterate timestamp from start until timestamp <= stop - step
128
- # so that the last interval will end at (timestamp + step) <= stop.
129
- # Add 1 to stop - step to get <= and not <.
130
- for timestamp in range(
131
- self._start, self._stop - self._step + 1, self._step
132
- ):
133
- entered = True
134
- start_time = datetime.datetime.fromtimestamp(
135
- timestamp, tz=datetime.timezone.utc
136
- )
137
- end_time = datetime.datetime.fromtimestamp(
138
- timestamp + self._step, tz=datetime.timezone.utc
139
- )
140
- yield _Interval(start_time, end_time)
141
- self._update_last_analyzed(timestamp + self._step)
142
- if not entered:
143
- logger.info(
144
- "All the data is set, but no complete intervals were found. "
145
- "Wait for last_updated to be updated",
146
- endpoint=self._endpoint,
147
- application=self._application,
148
- start=self._start,
149
- stop=self._stop,
150
- step=self._step,
151
- )
91
+ def _get_last_analyzed(self) -> int:
92
+ saved_last_analyzed = self._get_saved_last_analyzed()
93
+ if saved_last_analyzed is not None:
94
+ return saved_last_analyzed
152
95
  else:
153
- logger.warn(
154
- "The first request time is not found for this endpoint. "
155
- "No intervals will be generated",
156
- endpoint=self._endpoint,
96
+ last_analyzed = self._get_initial_last_analyzed()
97
+ # Update the in-memory DB to avoid duplicate initializations
98
+ self._update_last_analyzed(last_analyzed)
99
+ return last_analyzed
100
+
101
+ def get_intervals(self) -> Iterator[_Interval]:
102
+ """Generate the batch interval time ranges."""
103
+ entered = False
104
+ # Iterate timestamp from start until timestamp <= stop - step
105
+ # so that the last interval will end at (timestamp + step) <= stop.
106
+ # Add 1 to stop - step to get <= and not <.
107
+ for timestamp in range(self._start, self._stop - self._step + 1, self._step):
108
+ entered = True
109
+ start_time = datetime.datetime.fromtimestamp(
110
+ timestamp, tz=datetime.timezone.utc
111
+ )
112
+ end_time = datetime.datetime.fromtimestamp(
113
+ timestamp + self._step, tz=datetime.timezone.utc
114
+ )
115
+ yield _Interval(start_time, end_time)
116
+
117
+ last_analyzed = timestamp + self._step
118
+ self._update_last_analyzed(last_analyzed)
119
+ logger.debug(
120
+ "Updated the last analyzed time for this endpoint and application",
121
+ application=self._application,
122
+ last_analyzed=last_analyzed,
123
+ )
124
+
125
+ if not entered:
126
+ logger.debug(
127
+ "All the data is set, but no complete intervals were found. "
128
+ "Wait for last_updated to be updated",
157
129
  application=self._application,
158
130
  start=self._start,
159
131
  stop=self._stop,
132
+ step=self._step,
160
133
  )
161
134
 
162
135
 
163
- class _BatchWindowGenerator:
164
- def __init__(self, batch_dict: Union[dict, str]) -> None:
136
+ class _BatchWindowGenerator(AbstractContextManager):
137
+ def __init__(self, project: str, endpoint_id: str, window_length: int) -> None:
165
138
  """
166
139
  Initialize a batch window generator object that generates batch window objects
167
140
  for the monitoring functions.
168
141
  """
169
- self._batch_dict = batch_dict
170
- self._norm_batch_dict()
171
- self._timedelta = self._get_timedelta()
172
-
173
- def _norm_batch_dict(self) -> None:
174
- # TODO: This will be removed once the job params can be parsed with different types
175
- # Convert batch dict string into a dictionary
176
- if isinstance(self._batch_dict, str):
177
- self._parse_batch_dict_str()
178
-
179
- def _parse_batch_dict_str(self) -> None:
180
- """Convert batch dictionary string into a valid dictionary"""
181
- characters_to_remove = "{} "
182
- pattern = "[" + characters_to_remove + "]"
183
- # Remove unnecessary characters from the provided string
184
- batch_list = re.sub(pattern, "", self._batch_dict).split(",")
185
- # Initialize the dictionary of batch interval ranges
186
- self._batch_dict = {}
187
- for pair in batch_list:
188
- pair_list = pair.split(":")
189
- self._batch_dict[pair_list[0]] = float(pair_list[1])
190
-
191
- def _get_timedelta(self) -> int:
192
- """Get the timedelta in seconds from the batch dictionary"""
193
- return int(
194
- batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
142
+ self._project = project
143
+ self._endpoint_id = endpoint_id
144
+ self._timedelta = window_length
145
+ self._schedules_file = ModelMonitoringSchedulesFile(
146
+ project=project, endpoint_id=endpoint_id
147
+ )
148
+
149
+ def __enter__(self) -> "_BatchWindowGenerator":
150
+ self._schedules_file.__enter__()
151
+ return super().__enter__()
152
+
153
+ def __exit__(
154
+ self,
155
+ exc_type: Optional[type[BaseException]],
156
+ exc_value: Optional[BaseException],
157
+ traceback: Optional[TracebackType],
158
+ ) -> Optional[bool]:
159
+ self._schedules_file.__exit__(
160
+ exc_type=exc_type, exc_value=exc_value, traceback=traceback
195
161
  )
196
162
 
197
163
  @classmethod
198
- def _get_last_updated_time(
199
- cls, last_request: Optional[str], has_stream: bool
200
- ) -> Optional[int]:
164
+ def _get_last_updated_time(cls, last_request: str, has_stream: bool) -> int:
201
165
  """
202
166
  Get the last updated time of a model endpoint.
203
167
  """
204
- if not last_request:
205
- return None
206
168
  last_updated = int(
207
169
  cls._date_string2timestamp(last_request)
208
170
  - cast(
@@ -221,45 +183,42 @@ class _BatchWindowGenerator:
221
183
  )
222
184
  return last_updated
223
185
 
224
- @classmethod
225
- def _normalize_first_request(
226
- cls, first_request: Optional[str], endpoint: str
227
- ) -> Optional[int]:
228
- if not first_request:
229
- logger.debug(
230
- "There is no first request time for this endpoint.",
231
- endpoint=endpoint,
232
- first_request=first_request,
233
- )
234
- return None
235
- return cls._date_string2timestamp(first_request)
236
-
237
186
  @staticmethod
238
187
  def _date_string2timestamp(date_string: str) -> int:
239
188
  return int(datetime.datetime.fromisoformat(date_string).timestamp())
240
189
 
241
- def get_batch_window(
190
+ def get_intervals(
242
191
  self,
243
- project: str,
244
- endpoint: str,
192
+ *,
245
193
  application: str,
246
- first_request: Optional[str],
247
- last_request: Optional[str],
194
+ first_request: str,
195
+ last_request: str,
248
196
  has_stream: bool,
249
- ) -> _BatchWindow:
197
+ ) -> Iterator[_Interval]:
250
198
  """
251
199
  Get the batch window for a specific endpoint and application.
252
- first_request is the first request time to the endpoint.
200
+ `first_request` and `last_request` are the timestamps of the first request and last
201
+ request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
253
202
  """
254
-
255
- return _BatchWindow(
256
- project=project,
257
- endpoint=endpoint,
203
+ batch_window = _BatchWindow(
204
+ schedules_file=self._schedules_file,
258
205
  application=application,
259
206
  timedelta_seconds=self._timedelta,
260
207
  last_updated=self._get_last_updated_time(last_request, has_stream),
261
- first_request=self._normalize_first_request(first_request, endpoint),
208
+ first_request=self._date_string2timestamp(first_request),
262
209
  )
210
+ yield from batch_window.get_intervals()
211
+
212
+
213
+ def _get_window_length() -> int:
214
+ """Get the timedelta in seconds from the batch dictionary"""
215
+ return int(
216
+ batch_dict2timedelta(
217
+ json.loads(
218
+ cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
219
+ )
220
+ ).total_seconds()
221
+ )
263
222
 
264
223
 
265
224
  class MonitoringApplicationController:
@@ -278,17 +237,11 @@ class MonitoringApplicationController:
278
237
 
279
238
  self.db = mlrun.model_monitoring.get_store_object(project=self.project)
280
239
 
281
- self._batch_window_generator = _BatchWindowGenerator(
282
- batch_dict=json.loads(
283
- mlrun.get_secret_or_env(
284
- mm_constants.EventFieldType.BATCH_INTERVALS_DICT
285
- )
286
- )
287
- )
240
+ self._window_length = _get_window_length()
288
241
 
289
242
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
290
243
  self.storage_options = None
291
- if mlconf.artifact_path.startswith("s3://"):
244
+ if mlrun.mlconf.artifact_path.startswith("s3://"):
292
245
  self.storage_options = mlrun.mlconf.get_s3_storage_options()
293
246
 
294
247
  @staticmethod
@@ -299,6 +252,22 @@ class MonitoringApplicationController:
299
252
  access_key = mlrun.mlconf.get_v3io_access_key()
300
253
  return access_key
301
254
 
255
+ @staticmethod
256
+ def _should_monitor_endpoint(endpoint: dict[str, Any]) -> bool:
257
+ return (
258
+ # Is the model endpoint active?
259
+ endpoint[mm_constants.EventFieldType.ACTIVE]
260
+ # Is the model endpoint monitored?
261
+ and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
262
+ == mm_constants.ModelMonitoringMode.enabled
263
+ # Was the model endpoint called? I.e., are the first and last requests nonempty?
264
+ and endpoint[mm_constants.EventFieldType.FIRST_REQUEST]
265
+ and endpoint[mm_constants.EventFieldType.LAST_REQUEST]
266
+ # Is the model endpoint not a router endpoint? Router endpoint has no feature stats
267
+ and int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
268
+ != mm_constants.EndpointType.ROUTER
269
+ )
270
+
302
271
  def run(self) -> None:
303
272
  """
304
273
  Main method for run all the relevant monitoring applications on each endpoint.
@@ -349,32 +318,18 @@ class MonitoringApplicationController:
349
318
  exc=err_to_str(e),
350
319
  )
351
320
  return
352
- # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
321
+ # Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
353
322
  with concurrent.futures.ThreadPoolExecutor(
354
- max_workers=min(len(endpoints), 10),
323
+ max_workers=min(len(endpoints), 10)
355
324
  ) as pool:
356
325
  for endpoint in endpoints:
357
- if (
358
- endpoint[mm_constants.EventFieldType.ACTIVE]
359
- and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
360
- == mm_constants.ModelMonitoringMode.enabled.value
361
- ):
362
- # Skip router endpoint:
363
- if (
364
- int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
365
- == mm_constants.EndpointType.ROUTER
366
- ):
367
- # Router endpoint has no feature stats
368
- logger.info(
369
- f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
370
- )
371
- continue
326
+ if self._should_monitor_endpoint(endpoint):
372
327
  pool.submit(
373
328
  MonitoringApplicationController.model_endpoint_process,
329
+ project=self.project,
374
330
  endpoint=endpoint,
375
331
  applications_names=applications_names,
376
- batch_window_generator=self._batch_window_generator,
377
- project=self.project,
332
+ window_length=self._window_length,
378
333
  model_monitoring_access_key=self.model_monitoring_access_key,
379
334
  storage_options=self.storage_options,
380
335
  )
@@ -382,10 +337,10 @@ class MonitoringApplicationController:
382
337
  @classmethod
383
338
  def model_endpoint_process(
384
339
  cls,
340
+ project: str,
385
341
  endpoint: dict,
386
342
  applications_names: list[str],
387
- batch_window_generator: _BatchWindowGenerator,
388
- project: str,
343
+ window_length: int,
389
344
  model_monitoring_access_key: str,
390
345
  storage_options: Optional[dict] = None,
391
346
  ) -> None:
@@ -407,45 +362,49 @@ class MonitoringApplicationController:
407
362
  endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
408
363
  )
409
364
  try:
410
- for application in applications_names:
411
- batch_window = batch_window_generator.get_batch_window(
412
- project=project,
413
- endpoint=endpoint_id,
414
- application=application,
415
- first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
416
- last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
417
- has_stream=has_stream,
418
- )
419
-
420
- for start_infer_time, end_infer_time in batch_window.get_intervals():
421
- df = m_fs.to_dataframe(
422
- start_time=start_infer_time,
423
- end_time=end_infer_time,
424
- time_column=mm_constants.EventFieldType.TIMESTAMP,
425
- storage_options=storage_options,
426
- )
427
- if len(df) == 0:
428
- logger.info(
429
- "No data found for the given interval",
430
- start=start_infer_time,
431
- end=end_infer_time,
432
- endpoint_id=endpoint_id,
433
- )
434
- else:
435
- logger.info(
436
- "Data found for the given interval",
437
- start=start_infer_time,
438
- end=end_infer_time,
439
- endpoint_id=endpoint_id,
440
- )
441
- cls._push_to_applications(
442
- start_infer_time=start_infer_time,
443
- end_infer_time=end_infer_time,
444
- endpoint_id=endpoint_id,
445
- project=project,
446
- applications_names=[application],
447
- model_monitoring_access_key=model_monitoring_access_key,
365
+ with _BatchWindowGenerator(
366
+ project=project, endpoint_id=endpoint_id, window_length=window_length
367
+ ) as batch_window_generator:
368
+ for application in applications_names:
369
+ for (
370
+ start_infer_time,
371
+ end_infer_time,
372
+ ) in batch_window_generator.get_intervals(
373
+ application=application,
374
+ first_request=endpoint[
375
+ mm_constants.EventFieldType.FIRST_REQUEST
376
+ ],
377
+ last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
378
+ has_stream=has_stream,
379
+ ):
380
+ df = m_fs.to_dataframe(
381
+ start_time=start_infer_time,
382
+ end_time=end_infer_time,
383
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
384
+ storage_options=storage_options,
448
385
  )
386
+ if len(df) == 0:
387
+ logger.info(
388
+ "No data found for the given interval",
389
+ start=start_infer_time,
390
+ end=end_infer_time,
391
+ endpoint_id=endpoint_id,
392
+ )
393
+ else:
394
+ logger.info(
395
+ "Data found for the given interval",
396
+ start=start_infer_time,
397
+ end=end_infer_time,
398
+ endpoint_id=endpoint_id,
399
+ )
400
+ cls._push_to_applications(
401
+ start_infer_time=start_infer_time,
402
+ end_infer_time=end_infer_time,
403
+ endpoint_id=endpoint_id,
404
+ project=project,
405
+ applications_names=[application],
406
+ model_monitoring_access_key=model_monitoring_access_key,
407
+ )
449
408
 
450
409
  except Exception:
451
410
  logger.exception(
@@ -491,14 +450,17 @@ class MonitoringApplicationController:
491
450
  stream_uri = get_stream_path(project=project, function_name=app_name)
492
451
 
493
452
  logger.info(
494
- f"push endpoint_id {endpoint_id} to {app_name} by stream :{stream_uri}"
453
+ "Pushing data to application stream",
454
+ endpoint_id=endpoint_id,
455
+ app_name=app_name,
456
+ stream_uri=stream_uri,
495
457
  )
496
458
  get_stream_pusher(stream_uri, access_key=model_monitoring_access_key).push(
497
459
  [data]
498
460
  )
499
461
 
500
462
 
501
- def handler(context: nuclio.Context, event: nuclio.Event) -> None:
463
+ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
502
464
  """
503
465
  Run model monitoring application processor
504
466