mlrun 1.7.1rc4__py3-none-any.whl → 1.8.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (257) hide show
  1. mlrun/__init__.py +23 -21
  2. mlrun/__main__.py +3 -3
  3. mlrun/alerts/alert.py +148 -14
  4. mlrun/artifacts/__init__.py +1 -2
  5. mlrun/artifacts/base.py +46 -12
  6. mlrun/artifacts/dataset.py +16 -16
  7. mlrun/artifacts/document.py +334 -0
  8. mlrun/artifacts/manager.py +15 -13
  9. mlrun/artifacts/model.py +66 -53
  10. mlrun/common/constants.py +7 -0
  11. mlrun/common/formatters/__init__.py +1 -0
  12. mlrun/common/formatters/feature_set.py +1 -0
  13. mlrun/common/formatters/function.py +1 -0
  14. mlrun/{model_monitoring/db/stores/base/__init__.py → common/formatters/model_endpoint.py} +16 -1
  15. mlrun/common/formatters/pipeline.py +1 -2
  16. mlrun/common/formatters/project.py +9 -0
  17. mlrun/common/model_monitoring/__init__.py +0 -5
  18. mlrun/common/model_monitoring/helpers.py +1 -29
  19. mlrun/common/runtimes/constants.py +1 -2
  20. mlrun/common/schemas/__init__.py +6 -2
  21. mlrun/common/schemas/alert.py +111 -19
  22. mlrun/common/schemas/api_gateway.py +3 -3
  23. mlrun/common/schemas/artifact.py +11 -7
  24. mlrun/common/schemas/auth.py +6 -4
  25. mlrun/common/schemas/background_task.py +7 -7
  26. mlrun/common/schemas/client_spec.py +2 -3
  27. mlrun/common/schemas/clusterization_spec.py +2 -2
  28. mlrun/common/schemas/common.py +53 -3
  29. mlrun/common/schemas/constants.py +15 -0
  30. mlrun/common/schemas/datastore_profile.py +1 -1
  31. mlrun/common/schemas/feature_store.py +9 -9
  32. mlrun/common/schemas/frontend_spec.py +4 -4
  33. mlrun/common/schemas/function.py +10 -10
  34. mlrun/common/schemas/hub.py +1 -1
  35. mlrun/common/schemas/k8s.py +3 -3
  36. mlrun/common/schemas/memory_reports.py +3 -3
  37. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  38. mlrun/common/schemas/model_monitoring/constants.py +66 -14
  39. mlrun/common/schemas/model_monitoring/grafana.py +1 -1
  40. mlrun/common/schemas/model_monitoring/model_endpoints.py +91 -147
  41. mlrun/common/schemas/notification.py +24 -3
  42. mlrun/common/schemas/object.py +1 -1
  43. mlrun/common/schemas/pagination.py +4 -4
  44. mlrun/common/schemas/partition.py +137 -0
  45. mlrun/common/schemas/pipeline.py +2 -2
  46. mlrun/common/schemas/project.py +25 -17
  47. mlrun/common/schemas/runs.py +2 -2
  48. mlrun/common/schemas/runtime_resource.py +5 -5
  49. mlrun/common/schemas/schedule.py +1 -1
  50. mlrun/common/schemas/secret.py +1 -1
  51. mlrun/common/schemas/tag.py +3 -3
  52. mlrun/common/schemas/workflow.py +5 -5
  53. mlrun/config.py +67 -10
  54. mlrun/data_types/__init__.py +0 -2
  55. mlrun/data_types/infer.py +3 -1
  56. mlrun/data_types/spark.py +2 -1
  57. mlrun/datastore/__init__.py +0 -2
  58. mlrun/datastore/alibaba_oss.py +4 -1
  59. mlrun/datastore/azure_blob.py +4 -1
  60. mlrun/datastore/base.py +12 -4
  61. mlrun/datastore/datastore.py +9 -3
  62. mlrun/datastore/datastore_profile.py +79 -20
  63. mlrun/datastore/dbfs_store.py +4 -1
  64. mlrun/datastore/filestore.py +4 -1
  65. mlrun/datastore/google_cloud_storage.py +4 -1
  66. mlrun/datastore/hdfs.py +4 -1
  67. mlrun/datastore/inmem.py +4 -1
  68. mlrun/datastore/redis.py +4 -1
  69. mlrun/datastore/s3.py +4 -1
  70. mlrun/datastore/sources.py +52 -51
  71. mlrun/datastore/store_resources.py +0 -2
  72. mlrun/datastore/targets.py +21 -21
  73. mlrun/datastore/utils.py +2 -2
  74. mlrun/datastore/v3io.py +4 -1
  75. mlrun/datastore/vectorstore.py +194 -0
  76. mlrun/datastore/wasbfs/fs.py +13 -12
  77. mlrun/db/base.py +208 -82
  78. mlrun/db/factory.py +0 -3
  79. mlrun/db/httpdb.py +1237 -386
  80. mlrun/db/nopdb.py +201 -74
  81. mlrun/errors.py +2 -2
  82. mlrun/execution.py +136 -50
  83. mlrun/feature_store/__init__.py +0 -2
  84. mlrun/feature_store/api.py +41 -40
  85. mlrun/feature_store/common.py +9 -9
  86. mlrun/feature_store/feature_set.py +20 -18
  87. mlrun/feature_store/feature_vector.py +27 -24
  88. mlrun/feature_store/retrieval/base.py +14 -9
  89. mlrun/feature_store/retrieval/job.py +2 -1
  90. mlrun/feature_store/steps.py +2 -2
  91. mlrun/features.py +30 -13
  92. mlrun/frameworks/__init__.py +1 -2
  93. mlrun/frameworks/_common/__init__.py +1 -2
  94. mlrun/frameworks/_common/artifacts_library.py +2 -2
  95. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  96. mlrun/frameworks/_common/model_handler.py +29 -27
  97. mlrun/frameworks/_common/producer.py +3 -1
  98. mlrun/frameworks/_dl_common/__init__.py +1 -2
  99. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  100. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  101. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  102. mlrun/frameworks/_ml_common/__init__.py +1 -2
  103. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  104. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  105. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  106. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  107. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  108. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  109. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  110. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  111. mlrun/frameworks/huggingface/__init__.py +1 -2
  112. mlrun/frameworks/huggingface/model_server.py +9 -9
  113. mlrun/frameworks/lgbm/__init__.py +47 -44
  114. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  115. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  116. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  117. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  118. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  119. mlrun/frameworks/lgbm/model_handler.py +15 -11
  120. mlrun/frameworks/lgbm/model_server.py +11 -7
  121. mlrun/frameworks/lgbm/utils.py +2 -2
  122. mlrun/frameworks/onnx/__init__.py +1 -2
  123. mlrun/frameworks/onnx/dataset.py +3 -3
  124. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  125. mlrun/frameworks/onnx/model_handler.py +7 -5
  126. mlrun/frameworks/onnx/model_server.py +8 -6
  127. mlrun/frameworks/parallel_coordinates.py +11 -11
  128. mlrun/frameworks/pytorch/__init__.py +22 -23
  129. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  130. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  131. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  132. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  133. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  134. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  135. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  136. mlrun/frameworks/pytorch/model_handler.py +21 -17
  137. mlrun/frameworks/pytorch/model_server.py +13 -9
  138. mlrun/frameworks/sklearn/__init__.py +19 -18
  139. mlrun/frameworks/sklearn/estimator.py +2 -2
  140. mlrun/frameworks/sklearn/metric.py +3 -3
  141. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  142. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  143. mlrun/frameworks/sklearn/model_handler.py +4 -3
  144. mlrun/frameworks/tf_keras/__init__.py +11 -12
  145. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  146. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  147. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  148. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  149. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  150. mlrun/frameworks/tf_keras/model_server.py +12 -8
  151. mlrun/frameworks/xgboost/__init__.py +19 -18
  152. mlrun/frameworks/xgboost/model_handler.py +13 -9
  153. mlrun/launcher/base.py +3 -4
  154. mlrun/launcher/local.py +1 -1
  155. mlrun/launcher/remote.py +1 -1
  156. mlrun/lists.py +4 -3
  157. mlrun/model.py +117 -46
  158. mlrun/model_monitoring/__init__.py +4 -4
  159. mlrun/model_monitoring/api.py +61 -59
  160. mlrun/model_monitoring/applications/_application_steps.py +17 -17
  161. mlrun/model_monitoring/applications/base.py +165 -6
  162. mlrun/model_monitoring/applications/context.py +88 -37
  163. mlrun/model_monitoring/applications/evidently_base.py +1 -2
  164. mlrun/model_monitoring/applications/histogram_data_drift.py +43 -21
  165. mlrun/model_monitoring/applications/results.py +55 -3
  166. mlrun/model_monitoring/controller.py +207 -239
  167. mlrun/model_monitoring/db/__init__.py +0 -2
  168. mlrun/model_monitoring/db/_schedules.py +156 -0
  169. mlrun/model_monitoring/db/_stats.py +189 -0
  170. mlrun/model_monitoring/db/tsdb/base.py +78 -25
  171. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +90 -16
  172. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  173. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +279 -59
  174. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  175. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +78 -17
  176. mlrun/model_monitoring/helpers.py +152 -49
  177. mlrun/model_monitoring/stream_processing.py +99 -283
  178. mlrun/model_monitoring/tracking_policy.py +10 -3
  179. mlrun/model_monitoring/writer.py +48 -36
  180. mlrun/package/__init__.py +3 -6
  181. mlrun/package/context_handler.py +1 -1
  182. mlrun/package/packager.py +12 -9
  183. mlrun/package/packagers/__init__.py +0 -2
  184. mlrun/package/packagers/default_packager.py +14 -11
  185. mlrun/package/packagers/numpy_packagers.py +16 -7
  186. mlrun/package/packagers/pandas_packagers.py +18 -18
  187. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  188. mlrun/package/packagers_manager.py +31 -14
  189. mlrun/package/utils/__init__.py +0 -3
  190. mlrun/package/utils/_pickler.py +6 -6
  191. mlrun/platforms/__init__.py +47 -16
  192. mlrun/platforms/iguazio.py +4 -1
  193. mlrun/projects/operations.py +27 -27
  194. mlrun/projects/pipelines.py +75 -38
  195. mlrun/projects/project.py +865 -206
  196. mlrun/run.py +53 -10
  197. mlrun/runtimes/__init__.py +1 -3
  198. mlrun/runtimes/base.py +15 -11
  199. mlrun/runtimes/daskjob.py +9 -9
  200. mlrun/runtimes/generators.py +2 -1
  201. mlrun/runtimes/kubejob.py +4 -5
  202. mlrun/runtimes/mounts.py +572 -0
  203. mlrun/runtimes/mpijob/__init__.py +0 -2
  204. mlrun/runtimes/mpijob/abstract.py +7 -6
  205. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  206. mlrun/runtimes/nuclio/application/application.py +11 -11
  207. mlrun/runtimes/nuclio/function.py +19 -17
  208. mlrun/runtimes/nuclio/serving.py +18 -11
  209. mlrun/runtimes/pod.py +154 -45
  210. mlrun/runtimes/remotesparkjob.py +3 -2
  211. mlrun/runtimes/sparkjob/__init__.py +0 -2
  212. mlrun/runtimes/sparkjob/spark3job.py +21 -11
  213. mlrun/runtimes/utils.py +6 -5
  214. mlrun/serving/merger.py +6 -4
  215. mlrun/serving/remote.py +18 -17
  216. mlrun/serving/routers.py +185 -172
  217. mlrun/serving/server.py +7 -1
  218. mlrun/serving/states.py +97 -78
  219. mlrun/serving/utils.py +13 -2
  220. mlrun/serving/v1_serving.py +3 -2
  221. mlrun/serving/v2_serving.py +74 -65
  222. mlrun/track/__init__.py +1 -1
  223. mlrun/track/tracker.py +2 -2
  224. mlrun/track/trackers/mlflow_tracker.py +6 -5
  225. mlrun/utils/async_http.py +1 -1
  226. mlrun/utils/clones.py +1 -1
  227. mlrun/utils/helpers.py +66 -18
  228. mlrun/utils/logger.py +106 -4
  229. mlrun/utils/notifications/notification/__init__.py +22 -19
  230. mlrun/utils/notifications/notification/base.py +33 -14
  231. mlrun/utils/notifications/notification/console.py +6 -6
  232. mlrun/utils/notifications/notification/git.py +11 -11
  233. mlrun/utils/notifications/notification/ipython.py +10 -9
  234. mlrun/utils/notifications/notification/mail.py +176 -0
  235. mlrun/utils/notifications/notification/slack.py +6 -6
  236. mlrun/utils/notifications/notification/webhook.py +6 -6
  237. mlrun/utils/notifications/notification_pusher.py +86 -44
  238. mlrun/utils/regex.py +3 -1
  239. mlrun/utils/version/version.json +2 -2
  240. {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/METADATA +191 -186
  241. mlrun-1.8.0rc8.dist-info/RECORD +347 -0
  242. {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/WHEEL +1 -1
  243. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  244. mlrun/model_monitoring/db/stores/base/store.py +0 -213
  245. mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
  246. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
  247. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
  248. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
  249. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
  250. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
  251. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
  252. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
  253. mlrun/model_monitoring/model_endpoint.py +0 -118
  254. mlrun-1.7.1rc4.dist-info/RECORD +0 -351
  255. {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/LICENSE +0 -0
  256. {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/entry_points.txt +0 -0
  257. {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/top_level.txt +0 -0
@@ -11,31 +11,31 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import concurrent.futures
15
16
  import datetime
16
17
  import json
17
18
  import os
18
- import re
19
19
  from collections.abc import Iterator
20
- from typing import NamedTuple, Optional, Union, cast
20
+ from contextlib import AbstractContextManager
21
+ from types import TracebackType
22
+ from typing import NamedTuple, Optional, cast
21
23
 
22
- import nuclio
24
+ import nuclio_sdk
23
25
 
24
26
  import mlrun
25
27
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
26
- import mlrun.data_types.infer
27
28
  import mlrun.feature_store as fstore
28
- import mlrun.model_monitoring.db.stores
29
- from mlrun.config import config as mlconf
29
+ import mlrun.model_monitoring
30
+ from mlrun.common.schemas import EndpointType
30
31
  from mlrun.datastore import get_stream_pusher
31
32
  from mlrun.errors import err_to_str
32
- from mlrun.model_monitoring.helpers import (
33
- _BatchDict,
34
- batch_dict2timedelta,
35
- get_stream_path,
36
- )
33
+ from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
34
+ from mlrun.model_monitoring.helpers import batch_dict2timedelta, get_stream_path
37
35
  from mlrun.utils import datetime_now, logger
38
36
 
37
+ _SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
38
+
39
39
 
40
40
  class _Interval(NamedTuple):
41
41
  start: datetime.datetime
@@ -45,12 +45,12 @@ class _Interval(NamedTuple):
45
45
  class _BatchWindow:
46
46
  def __init__(
47
47
  self,
48
- project: str,
49
- endpoint: str,
48
+ *,
49
+ schedules_file: ModelMonitoringSchedulesFile,
50
50
  application: str,
51
51
  timedelta_seconds: int,
52
- last_updated: Optional[int],
53
- first_request: Optional[int],
52
+ last_updated: int,
53
+ first_request: int,
54
54
  ) -> None:
55
55
  """
56
56
  Initialize a batch window object that handles the batch interval time range
@@ -58,159 +58,124 @@ class _BatchWindow:
58
58
  All the time values are in seconds.
59
59
  The start and stop time are in seconds since the epoch.
60
60
  """
61
- self.project = project
62
- self._endpoint = endpoint
63
61
  self._application = application
64
62
  self._first_request = first_request
65
63
  self._stop = last_updated
66
64
  self._step = timedelta_seconds
67
- self._db = mlrun.model_monitoring.get_store_object(project=self.project)
65
+ self._db = schedules_file
68
66
  self._start = self._get_last_analyzed()
69
67
 
70
- def _get_last_analyzed(self) -> Optional[int]:
71
- try:
72
- last_analyzed = self._db.get_last_analyzed(
73
- endpoint_id=self._endpoint,
74
- application_name=self._application,
75
- )
76
- except mlrun.errors.MLRunNotFoundError:
77
- logger.info(
78
- "No last analyzed time was found for this endpoint and "
79
- "application, as this is probably the first time this "
80
- "application is running. Using the latest between first "
81
- "request time or last update time minus one day instead",
82
- endpoint=self._endpoint,
83
- application=self._application,
84
- first_request=self._first_request,
85
- last_updated=self._stop,
86
- )
87
-
88
- if self._first_request and self._stop:
89
- # TODO : Change the timedelta according to the policy.
90
- first_period_in_seconds = max(
91
- int(datetime.timedelta(days=1).total_seconds()), self._step
92
- ) # max between one day and the base period
93
- return max(
94
- self._first_request,
95
- self._stop - first_period_in_seconds,
96
- )
97
- return self._first_request
68
+ def _get_saved_last_analyzed(self) -> Optional[int]:
69
+ return cast(int, self._db.get_application_time(self._application))
98
70
 
99
- logger.info(
100
- "Got the last analyzed time for this endpoint and application",
101
- endpoint=self._endpoint,
102
- application=self._application,
103
- last_analyzed=last_analyzed,
71
+ def _update_last_analyzed(self, last_analyzed: int) -> None:
72
+ self._db.update_application_time(
73
+ application=self._application, timestamp=last_analyzed
104
74
  )
105
- return last_analyzed
106
75
 
107
- def _update_last_analyzed(self, last_analyzed: int) -> None:
76
+ def _get_initial_last_analyzed(self) -> int:
108
77
  logger.info(
109
- "Updating the last analyzed time for this endpoint and application",
110
- endpoint=self._endpoint,
78
+ "No last analyzed time was found for this endpoint and application, as this is "
79
+ "probably the first time this application is running. Initializing last analyzed "
80
+ "to the latest between first request time or last update time minus one day",
111
81
  application=self._application,
112
- last_analyzed=last_analyzed,
82
+ first_request=self._first_request,
83
+ last_updated=self._stop,
113
84
  )
114
-
115
- self._db.update_last_analyzed(
116
- endpoint_id=self._endpoint,
117
- application_name=self._application,
118
- last_analyzed=last_analyzed,
85
+ # max between one day and the base period
86
+ first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
87
+ return max(
88
+ self._first_request,
89
+ self._stop - first_period_in_seconds,
119
90
  )
120
91
 
121
- def get_intervals(
122
- self,
123
- ) -> Iterator[_Interval]:
124
- """Generate the batch interval time ranges."""
125
- if self._start is not None and self._stop is not None:
126
- entered = False
127
- # Iterate timestamp from start until timestamp <= stop - step
128
- # so that the last interval will end at (timestamp + step) <= stop.
129
- # Add 1 to stop - step to get <= and not <.
130
- for timestamp in range(
131
- self._start, self._stop - self._step + 1, self._step
132
- ):
133
- entered = True
134
- start_time = datetime.datetime.fromtimestamp(
135
- timestamp, tz=datetime.timezone.utc
136
- )
137
- end_time = datetime.datetime.fromtimestamp(
138
- timestamp + self._step, tz=datetime.timezone.utc
139
- )
140
- yield _Interval(start_time, end_time)
141
- self._update_last_analyzed(timestamp + self._step)
142
- if not entered:
143
- logger.info(
144
- "All the data is set, but no complete intervals were found. "
145
- "Wait for last_updated to be updated",
146
- endpoint=self._endpoint,
147
- application=self._application,
148
- start=self._start,
149
- stop=self._stop,
150
- step=self._step,
151
- )
92
+ def _get_last_analyzed(self) -> int:
93
+ saved_last_analyzed = self._get_saved_last_analyzed()
94
+ if saved_last_analyzed is not None:
95
+ return saved_last_analyzed
152
96
  else:
153
- logger.warn(
154
- "The first request time is not found for this endpoint. "
155
- "No intervals will be generated",
156
- endpoint=self._endpoint,
97
+ last_analyzed = self._get_initial_last_analyzed()
98
+ # Update the in-memory DB to avoid duplicate initializations
99
+ self._update_last_analyzed(last_analyzed)
100
+ return last_analyzed
101
+
102
+ def get_intervals(self) -> Iterator[_Interval]:
103
+ """Generate the batch interval time ranges."""
104
+ entered = False
105
+ # Iterate timestamp from start until timestamp <= stop - step
106
+ # so that the last interval will end at (timestamp + step) <= stop.
107
+ # Add 1 to stop - step to get <= and not <.
108
+ for timestamp in range(self._start, self._stop - self._step + 1, self._step):
109
+ entered = True
110
+ start_time = datetime.datetime.fromtimestamp(
111
+ timestamp, tz=datetime.timezone.utc
112
+ )
113
+ end_time = datetime.datetime.fromtimestamp(
114
+ timestamp + self._step, tz=datetime.timezone.utc
115
+ )
116
+ yield _Interval(start_time, end_time)
117
+
118
+ last_analyzed = timestamp + self._step
119
+ self._update_last_analyzed(last_analyzed)
120
+ logger.debug(
121
+ "Updated the last analyzed time for this endpoint and application",
122
+ application=self._application,
123
+ last_analyzed=last_analyzed,
124
+ )
125
+
126
+ if not entered:
127
+ logger.debug(
128
+ "All the data is set, but no complete intervals were found. "
129
+ "Wait for last_updated to be updated",
157
130
  application=self._application,
158
131
  start=self._start,
159
132
  stop=self._stop,
133
+ step=self._step,
160
134
  )
161
135
 
162
136
 
163
- class _BatchWindowGenerator:
164
- def __init__(self, batch_dict: Union[dict, str]) -> None:
137
+ class _BatchWindowGenerator(AbstractContextManager):
138
+ def __init__(self, project: str, endpoint_id: str, window_length: int) -> None:
165
139
  """
166
140
  Initialize a batch window generator object that generates batch window objects
167
141
  for the monitoring functions.
168
142
  """
169
- self._batch_dict = batch_dict
170
- self._norm_batch_dict()
171
- self._timedelta = self._get_timedelta()
172
-
173
- def _norm_batch_dict(self) -> None:
174
- # TODO: This will be removed once the job params can be parsed with different types
175
- # Convert batch dict string into a dictionary
176
- if isinstance(self._batch_dict, str):
177
- self._parse_batch_dict_str()
178
-
179
- def _parse_batch_dict_str(self) -> None:
180
- """Convert batch dictionary string into a valid dictionary"""
181
- characters_to_remove = "{} "
182
- pattern = "[" + characters_to_remove + "]"
183
- # Remove unnecessary characters from the provided string
184
- batch_list = re.sub(pattern, "", self._batch_dict).split(",")
185
- # Initialize the dictionary of batch interval ranges
186
- self._batch_dict = {}
187
- for pair in batch_list:
188
- pair_list = pair.split(":")
189
- self._batch_dict[pair_list[0]] = float(pair_list[1])
190
-
191
- def _get_timedelta(self) -> int:
192
- """Get the timedelta in seconds from the batch dictionary"""
193
- return int(
194
- batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
143
+ self._project = project
144
+ self._endpoint_id = endpoint_id
145
+ self._timedelta = window_length
146
+ self._schedules_file = ModelMonitoringSchedulesFile(
147
+ project=project, endpoint_id=endpoint_id
148
+ )
149
+
150
+ def __enter__(self) -> "_BatchWindowGenerator":
151
+ self._schedules_file.__enter__()
152
+ return super().__enter__()
153
+
154
+ def __exit__(
155
+ self,
156
+ exc_type: Optional[type[BaseException]],
157
+ exc_value: Optional[BaseException],
158
+ traceback: Optional[TracebackType],
159
+ ) -> Optional[bool]:
160
+ self._schedules_file.__exit__(
161
+ exc_type=exc_type, exc_value=exc_value, traceback=traceback
195
162
  )
196
163
 
197
164
  @classmethod
198
165
  def _get_last_updated_time(
199
- cls, last_request: Optional[str], has_stream: bool
200
- ) -> Optional[int]:
166
+ cls, last_request: datetime.datetime, not_batch_endpoint: bool
167
+ ) -> int:
201
168
  """
202
169
  Get the last updated time of a model endpoint.
203
170
  """
204
- if not last_request:
205
- return None
206
171
  last_updated = int(
207
- cls._date_string2timestamp(last_request)
172
+ last_request.timestamp()
208
173
  - cast(
209
174
  float,
210
175
  mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
211
176
  )
212
177
  )
213
- if not has_stream:
178
+ if not not_batch_endpoint:
214
179
  # If the endpoint does not have a stream, `last_updated` should be
215
180
  # the minimum between the current time and the last updated time.
216
181
  # This compensates for the bumping mechanism - see
@@ -221,45 +186,38 @@ class _BatchWindowGenerator:
221
186
  )
222
187
  return last_updated
223
188
 
224
- @classmethod
225
- def _normalize_first_request(
226
- cls, first_request: Optional[str], endpoint: str
227
- ) -> Optional[int]:
228
- if not first_request:
229
- logger.debug(
230
- "There is no first request time for this endpoint.",
231
- endpoint=endpoint,
232
- first_request=first_request,
233
- )
234
- return None
235
- return cls._date_string2timestamp(first_request)
236
-
237
- @staticmethod
238
- def _date_string2timestamp(date_string: str) -> int:
239
- return int(datetime.datetime.fromisoformat(date_string).timestamp())
240
-
241
- def get_batch_window(
189
+ def get_intervals(
242
190
  self,
243
- project: str,
244
- endpoint: str,
191
+ *,
245
192
  application: str,
246
- first_request: Optional[str],
247
- last_request: Optional[str],
248
- has_stream: bool,
249
- ) -> _BatchWindow:
193
+ first_request: datetime.datetime,
194
+ last_request: datetime.datetime,
195
+ not_batch_endpoint: bool,
196
+ ) -> Iterator[_Interval]:
250
197
  """
251
198
  Get the batch window for a specific endpoint and application.
252
- first_request is the first request time to the endpoint.
199
+ `first_request` and `last_request` are the timestamps of the first request and last
200
+ request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
253
201
  """
254
-
255
- return _BatchWindow(
256
- project=project,
257
- endpoint=endpoint,
202
+ batch_window = _BatchWindow(
203
+ schedules_file=self._schedules_file,
258
204
  application=application,
259
205
  timedelta_seconds=self._timedelta,
260
- last_updated=self._get_last_updated_time(last_request, has_stream),
261
- first_request=self._normalize_first_request(first_request, endpoint),
206
+ last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
207
+ first_request=int(first_request.timestamp()),
262
208
  )
209
+ yield from batch_window.get_intervals()
210
+
211
+
212
+ def _get_window_length() -> int:
213
+ """Get the timedelta in seconds from the batch dictionary"""
214
+ return int(
215
+ batch_dict2timedelta(
216
+ json.loads(
217
+ cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
218
+ )
219
+ ).total_seconds()
220
+ )
263
221
 
264
222
 
265
223
  class MonitoringApplicationController:
@@ -276,19 +234,11 @@ class MonitoringApplicationController:
276
234
 
277
235
  logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
278
236
 
279
- self.db = mlrun.model_monitoring.get_store_object(project=self.project)
280
-
281
- self._batch_window_generator = _BatchWindowGenerator(
282
- batch_dict=json.loads(
283
- mlrun.get_secret_or_env(
284
- mm_constants.EventFieldType.BATCH_INTERVALS_DICT
285
- )
286
- )
287
- )
237
+ self._window_length = _get_window_length()
288
238
 
289
239
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
290
240
  self.storage_options = None
291
- if mlconf.artifact_path.startswith("s3://"):
241
+ if mlrun.mlconf.artifact_path.startswith("s3://"):
292
242
  self.storage_options = mlrun.mlconf.get_s3_storage_options()
293
243
 
294
244
  @staticmethod
@@ -299,6 +249,19 @@ class MonitoringApplicationController:
299
249
  access_key = mlrun.mlconf.get_v3io_access_key()
300
250
  return access_key
301
251
 
252
+ @staticmethod
253
+ def _should_monitor_endpoint(endpoint: mlrun.common.schemas.ModelEndpoint) -> bool:
254
+ return (
255
+ # Is the model endpoint monitored?
256
+ endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
257
+ # Was the model endpoint called? I.e., are the first and last requests nonempty?
258
+ and endpoint.status.first_request
259
+ and endpoint.status.last_request
260
+ # Is the model endpoint not a router endpoint? Router endpoint has no feature stats
261
+ and endpoint.metadata.endpoint_type.value
262
+ != mm_constants.EndpointType.ROUTER.value
263
+ )
264
+
302
265
  def run(self) -> None:
303
266
  """
304
267
  Main method for run all the relevant monitoring applications on each endpoint.
@@ -312,7 +275,10 @@ class MonitoringApplicationController:
312
275
  logger.info("Start running monitoring controller")
313
276
  try:
314
277
  applications_names = []
315
- endpoints = self.db.list_model_endpoints(include_stats=True)
278
+ endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
279
+ project=self.project, tsdb_metrics=True
280
+ )
281
+ endpoints = endpoints_list.endpoints
316
282
  if not endpoints:
317
283
  logger.info("No model endpoints found", project=self.project)
318
284
  return
@@ -349,43 +315,36 @@ class MonitoringApplicationController:
349
315
  exc=err_to_str(e),
350
316
  )
351
317
  return
352
- # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
318
+ # Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
353
319
  with concurrent.futures.ThreadPoolExecutor(
354
- max_workers=min(len(endpoints), 10),
320
+ max_workers=min(len(endpoints), 10)
355
321
  ) as pool:
356
322
  for endpoint in endpoints:
357
- if (
358
- endpoint[mm_constants.EventFieldType.ACTIVE]
359
- and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
360
- == mm_constants.ModelMonitoringMode.enabled.value
361
- ):
362
- # Skip router endpoint:
363
- if (
364
- int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
365
- == mm_constants.EndpointType.ROUTER
366
- ):
367
- # Router endpoint has no feature stats
368
- logger.info(
369
- f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
370
- )
371
- continue
323
+ if self._should_monitor_endpoint(endpoint):
372
324
  pool.submit(
373
325
  MonitoringApplicationController.model_endpoint_process,
326
+ project=self.project,
374
327
  endpoint=endpoint,
375
328
  applications_names=applications_names,
376
- batch_window_generator=self._batch_window_generator,
377
- project=self.project,
329
+ window_length=self._window_length,
378
330
  model_monitoring_access_key=self.model_monitoring_access_key,
379
331
  storage_options=self.storage_options,
380
332
  )
333
+ else:
334
+ logger.debug(
335
+ "Skipping endpoint, not ready or not suitable for monitoring",
336
+ endpoint_id=endpoint.metadata.uid,
337
+ endpoint_name=endpoint.metadata.name,
338
+ )
339
+ logger.info("Finished running monitoring controller")
381
340
 
382
341
  @classmethod
383
342
  def model_endpoint_process(
384
343
  cls,
385
- endpoint: dict,
386
- applications_names: list[str],
387
- batch_window_generator: _BatchWindowGenerator,
388
344
  project: str,
345
+ endpoint: mlrun.common.schemas.ModelEndpoint,
346
+ applications_names: list[str],
347
+ window_length: int,
389
348
  model_monitoring_access_key: str,
390
349
  storage_options: Optional[dict] = None,
391
350
  ) -> None:
@@ -401,56 +360,60 @@ class MonitoringApplicationController:
401
360
  :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
402
361
  :param storage_options: (dict) Storage options for reading the infer parquet files.
403
362
  """
404
- endpoint_id = endpoint[mm_constants.EventFieldType.UID]
405
- has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
406
- m_fs = fstore.get_feature_set(
407
- endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
363
+ endpoint_id = endpoint.metadata.uid
364
+ not_batch_endpoint = not (
365
+ endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
408
366
  )
367
+ m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
409
368
  try:
410
- for application in applications_names:
411
- batch_window = batch_window_generator.get_batch_window(
412
- project=project,
413
- endpoint=endpoint_id,
414
- application=application,
415
- first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
416
- last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
417
- has_stream=has_stream,
418
- )
419
-
420
- for start_infer_time, end_infer_time in batch_window.get_intervals():
421
- df = m_fs.to_dataframe(
422
- start_time=start_infer_time,
423
- end_time=end_infer_time,
424
- time_column=mm_constants.EventFieldType.TIMESTAMP,
425
- storage_options=storage_options,
426
- )
427
- if len(df) == 0:
428
- logger.info(
429
- "No data found for the given interval",
430
- start=start_infer_time,
431
- end=end_infer_time,
432
- endpoint_id=endpoint_id,
433
- )
434
- else:
435
- logger.info(
436
- "Data found for the given interval",
437
- start=start_infer_time,
438
- end=end_infer_time,
439
- endpoint_id=endpoint_id,
440
- )
441
- cls._push_to_applications(
442
- start_infer_time=start_infer_time,
443
- end_infer_time=end_infer_time,
444
- endpoint_id=endpoint_id,
445
- project=project,
446
- applications_names=[application],
447
- model_monitoring_access_key=model_monitoring_access_key,
369
+ with _BatchWindowGenerator(
370
+ project=project, endpoint_id=endpoint_id, window_length=window_length
371
+ ) as batch_window_generator:
372
+ for application in applications_names:
373
+ for (
374
+ start_infer_time,
375
+ end_infer_time,
376
+ ) in batch_window_generator.get_intervals(
377
+ application=application,
378
+ first_request=endpoint.status.first_request,
379
+ last_request=endpoint.status.last_request,
380
+ not_batch_endpoint=not_batch_endpoint,
381
+ ):
382
+ df = m_fs.to_dataframe(
383
+ start_time=start_infer_time,
384
+ end_time=end_infer_time,
385
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
386
+ storage_options=storage_options,
448
387
  )
388
+ if len(df) == 0:
389
+ logger.info(
390
+ "No data found for the given interval",
391
+ start=start_infer_time,
392
+ end=end_infer_time,
393
+ endpoint_id=endpoint_id,
394
+ )
395
+ else:
396
+ logger.info(
397
+ "Data found for the given interval",
398
+ start=start_infer_time,
399
+ end=end_infer_time,
400
+ endpoint_id=endpoint_id,
401
+ )
402
+ cls._push_to_applications(
403
+ start_infer_time=start_infer_time,
404
+ end_infer_time=end_infer_time,
405
+ endpoint_id=endpoint_id,
406
+ endpoint_name=endpoint.metadata.name,
407
+ project=project,
408
+ applications_names=[application],
409
+ model_monitoring_access_key=model_monitoring_access_key,
410
+ )
411
+ logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
449
412
 
450
413
  except Exception:
451
414
  logger.exception(
452
415
  "Encountered an exception",
453
- endpoint_id=endpoint[mm_constants.EventFieldType.UID],
416
+ endpoint_id=endpoint.metadata.uid,
454
417
  )
455
418
 
456
419
  @staticmethod
@@ -458,6 +421,7 @@ class MonitoringApplicationController:
458
421
  start_infer_time: datetime.datetime,
459
422
  end_infer_time: datetime.datetime,
460
423
  endpoint_id: str,
424
+ endpoint_name: str,
461
425
  project: str,
462
426
  applications_names: list[str],
463
427
  model_monitoring_access_key: str,
@@ -481,6 +445,7 @@ class MonitoringApplicationController:
481
445
  sep=" ", timespec="microseconds"
482
446
  ),
483
447
  mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
448
+ mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
484
449
  mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
485
450
  project=project,
486
451
  function_name=mm_constants.MonitoringFunctionNames.WRITER,
@@ -491,14 +456,17 @@ class MonitoringApplicationController:
491
456
  stream_uri = get_stream_path(project=project, function_name=app_name)
492
457
 
493
458
  logger.info(
494
- f"push endpoint_id {endpoint_id} to {app_name} by stream :{stream_uri}"
459
+ "Pushing data to application stream",
460
+ endpoint_id=endpoint_id,
461
+ app_name=app_name,
462
+ stream_uri=stream_uri,
495
463
  )
496
464
  get_stream_pusher(stream_uri, access_key=model_monitoring_access_key).push(
497
465
  [data]
498
466
  )
499
467
 
500
468
 
501
- def handler(context: nuclio.Context, event: nuclio.Event) -> None:
469
+ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
502
470
  """
503
471
  Run model monitoring application processor
504
472
 
@@ -12,7 +12,5 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .stores import ObjectStoreFactory, get_store_object
16
- from .stores.base import StoreBase
17
15
  from .tsdb import get_tsdb_connector
18
16
  from .tsdb.base import TSDBConnector