mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show
  1. mlrun/__init__.py +26 -22
  2. mlrun/__main__.py +15 -16
  3. mlrun/alerts/alert.py +150 -15
  4. mlrun/api/schemas/__init__.py +1 -9
  5. mlrun/artifacts/__init__.py +2 -3
  6. mlrun/artifacts/base.py +62 -19
  7. mlrun/artifacts/dataset.py +17 -17
  8. mlrun/artifacts/document.py +454 -0
  9. mlrun/artifacts/manager.py +28 -18
  10. mlrun/artifacts/model.py +91 -59
  11. mlrun/artifacts/plots.py +2 -2
  12. mlrun/common/constants.py +8 -0
  13. mlrun/common/formatters/__init__.py +1 -0
  14. mlrun/common/formatters/artifact.py +1 -1
  15. mlrun/common/formatters/feature_set.py +2 -0
  16. mlrun/common/formatters/function.py +1 -0
  17. mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
  18. mlrun/common/formatters/pipeline.py +1 -2
  19. mlrun/common/formatters/project.py +9 -0
  20. mlrun/common/model_monitoring/__init__.py +0 -5
  21. mlrun/common/model_monitoring/helpers.py +12 -62
  22. mlrun/common/runtimes/constants.py +25 -4
  23. mlrun/common/schemas/__init__.py +9 -5
  24. mlrun/common/schemas/alert.py +114 -19
  25. mlrun/common/schemas/api_gateway.py +3 -3
  26. mlrun/common/schemas/artifact.py +22 -9
  27. mlrun/common/schemas/auth.py +8 -4
  28. mlrun/common/schemas/background_task.py +7 -7
  29. mlrun/common/schemas/client_spec.py +4 -4
  30. mlrun/common/schemas/clusterization_spec.py +2 -2
  31. mlrun/common/schemas/common.py +53 -3
  32. mlrun/common/schemas/constants.py +15 -0
  33. mlrun/common/schemas/datastore_profile.py +1 -1
  34. mlrun/common/schemas/feature_store.py +9 -9
  35. mlrun/common/schemas/frontend_spec.py +4 -4
  36. mlrun/common/schemas/function.py +10 -10
  37. mlrun/common/schemas/hub.py +1 -1
  38. mlrun/common/schemas/k8s.py +3 -3
  39. mlrun/common/schemas/memory_reports.py +3 -3
  40. mlrun/common/schemas/model_monitoring/__init__.py +4 -8
  41. mlrun/common/schemas/model_monitoring/constants.py +127 -46
  42. mlrun/common/schemas/model_monitoring/grafana.py +18 -12
  43. mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
  44. mlrun/common/schemas/notification.py +24 -3
  45. mlrun/common/schemas/object.py +1 -1
  46. mlrun/common/schemas/pagination.py +4 -4
  47. mlrun/common/schemas/partition.py +142 -0
  48. mlrun/common/schemas/pipeline.py +3 -3
  49. mlrun/common/schemas/project.py +26 -18
  50. mlrun/common/schemas/runs.py +3 -3
  51. mlrun/common/schemas/runtime_resource.py +5 -5
  52. mlrun/common/schemas/schedule.py +1 -1
  53. mlrun/common/schemas/secret.py +1 -1
  54. mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
  55. mlrun/common/schemas/tag.py +3 -3
  56. mlrun/common/schemas/workflow.py +6 -5
  57. mlrun/common/types.py +1 -0
  58. mlrun/config.py +157 -89
  59. mlrun/data_types/__init__.py +5 -3
  60. mlrun/data_types/infer.py +13 -3
  61. mlrun/data_types/spark.py +2 -1
  62. mlrun/datastore/__init__.py +59 -18
  63. mlrun/datastore/alibaba_oss.py +4 -1
  64. mlrun/datastore/azure_blob.py +4 -1
  65. mlrun/datastore/base.py +19 -24
  66. mlrun/datastore/datastore.py +10 -4
  67. mlrun/datastore/datastore_profile.py +178 -45
  68. mlrun/datastore/dbfs_store.py +4 -1
  69. mlrun/datastore/filestore.py +4 -1
  70. mlrun/datastore/google_cloud_storage.py +4 -1
  71. mlrun/datastore/hdfs.py +4 -1
  72. mlrun/datastore/inmem.py +4 -1
  73. mlrun/datastore/redis.py +4 -1
  74. mlrun/datastore/s3.py +14 -3
  75. mlrun/datastore/sources.py +89 -92
  76. mlrun/datastore/store_resources.py +7 -4
  77. mlrun/datastore/storeytargets.py +51 -16
  78. mlrun/datastore/targets.py +38 -31
  79. mlrun/datastore/utils.py +87 -4
  80. mlrun/datastore/v3io.py +4 -1
  81. mlrun/datastore/vectorstore.py +291 -0
  82. mlrun/datastore/wasbfs/fs.py +13 -12
  83. mlrun/db/base.py +286 -100
  84. mlrun/db/httpdb.py +1562 -490
  85. mlrun/db/nopdb.py +250 -83
  86. mlrun/errors.py +6 -2
  87. mlrun/execution.py +194 -50
  88. mlrun/feature_store/__init__.py +2 -10
  89. mlrun/feature_store/api.py +20 -458
  90. mlrun/feature_store/common.py +9 -9
  91. mlrun/feature_store/feature_set.py +20 -18
  92. mlrun/feature_store/feature_vector.py +105 -479
  93. mlrun/feature_store/feature_vector_utils.py +466 -0
  94. mlrun/feature_store/retrieval/base.py +15 -11
  95. mlrun/feature_store/retrieval/job.py +2 -1
  96. mlrun/feature_store/retrieval/storey_merger.py +1 -1
  97. mlrun/feature_store/steps.py +3 -3
  98. mlrun/features.py +30 -13
  99. mlrun/frameworks/__init__.py +1 -2
  100. mlrun/frameworks/_common/__init__.py +1 -2
  101. mlrun/frameworks/_common/artifacts_library.py +2 -2
  102. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  103. mlrun/frameworks/_common/model_handler.py +31 -31
  104. mlrun/frameworks/_common/producer.py +3 -1
  105. mlrun/frameworks/_dl_common/__init__.py +1 -2
  106. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  107. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  108. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  109. mlrun/frameworks/_ml_common/__init__.py +1 -2
  110. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  111. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  112. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  113. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  114. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  115. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  116. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  117. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  118. mlrun/frameworks/huggingface/__init__.py +1 -2
  119. mlrun/frameworks/huggingface/model_server.py +9 -9
  120. mlrun/frameworks/lgbm/__init__.py +47 -44
  121. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  122. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  123. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  124. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  125. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  126. mlrun/frameworks/lgbm/model_handler.py +15 -11
  127. mlrun/frameworks/lgbm/model_server.py +11 -7
  128. mlrun/frameworks/lgbm/utils.py +2 -2
  129. mlrun/frameworks/onnx/__init__.py +1 -2
  130. mlrun/frameworks/onnx/dataset.py +3 -3
  131. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  132. mlrun/frameworks/onnx/model_handler.py +7 -5
  133. mlrun/frameworks/onnx/model_server.py +8 -6
  134. mlrun/frameworks/parallel_coordinates.py +11 -11
  135. mlrun/frameworks/pytorch/__init__.py +22 -23
  136. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  137. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  138. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  139. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  140. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  141. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  142. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  143. mlrun/frameworks/pytorch/model_handler.py +21 -17
  144. mlrun/frameworks/pytorch/model_server.py +13 -9
  145. mlrun/frameworks/sklearn/__init__.py +19 -18
  146. mlrun/frameworks/sklearn/estimator.py +2 -2
  147. mlrun/frameworks/sklearn/metric.py +3 -3
  148. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  149. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  150. mlrun/frameworks/sklearn/model_handler.py +4 -3
  151. mlrun/frameworks/tf_keras/__init__.py +11 -12
  152. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  153. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  154. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  155. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  156. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  157. mlrun/frameworks/tf_keras/model_server.py +12 -8
  158. mlrun/frameworks/xgboost/__init__.py +19 -18
  159. mlrun/frameworks/xgboost/model_handler.py +13 -9
  160. mlrun/k8s_utils.py +2 -5
  161. mlrun/launcher/base.py +3 -4
  162. mlrun/launcher/client.py +2 -2
  163. mlrun/launcher/local.py +6 -2
  164. mlrun/launcher/remote.py +1 -1
  165. mlrun/lists.py +8 -4
  166. mlrun/model.py +132 -46
  167. mlrun/model_monitoring/__init__.py +3 -5
  168. mlrun/model_monitoring/api.py +113 -98
  169. mlrun/model_monitoring/applications/__init__.py +0 -5
  170. mlrun/model_monitoring/applications/_application_steps.py +81 -50
  171. mlrun/model_monitoring/applications/base.py +467 -14
  172. mlrun/model_monitoring/applications/context.py +212 -134
  173. mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
  174. mlrun/model_monitoring/applications/evidently/base.py +146 -0
  175. mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
  176. mlrun/model_monitoring/applications/results.py +67 -15
  177. mlrun/model_monitoring/controller.py +701 -315
  178. mlrun/model_monitoring/db/__init__.py +0 -2
  179. mlrun/model_monitoring/db/_schedules.py +242 -0
  180. mlrun/model_monitoring/db/_stats.py +189 -0
  181. mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
  182. mlrun/model_monitoring/db/tsdb/base.py +243 -49
  183. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
  184. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  185. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
  187. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  188. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
  189. mlrun/model_monitoring/helpers.py +356 -114
  190. mlrun/model_monitoring/stream_processing.py +190 -345
  191. mlrun/model_monitoring/tracking_policy.py +11 -4
  192. mlrun/model_monitoring/writer.py +49 -90
  193. mlrun/package/__init__.py +3 -6
  194. mlrun/package/context_handler.py +2 -2
  195. mlrun/package/packager.py +12 -9
  196. mlrun/package/packagers/__init__.py +0 -2
  197. mlrun/package/packagers/default_packager.py +14 -11
  198. mlrun/package/packagers/numpy_packagers.py +16 -7
  199. mlrun/package/packagers/pandas_packagers.py +18 -18
  200. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  201. mlrun/package/packagers_manager.py +35 -32
  202. mlrun/package/utils/__init__.py +0 -3
  203. mlrun/package/utils/_pickler.py +6 -6
  204. mlrun/platforms/__init__.py +47 -16
  205. mlrun/platforms/iguazio.py +4 -1
  206. mlrun/projects/operations.py +30 -30
  207. mlrun/projects/pipelines.py +116 -47
  208. mlrun/projects/project.py +1292 -329
  209. mlrun/render.py +5 -9
  210. mlrun/run.py +57 -14
  211. mlrun/runtimes/__init__.py +1 -3
  212. mlrun/runtimes/base.py +30 -22
  213. mlrun/runtimes/daskjob.py +9 -9
  214. mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
  215. mlrun/runtimes/function_reference.py +5 -2
  216. mlrun/runtimes/generators.py +3 -2
  217. mlrun/runtimes/kubejob.py +6 -7
  218. mlrun/runtimes/mounts.py +574 -0
  219. mlrun/runtimes/mpijob/__init__.py +0 -2
  220. mlrun/runtimes/mpijob/abstract.py +7 -6
  221. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  222. mlrun/runtimes/nuclio/application/application.py +11 -13
  223. mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
  224. mlrun/runtimes/nuclio/function.py +127 -70
  225. mlrun/runtimes/nuclio/serving.py +105 -37
  226. mlrun/runtimes/pod.py +159 -54
  227. mlrun/runtimes/remotesparkjob.py +3 -2
  228. mlrun/runtimes/sparkjob/__init__.py +0 -2
  229. mlrun/runtimes/sparkjob/spark3job.py +22 -12
  230. mlrun/runtimes/utils.py +7 -6
  231. mlrun/secrets.py +2 -2
  232. mlrun/serving/__init__.py +8 -0
  233. mlrun/serving/merger.py +7 -5
  234. mlrun/serving/remote.py +35 -22
  235. mlrun/serving/routers.py +186 -240
  236. mlrun/serving/server.py +41 -10
  237. mlrun/serving/states.py +432 -118
  238. mlrun/serving/utils.py +13 -2
  239. mlrun/serving/v1_serving.py +3 -2
  240. mlrun/serving/v2_serving.py +161 -203
  241. mlrun/track/__init__.py +1 -1
  242. mlrun/track/tracker.py +2 -2
  243. mlrun/track/trackers/mlflow_tracker.py +6 -5
  244. mlrun/utils/async_http.py +35 -22
  245. mlrun/utils/clones.py +7 -4
  246. mlrun/utils/helpers.py +511 -58
  247. mlrun/utils/logger.py +119 -13
  248. mlrun/utils/notifications/notification/__init__.py +22 -19
  249. mlrun/utils/notifications/notification/base.py +39 -15
  250. mlrun/utils/notifications/notification/console.py +6 -6
  251. mlrun/utils/notifications/notification/git.py +11 -11
  252. mlrun/utils/notifications/notification/ipython.py +10 -9
  253. mlrun/utils/notifications/notification/mail.py +176 -0
  254. mlrun/utils/notifications/notification/slack.py +16 -8
  255. mlrun/utils/notifications/notification/webhook.py +24 -8
  256. mlrun/utils/notifications/notification_pusher.py +191 -200
  257. mlrun/utils/regex.py +12 -2
  258. mlrun/utils/version/version.json +2 -2
  259. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/METADATA +81 -54
  260. mlrun-1.8.0.dist-info/RECORD +351 -0
  261. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
  262. mlrun/model_monitoring/applications/evidently_base.py +0 -137
  263. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  264. mlrun/model_monitoring/db/stores/base/store.py +0 -213
  265. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
  266. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
  267. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
  268. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
  269. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
  270. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
  271. mlrun/model_monitoring/model_endpoint.py +0 -118
  272. mlrun-1.7.2rc3.dist-info/RECORD +0 -351
  273. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
  274. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
  275. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
@@ -11,31 +11,42 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+ import collections
14
16
  import concurrent.futures
15
17
  import datetime
16
18
  import json
17
19
  import os
18
- import re
20
+ import traceback
21
+ from collections import OrderedDict
19
22
  from collections.abc import Iterator
20
- from typing import NamedTuple, Optional, Union, cast
23
+ from contextlib import AbstractContextManager
24
+ from types import TracebackType
25
+ from typing import Any, NamedTuple, Optional, Union, cast
21
26
 
22
- import nuclio
27
+ import nuclio_sdk
28
+ import pandas as pd
23
29
 
24
30
  import mlrun
25
31
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
26
- import mlrun.data_types.infer
27
32
  import mlrun.feature_store as fstore
28
- import mlrun.model_monitoring.db.stores
29
- from mlrun.config import config as mlconf
30
- from mlrun.datastore import get_stream_pusher
31
- from mlrun.errors import err_to_str
32
- from mlrun.model_monitoring.helpers import (
33
- _BatchDict,
34
- batch_dict2timedelta,
35
- get_stream_path,
33
+ import mlrun.model_monitoring
34
+ import mlrun.model_monitoring.db._schedules as schedules
35
+ import mlrun.model_monitoring.helpers
36
+ import mlrun.platforms.iguazio
37
+ from mlrun.common.schemas import EndpointType
38
+ from mlrun.common.schemas.model_monitoring.constants import (
39
+ ControllerEvent,
40
+ ControllerEventEndpointPolicy,
41
+ ControllerEventKind,
36
42
  )
43
+ from mlrun.errors import err_to_str
44
+ from mlrun.model_monitoring.helpers import batch_dict2timedelta
37
45
  from mlrun.utils import datetime_now, logger
38
46
 
47
+ _SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
48
+ _SECONDS_IN_MINUTE = 60
49
+
39
50
 
40
51
  class _Interval(NamedTuple):
41
52
  start: datetime.datetime
@@ -45,12 +56,12 @@ class _Interval(NamedTuple):
45
56
  class _BatchWindow:
46
57
  def __init__(
47
58
  self,
48
- project: str,
49
- endpoint: str,
59
+ *,
60
+ schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
50
61
  application: str,
51
62
  timedelta_seconds: int,
52
- last_updated: Optional[int],
53
- first_request: Optional[int],
63
+ last_updated: int,
64
+ first_request: int,
54
65
  ) -> None:
55
66
  """
56
67
  Initialize a batch window object that handles the batch interval time range
@@ -58,159 +69,133 @@ class _BatchWindow:
58
69
  All the time values are in seconds.
59
70
  The start and stop time are in seconds since the epoch.
60
71
  """
61
- self.project = project
62
- self._endpoint = endpoint
63
72
  self._application = application
64
73
  self._first_request = first_request
65
74
  self._stop = last_updated
66
75
  self._step = timedelta_seconds
67
- self._db = mlrun.model_monitoring.get_store_object(project=self.project)
76
+ self._db = schedules_file
68
77
  self._start = self._get_last_analyzed()
69
78
 
70
- def _get_last_analyzed(self) -> Optional[int]:
71
- try:
72
- last_analyzed = self._db.get_last_analyzed(
73
- endpoint_id=self._endpoint,
74
- application_name=self._application,
75
- )
76
- except mlrun.errors.MLRunNotFoundError:
77
- logger.info(
78
- "No last analyzed time was found for this endpoint and "
79
- "application, as this is probably the first time this "
80
- "application is running. Using the latest between first "
81
- "request time or last update time minus one day instead",
82
- endpoint=self._endpoint,
83
- application=self._application,
84
- first_request=self._first_request,
85
- last_updated=self._stop,
86
- )
79
+ def _get_saved_last_analyzed(self) -> Optional[int]:
80
+ return cast(int, self._db.get_application_time(self._application))
87
81
 
88
- if self._first_request and self._stop:
89
- # TODO : Change the timedelta according to the policy.
90
- first_period_in_seconds = max(
91
- int(datetime.timedelta(days=1).total_seconds()), self._step
92
- ) # max between one day and the base period
93
- return max(
94
- self._first_request,
95
- self._stop - first_period_in_seconds,
96
- )
97
- return self._first_request
98
-
99
- logger.info(
100
- "Got the last analyzed time for this endpoint and application",
101
- endpoint=self._endpoint,
102
- application=self._application,
103
- last_analyzed=last_analyzed,
82
+ def _update_last_analyzed(self, last_analyzed: int) -> None:
83
+ self._db.update_application_time(
84
+ application=self._application, timestamp=last_analyzed
104
85
  )
105
- return last_analyzed
106
86
 
107
- def _update_last_analyzed(self, last_analyzed: int) -> None:
87
+ def _get_initial_last_analyzed(self) -> int:
108
88
  logger.info(
109
- "Updating the last analyzed time for this endpoint and application",
110
- endpoint=self._endpoint,
89
+ "No last analyzed time was found for this endpoint and application, as this is "
90
+ "probably the first time this application is running. Initializing last analyzed "
91
+ "to the latest between first request time or last update time minus one day",
111
92
  application=self._application,
112
- last_analyzed=last_analyzed,
93
+ first_request=self._first_request,
94
+ last_updated=self._stop,
113
95
  )
114
-
115
- self._db.update_last_analyzed(
116
- endpoint_id=self._endpoint,
117
- application_name=self._application,
118
- last_analyzed=last_analyzed,
96
+ # max between one day and the base period
97
+ first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
98
+ return max(
99
+ self._first_request,
100
+ self._stop - first_period_in_seconds,
119
101
  )
120
102
 
121
- def get_intervals(
122
- self,
123
- ) -> Iterator[_Interval]:
124
- """Generate the batch interval time ranges."""
125
- if self._start is not None and self._stop is not None:
126
- entered = False
127
- # Iterate timestamp from start until timestamp <= stop - step
128
- # so that the last interval will end at (timestamp + step) <= stop.
129
- # Add 1 to stop - step to get <= and not <.
130
- for timestamp in range(
131
- self._start, self._stop - self._step + 1, self._step
132
- ):
133
- entered = True
134
- start_time = datetime.datetime.fromtimestamp(
135
- timestamp, tz=datetime.timezone.utc
136
- )
137
- end_time = datetime.datetime.fromtimestamp(
138
- timestamp + self._step, tz=datetime.timezone.utc
139
- )
140
- yield _Interval(start_time, end_time)
141
- self._update_last_analyzed(timestamp + self._step)
142
- if not entered:
143
- logger.info(
144
- "All the data is set, but no complete intervals were found. "
145
- "Wait for last_updated to be updated",
146
- endpoint=self._endpoint,
147
- application=self._application,
148
- start=self._start,
149
- stop=self._stop,
150
- step=self._step,
151
- )
103
+ def _get_last_analyzed(self) -> int:
104
+ saved_last_analyzed = self._get_saved_last_analyzed()
105
+ if saved_last_analyzed is not None:
106
+ return saved_last_analyzed
152
107
  else:
153
- logger.warn(
154
- "The first request time is not found for this endpoint. "
155
- "No intervals will be generated",
156
- endpoint=self._endpoint,
108
+ last_analyzed = self._get_initial_last_analyzed()
109
+ # Update the in-memory DB to avoid duplicate initializations
110
+ self._update_last_analyzed(last_analyzed)
111
+ return last_analyzed
112
+
113
+ def get_intervals(self) -> Iterator[_Interval]:
114
+ """Generate the batch interval time ranges."""
115
+ entered = False
116
+ # Iterate timestamp from start until timestamp <= stop - step
117
+ # so that the last interval will end at (timestamp + step) <= stop.
118
+ # Add 1 to stop - step to get <= and not <.
119
+ for timestamp in range(self._start, self._stop - self._step + 1, self._step):
120
+ entered = True
121
+ start_time = datetime.datetime.fromtimestamp(
122
+ timestamp, tz=datetime.timezone.utc
123
+ )
124
+ end_time = datetime.datetime.fromtimestamp(
125
+ timestamp + self._step, tz=datetime.timezone.utc
126
+ )
127
+ yield _Interval(start_time, end_time)
128
+
129
+ last_analyzed = timestamp + self._step
130
+ self._update_last_analyzed(last_analyzed)
131
+ logger.debug(
132
+ "Updated the last analyzed time for this endpoint and application",
133
+ application=self._application,
134
+ last_analyzed=last_analyzed,
135
+ )
136
+
137
+ if not entered:
138
+ logger.debug(
139
+ "All the data is set, but no complete intervals were found. "
140
+ "Wait for last_updated to be updated",
157
141
  application=self._application,
158
142
  start=self._start,
159
143
  stop=self._stop,
144
+ step=self._step,
160
145
  )
161
146
 
162
147
 
163
- class _BatchWindowGenerator:
164
- def __init__(self, batch_dict: Union[dict, str]) -> None:
148
+ class _BatchWindowGenerator(AbstractContextManager):
149
+ def __init__(
150
+ self, project: str, endpoint_id: str, window_length: Optional[int] = None
151
+ ) -> None:
165
152
  """
166
153
  Initialize a batch window generator object that generates batch window objects
167
154
  for the monitoring functions.
168
155
  """
169
- self._batch_dict = batch_dict
170
- self._norm_batch_dict()
171
- self._timedelta = self._get_timedelta()
172
-
173
- def _norm_batch_dict(self) -> None:
174
- # TODO: This will be removed once the job params can be parsed with different types
175
- # Convert batch dict string into a dictionary
176
- if isinstance(self._batch_dict, str):
177
- self._parse_batch_dict_str()
178
-
179
- def _parse_batch_dict_str(self) -> None:
180
- """Convert batch dictionary string into a valid dictionary"""
181
- characters_to_remove = "{} "
182
- pattern = "[" + characters_to_remove + "]"
183
- # Remove unnecessary characters from the provided string
184
- batch_list = re.sub(pattern, "", self._batch_dict).split(",")
185
- # Initialize the dictionary of batch interval ranges
186
- self._batch_dict = {}
187
- for pair in batch_list:
188
- pair_list = pair.split(":")
189
- self._batch_dict[pair_list[0]] = float(pair_list[1])
190
-
191
- def _get_timedelta(self) -> int:
192
- """Get the timedelta in seconds from the batch dictionary"""
193
- return int(
194
- batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
156
+ self.batch_window: _BatchWindow = None
157
+ self._project = project
158
+ self._endpoint_id = endpoint_id
159
+ self._timedelta = window_length
160
+ self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
161
+ project=project, endpoint_id=endpoint_id
195
162
  )
196
163
 
164
+ def __enter__(self) -> "_BatchWindowGenerator":
165
+ self._schedules_file.__enter__()
166
+ return super().__enter__()
167
+
168
+ def __exit__(
169
+ self,
170
+ exc_type: Optional[type[BaseException]],
171
+ exc_value: Optional[BaseException],
172
+ traceback: Optional[TracebackType],
173
+ ) -> Optional[bool]:
174
+ self._schedules_file.__exit__(
175
+ exc_type=exc_type, exc_value=exc_value, traceback=traceback
176
+ )
177
+
178
+ def get_application_list(self) -> set[str]:
179
+ return self._schedules_file.get_application_list()
180
+
181
+ def get_min_last_analyzed(self) -> Optional[int]:
182
+ return self._schedules_file.get_min_timestamp()
183
+
197
184
  @classmethod
198
185
  def _get_last_updated_time(
199
- cls, last_request: Optional[str], has_stream: bool
200
- ) -> Optional[int]:
186
+ cls, last_request: datetime.datetime, not_batch_endpoint: bool
187
+ ) -> int:
201
188
  """
202
189
  Get the last updated time of a model endpoint.
203
190
  """
204
- if not last_request:
205
- return None
206
191
  last_updated = int(
207
- cls._date_string2timestamp(last_request)
192
+ last_request.timestamp()
208
193
  - cast(
209
194
  float,
210
195
  mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
211
196
  )
212
197
  )
213
- if not has_stream:
198
+ if not not_batch_endpoint:
214
199
  # If the endpoint does not have a stream, `last_updated` should be
215
200
  # the minimum between the current time and the last updated time.
216
201
  # This compensates for the bumping mechanism - see
@@ -221,45 +206,38 @@ class _BatchWindowGenerator:
221
206
  )
222
207
  return last_updated
223
208
 
224
- @classmethod
225
- def _normalize_first_request(
226
- cls, first_request: Optional[str], endpoint: str
227
- ) -> Optional[int]:
228
- if not first_request:
229
- logger.debug(
230
- "There is no first request time for this endpoint.",
231
- endpoint=endpoint,
232
- first_request=first_request,
233
- )
234
- return None
235
- return cls._date_string2timestamp(first_request)
236
-
237
- @staticmethod
238
- def _date_string2timestamp(date_string: str) -> int:
239
- return int(datetime.datetime.fromisoformat(date_string).timestamp())
240
-
241
- def get_batch_window(
209
+ def get_intervals(
242
210
  self,
243
- project: str,
244
- endpoint: str,
211
+ *,
245
212
  application: str,
246
- first_request: Optional[str],
247
- last_request: Optional[str],
248
- has_stream: bool,
249
- ) -> _BatchWindow:
213
+ first_request: datetime.datetime,
214
+ last_request: datetime.datetime,
215
+ not_batch_endpoint: bool,
216
+ ) -> Iterator[_Interval]:
250
217
  """
251
218
  Get the batch window for a specific endpoint and application.
252
- first_request is the first request time to the endpoint.
219
+ `first_request` and `last_request` are the timestamps of the first request and last
220
+ request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
253
221
  """
254
-
255
- return _BatchWindow(
256
- project=project,
257
- endpoint=endpoint,
222
+ self.batch_window = _BatchWindow(
223
+ schedules_file=self._schedules_file,
258
224
  application=application,
259
225
  timedelta_seconds=self._timedelta,
260
- last_updated=self._get_last_updated_time(last_request, has_stream),
261
- first_request=self._normalize_first_request(first_request, endpoint),
226
+ last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
227
+ first_request=int(first_request.timestamp()),
262
228
  )
229
+ yield from self.batch_window.get_intervals()
230
+
231
+
232
+ def _get_window_length() -> int:
233
+ """Get the timedelta in seconds from the batch dictionary"""
234
+ return int(
235
+ batch_dict2timedelta(
236
+ json.loads(
237
+ cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
238
+ )
239
+ ).total_seconds()
240
+ )
263
241
 
264
242
 
265
243
  class MonitoringApplicationController:
@@ -269,27 +247,79 @@ class MonitoringApplicationController:
269
247
  Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
270
248
  """
271
249
 
250
+ _MAX_FEATURE_SET_PER_WORKER = 1000
251
+
272
252
  def __init__(self) -> None:
273
253
  """Initialize Monitoring Application Controller"""
274
254
  self.project = cast(str, mlrun.mlconf.default_project)
275
- self.project_obj = mlrun.load_project(name=self.project, url=self.project)
276
-
255
+ self.project_obj = mlrun.get_run_db().get_project(name=self.project)
277
256
  logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
278
257
 
279
- self.db = mlrun.model_monitoring.get_store_object(project=self.project)
258
+ self._window_length = _get_window_length()
280
259
 
281
- self._batch_window_generator = _BatchWindowGenerator(
282
- batch_dict=json.loads(
283
- mlrun.get_secret_or_env(
284
- mm_constants.EventFieldType.BATCH_INTERVALS_DICT
285
- )
286
- )
260
+ self.model_monitoring_access_key = self._get_model_monitoring_access_key()
261
+ self.v3io_access_key = mlrun.mlconf.get_v3io_access_key()
262
+ store, _, _ = mlrun.store_manager.get_or_create_store(
263
+ mlrun.mlconf.artifact_path
264
+ )
265
+ self.storage_options = store.get_storage_options()
266
+ self._controller_stream: Optional[
267
+ Union[
268
+ mlrun.platforms.iguazio.OutputStream,
269
+ mlrun.platforms.iguazio.KafkaOutputStream,
270
+ ]
271
+ ] = None
272
+ self._model_monitoring_stream: Optional[
273
+ Union[
274
+ mlrun.platforms.iguazio.OutputStream,
275
+ mlrun.platforms.iguazio.KafkaOutputStream,
276
+ ]
277
+ ] = None
278
+ self.applications_streams: dict[
279
+ str,
280
+ Union[
281
+ mlrun.platforms.iguazio.OutputStream,
282
+ mlrun.platforms.iguazio.KafkaOutputStream,
283
+ ],
284
+ ] = {}
285
+ self.feature_sets: OrderedDict[str, mlrun.feature_store.FeatureSet] = (
286
+ collections.OrderedDict()
287
+ )
288
+ self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
289
+ project=self.project
287
290
  )
288
291
 
289
- self.model_monitoring_access_key = self._get_model_monitoring_access_key()
290
- self.storage_options = None
291
- if mlconf.artifact_path.startswith("s3://"):
292
- self.storage_options = mlrun.mlconf.get_s3_storage_options()
292
+ @property
293
+ def controller_stream(
294
+ self,
295
+ ) -> Union[
296
+ mlrun.platforms.iguazio.OutputStream,
297
+ mlrun.platforms.iguazio.KafkaOutputStream,
298
+ ]:
299
+ if self._controller_stream is None:
300
+ self._controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
301
+ project=self.project,
302
+ function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
303
+ v3io_access_key=self.v3io_access_key,
304
+ )
305
+ return self._controller_stream
306
+
307
+ @property
308
+ def model_monitoring_stream(
309
+ self,
310
+ ) -> Union[
311
+ mlrun.platforms.iguazio.OutputStream,
312
+ mlrun.platforms.iguazio.KafkaOutputStream,
313
+ ]:
314
+ if self._model_monitoring_stream is None:
315
+ self._model_monitoring_stream = (
316
+ mlrun.model_monitoring.helpers.get_output_stream(
317
+ project=self.project,
318
+ function_name=mm_constants.MonitoringFunctionNames.STREAM,
319
+ v3io_access_key=self.model_monitoring_access_key,
320
+ )
321
+ )
322
+ return self._model_monitoring_stream
293
323
 
294
324
  @staticmethod
295
325
  def _get_model_monitoring_access_key() -> Optional[str]:
@@ -299,168 +329,302 @@ class MonitoringApplicationController:
299
329
  access_key = mlrun.mlconf.get_v3io_access_key()
300
330
  return access_key
301
331
 
302
- def run(self) -> None:
332
+ def _should_monitor_endpoint(
333
+ self,
334
+ endpoint: mlrun.common.schemas.ModelEndpoint,
335
+ application_names: set,
336
+ base_period_minutes: int,
337
+ schedules_file: schedules.ModelMonitoringSchedulesFileChief,
338
+ ) -> bool:
303
339
  """
304
- Main method for run all the relevant monitoring applications on each endpoint.
305
- This method handles the following:
306
- 1. List model endpoints
307
- 2. List applications
308
- 3. Check model monitoring windows
309
- 4. Send data to applications
310
- 5. Delete old parquets
340
+ checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
341
+ next conditions:
342
+ 1. monitoring_mode is enabled
343
+ 2. first request exists
344
+ 3. last request exists
345
+ 4. endpoint_type is not ROUTER
346
+ if the four above conditions apply we require one of the two condition monitor:
347
+ 1. never monitored the one of the endpoint applications meaning min_last_analyzed is None
348
+ 2. min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
349
+ is sent with the combination of current last_request & current last_analyzed per endpoint.
311
350
  """
312
- logger.info("Start running monitoring controller")
313
- try:
314
- applications_names = []
315
- endpoints = self.db.list_model_endpoints(include_stats=True)
316
- if not endpoints:
317
- logger.info("No model endpoints found", project=self.project)
318
- return
319
- monitoring_functions = self.project_obj.list_model_monitoring_functions()
320
- if monitoring_functions:
321
- applications_names = list(
322
- {app.metadata.name for app in monitoring_functions}
351
+ last_timestamp_sent = schedules_file.get_endpoint_last_request(
352
+ endpoint.metadata.uid
353
+ )
354
+ last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
355
+ endpoint.metadata.uid
356
+ )
357
+ logger.debug(
358
+ "Chief should monitor endpoint check",
359
+ last_timestamp_sent=last_timestamp_sent,
360
+ last_analyzed_sent=last_analyzed_sent,
361
+ uid=endpoint.metadata.uid,
362
+ )
363
+ if (
364
+ # Is the model endpoint monitored?
365
+ endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
366
+ # Was the model endpoint called? I.e., are the first and last requests nonempty?
367
+ and endpoint.status.first_request
368
+ and endpoint.status.last_request
369
+ # Is the model endpoint not a router endpoint? Router endpoint has no feature stats
370
+ and endpoint.metadata.endpoint_type.value
371
+ != mm_constants.EndpointType.ROUTER.value
372
+ ):
373
+ with _BatchWindowGenerator(
374
+ project=endpoint.metadata.project,
375
+ endpoint_id=endpoint.metadata.uid,
376
+ ) as batch_window_generator:
377
+ current_time = mlrun.utils.datetime_now()
378
+ current_min_last_analyzed = (
379
+ batch_window_generator.get_min_last_analyzed()
323
380
  )
324
- # if monitoring_functions: - TODO : ML-7700
325
- # Gets only application in ready state
326
- # applications_names = list(
327
- # {
328
- # app.metadata.name
329
- # for app in monitoring_functions
330
- # if (
331
- # app.status.state == "ready"
332
- # # workaround for the default app, as its `status.state` is `None`
333
- # or app.metadata.name
334
- # == mm_constants.HistogramDataDriftApplicationConstants.NAME
335
- # )
336
- # }
337
- # )
338
- if not applications_names:
339
- logger.info("No monitoring functions found", project=self.project)
340
- return
381
+ if (
382
+ # Different application names, or last analyzed never updated while there are application to monitor
383
+ application_names
384
+ and (
385
+ application_names
386
+ != batch_window_generator.get_application_list()
387
+ or not current_min_last_analyzed
388
+ )
389
+ ):
390
+ return True
391
+ elif (
392
+ # Does nop event will be sent to close the relevant window
393
+ self._should_send_nop_event(
394
+ base_period_minutes, current_min_last_analyzed, current_time
395
+ )
396
+ and (
397
+ int(endpoint.status.last_request.timestamp())
398
+ != last_timestamp_sent
399
+ or current_min_last_analyzed != last_analyzed_sent
400
+ )
401
+ ):
402
+ # Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
403
+ schedules_file.update_endpoint_timestamps(
404
+ endpoint_uid=endpoint.metadata.uid,
405
+ last_request=int(endpoint.status.last_request.timestamp()),
406
+ last_analyzed=current_min_last_analyzed,
407
+ )
408
+ return True
409
+ else:
410
+ logger.info(
411
+ "All the possible intervals were already analyzed, didn't push regular event",
412
+ endpoint_id=endpoint.metadata.uid,
413
+ last_analyzed=current_min_last_analyzed,
414
+ last_request=endpoint.status.last_request,
415
+ )
416
+ else:
341
417
  logger.info(
342
- "Starting to iterate over the applications",
343
- applications=applications_names,
418
+ "Should not monitor model endpoint, didn't push regular event",
419
+ endpoint_id=endpoint.metadata.uid,
420
+ endpoint_name=endpoint.metadata.name,
421
+ last_request=endpoint.status.last_request,
422
+ first_request=endpoint.status.first_request,
423
+ endpoint_type=endpoint.metadata.endpoint_type,
424
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
344
425
  )
426
+ return False
345
427
 
428
+ @staticmethod
429
+ def _should_send_nop_event(
430
+ base_period_minutes: int,
431
+ min_last_analyzed: int,
432
+ current_time: datetime.datetime,
433
+ ):
434
+ if min_last_analyzed:
435
+ return (
436
+ current_time.timestamp() - min_last_analyzed
437
+ >= datetime.timedelta(minutes=base_period_minutes).total_seconds()
438
+ + mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
439
+ )
440
+ else:
441
+ return True
442
+
443
+ def run(self, event: nuclio_sdk.Event) -> None:
444
+ """
445
+ Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
446
+ Handles nop events logic.
447
+ This method handles the following:
448
+ 1. Read applications from the event (endpoint_policy)
449
+ 2. Check model monitoring windows
450
+ 3. Send data to applications
451
+ 4. Pushes nop event to main stream if needed
452
+ """
453
+ logger.info("Start running monitoring controller worker")
454
+ try:
455
+ body = json.loads(event.body.decode("utf-8"))
346
456
  except Exception as e:
347
457
  logger.error(
348
- "Failed to list endpoints and monitoring applications",
458
+ "Failed to decode event",
349
459
  exc=err_to_str(e),
350
460
  )
351
461
  return
352
- # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
353
- with concurrent.futures.ThreadPoolExecutor(
354
- max_workers=min(len(endpoints), 10),
355
- ) as pool:
356
- for endpoint in endpoints:
357
- if (
358
- endpoint[mm_constants.EventFieldType.ACTIVE]
359
- and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
360
- == mm_constants.ModelMonitoringMode.enabled.value
361
- ):
362
- # Skip router endpoint:
363
- if (
364
- int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
365
- == mm_constants.EndpointType.ROUTER
366
- ):
367
- # Router endpoint has no feature stats
368
- logger.info(
369
- f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
370
- )
371
- continue
372
- pool.submit(
373
- MonitoringApplicationController.model_endpoint_process,
374
- endpoint=endpoint,
375
- applications_names=applications_names,
376
- batch_window_generator=self._batch_window_generator,
377
- project=self.project,
378
- model_monitoring_access_key=self.model_monitoring_access_key,
379
- storage_options=self.storage_options,
380
- )
462
+ # Run single endpoint process
463
+ self.model_endpoint_process(event=body)
381
464
 
382
- @classmethod
383
465
  def model_endpoint_process(
384
- cls,
385
- endpoint: dict,
386
- applications_names: list[str],
387
- batch_window_generator: _BatchWindowGenerator,
388
- project: str,
389
- model_monitoring_access_key: str,
390
- storage_options: Optional[dict] = None,
466
+ self,
467
+ event: Optional[dict] = None,
391
468
  ) -> None:
392
469
  """
393
470
  Process a model endpoint and trigger the monitoring applications. This function running on different process
394
- for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
395
- for a specific time range.
396
-
397
- :param endpoint: (dict) Model endpoint record.
398
- :param applications_names: (list[str]) List of application names to push results to.
399
- :param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
400
- :param project: (str) Project name.
401
- :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
402
- :param storage_options: (dict) Storage options for reading the infer parquet files.
471
+ for each endpoint.
472
+
473
+ :param event: (dict) Event that triggered the monitoring process.
403
474
  """
404
- endpoint_id = endpoint[mm_constants.EventFieldType.UID]
405
- has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
406
- m_fs = fstore.get_feature_set(
407
- endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
408
- )
475
+ logger.info("Model endpoint process started", event=event)
476
+
409
477
  try:
410
- for application in applications_names:
411
- batch_window = batch_window_generator.get_batch_window(
412
- project=project,
413
- endpoint=endpoint_id,
414
- application=application,
415
- first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
416
- last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
417
- has_stream=has_stream,
418
- )
478
+ project_name = event[ControllerEvent.PROJECT]
479
+ endpoint_id = event[ControllerEvent.ENDPOINT_ID]
480
+ endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
481
+ applications_names = event[ControllerEvent.ENDPOINT_POLICY][
482
+ ControllerEventEndpointPolicy.MONITORING_APPLICATIONS
483
+ ]
484
+
485
+ not_batch_endpoint = (
486
+ event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
487
+ )
419
488
 
420
- for start_infer_time, end_infer_time in batch_window.get_intervals():
421
- df = m_fs.to_dataframe(
422
- start_time=start_infer_time,
423
- end_time=end_infer_time,
424
- time_column=mm_constants.EventFieldType.TIMESTAMP,
425
- storage_options=storage_options,
489
+ logger.info(
490
+ "Starting analyzing for", timestamp=event[ControllerEvent.TIMESTAMP]
491
+ )
492
+ last_stream_timestamp = datetime.datetime.fromisoformat(
493
+ event[ControllerEvent.TIMESTAMP]
494
+ )
495
+ first_request = datetime.datetime.fromisoformat(
496
+ event[ControllerEvent.FIRST_REQUEST]
497
+ )
498
+ with _BatchWindowGenerator(
499
+ project=project_name,
500
+ endpoint_id=endpoint_id,
501
+ window_length=self._window_length,
502
+ ) as batch_window_generator:
503
+ for application in applications_names:
504
+ for (
505
+ start_infer_time,
506
+ end_infer_time,
507
+ ) in batch_window_generator.get_intervals(
508
+ application=application,
509
+ not_batch_endpoint=not_batch_endpoint,
510
+ first_request=first_request,
511
+ last_request=last_stream_timestamp,
512
+ ):
513
+ data_in_window = False
514
+ if not_batch_endpoint:
515
+ # Serving endpoint - get the relevant window data from the TSDB
516
+ prediction_metric = self.tsdb_connector.read_predictions(
517
+ start=start_infer_time,
518
+ end=end_infer_time,
519
+ endpoint_id=endpoint_id,
520
+ )
521
+ if prediction_metric.data:
522
+ data_in_window = True
523
+ else:
524
+ if endpoint_id not in self.feature_sets:
525
+ self.feature_sets[endpoint_id] = fstore.get_feature_set(
526
+ event[ControllerEvent.FEATURE_SET_URI]
527
+ )
528
+ self.feature_sets.move_to_end(endpoint_id, last=False)
529
+ if (
530
+ len(self.feature_sets)
531
+ > self._MAX_FEATURE_SET_PER_WORKER
532
+ ):
533
+ self.feature_sets.popitem(last=True)
534
+ m_fs = self.feature_sets.get(endpoint_id)
535
+
536
+ # Batch endpoint - get the relevant window data from the parquet target
537
+ df = m_fs.to_dataframe(
538
+ start_time=start_infer_time,
539
+ end_time=end_infer_time,
540
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
541
+ storage_options=self.storage_options,
542
+ )
543
+ if len(df) > 0:
544
+ data_in_window = True
545
+ if not data_in_window:
546
+ logger.info(
547
+ "No data found for the given interval",
548
+ start=start_infer_time,
549
+ end=end_infer_time,
550
+ endpoint_id=endpoint_id,
551
+ )
552
+ else:
553
+ logger.info(
554
+ "Data found for the given interval",
555
+ start=start_infer_time,
556
+ end=end_infer_time,
557
+ endpoint_id=endpoint_id,
558
+ )
559
+ self._push_to_applications(
560
+ start_infer_time=start_infer_time,
561
+ end_infer_time=end_infer_time,
562
+ endpoint_id=endpoint_id,
563
+ endpoint_name=endpoint_name,
564
+ project=project_name,
565
+ applications_names=[application],
566
+ model_monitoring_access_key=self.model_monitoring_access_key,
567
+ endpoint_updated=event[ControllerEvent.ENDPOINT_POLICY][
568
+ ControllerEventEndpointPolicy.ENDPOINT_UPDATED
569
+ ],
570
+ )
571
+ base_period = event[ControllerEvent.ENDPOINT_POLICY][
572
+ ControllerEventEndpointPolicy.BASE_PERIOD
573
+ ]
574
+ current_time = mlrun.utils.datetime_now()
575
+ if (
576
+ self._should_send_nop_event(
577
+ base_period,
578
+ batch_window_generator.get_min_last_analyzed(),
579
+ current_time,
426
580
  )
427
- if len(df) == 0:
428
- logger.info(
429
- "No data found for the given interval",
430
- start=start_infer_time,
431
- end=end_infer_time,
432
- endpoint_id=endpoint_id,
433
- )
434
- else:
435
- logger.info(
436
- "Data found for the given interval",
437
- start=start_infer_time,
438
- end=end_infer_time,
439
- endpoint_id=endpoint_id,
440
- )
441
- cls._push_to_applications(
442
- start_infer_time=start_infer_time,
443
- end_infer_time=end_infer_time,
444
- endpoint_id=endpoint_id,
445
- project=project,
446
- applications_names=[application],
447
- model_monitoring_access_key=model_monitoring_access_key,
448
- )
581
+ and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
582
+ ):
583
+ event = {
584
+ ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
585
+ ControllerEvent.PROJECT: project_name,
586
+ ControllerEvent.ENDPOINT_ID: endpoint_id,
587
+ ControllerEvent.ENDPOINT_NAME: endpoint_name,
588
+ ControllerEvent.TIMESTAMP: current_time.isoformat(
589
+ timespec="microseconds"
590
+ ),
591
+ ControllerEvent.ENDPOINT_POLICY: event[
592
+ ControllerEvent.ENDPOINT_POLICY
593
+ ],
594
+ ControllerEvent.ENDPOINT_TYPE: event[
595
+ ControllerEvent.ENDPOINT_TYPE
596
+ ],
597
+ ControllerEvent.FEATURE_SET_URI: event[
598
+ ControllerEvent.FEATURE_SET_URI
599
+ ],
600
+ ControllerEvent.FIRST_REQUEST: event[
601
+ ControllerEvent.FIRST_REQUEST
602
+ ],
603
+ }
604
+ self._push_to_main_stream(
605
+ event=event,
606
+ endpoint_id=endpoint_id,
607
+ )
608
+ logger.info(
609
+ "Finish analyze for", timestamp=event[ControllerEvent.TIMESTAMP]
610
+ )
449
611
 
450
612
  except Exception:
451
613
  logger.exception(
452
614
  "Encountered an exception",
453
- endpoint_id=endpoint[mm_constants.EventFieldType.UID],
615
+ endpoint_id=event[ControllerEvent.ENDPOINT_ID],
454
616
  )
455
617
 
456
- @staticmethod
457
618
  def _push_to_applications(
619
+ self,
458
620
  start_infer_time: datetime.datetime,
459
621
  end_infer_time: datetime.datetime,
460
622
  endpoint_id: str,
623
+ endpoint_name: str,
461
624
  project: str,
462
625
  applications_names: list[str],
463
626
  model_monitoring_access_key: str,
627
+ endpoint_updated: str,
464
628
  ):
465
629
  """
466
630
  Pushes data to multiple stream applications.
@@ -471,7 +635,7 @@ class MonitoringApplicationController:
471
635
  :param project: mlrun Project name.
472
636
  :param applications_names: List of application names to which data will be pushed.
473
637
  :param model_monitoring_access_key: Access key to apply the model monitoring process.
474
-
638
+ :param endpoint_updated: str isoformet for the timestamp the model endpoint was updated
475
639
  """
476
640
  data = {
477
641
  mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
@@ -481,28 +645,250 @@ class MonitoringApplicationController:
481
645
  sep=" ", timespec="microseconds"
482
646
  ),
483
647
  mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
484
- mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
485
- project=project,
486
- function_name=mm_constants.MonitoringFunctionNames.WRITER,
487
- ),
648
+ mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
649
+ mm_constants.ApplicationEvent.ENDPOINT_UPDATED: endpoint_updated,
488
650
  }
489
651
  for app_name in applications_names:
490
652
  data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
491
- stream_uri = get_stream_path(project=project, function_name=app_name)
653
+ if app_name not in self.applications_streams:
654
+ self.applications_streams[app_name] = (
655
+ mlrun.model_monitoring.helpers.get_output_stream(
656
+ project=project,
657
+ function_name=app_name,
658
+ v3io_access_key=model_monitoring_access_key,
659
+ )
660
+ )
661
+ app_stream = self.applications_streams.get(app_name)
492
662
 
493
663
  logger.info(
494
- f"push endpoint_id {endpoint_id} to {app_name} by stream :{stream_uri}"
664
+ "Pushing data to application stream",
665
+ endpoint_id=endpoint_id,
666
+ app_name=app_name,
667
+ app_stream_type=str(type(app_stream)),
668
+ )
669
+ app_stream.push([data], partition_key=endpoint_id)
670
+
671
+ def push_regular_event_to_controller_stream(self) -> None:
672
+ """
673
+ pushes a regular event to the controller stream.
674
+ """
675
+ logger.info("Starting monitoring controller chief")
676
+ applications_names = []
677
+ endpoints = self.project_obj.list_model_endpoints(tsdb_metrics=False).endpoints
678
+ last_request_dict = self.tsdb_connector.get_last_request(
679
+ endpoint_ids=[mep.metadata.uid for mep in endpoints]
680
+ )
681
+ if isinstance(last_request_dict, pd.DataFrame):
682
+ last_request_dict = last_request_dict.set_index(
683
+ mm_constants.EventFieldType.ENDPOINT_ID
684
+ )[mm_constants.ModelEndpointSchema.LAST_REQUEST].to_dict()
685
+
686
+ if not endpoints:
687
+ logger.info("No model endpoints found", project=self.project)
688
+ return
689
+ monitoring_functions = self.project_obj.list_model_monitoring_functions()
690
+ if monitoring_functions:
691
+ # if monitoring_functions: - TODO : ML-7700
692
+ # Gets only application in ready state
693
+ # applications_names = list(
694
+ # {
695
+ # app.metadata.name
696
+ # for app in monitoring_functions
697
+ # if (
698
+ # app.status.state == "ready"
699
+ # # workaround for the default app, as its `status.state` is `None`
700
+ # or app.metadata.name
701
+ # == mm_constants.HistogramDataDriftApplicationConstants.NAME
702
+ # )
703
+ # }
704
+ # )
705
+ applications_names = list(
706
+ {app.metadata.name for app in monitoring_functions}
707
+ )
708
+ if not applications_names:
709
+ logger.info("No monitoring functions found", project=self.project)
710
+ return
711
+ policy = {
712
+ ControllerEventEndpointPolicy.MONITORING_APPLICATIONS: applications_names,
713
+ ControllerEventEndpointPolicy.BASE_PERIOD: int(
714
+ batch_dict2timedelta(
715
+ json.loads(
716
+ cast(
717
+ str,
718
+ os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
719
+ )
720
+ )
721
+ ).total_seconds()
722
+ // _SECONDS_IN_MINUTE
723
+ ),
724
+ }
725
+ with concurrent.futures.ThreadPoolExecutor(
726
+ max_workers=min(len(endpoints), 10)
727
+ ) as pool:
728
+ with schedules.ModelMonitoringSchedulesFileChief(
729
+ self.project
730
+ ) as schedule_file:
731
+ for endpoint in endpoints:
732
+ last_request = last_request_dict.get(endpoint.metadata.uid, None)
733
+ if isinstance(last_request, float):
734
+ last_request = pd.to_datetime(last_request, unit="s", utc=True)
735
+ endpoint.status.last_request = (
736
+ last_request or endpoint.status.last_request
737
+ )
738
+ futures = {
739
+ pool.submit(
740
+ self.endpoint_to_regular_event,
741
+ endpoint,
742
+ policy,
743
+ set(applications_names),
744
+ schedule_file,
745
+ ): endpoint
746
+ }
747
+ for future in concurrent.futures.as_completed(futures):
748
+ if future.exception():
749
+ exception = future.exception()
750
+ error = (
751
+ f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
752
+ f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
753
+ )
754
+ error += "".join(
755
+ traceback.format_exception(
756
+ None, exception, exception.__traceback__
757
+ )
758
+ )
759
+ logger.error(error)
760
+ logger.info("Finishing monitoring controller chief")
761
+
762
+ def endpoint_to_regular_event(
763
+ self,
764
+ endpoint: mlrun.common.schemas.ModelEndpoint,
765
+ policy: dict,
766
+ applications_names: set,
767
+ schedule_file: schedules.ModelMonitoringSchedulesFileChief,
768
+ ) -> None:
769
+ if self._should_monitor_endpoint(
770
+ endpoint,
771
+ set(applications_names),
772
+ policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
773
+ schedule_file,
774
+ ):
775
+ logger.debug(
776
+ "Endpoint data is being prepared for regular event",
777
+ endpoint_id=endpoint.metadata.uid,
778
+ endpoint_name=endpoint.metadata.name,
779
+ timestamp=endpoint.status.last_request.isoformat(
780
+ sep=" ", timespec="microseconds"
781
+ ),
782
+ first_request=endpoint.status.first_request.isoformat(
783
+ sep=" ", timespec="microseconds"
784
+ ),
785
+ endpoint_type=endpoint.metadata.endpoint_type,
786
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
787
+ endpoint_policy=json.dumps(policy),
788
+ )
789
+ policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
790
+ endpoint.metadata.updated.isoformat()
495
791
  )
496
- get_stream_pusher(stream_uri, access_key=model_monitoring_access_key).push(
497
- [data]
792
+ self.push_to_controller_stream(
793
+ kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
794
+ project=endpoint.metadata.project,
795
+ endpoint_id=endpoint.metadata.uid,
796
+ endpoint_name=endpoint.metadata.name,
797
+ timestamp=endpoint.status.last_request.isoformat(
798
+ sep=" ", timespec="microseconds"
799
+ ),
800
+ first_request=endpoint.status.first_request.isoformat(
801
+ sep=" ", timespec="microseconds"
802
+ ),
803
+ endpoint_type=endpoint.metadata.endpoint_type.value,
804
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
805
+ endpoint_policy=policy,
498
806
  )
499
807
 
808
+ def push_to_controller_stream(
809
+ self,
810
+ kind: str,
811
+ project: str,
812
+ endpoint_id: str,
813
+ endpoint_name: str,
814
+ timestamp: str,
815
+ first_request: str,
816
+ endpoint_type: int,
817
+ feature_set_uri: str,
818
+ endpoint_policy: dict[str, Any],
819
+ ) -> None:
820
+ """
821
+ Pushes event data to controller stream.
822
+ :param timestamp: the event timestamp str isoformat utc timezone
823
+ :param first_request: the first request str isoformat utc timezone
824
+ :param endpoint_policy: dictionary hold the monitoring policy
825
+ :param kind: str event kind
826
+ :param project: project name
827
+ :param endpoint_id: endpoint id string
828
+ :param endpoint_name: the endpoint name string
829
+ :param endpoint_type: Enum of the endpoint type
830
+ :param feature_set_uri: the feature set uri string
831
+ """
832
+ event = {
833
+ ControllerEvent.KIND.value: kind,
834
+ ControllerEvent.PROJECT.value: project,
835
+ ControllerEvent.ENDPOINT_ID.value: endpoint_id,
836
+ ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
837
+ ControllerEvent.TIMESTAMP.value: timestamp,
838
+ ControllerEvent.FIRST_REQUEST.value: first_request,
839
+ ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
840
+ ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
841
+ ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
842
+ }
843
+ logger.info(
844
+ "Pushing data to controller stream",
845
+ event=event,
846
+ endpoint_id=endpoint_id,
847
+ controller_stream_type=str(type(self.controller_stream)),
848
+ )
849
+ self.controller_stream.push([event], partition_key=endpoint_id)
850
+
851
+ def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
852
+ """
853
+ Pushes the given event to model monitoring stream
854
+ :param event: event dictionary to push to stream
855
+ :param endpoint_id: endpoint id string
856
+ """
857
+ logger.info(
858
+ "Pushing data to main stream, NOP event is been generated",
859
+ event=json.dumps(event),
860
+ endpoint_id=endpoint_id,
861
+ mm_stream_type=str(type(self.model_monitoring_stream)),
862
+ )
863
+ self.model_monitoring_stream.push([event], partition_key=endpoint_id)
864
+
500
865
 
501
- def handler(context: nuclio.Context, event: nuclio.Event) -> None:
866
+ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
502
867
  """
503
868
  Run model monitoring application processor
504
869
 
505
870
  :param context: the Nuclio context
506
871
  :param event: trigger event
507
872
  """
508
- MonitoringApplicationController().run()
873
+ logger.info(
874
+ "Controller got event",
875
+ trigger=event.trigger,
876
+ trigger_kind=event.trigger.kind,
877
+ )
878
+
879
+ if event.trigger.kind in mm_constants.CRON_TRIGGER_KINDS:
880
+ # Runs controller chief:
881
+ context.user_data.monitor_app_controller.push_regular_event_to_controller_stream()
882
+ elif event.trigger.kind in mm_constants.STREAM_TRIGGER_KINDS:
883
+ # Runs controller worker:
884
+ context.user_data.monitor_app_controller.run(event)
885
+ else:
886
+ raise mlrun.errors.MLRunInvalidArgumentError(
887
+ "Wrong trigger kind for model monitoring controller"
888
+ )
889
+
890
+
891
+ def init_context(context):
892
+ monitor_app_controller = MonitoringApplicationController()
893
+ setattr(context.user_data, "monitor_app_controller", monitor_app_controller)
894
+ context.logger.info("Monitoring application controller initialized")