mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +40 -122
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +47 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +79 -47
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +74 -1
  13. mlrun/common/db/sql_session.py +5 -5
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +45 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +33 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +12 -3
  23. mlrun/common/model_monitoring/helpers.py +9 -5
  24. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  25. mlrun/common/schemas/__init__.py +31 -5
  26. mlrun/common/schemas/alert.py +202 -0
  27. mlrun/common/schemas/api_gateway.py +196 -0
  28. mlrun/common/schemas/artifact.py +25 -4
  29. mlrun/common/schemas/auth.py +16 -5
  30. mlrun/common/schemas/background_task.py +1 -1
  31. mlrun/common/schemas/client_spec.py +4 -2
  32. mlrun/common/schemas/common.py +7 -4
  33. mlrun/common/schemas/constants.py +3 -0
  34. mlrun/common/schemas/feature_store.py +74 -44
  35. mlrun/common/schemas/frontend_spec.py +15 -7
  36. mlrun/common/schemas/function.py +12 -1
  37. mlrun/common/schemas/hub.py +11 -18
  38. mlrun/common/schemas/memory_reports.py +2 -2
  39. mlrun/common/schemas/model_monitoring/__init__.py +20 -4
  40. mlrun/common/schemas/model_monitoring/constants.py +123 -42
  41. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  42. mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
  43. mlrun/common/schemas/notification.py +71 -14
  44. mlrun/common/schemas/object.py +2 -2
  45. mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
  46. mlrun/common/schemas/pipeline.py +8 -1
  47. mlrun/common/schemas/project.py +69 -18
  48. mlrun/common/schemas/runs.py +7 -1
  49. mlrun/common/schemas/runtime_resource.py +8 -12
  50. mlrun/common/schemas/schedule.py +4 -4
  51. mlrun/common/schemas/tag.py +1 -2
  52. mlrun/common/schemas/workflow.py +12 -4
  53. mlrun/common/types.py +14 -1
  54. mlrun/config.py +154 -69
  55. mlrun/data_types/data_types.py +6 -1
  56. mlrun/data_types/spark.py +2 -2
  57. mlrun/data_types/to_pandas.py +67 -37
  58. mlrun/datastore/__init__.py +6 -8
  59. mlrun/datastore/alibaba_oss.py +131 -0
  60. mlrun/datastore/azure_blob.py +143 -42
  61. mlrun/datastore/base.py +102 -58
  62. mlrun/datastore/datastore.py +34 -13
  63. mlrun/datastore/datastore_profile.py +146 -20
  64. mlrun/datastore/dbfs_store.py +3 -7
  65. mlrun/datastore/filestore.py +1 -4
  66. mlrun/datastore/google_cloud_storage.py +97 -33
  67. mlrun/datastore/hdfs.py +56 -0
  68. mlrun/datastore/inmem.py +6 -3
  69. mlrun/datastore/redis.py +7 -2
  70. mlrun/datastore/s3.py +34 -12
  71. mlrun/datastore/snowflake_utils.py +45 -0
  72. mlrun/datastore/sources.py +303 -111
  73. mlrun/datastore/spark_utils.py +31 -2
  74. mlrun/datastore/store_resources.py +9 -7
  75. mlrun/datastore/storeytargets.py +151 -0
  76. mlrun/datastore/targets.py +453 -176
  77. mlrun/datastore/utils.py +72 -58
  78. mlrun/datastore/v3io.py +6 -1
  79. mlrun/db/base.py +274 -41
  80. mlrun/db/factory.py +1 -1
  81. mlrun/db/httpdb.py +893 -225
  82. mlrun/db/nopdb.py +291 -33
  83. mlrun/errors.py +36 -6
  84. mlrun/execution.py +115 -42
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +65 -73
  87. mlrun/feature_store/common.py +7 -12
  88. mlrun/feature_store/feature_set.py +76 -55
  89. mlrun/feature_store/feature_vector.py +39 -31
  90. mlrun/feature_store/ingestion.py +7 -6
  91. mlrun/feature_store/retrieval/base.py +16 -11
  92. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  93. mlrun/feature_store/retrieval/job.py +13 -4
  94. mlrun/feature_store/retrieval/local_merger.py +2 -0
  95. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  96. mlrun/feature_store/steps.py +45 -34
  97. mlrun/features.py +11 -21
  98. mlrun/frameworks/_common/artifacts_library.py +9 -9
  99. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  100. mlrun/frameworks/_common/model_handler.py +48 -48
  101. mlrun/frameworks/_common/plan.py +5 -6
  102. mlrun/frameworks/_common/producer.py +3 -4
  103. mlrun/frameworks/_common/utils.py +5 -5
  104. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  105. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  106. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  107. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  108. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  109. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  110. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  111. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  112. mlrun/frameworks/_ml_common/plan.py +2 -2
  113. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  114. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  115. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  116. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  117. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  118. mlrun/frameworks/_ml_common/utils.py +4 -4
  119. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  120. mlrun/frameworks/huggingface/model_server.py +4 -4
  121. mlrun/frameworks/lgbm/__init__.py +33 -33
  122. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  123. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  124. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  125. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  126. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  127. mlrun/frameworks/lgbm/model_handler.py +10 -10
  128. mlrun/frameworks/lgbm/model_server.py +6 -6
  129. mlrun/frameworks/lgbm/utils.py +5 -5
  130. mlrun/frameworks/onnx/dataset.py +8 -8
  131. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  132. mlrun/frameworks/onnx/model_handler.py +6 -6
  133. mlrun/frameworks/onnx/model_server.py +7 -7
  134. mlrun/frameworks/parallel_coordinates.py +6 -6
  135. mlrun/frameworks/pytorch/__init__.py +18 -18
  136. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  137. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  138. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  139. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  140. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  141. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  142. mlrun/frameworks/pytorch/model_handler.py +17 -17
  143. mlrun/frameworks/pytorch/model_server.py +7 -7
  144. mlrun/frameworks/sklearn/__init__.py +13 -13
  145. mlrun/frameworks/sklearn/estimator.py +4 -4
  146. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  147. mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
  148. mlrun/frameworks/sklearn/model_handler.py +2 -2
  149. mlrun/frameworks/tf_keras/__init__.py +10 -7
  150. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  151. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  152. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  153. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  154. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  155. mlrun/frameworks/tf_keras/model_server.py +6 -6
  156. mlrun/frameworks/xgboost/__init__.py +13 -13
  157. mlrun/frameworks/xgboost/model_handler.py +6 -6
  158. mlrun/k8s_utils.py +61 -17
  159. mlrun/launcher/__init__.py +1 -1
  160. mlrun/launcher/base.py +16 -15
  161. mlrun/launcher/client.py +13 -11
  162. mlrun/launcher/factory.py +1 -1
  163. mlrun/launcher/local.py +23 -13
  164. mlrun/launcher/remote.py +17 -10
  165. mlrun/lists.py +7 -6
  166. mlrun/model.py +478 -103
  167. mlrun/model_monitoring/__init__.py +1 -1
  168. mlrun/model_monitoring/api.py +163 -371
  169. mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
  170. mlrun/model_monitoring/applications/_application_steps.py +188 -0
  171. mlrun/model_monitoring/applications/base.py +108 -0
  172. mlrun/model_monitoring/applications/context.py +341 -0
  173. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  174. mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
  175. mlrun/model_monitoring/applications/results.py +99 -0
  176. mlrun/model_monitoring/controller.py +131 -278
  177. mlrun/model_monitoring/db/__init__.py +18 -0
  178. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  179. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  180. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  181. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  182. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  183. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  184. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  185. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  186. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  187. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  188. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  189. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  190. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  191. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  192. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  193. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
  194. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  195. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
  196. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  197. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  198. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  199. mlrun/model_monitoring/features_drift_table.py +134 -106
  200. mlrun/model_monitoring/helpers.py +199 -55
  201. mlrun/model_monitoring/metrics/__init__.py +13 -0
  202. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  203. mlrun/model_monitoring/model_endpoint.py +3 -2
  204. mlrun/model_monitoring/stream_processing.py +134 -398
  205. mlrun/model_monitoring/tracking_policy.py +9 -2
  206. mlrun/model_monitoring/writer.py +161 -125
  207. mlrun/package/__init__.py +6 -6
  208. mlrun/package/context_handler.py +5 -5
  209. mlrun/package/packager.py +7 -7
  210. mlrun/package/packagers/default_packager.py +8 -8
  211. mlrun/package/packagers/numpy_packagers.py +15 -15
  212. mlrun/package/packagers/pandas_packagers.py +5 -5
  213. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  214. mlrun/package/packagers_manager.py +19 -23
  215. mlrun/package/utils/_formatter.py +6 -6
  216. mlrun/package/utils/_pickler.py +2 -2
  217. mlrun/package/utils/_supported_format.py +4 -4
  218. mlrun/package/utils/log_hint_utils.py +2 -2
  219. mlrun/package/utils/type_hint_utils.py +4 -9
  220. mlrun/platforms/__init__.py +11 -10
  221. mlrun/platforms/iguazio.py +24 -203
  222. mlrun/projects/operations.py +52 -25
  223. mlrun/projects/pipelines.py +191 -197
  224. mlrun/projects/project.py +1227 -400
  225. mlrun/render.py +16 -19
  226. mlrun/run.py +209 -184
  227. mlrun/runtimes/__init__.py +83 -15
  228. mlrun/runtimes/base.py +51 -35
  229. mlrun/runtimes/daskjob.py +17 -10
  230. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  231. mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
  232. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  233. mlrun/runtimes/funcdoc.py +1 -29
  234. mlrun/runtimes/function_reference.py +1 -1
  235. mlrun/runtimes/kubejob.py +34 -128
  236. mlrun/runtimes/local.py +40 -11
  237. mlrun/runtimes/mpijob/__init__.py +0 -20
  238. mlrun/runtimes/mpijob/abstract.py +9 -10
  239. mlrun/runtimes/mpijob/v1.py +1 -1
  240. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  241. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  242. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  243. mlrun/runtimes/nuclio/application/application.py +758 -0
  244. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  245. mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
  246. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  247. mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
  248. mlrun/runtimes/pod.py +281 -101
  249. mlrun/runtimes/remotesparkjob.py +12 -9
  250. mlrun/runtimes/sparkjob/spark3job.py +67 -51
  251. mlrun/runtimes/utils.py +41 -75
  252. mlrun/secrets.py +9 -5
  253. mlrun/serving/__init__.py +8 -1
  254. mlrun/serving/remote.py +2 -7
  255. mlrun/serving/routers.py +85 -69
  256. mlrun/serving/server.py +69 -44
  257. mlrun/serving/states.py +209 -36
  258. mlrun/serving/utils.py +22 -14
  259. mlrun/serving/v1_serving.py +6 -7
  260. mlrun/serving/v2_serving.py +133 -54
  261. mlrun/track/tracker.py +2 -1
  262. mlrun/track/tracker_manager.py +3 -3
  263. mlrun/track/trackers/mlflow_tracker.py +6 -2
  264. mlrun/utils/async_http.py +6 -8
  265. mlrun/utils/azure_vault.py +1 -1
  266. mlrun/utils/clones.py +1 -2
  267. mlrun/utils/condition_evaluator.py +3 -3
  268. mlrun/utils/db.py +21 -3
  269. mlrun/utils/helpers.py +405 -225
  270. mlrun/utils/http.py +3 -6
  271. mlrun/utils/logger.py +112 -16
  272. mlrun/utils/notifications/notification/__init__.py +17 -13
  273. mlrun/utils/notifications/notification/base.py +50 -2
  274. mlrun/utils/notifications/notification/console.py +2 -0
  275. mlrun/utils/notifications/notification/git.py +24 -1
  276. mlrun/utils/notifications/notification/ipython.py +3 -1
  277. mlrun/utils/notifications/notification/slack.py +96 -21
  278. mlrun/utils/notifications/notification/webhook.py +59 -2
  279. mlrun/utils/notifications/notification_pusher.py +149 -30
  280. mlrun/utils/regex.py +9 -0
  281. mlrun/utils/retryer.py +208 -0
  282. mlrun/utils/singleton.py +1 -1
  283. mlrun/utils/v3io_clients.py +4 -6
  284. mlrun/utils/version/version.json +2 -2
  285. mlrun/utils/version/version.py +2 -6
  286. mlrun-1.7.0.dist-info/METADATA +378 -0
  287. mlrun-1.7.0.dist-info/RECORD +351 -0
  288. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
  289. mlrun/feature_store/retrieval/conversion.py +0 -273
  290. mlrun/kfpops.py +0 -868
  291. mlrun/model_monitoring/application.py +0 -310
  292. mlrun/model_monitoring/batch.py +0 -1095
  293. mlrun/model_monitoring/prometheus.py +0 -219
  294. mlrun/model_monitoring/stores/__init__.py +0 -111
  295. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
  296. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  297. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  298. mlrun/model_monitoring/stores/models/base.py +0 -84
  299. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  300. mlrun/platforms/other.py +0 -306
  301. mlrun-1.6.4rc8.dist-info/METADATA +0 -272
  302. mlrun-1.6.4rc8.dist-info/RECORD +0 -314
  303. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
  304. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
  305. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
@@ -11,32 +11,30 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import concurrent.futures
16
15
  import datetime
17
16
  import json
18
17
  import os
19
18
  import re
20
- from typing import Any, Iterator, NamedTuple, Optional, Union, cast
19
+ from collections.abc import Iterator
20
+ from typing import NamedTuple, Optional, Union, cast
21
21
 
22
- from v3io.dataplane.response import HttpResponseError
22
+ import nuclio
23
23
 
24
24
  import mlrun
25
25
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
26
26
  import mlrun.data_types.infer
27
27
  import mlrun.feature_store as fstore
28
- from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
28
+ import mlrun.model_monitoring.db.stores
29
+ from mlrun.config import config as mlconf
29
30
  from mlrun.datastore import get_stream_pusher
30
- from mlrun.datastore.targets import ParquetTarget
31
- from mlrun.model_monitoring.batch import calculate_inputs_statistics
31
+ from mlrun.errors import err_to_str
32
32
  from mlrun.model_monitoring.helpers import (
33
33
  _BatchDict,
34
34
  batch_dict2timedelta,
35
- get_monitoring_parquet_path,
36
35
  get_stream_path,
37
36
  )
38
- from mlrun.utils import create_logger, datetime_now, logger
39
- from mlrun.utils.v3io_clients import get_v3io_client
37
+ from mlrun.utils import datetime_now, logger
40
38
 
41
39
 
42
40
  class _Interval(NamedTuple):
@@ -45,8 +43,6 @@ class _Interval(NamedTuple):
45
43
 
46
44
 
47
45
  class _BatchWindow:
48
- V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
49
-
50
46
  def __init__(
51
47
  self,
52
48
  project: str,
@@ -62,27 +58,22 @@ class _BatchWindow:
62
58
  All the time values are in seconds.
63
59
  The start and stop time are in seconds since the epoch.
64
60
  """
61
+ self.project = project
65
62
  self._endpoint = endpoint
66
63
  self._application = application
67
64
  self._first_request = first_request
68
- self._kv_storage = get_v3io_client(
69
- endpoint=mlrun.mlconf.v3io_api,
70
- # Avoid noisy warning logs before the KV table is created
71
- logger=create_logger(name="v3io_client", level="error"),
72
- ).kv
73
- self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
74
65
  self._stop = last_updated
75
66
  self._step = timedelta_seconds
67
+ self._db = mlrun.model_monitoring.get_store_object(project=self.project)
76
68
  self._start = self._get_last_analyzed()
77
69
 
78
70
  def _get_last_analyzed(self) -> Optional[int]:
79
71
  try:
80
- data = self._kv_storage.get(
81
- container=self._v3io_container,
82
- table_path=self._endpoint,
83
- key=self._application,
72
+ last_analyzed = self._db.get_last_analyzed(
73
+ endpoint_id=self._endpoint,
74
+ application_name=self._application,
84
75
  )
85
- except HttpResponseError as err:
76
+ except mlrun.errors.MLRunNotFoundError:
86
77
  logger.info(
87
78
  "No last analyzed time was found for this endpoint and "
88
79
  "application, as this is probably the first time this "
@@ -93,7 +84,7 @@ class _BatchWindow:
93
84
  first_request=self._first_request,
94
85
  last_updated=self._stop,
95
86
  )
96
- logger.debug("Error while getting last analyzed time", err=err)
87
+
97
88
  if self._first_request and self._stop:
98
89
  # TODO : Change the timedelta according to the policy.
99
90
  first_period_in_seconds = max(
@@ -105,7 +96,6 @@ class _BatchWindow:
105
96
  )
106
97
  return self._first_request
107
98
 
108
- last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
109
99
  logger.info(
110
100
  "Got the last analyzed time for this endpoint and application",
111
101
  endpoint=self._endpoint,
@@ -121,11 +111,11 @@ class _BatchWindow:
121
111
  application=self._application,
122
112
  last_analyzed=last_analyzed,
123
113
  )
124
- self._kv_storage.put(
125
- container=self._v3io_container,
126
- table_path=self._endpoint,
127
- key=self._application,
128
- attributes={mm_constants.SchedulingKeys.LAST_ANALYZED: last_analyzed},
114
+
115
+ self._db.update_last_analyzed(
116
+ endpoint_id=self._endpoint,
117
+ application_name=self._application,
118
+ last_analyzed=last_analyzed,
129
119
  )
130
120
 
131
121
  def get_intervals(
@@ -224,7 +214,7 @@ class _BatchWindowGenerator:
224
214
  # If the endpoint does not have a stream, `last_updated` should be
225
215
  # the minimum between the current time and the last updated time.
226
216
  # This compensates for the bumping mechanism - see
227
- # `bump_model_endpoint_last_request`.
217
+ # `update_model_endpoint_last_request`.
228
218
  last_updated = min(int(datetime_now().timestamp()), last_updated)
229
219
  logger.debug(
230
220
  "The endpoint does not have a stream", last_updated=last_updated
@@ -279,44 +269,26 @@ class MonitoringApplicationController:
279
269
  Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
280
270
  """
281
271
 
282
- def __init__(
283
- self,
284
- context: mlrun.run.MLClientCtx,
285
- project: str,
286
- ):
287
- """
288
- Initialize Monitoring Application Processor object.
272
+ def __init__(self) -> None:
273
+ """Initialize Monitoring Application Controller"""
274
+ self.project = cast(str, mlrun.mlconf.default_project)
275
+ self.project_obj = mlrun.load_project(name=self.project, url=self.project)
289
276
 
290
- :param context: An MLRun context.
291
- :param project: Project name.
292
- """
293
- self.context = context
294
- self.project = project
295
- self.project_obj = mlrun.get_or_create_project(project)
296
-
297
- context.logger.debug(f"Initializing {self.__class__.__name__}", project=project)
277
+ logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
298
278
 
299
- self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
279
+ self.db = mlrun.model_monitoring.get_store_object(project=self.project)
300
280
 
301
281
  self._batch_window_generator = _BatchWindowGenerator(
302
- batch_dict=context.parameters[
303
- mm_constants.EventFieldType.BATCH_INTERVALS_DICT
304
- ]
282
+ batch_dict=json.loads(
283
+ mlrun.get_secret_or_env(
284
+ mm_constants.EventFieldType.BATCH_INTERVALS_DICT
285
+ )
286
+ )
305
287
  )
306
288
 
307
- # If provided, only model endpoints in that that list will be analyzed
308
- self.model_endpoints = context.parameters.get(
309
- mm_constants.EventFieldType.MODEL_ENDPOINTS, None
310
- )
311
289
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
312
- self.parquet_directory = get_monitoring_parquet_path(
313
- self.project_obj,
314
- kind=mm_constants.FileTargetKind.APPS_PARQUET,
315
- )
316
290
  self.storage_options = None
317
- if not mlrun.mlconf.is_ce_mode():
318
- self._initialize_v3io_configurations()
319
- elif self.parquet_directory.startswith("s3://"):
291
+ if mlconf.artifact_path.startswith("s3://"):
320
292
  self.storage_options = mlrun.mlconf.get_s3_storage_options()
321
293
 
322
294
  @staticmethod
@@ -327,39 +299,60 @@ class MonitoringApplicationController:
327
299
  access_key = mlrun.mlconf.get_v3io_access_key()
328
300
  return access_key
329
301
 
330
- def _initialize_v3io_configurations(self) -> None:
331
- self.v3io_framesd = mlrun.mlconf.v3io_framesd
332
- self.v3io_api = mlrun.mlconf.v3io_api
333
- self.storage_options = dict(
334
- v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
335
- )
336
-
337
- def run(self):
302
+ def run(self) -> None:
338
303
  """
339
- Main method for run all the relevant monitoring applications on each endpoint
304
+ Main method for run all the relevant monitoring applications on each endpoint.
305
+ This method handles the following:
306
+ 1. List model endpoints
307
+ 2. List applications
308
+ 3. Check model monitoring windows
309
+ 4. Send data to applications
310
+ 5. Delete old parquets
340
311
  """
312
+ logger.info("Start running monitoring controller")
341
313
  try:
342
- endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
314
+ applications_names = []
315
+ endpoints = self.db.list_model_endpoints(include_stats=True)
316
+ if not endpoints:
317
+ logger.info("No model endpoints found", project=self.project)
318
+ return
343
319
  monitoring_functions = self.project_obj.list_model_monitoring_functions()
344
320
  if monitoring_functions:
345
321
  applications_names = list(
346
322
  {app.metadata.name for app in monitoring_functions}
347
323
  )
348
- else:
349
- self.context.logger.info(
350
- "No monitoring functions found", project=self.project
351
- )
352
- applications_names = []
324
+ # if monitoring_functions: - TODO : ML-7700
325
+ # Gets only application in ready state
326
+ # applications_names = list(
327
+ # {
328
+ # app.metadata.name
329
+ # for app in monitoring_functions
330
+ # if (
331
+ # app.status.state == "ready"
332
+ # # workaround for the default app, as its `status.state` is `None`
333
+ # or app.metadata.name
334
+ # == mm_constants.HistogramDataDriftApplicationConstants.NAME
335
+ # )
336
+ # }
337
+ # )
338
+ if not applications_names:
339
+ logger.info("No monitoring functions found", project=self.project)
340
+ return
341
+ logger.info(
342
+ "Starting to iterate over the applications",
343
+ applications=applications_names,
344
+ )
353
345
 
354
346
  except Exception as e:
355
- self.context.logger.error("Failed to list endpoints", exc=e)
356
- return
357
- if endpoints and applications_names:
358
- # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
359
- pool = concurrent.futures.ProcessPoolExecutor(
360
- max_workers=min(len(endpoints), 10),
347
+ logger.error(
348
+ "Failed to list endpoints and monitoring applications",
349
+ exc=err_to_str(e),
361
350
  )
362
- futures = []
351
+ return
352
+ # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
353
+ with concurrent.futures.ThreadPoolExecutor(
354
+ max_workers=min(len(endpoints), 10),
355
+ ) as pool:
363
356
  for endpoint in endpoints:
364
357
  if (
365
358
  endpoint[mm_constants.EventFieldType.ACTIVE]
@@ -373,27 +366,18 @@ class MonitoringApplicationController:
373
366
  ):
374
367
  # Router endpoint has no feature stats
375
368
  logger.info(
376
- f"{endpoint[mm_constants.EventFieldType.UID]} is router skipping"
369
+ f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
377
370
  )
378
371
  continue
379
- future = pool.submit(
372
+ pool.submit(
380
373
  MonitoringApplicationController.model_endpoint_process,
381
374
  endpoint=endpoint,
382
375
  applications_names=applications_names,
383
376
  batch_window_generator=self._batch_window_generator,
384
377
  project=self.project,
385
- parquet_directory=self.parquet_directory,
386
- storage_options=self.storage_options,
387
378
  model_monitoring_access_key=self.model_monitoring_access_key,
379
+ storage_options=self.storage_options,
388
380
  )
389
- futures.append(future)
390
-
391
- for future in concurrent.futures.as_completed(futures):
392
- result = future.result()
393
- if result:
394
- self.context.log_results(result)
395
-
396
- self._delete_old_parquet(endpoints=endpoints)
397
381
 
398
382
  @classmethod
399
383
  def model_endpoint_process(
@@ -402,10 +386,9 @@ class MonitoringApplicationController:
402
386
  applications_names: list[str],
403
387
  batch_window_generator: _BatchWindowGenerator,
404
388
  project: str,
405
- parquet_directory: str,
406
- storage_options: dict,
407
389
  model_monitoring_access_key: str,
408
- ) -> Optional[dict[str, list[str]]]:
390
+ storage_options: Optional[dict] = None,
391
+ ) -> None:
409
392
  """
410
393
  Process a model endpoint and trigger the monitoring applications. This function running on different process
411
394
  for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
@@ -415,18 +398,15 @@ class MonitoringApplicationController:
415
398
  :param applications_names: (list[str]) List of application names to push results to.
416
399
  :param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
417
400
  :param project: (str) Project name.
418
- :param parquet_directory: (str) Directory to store application parquet files
419
- :param storage_options: (dict) Storage options for writing ParquetTarget.
420
401
  :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
421
-
402
+ :param storage_options: (dict) Storage options for reading the infer parquet files.
422
403
  """
423
404
  endpoint_id = endpoint[mm_constants.EventFieldType.UID]
424
- start_times: set[datetime.datetime] = set()
405
+ has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
406
+ m_fs = fstore.get_feature_set(
407
+ endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
408
+ )
425
409
  try:
426
- m_fs = fstore.get_feature_set(
427
- endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
428
- )
429
-
430
410
  for application in applications_names:
431
411
  batch_window = batch_window_generator.get_batch_window(
432
412
  project=project,
@@ -434,171 +414,81 @@ class MonitoringApplicationController:
434
414
  application=application,
435
415
  first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
436
416
  last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
437
- has_stream=endpoint[mm_constants.EventFieldType.STREAM_PATH] != "",
417
+ has_stream=has_stream,
438
418
  )
439
419
 
440
420
  for start_infer_time, end_infer_time in batch_window.get_intervals():
441
- try:
442
- # Get application sample data
443
- offline_response = cls._get_sample_df(
444
- feature_set=m_fs,
421
+ df = m_fs.to_dataframe(
422
+ start_time=start_infer_time,
423
+ end_time=end_infer_time,
424
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
425
+ storage_options=storage_options,
426
+ )
427
+ if len(df) == 0:
428
+ logger.info(
429
+ "No data found for the given interval",
430
+ start=start_infer_time,
431
+ end=end_infer_time,
432
+ endpoint_id=endpoint_id,
433
+ )
434
+ else:
435
+ logger.info(
436
+ "Data found for the given interval",
437
+ start=start_infer_time,
438
+ end=end_infer_time,
445
439
  endpoint_id=endpoint_id,
440
+ )
441
+ cls._push_to_applications(
446
442
  start_infer_time=start_infer_time,
447
443
  end_infer_time=end_infer_time,
448
- parquet_directory=parquet_directory,
449
- storage_options=storage_options,
450
- application_name=application,
451
- )
452
-
453
- df = offline_response.to_dataframe()
454
- parquet_target_path = offline_response.vector.get_target_path()
455
-
456
- if len(df) == 0:
457
- logger.info(
458
- "During this time window, the endpoint has not received any data",
459
- endpoint=endpoint[mm_constants.EventFieldType.UID],
460
- start_time=start_infer_time,
461
- end_time=end_infer_time,
462
- )
463
- continue
464
-
465
- except FileNotFoundError:
466
- logger.warn(
467
- "No parquets were written yet",
468
- endpoint=endpoint[mm_constants.EventFieldType.UID],
444
+ endpoint_id=endpoint_id,
445
+ project=project,
446
+ applications_names=[application],
447
+ model_monitoring_access_key=model_monitoring_access_key,
469
448
  )
470
- continue
471
-
472
- # Get the timestamp of the latest request:
473
- latest_request = df[mm_constants.EventFieldType.TIMESTAMP].iloc[-1]
474
-
475
- # Get the feature stats from the model endpoint for reference data
476
- feature_stats = json.loads(
477
- endpoint[mm_constants.EventFieldType.FEATURE_STATS]
478
- )
479
-
480
- # Pad the original feature stats to accommodate current
481
- # data out of the original range (unless already padded)
482
- pad_features_hist(FeatureStats(feature_stats))
483
449
 
484
- # Get the current stats:
485
- current_stats = calculate_inputs_statistics(
486
- sample_set_statistics=feature_stats,
487
- inputs=df,
488
- )
489
-
490
- cls._push_to_applications(
491
- current_stats=current_stats,
492
- feature_stats=feature_stats,
493
- start_infer_time=start_infer_time,
494
- end_infer_time=end_infer_time,
495
- endpoint_id=endpoint_id,
496
- latest_request=latest_request,
497
- project=project,
498
- applications_names=[application],
499
- model_monitoring_access_key=model_monitoring_access_key,
500
- parquet_target_path=parquet_target_path,
501
- )
502
- start_times.add(start_infer_time)
503
450
  except Exception:
504
451
  logger.exception(
505
452
  "Encountered an exception",
506
453
  endpoint_id=endpoint[mm_constants.EventFieldType.UID],
507
454
  )
508
455
 
509
- if start_times:
510
- return {endpoint_id: [str(t) for t in sorted(list(start_times))]}
511
-
512
- def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
513
- """
514
- Delete application parquets older than the argument days.
515
-
516
- :param endpoints: A list of dictionaries of model endpoints records.
517
- """
518
- if self.parquet_directory.startswith("v3io:///"):
519
- # create fs with access to the user side (under projects)
520
- store, _ = mlrun.store_manager.get_or_create_store(
521
- self.parquet_directory,
522
- {"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
523
- )
524
- fs = store.filesystem
525
-
526
- # calculate time threshold (keep only files from the last 24 hours)
527
- time_to_keep = (
528
- datetime.datetime.now(tz=datetime.timezone.utc)
529
- - datetime.timedelta(days=days)
530
- ).timestamp()
531
-
532
- for endpoint in endpoints:
533
- try:
534
- apps_parquet_directories = fs.listdir(
535
- path=f"{self.parquet_directory}"
536
- f"/key={endpoint[mm_constants.EventFieldType.UID]}"
537
- )
538
- for directory in apps_parquet_directories:
539
- if directory["mtime"] < time_to_keep:
540
- # Delete files
541
- fs.rm(path=directory["name"], recursive=True)
542
- # Delete directory
543
- fs.rmdir(path=directory["name"])
544
- except FileNotFoundError:
545
- logger.info(
546
- "Application parquet directory is empty, "
547
- "probably parquets have not yet been created for this app",
548
- endpoint=endpoint[mm_constants.EventFieldType.UID],
549
- path=f"{self.parquet_directory}"
550
- f"/key={endpoint[mm_constants.EventFieldType.UID]}",
551
- )
552
-
553
456
  @staticmethod
554
457
  def _push_to_applications(
555
- current_stats,
556
- feature_stats,
557
- start_infer_time,
558
- end_infer_time,
559
- endpoint_id,
560
- latest_request,
561
- project,
562
- applications_names,
563
- model_monitoring_access_key,
564
- parquet_target_path,
458
+ start_infer_time: datetime.datetime,
459
+ end_infer_time: datetime.datetime,
460
+ endpoint_id: str,
461
+ project: str,
462
+ applications_names: list[str],
463
+ model_monitoring_access_key: str,
565
464
  ):
566
465
  """
567
466
  Pushes data to multiple stream applications.
568
467
 
569
- :param current_stats: Current statistics of input data.
570
- :param feature_stats: Statistics of train features.
571
- :param start_infer_time: The beginning of the infer interval window.
572
- :param end_infer_time: The end of the infer interval window.
573
- :param endpoint_id: Identifier for the model endpoint.
574
- :param latest_request: Timestamp of the latest model request.
575
- :param project: mlrun Project name.
576
- :param applications_names: List of application names to which data will be pushed.
468
+ :param start_infer_time: The beginning of the infer interval window.
469
+ :param end_infer_time: The end of the infer interval window.
470
+ :param endpoint_id: Identifier for the model endpoint.
471
+ :param project: mlrun Project name.
472
+ :param applications_names: List of application names to which data will be pushed.
473
+ :param model_monitoring_access_key: Access key to apply the model monitoring process.
577
474
 
578
475
  """
579
-
580
476
  data = {
581
- mm_constants.ApplicationEvent.CURRENT_STATS: json.dumps(current_stats),
582
- mm_constants.ApplicationEvent.FEATURE_STATS: json.dumps(feature_stats),
583
- mm_constants.ApplicationEvent.SAMPLE_PARQUET_PATH: parquet_target_path,
584
477
  mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
585
478
  sep=" ", timespec="microseconds"
586
479
  ),
587
480
  mm_constants.ApplicationEvent.END_INFER_TIME: end_infer_time.isoformat(
588
481
  sep=" ", timespec="microseconds"
589
482
  ),
590
- mm_constants.ApplicationEvent.LAST_REQUEST: latest_request.isoformat(
591
- sep=" ", timespec="microseconds"
592
- ),
593
483
  mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
594
484
  mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
595
485
  project=project,
596
- application_name=mm_constants.MonitoringFunctionNames.WRITER,
486
+ function_name=mm_constants.MonitoringFunctionNames.WRITER,
597
487
  ),
598
488
  }
599
489
  for app_name in applications_names:
600
490
  data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
601
- stream_uri = get_stream_path(project=project, application_name=app_name)
491
+ stream_uri = get_stream_path(project=project, function_name=app_name)
602
492
 
603
493
  logger.info(
604
494
  f"push endpoint_id {endpoint_id} to {app_name} by stream :{stream_uri}"
@@ -607,49 +497,12 @@ class MonitoringApplicationController:
607
497
  [data]
608
498
  )
609
499
 
610
- @staticmethod
611
- def _get_sample_df(
612
- feature_set: mlrun.common.schemas.FeatureSet,
613
- endpoint_id: str,
614
- start_infer_time: datetime.datetime,
615
- end_infer_time: datetime.datetime,
616
- parquet_directory: str,
617
- storage_options: dict,
618
- application_name: str,
619
- ) -> mlrun.feature_store.OfflineVectorResponse:
620
- """
621
- Retrieves a sample DataFrame of the current input according to the provided infer interval window.
622
-
623
- :param feature_set: The main feature set.
624
- :param endpoint_id: Identifier for the model endpoint.
625
- :param start_infer_time: The beginning of the infer interval window.
626
- :param end_infer_time: The end of the infer interval window.
627
- :param parquet_directory: Directory where Parquet files are stored.
628
- :param storage_options: Storage options for accessing the data.
629
- :param application_name: Current application name.
630
500
 
631
- :return: OfflineVectorResponse that can be used for generating a sample DataFrame for the specified endpoint.
501
+ def handler(context: nuclio.Context, event: nuclio.Event) -> None:
502
+ """
503
+ Run model monitoring application processor
632
504
 
633
- """
634
- features = [f"{feature_set.metadata.name}.*"]
635
- vector = fstore.FeatureVector(
636
- name=f"{endpoint_id}_vector",
637
- features=features,
638
- with_indexes=True,
639
- )
640
- vector.metadata.tag = application_name
641
- vector.feature_set_objects = {feature_set.metadata.name: feature_set}
642
-
643
- # get offline features based on application start and end time.
644
- # store the result parquet by partitioning by controller end processing time
645
- offline_response = vector.get_offline_features(
646
- start_time=start_infer_time,
647
- end_time=end_infer_time,
648
- timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
649
- target=ParquetTarget(
650
- path=parquet_directory
651
- + f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
652
- storage_options=storage_options,
653
- ),
654
- )
655
- return offline_response
505
+ :param context: the Nuclio context
506
+ :param event: trigger event
507
+ """
508
+ MonitoringApplicationController().run()
@@ -0,0 +1,18 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .stores import ObjectStoreFactory, get_store_object
16
+ from .stores.base import StoreBase
17
+ from .tsdb import get_tsdb_connector
18
+ from .tsdb.base import TSDBConnector