mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -11,35 +11,30 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import concurrent.futures
16
15
  import datetime
17
16
  import json
18
17
  import os
19
18
  import re
20
19
  from collections.abc import Iterator
21
- from typing import Any, NamedTuple, Optional, Union, cast
20
+ from typing import NamedTuple, Optional, Union, cast
22
21
 
23
22
  import nuclio
24
- from v3io.dataplane.response import HttpResponseError
25
23
 
26
24
  import mlrun
27
25
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
28
26
  import mlrun.data_types.infer
29
27
  import mlrun.feature_store as fstore
30
- from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
28
+ import mlrun.model_monitoring.db.stores
29
+ from mlrun.config import config as mlconf
31
30
  from mlrun.datastore import get_stream_pusher
32
- from mlrun.datastore.targets import ParquetTarget
33
31
  from mlrun.errors import err_to_str
34
- from mlrun.model_monitoring.batch import calculate_inputs_statistics
35
32
  from mlrun.model_monitoring.helpers import (
36
33
  _BatchDict,
37
34
  batch_dict2timedelta,
38
- get_monitoring_parquet_path,
39
35
  get_stream_path,
40
36
  )
41
- from mlrun.utils import create_logger, datetime_now, logger
42
- from mlrun.utils.v3io_clients import get_v3io_client
37
+ from mlrun.utils import datetime_now, logger
43
38
 
44
39
 
45
40
  class _Interval(NamedTuple):
@@ -48,8 +43,6 @@ class _Interval(NamedTuple):
48
43
 
49
44
 
50
45
  class _BatchWindow:
51
- V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
52
-
53
46
  def __init__(
54
47
  self,
55
48
  project: str,
@@ -65,27 +58,22 @@ class _BatchWindow:
65
58
  All the time values are in seconds.
66
59
  The start and stop time are in seconds since the epoch.
67
60
  """
61
+ self.project = project
68
62
  self._endpoint = endpoint
69
63
  self._application = application
70
64
  self._first_request = first_request
71
- self._kv_storage = get_v3io_client(
72
- endpoint=mlrun.mlconf.v3io_api,
73
- # Avoid noisy warning logs before the KV table is created
74
- logger=create_logger(name="v3io_client", level="error"),
75
- ).kv
76
- self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
77
65
  self._stop = last_updated
78
66
  self._step = timedelta_seconds
67
+ self._db = mlrun.model_monitoring.get_store_object(project=self.project)
79
68
  self._start = self._get_last_analyzed()
80
69
 
81
70
  def _get_last_analyzed(self) -> Optional[int]:
82
71
  try:
83
- data = self._kv_storage.get(
84
- container=self._v3io_container,
85
- table_path=self._endpoint,
86
- key=self._application,
72
+ last_analyzed = self._db.get_last_analyzed(
73
+ endpoint_id=self._endpoint,
74
+ application_name=self._application,
87
75
  )
88
- except HttpResponseError as err:
76
+ except mlrun.errors.MLRunNotFoundError:
89
77
  logger.info(
90
78
  "No last analyzed time was found for this endpoint and "
91
79
  "application, as this is probably the first time this "
@@ -96,7 +84,7 @@ class _BatchWindow:
96
84
  first_request=self._first_request,
97
85
  last_updated=self._stop,
98
86
  )
99
- logger.debug("Error while getting last analyzed time", err=err)
87
+
100
88
  if self._first_request and self._stop:
101
89
  # TODO : Change the timedelta according to the policy.
102
90
  first_period_in_seconds = max(
@@ -108,7 +96,6 @@ class _BatchWindow:
108
96
  )
109
97
  return self._first_request
110
98
 
111
- last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
112
99
  logger.info(
113
100
  "Got the last analyzed time for this endpoint and application",
114
101
  endpoint=self._endpoint,
@@ -124,11 +111,11 @@ class _BatchWindow:
124
111
  application=self._application,
125
112
  last_analyzed=last_analyzed,
126
113
  )
127
- self._kv_storage.put(
128
- container=self._v3io_container,
129
- table_path=self._endpoint,
130
- key=self._application,
131
- attributes={mm_constants.SchedulingKeys.LAST_ANALYZED: last_analyzed},
114
+
115
+ self._db.update_last_analyzed(
116
+ endpoint_id=self._endpoint,
117
+ application_name=self._application,
118
+ last_analyzed=last_analyzed,
132
119
  )
133
120
 
134
121
  def get_intervals(
@@ -227,7 +214,7 @@ class _BatchWindowGenerator:
227
214
  # If the endpoint does not have a stream, `last_updated` should be
228
215
  # the minimum between the current time and the last updated time.
229
216
  # This compensates for the bumping mechanism - see
230
- # `bump_model_endpoint_last_request`.
217
+ # `update_model_endpoint_last_request`.
231
218
  last_updated = min(int(datetime_now().timestamp()), last_updated)
232
219
  logger.debug(
233
220
  "The endpoint does not have a stream", last_updated=last_updated
@@ -282,26 +269,14 @@ class MonitoringApplicationController:
282
269
  Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
283
270
  """
284
271
 
285
- def __init__(
286
- self,
287
- mlrun_context: mlrun.run.MLClientCtx,
288
- project: str,
289
- ):
290
- """
291
- Initialize Monitoring Application Processor object.
272
+ def __init__(self) -> None:
273
+ """Initialize Monitoring Application Controller"""
274
+ self.project = cast(str, mlrun.mlconf.default_project)
275
+ self.project_obj = mlrun.load_project(name=self.project, url=self.project)
292
276
 
293
- :param mlrun_context: An MLRun context.
294
- :param project: Project name.
295
- """
296
- self.context = mlrun_context
297
- self.project = project
298
- self.project_obj = mlrun.get_or_create_project(project)
277
+ logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
299
278
 
300
- mlrun_context.logger.debug(
301
- f"Initializing {self.__class__.__name__}", project=project
302
- )
303
-
304
- self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
279
+ self.db = mlrun.model_monitoring.get_store_object(project=self.project)
305
280
 
306
281
  self._batch_window_generator = _BatchWindowGenerator(
307
282
  batch_dict=json.loads(
@@ -312,14 +287,8 @@ class MonitoringApplicationController:
312
287
  )
313
288
 
314
289
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
315
- self.parquet_directory = get_monitoring_parquet_path(
316
- self.project_obj,
317
- kind=mm_constants.FileTargetKind.APPS_PARQUET,
318
- )
319
290
  self.storage_options = None
320
- if not mlrun.mlconf.is_ce_mode():
321
- self._initialize_v3io_configurations()
322
- elif self.parquet_directory.startswith("s3://"):
291
+ if mlconf.artifact_path.startswith("s3://"):
323
292
  self.storage_options = mlrun.mlconf.get_s3_storage_options()
324
293
 
325
294
  @staticmethod
@@ -330,89 +299,85 @@ class MonitoringApplicationController:
330
299
  access_key = mlrun.mlconf.get_v3io_access_key()
331
300
  return access_key
332
301
 
333
- def _initialize_v3io_configurations(self) -> None:
334
- self.v3io_framesd = mlrun.mlconf.v3io_framesd
335
- self.v3io_api = mlrun.mlconf.v3io_api
336
- self.storage_options = dict(
337
- v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
338
- )
339
-
340
- def run(self, event: nuclio.Event):
302
+ def run(self) -> None:
341
303
  """
342
- Main method for run all the relevant monitoring applications on each endpoint
343
-
344
- :param event: trigger event
304
+ Main method for run all the relevant monitoring applications on each endpoint.
305
+ This method handles the following:
306
+ 1. List model endpoints
307
+ 2. List applications
308
+ 3. Check model monitoring windows
309
+ 4. Send data to applications
310
+ 5. Delete old parquets
345
311
  """
346
312
  logger.info("Start running monitoring controller")
347
313
  try:
348
314
  applications_names = []
349
- endpoints = self.db.list_model_endpoints()
315
+ endpoints = self.db.list_model_endpoints(include_stats=True)
350
316
  if not endpoints:
351
- self.context.logger.info(
352
- "No model endpoints found", project=self.project
353
- )
317
+ logger.info("No model endpoints found", project=self.project)
354
318
  return
355
319
  monitoring_functions = self.project_obj.list_model_monitoring_functions()
356
320
  if monitoring_functions:
357
- # Gets only application in ready state
358
321
  applications_names = list(
359
- {
360
- app.metadata.name
361
- for app in monitoring_functions
362
- if app.status.state == "ready"
363
- }
322
+ {app.metadata.name for app in monitoring_functions}
364
323
  )
324
+ # if monitoring_functions: - TODO : ML-7700
325
+ # Gets only application in ready state
326
+ # applications_names = list(
327
+ # {
328
+ # app.metadata.name
329
+ # for app in monitoring_functions
330
+ # if (
331
+ # app.status.state == "ready"
332
+ # # workaround for the default app, as its `status.state` is `None`
333
+ # or app.metadata.name
334
+ # == mm_constants.HistogramDataDriftApplicationConstants.NAME
335
+ # )
336
+ # }
337
+ # )
365
338
  if not applications_names:
366
- self.context.logger.info(
367
- "No monitoring functions found", project=self.project
368
- )
339
+ logger.info("No monitoring functions found", project=self.project)
369
340
  return
341
+ logger.info(
342
+ "Starting to iterate over the applications",
343
+ applications=applications_names,
344
+ )
370
345
 
371
346
  except Exception as e:
372
- self.context.logger.error(
347
+ logger.error(
373
348
  "Failed to list endpoints and monitoring applications",
374
349
  exc=err_to_str(e),
375
350
  )
376
351
  return
377
352
  # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
378
- pool = concurrent.futures.ProcessPoolExecutor(
353
+ with concurrent.futures.ThreadPoolExecutor(
379
354
  max_workers=min(len(endpoints), 10),
380
- )
381
- futures = []
382
- for endpoint in endpoints:
383
- if (
384
- endpoint[mm_constants.EventFieldType.ACTIVE]
385
- and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
386
- == mm_constants.ModelMonitoringMode.enabled.value
387
- ):
388
- # Skip router endpoint:
355
+ ) as pool:
356
+ for endpoint in endpoints:
389
357
  if (
390
- int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
391
- == mm_constants.EndpointType.ROUTER
358
+ endpoint[mm_constants.EventFieldType.ACTIVE]
359
+ and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
360
+ == mm_constants.ModelMonitoringMode.enabled.value
392
361
  ):
393
- # Router endpoint has no feature stats
394
- logger.info(
395
- f"{endpoint[mm_constants.EventFieldType.UID]} is router skipping"
362
+ # Skip router endpoint:
363
+ if (
364
+ int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
365
+ == mm_constants.EndpointType.ROUTER
366
+ ):
367
+ # Router endpoint has no feature stats
368
+ logger.info(
369
+ f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
370
+ )
371
+ continue
372
+ pool.submit(
373
+ MonitoringApplicationController.model_endpoint_process,
374
+ endpoint=endpoint,
375
+ applications_names=applications_names,
376
+ batch_window_generator=self._batch_window_generator,
377
+ project=self.project,
378
+ model_monitoring_access_key=self.model_monitoring_access_key,
379
+ storage_options=self.storage_options,
396
380
  )
397
- continue
398
- future = pool.submit(
399
- MonitoringApplicationController.model_endpoint_process,
400
- endpoint=endpoint,
401
- applications_names=applications_names,
402
- batch_window_generator=self._batch_window_generator,
403
- project=self.project,
404
- parquet_directory=self.parquet_directory,
405
- storage_options=self.storage_options,
406
- model_monitoring_access_key=self.model_monitoring_access_key,
407
- )
408
- futures.append(future)
409
-
410
- for future in concurrent.futures.as_completed(futures):
411
- result = future.result()
412
- if result:
413
- self.context.log_results(result)
414
-
415
- self._delete_old_parquet(endpoints=endpoints)
416
381
 
417
382
  @classmethod
418
383
  def model_endpoint_process(
@@ -421,10 +386,9 @@ class MonitoringApplicationController:
421
386
  applications_names: list[str],
422
387
  batch_window_generator: _BatchWindowGenerator,
423
388
  project: str,
424
- parquet_directory: str,
425
- storage_options: dict,
426
389
  model_monitoring_access_key: str,
427
- ) -> Optional[dict[str, list[str]]]:
390
+ storage_options: Optional[dict] = None,
391
+ ) -> None:
428
392
  """
429
393
  Process a model endpoint and trigger the monitoring applications. This function running on different process
430
394
  for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
@@ -434,25 +398,15 @@ class MonitoringApplicationController:
434
398
  :param applications_names: (list[str]) List of application names to push results to.
435
399
  :param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
436
400
  :param project: (str) Project name.
437
- :param parquet_directory: (str) Directory to store application parquet files
438
- :param storage_options: (dict) Storage options for writing ParquetTarget.
439
401
  :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
440
-
402
+ :param storage_options: (dict) Storage options for reading the infer parquet files.
441
403
  """
442
404
  endpoint_id = endpoint[mm_constants.EventFieldType.UID]
443
- start_times: set[datetime.datetime] = set()
405
+ has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
406
+ m_fs = fstore.get_feature_set(
407
+ endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
408
+ )
444
409
  try:
445
- m_fs = fstore.get_feature_set(
446
- endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
447
- )
448
- labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
449
- if labels:
450
- if isinstance(labels, str):
451
- labels = json.loads(labels)
452
- for label in labels:
453
- if label not in list(m_fs.spec.features.keys()):
454
- m_fs.add_feature(fstore.Feature(name=label, value_type="float"))
455
-
456
410
  for application in applications_names:
457
411
  batch_window = batch_window_generator.get_batch_window(
458
412
  project=project,
@@ -460,162 +414,72 @@ class MonitoringApplicationController:
460
414
  application=application,
461
415
  first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
462
416
  last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
463
- has_stream=endpoint[mm_constants.EventFieldType.STREAM_PATH] != "",
417
+ has_stream=has_stream,
464
418
  )
465
419
 
466
420
  for start_infer_time, end_infer_time in batch_window.get_intervals():
467
- try:
468
- # Get application sample data
469
- offline_response = cls._get_sample_df(
470
- feature_set=m_fs,
421
+ df = m_fs.to_dataframe(
422
+ start_time=start_infer_time,
423
+ end_time=end_infer_time,
424
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
425
+ storage_options=storage_options,
426
+ )
427
+ if len(df) == 0:
428
+ logger.info(
429
+ "No data found for the given interval",
430
+ start=start_infer_time,
431
+ end=end_infer_time,
471
432
  endpoint_id=endpoint_id,
433
+ )
434
+ else:
435
+ logger.info(
436
+ "Data found for the given interval",
437
+ start=start_infer_time,
438
+ end=end_infer_time,
439
+ endpoint_id=endpoint_id,
440
+ )
441
+ cls._push_to_applications(
472
442
  start_infer_time=start_infer_time,
473
443
  end_infer_time=end_infer_time,
474
- parquet_directory=parquet_directory,
475
- storage_options=storage_options,
476
- application_name=application,
477
- )
478
-
479
- df = offline_response.to_dataframe()
480
- parquet_target_path = offline_response.vector.get_target_path()
481
-
482
- if len(df) == 0:
483
- logger.info(
484
- "During this time window, the endpoint has not received any data",
485
- endpoint=endpoint[mm_constants.EventFieldType.UID],
486
- start_time=start_infer_time,
487
- end_time=end_infer_time,
488
- )
489
- continue
490
-
491
- except FileNotFoundError:
492
- logger.warn(
493
- "No parquets were written yet",
494
- endpoint=endpoint[mm_constants.EventFieldType.UID],
444
+ endpoint_id=endpoint_id,
445
+ project=project,
446
+ applications_names=[application],
447
+ model_monitoring_access_key=model_monitoring_access_key,
495
448
  )
496
- continue
497
-
498
- # Get the timestamp of the latest request:
499
- latest_request = df[mm_constants.EventFieldType.TIMESTAMP].iloc[-1]
500
-
501
- # Get the feature stats from the model endpoint for reference data
502
- feature_stats = json.loads(
503
- endpoint[mm_constants.EventFieldType.FEATURE_STATS]
504
- )
505
-
506
- # Pad the original feature stats to accommodate current
507
- # data out of the original range (unless already padded)
508
- pad_features_hist(FeatureStats(feature_stats))
509
449
 
510
- # Get the current stats:
511
- current_stats = calculate_inputs_statistics(
512
- sample_set_statistics=feature_stats,
513
- inputs=df,
514
- )
515
-
516
- cls._push_to_applications(
517
- current_stats=current_stats,
518
- feature_stats=feature_stats,
519
- start_infer_time=start_infer_time,
520
- end_infer_time=end_infer_time,
521
- endpoint_id=endpoint_id,
522
- latest_request=latest_request,
523
- project=project,
524
- applications_names=[application],
525
- model_monitoring_access_key=model_monitoring_access_key,
526
- parquet_target_path=parquet_target_path,
527
- )
528
- start_times.add(start_infer_time)
529
450
  except Exception:
530
451
  logger.exception(
531
452
  "Encountered an exception",
532
453
  endpoint_id=endpoint[mm_constants.EventFieldType.UID],
533
454
  )
534
455
 
535
- if start_times:
536
- return {endpoint_id: [str(t) for t in sorted(list(start_times))]}
537
-
538
- def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
539
- """
540
- Delete application parquets older than the argument days.
541
-
542
- :param endpoints: A list of dictionaries of model endpoints records.
543
- """
544
- if self.parquet_directory.startswith("v3io:///"):
545
- # create fs with access to the user side (under projects)
546
- store, _, _ = mlrun.store_manager.get_or_create_store(
547
- self.parquet_directory,
548
- {"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
549
- )
550
- fs = store.filesystem
551
-
552
- # calculate time threshold (keep only files from the last 24 hours)
553
- time_to_keep = (
554
- datetime.datetime.now(tz=datetime.timezone.utc)
555
- - datetime.timedelta(days=days)
556
- ).timestamp()
557
-
558
- for endpoint in endpoints:
559
- try:
560
- apps_parquet_directories = fs.listdir(
561
- path=f"{self.parquet_directory}"
562
- f"/key={endpoint[mm_constants.EventFieldType.UID]}"
563
- )
564
- for directory in apps_parquet_directories:
565
- if directory["mtime"] < time_to_keep:
566
- # Delete files
567
- fs.rm(path=directory["name"], recursive=True)
568
- # Delete directory
569
- fs.rmdir(path=directory["name"])
570
- except FileNotFoundError:
571
- logger.info(
572
- "Application parquet directory is empty, "
573
- "probably parquets have not yet been created for this app",
574
- endpoint=endpoint[mm_constants.EventFieldType.UID],
575
- path=f"{self.parquet_directory}"
576
- f"/key={endpoint[mm_constants.EventFieldType.UID]}",
577
- )
578
-
579
456
  @staticmethod
580
457
  def _push_to_applications(
581
- current_stats,
582
- feature_stats,
583
- start_infer_time,
584
- end_infer_time,
585
- endpoint_id,
586
- latest_request,
587
- project,
588
- applications_names,
589
- model_monitoring_access_key,
590
- parquet_target_path,
458
+ start_infer_time: datetime.datetime,
459
+ end_infer_time: datetime.datetime,
460
+ endpoint_id: str,
461
+ project: str,
462
+ applications_names: list[str],
463
+ model_monitoring_access_key: str,
591
464
  ):
592
465
  """
593
466
  Pushes data to multiple stream applications.
594
467
 
595
- :param current_stats: Current statistics of input data.
596
- :param feature_stats: Statistics of train features.
597
- :param start_infer_time: The beginning of the infer interval window.
598
- :param end_infer_time: The end of the infer interval window.
599
- :param endpoint_id: Identifier for the model endpoint.
600
- :param latest_request: Timestamp of the latest model request.
601
- :param project: mlrun Project name.
602
- :param applications_names: List of application names to which data will be pushed.
468
+ :param start_infer_time: The beginning of the infer interval window.
469
+ :param end_infer_time: The end of the infer interval window.
470
+ :param endpoint_id: Identifier for the model endpoint.
471
+ :param project: mlrun Project name.
472
+ :param applications_names: List of application names to which data will be pushed.
473
+ :param model_monitoring_access_key: Access key to apply the model monitoring process.
603
474
 
604
475
  """
605
-
606
476
  data = {
607
- mm_constants.ApplicationEvent.CURRENT_STATS: json.dumps(current_stats),
608
- mm_constants.ApplicationEvent.FEATURE_STATS: json.dumps(feature_stats),
609
- mm_constants.ApplicationEvent.SAMPLE_PARQUET_PATH: parquet_target_path,
610
477
  mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
611
478
  sep=" ", timespec="microseconds"
612
479
  ),
613
480
  mm_constants.ApplicationEvent.END_INFER_TIME: end_infer_time.isoformat(
614
481
  sep=" ", timespec="microseconds"
615
482
  ),
616
- mm_constants.ApplicationEvent.LAST_REQUEST: latest_request.isoformat(
617
- sep=" ", timespec="microseconds"
618
- ),
619
483
  mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
620
484
  mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
621
485
  project=project,
@@ -633,49 +497,12 @@ class MonitoringApplicationController:
633
497
  [data]
634
498
  )
635
499
 
636
- @staticmethod
637
- def _get_sample_df(
638
- feature_set: mlrun.common.schemas.FeatureSet,
639
- endpoint_id: str,
640
- start_infer_time: datetime.datetime,
641
- end_infer_time: datetime.datetime,
642
- parquet_directory: str,
643
- storage_options: dict,
644
- application_name: str,
645
- ) -> mlrun.feature_store.OfflineVectorResponse:
646
- """
647
- Retrieves a sample DataFrame of the current input according to the provided infer interval window.
648
-
649
- :param feature_set: The main feature set.
650
- :param endpoint_id: Identifier for the model endpoint.
651
- :param start_infer_time: The beginning of the infer interval window.
652
- :param end_infer_time: The end of the infer interval window.
653
- :param parquet_directory: Directory where Parquet files are stored.
654
- :param storage_options: Storage options for accessing the data.
655
- :param application_name: Current application name.
656
500
 
657
- :return: OfflineVectorResponse that can be used for generating a sample DataFrame for the specified endpoint.
501
+ def handler(context: nuclio.Context, event: nuclio.Event) -> None:
502
+ """
503
+ Run model monitoring application processor
658
504
 
659
- """
660
- features = [f"{feature_set.metadata.name}.*"]
661
- vector = fstore.FeatureVector(
662
- name=f"{endpoint_id}_vector",
663
- features=features,
664
- with_indexes=True,
665
- )
666
- vector.metadata.tag = application_name
667
- vector.feature_set_objects = {feature_set.metadata.name: feature_set}
668
-
669
- # get offline features based on application start and end time.
670
- # store the result parquet by partitioning by controller end processing time
671
- offline_response = vector.get_offline_features(
672
- start_time=start_infer_time,
673
- end_time=end_infer_time,
674
- timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
675
- target=ParquetTarget(
676
- path=parquet_directory
677
- + f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
678
- storage_options=storage_options,
679
- ),
680
- )
681
- return offline_response
505
+ :param context: the Nuclio context
506
+ :param event: trigger event
507
+ """
508
+ MonitoringApplicationController().run()
@@ -1,4 +1,4 @@
1
- # Copyright 2023 Iguazio
1
+ # Copyright 2024 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -12,12 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from sqlalchemy.ext.declarative import declarative_base
16
-
17
- from .base import ModelEndpointsBaseTable
18
-
19
- Base = declarative_base()
20
-
21
-
22
- class ModelEndpointsTable(Base, ModelEndpointsBaseTable):
23
- pass
15
+ from .stores import ObjectStoreFactory, get_store_object
16
+ from .stores.base import StoreBase
17
+ from .tsdb import get_tsdb_connector
18
+ from .tsdb.base import TSDBConnector