mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +3 -41
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/frontend_spec.py +2 -1
  7. mlrun/api/api/endpoints/functions.py +95 -59
  8. mlrun/api/api/endpoints/grafana_proxy.py +9 -9
  9. mlrun/api/api/endpoints/logs.py +17 -3
  10. mlrun/api/api/endpoints/model_endpoints.py +3 -2
  11. mlrun/api/api/endpoints/pipelines.py +1 -5
  12. mlrun/api/api/endpoints/projects.py +88 -0
  13. mlrun/api/api/endpoints/runs.py +48 -6
  14. mlrun/api/api/endpoints/submit.py +2 -1
  15. mlrun/api/api/endpoints/workflows.py +355 -0
  16. mlrun/api/api/utils.py +3 -4
  17. mlrun/api/crud/__init__.py +1 -0
  18. mlrun/api/crud/client_spec.py +6 -2
  19. mlrun/api/crud/feature_store.py +5 -0
  20. mlrun/api/crud/model_monitoring/__init__.py +1 -0
  21. mlrun/api/crud/model_monitoring/deployment.py +497 -0
  22. mlrun/api/crud/model_monitoring/grafana.py +96 -42
  23. mlrun/api/crud/model_monitoring/helpers.py +159 -0
  24. mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
  25. mlrun/api/crud/notifications.py +9 -4
  26. mlrun/api/crud/pipelines.py +6 -11
  27. mlrun/api/crud/projects.py +2 -2
  28. mlrun/api/crud/runtime_resources.py +4 -3
  29. mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
  30. mlrun/api/crud/secrets.py +21 -0
  31. mlrun/api/crud/workflows.py +352 -0
  32. mlrun/api/db/base.py +16 -1
  33. mlrun/api/db/init_db.py +2 -4
  34. mlrun/api/db/session.py +1 -1
  35. mlrun/api/db/sqldb/db.py +129 -31
  36. mlrun/api/db/sqldb/models/models_mysql.py +15 -1
  37. mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
  38. mlrun/api/launcher.py +38 -6
  39. mlrun/api/main.py +3 -2
  40. mlrun/api/rundb/__init__.py +13 -0
  41. mlrun/{db → api/rundb}/sqldb.py +36 -84
  42. mlrun/api/runtime_handlers/__init__.py +56 -0
  43. mlrun/api/runtime_handlers/base.py +1247 -0
  44. mlrun/api/runtime_handlers/daskjob.py +209 -0
  45. mlrun/api/runtime_handlers/kubejob.py +37 -0
  46. mlrun/api/runtime_handlers/mpijob.py +147 -0
  47. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  48. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  49. mlrun/api/schemas/__init__.py +17 -6
  50. mlrun/api/utils/builder.py +1 -4
  51. mlrun/api/utils/clients/chief.py +14 -0
  52. mlrun/api/utils/clients/iguazio.py +33 -33
  53. mlrun/api/utils/clients/nuclio.py +2 -2
  54. mlrun/api/utils/periodic.py +9 -2
  55. mlrun/api/utils/projects/follower.py +14 -7
  56. mlrun/api/utils/projects/leader.py +2 -1
  57. mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
  58. mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
  59. mlrun/api/utils/runtimes/__init__.py +14 -0
  60. mlrun/api/utils/runtimes/nuclio.py +43 -0
  61. mlrun/api/utils/scheduler.py +98 -15
  62. mlrun/api/utils/singletons/db.py +5 -1
  63. mlrun/api/utils/singletons/project_member.py +4 -1
  64. mlrun/api/utils/singletons/scheduler.py +1 -1
  65. mlrun/artifacts/base.py +6 -6
  66. mlrun/artifacts/dataset.py +4 -4
  67. mlrun/artifacts/manager.py +2 -3
  68. mlrun/artifacts/model.py +2 -2
  69. mlrun/artifacts/plots.py +8 -8
  70. mlrun/common/db/__init__.py +14 -0
  71. mlrun/common/helpers.py +37 -0
  72. mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
  73. mlrun/common/model_monitoring/helpers.py +69 -0
  74. mlrun/common/schemas/__init__.py +13 -1
  75. mlrun/common/schemas/auth.py +4 -1
  76. mlrun/common/schemas/client_spec.py +1 -1
  77. mlrun/common/schemas/function.py +17 -0
  78. mlrun/common/schemas/model_monitoring/__init__.py +48 -0
  79. mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
  80. mlrun/common/schemas/model_monitoring/grafana.py +55 -0
  81. mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
  82. mlrun/common/schemas/notification.py +1 -0
  83. mlrun/common/schemas/object.py +4 -0
  84. mlrun/common/schemas/project.py +1 -0
  85. mlrun/common/schemas/regex.py +1 -1
  86. mlrun/common/schemas/runs.py +1 -8
  87. mlrun/common/schemas/schedule.py +1 -8
  88. mlrun/common/schemas/workflow.py +54 -0
  89. mlrun/config.py +45 -42
  90. mlrun/datastore/__init__.py +21 -0
  91. mlrun/datastore/base.py +1 -1
  92. mlrun/datastore/datastore.py +9 -0
  93. mlrun/datastore/dbfs_store.py +168 -0
  94. mlrun/datastore/helpers.py +18 -0
  95. mlrun/datastore/sources.py +1 -0
  96. mlrun/datastore/store_resources.py +2 -5
  97. mlrun/datastore/v3io.py +1 -2
  98. mlrun/db/__init__.py +4 -68
  99. mlrun/db/base.py +12 -0
  100. mlrun/db/factory.py +65 -0
  101. mlrun/db/httpdb.py +175 -20
  102. mlrun/db/nopdb.py +4 -2
  103. mlrun/execution.py +4 -2
  104. mlrun/feature_store/__init__.py +1 -0
  105. mlrun/feature_store/api.py +1 -2
  106. mlrun/feature_store/common.py +2 -1
  107. mlrun/feature_store/feature_set.py +1 -11
  108. mlrun/feature_store/feature_vector.py +340 -2
  109. mlrun/feature_store/ingestion.py +5 -10
  110. mlrun/feature_store/retrieval/base.py +118 -104
  111. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  112. mlrun/feature_store/retrieval/job.py +4 -1
  113. mlrun/feature_store/retrieval/local_merger.py +18 -18
  114. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  115. mlrun/feature_store/retrieval/storey_merger.py +22 -16
  116. mlrun/kfpops.py +3 -9
  117. mlrun/launcher/base.py +57 -53
  118. mlrun/launcher/client.py +5 -4
  119. mlrun/launcher/factory.py +24 -13
  120. mlrun/launcher/local.py +6 -6
  121. mlrun/launcher/remote.py +4 -4
  122. mlrun/lists.py +0 -11
  123. mlrun/model.py +11 -17
  124. mlrun/model_monitoring/__init__.py +2 -22
  125. mlrun/model_monitoring/features_drift_table.py +1 -1
  126. mlrun/model_monitoring/helpers.py +22 -210
  127. mlrun/model_monitoring/model_endpoint.py +1 -1
  128. mlrun/model_monitoring/model_monitoring_batch.py +127 -50
  129. mlrun/model_monitoring/prometheus.py +219 -0
  130. mlrun/model_monitoring/stores/__init__.py +16 -11
  131. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
  132. mlrun/model_monitoring/stores/models/mysql.py +47 -29
  133. mlrun/model_monitoring/stores/models/sqlite.py +47 -29
  134. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
  135. mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
  136. mlrun/model_monitoring/tracking_policy.py +104 -0
  137. mlrun/package/packager.py +6 -8
  138. mlrun/package/packagers/default_packager.py +121 -10
  139. mlrun/package/packagers/numpy_packagers.py +1 -1
  140. mlrun/platforms/__init__.py +0 -2
  141. mlrun/platforms/iguazio.py +0 -56
  142. mlrun/projects/pipelines.py +53 -159
  143. mlrun/projects/project.py +10 -37
  144. mlrun/render.py +1 -1
  145. mlrun/run.py +8 -124
  146. mlrun/runtimes/__init__.py +6 -42
  147. mlrun/runtimes/base.py +29 -1249
  148. mlrun/runtimes/daskjob.py +2 -198
  149. mlrun/runtimes/funcdoc.py +0 -9
  150. mlrun/runtimes/function.py +25 -29
  151. mlrun/runtimes/kubejob.py +5 -29
  152. mlrun/runtimes/local.py +1 -1
  153. mlrun/runtimes/mpijob/__init__.py +2 -2
  154. mlrun/runtimes/mpijob/abstract.py +10 -1
  155. mlrun/runtimes/mpijob/v1.py +0 -76
  156. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  157. mlrun/runtimes/nuclio.py +3 -2
  158. mlrun/runtimes/pod.py +28 -18
  159. mlrun/runtimes/remotesparkjob.py +1 -15
  160. mlrun/runtimes/serving.py +14 -6
  161. mlrun/runtimes/sparkjob/__init__.py +0 -1
  162. mlrun/runtimes/sparkjob/abstract.py +4 -131
  163. mlrun/runtimes/utils.py +0 -26
  164. mlrun/serving/routers.py +7 -7
  165. mlrun/serving/server.py +11 -8
  166. mlrun/serving/states.py +7 -1
  167. mlrun/serving/v2_serving.py +6 -6
  168. mlrun/utils/helpers.py +23 -42
  169. mlrun/utils/notifications/notification/__init__.py +4 -0
  170. mlrun/utils/notifications/notification/webhook.py +61 -0
  171. mlrun/utils/notifications/notification_pusher.py +5 -25
  172. mlrun/utils/regex.py +7 -2
  173. mlrun/utils/version/version.json +2 -2
  174. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
  175. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
  176. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  177. mlrun/mlutils/data.py +0 -160
  178. mlrun/mlutils/models.py +0 -78
  179. mlrun/mlutils/plots.py +0 -902
  180. mlrun/utils/model_monitoring.py +0 -249
  181. /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
  182. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  183. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  184. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -12,232 +12,44 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  #
15
- import pathlib
16
- import typing
17
-
18
- import sqlalchemy.orm
19
- from fastapi import Depends
20
-
21
- import mlrun
22
- import mlrun.api.api.utils
23
- import mlrun.api.crud.secrets
24
- import mlrun.api.utils.singletons.db
25
- import mlrun.api.utils.singletons.k8s
26
- import mlrun.common.model_monitoring as model_monitoring_constants
27
- import mlrun.common.schemas
28
- import mlrun.config
29
- import mlrun.feature_store as fstore
30
- import mlrun.model_monitoring.stream_processing_fs
31
- import mlrun.runtimes
32
- import mlrun.utils.helpers
33
- import mlrun.utils.model_monitoring
34
- from mlrun.api.api import deps
35
-
36
- _CURRENT_FILE_PATH = pathlib.Path(__file__)
37
- _STREAM_PROCESSING_FUNCTION_PATH = _CURRENT_FILE_PATH.parent / "stream_processing_fs.py"
38
- _MONIOTINRG_BATCH_FUNCTION_PATH = (
39
- _CURRENT_FILE_PATH.parent / "model_monitoring_batch.py"
40
- )
41
-
42
15
 
43
- def initial_model_monitoring_stream_processing_function(
44
- project: str,
45
- model_monitoring_access_key: str,
46
- tracking_policy: mlrun.utils.model_monitoring.TrackingPolicy,
47
- auth_info: mlrun.common.schemas.AuthInfo,
48
- parquet_target: str,
49
- ):
50
- """
51
- Initialize model monitoring stream processing function.
52
-
53
- :param project: Project name.
54
- :param model_monitoring_access_key: Access key to apply the model monitoring process. Please note that in CE
55
- deployments this parameter will be None.
56
- :param tracking_policy: Model monitoring configurations.
57
- :param auth_info: The auth info of the request.
58
- :parquet_target: Path to model monitoring parquet file that will be generated by the monitoring
59
- stream nuclio function.
60
-
61
- :return: A function object from a mlrun runtime class
62
-
63
- """
64
16
 
65
- # Initialize Stream Processor object
66
- stream_processor = mlrun.model_monitoring.stream_processing_fs.EventStreamProcessor(
67
- project=project,
68
- parquet_batching_max_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
69
- parquet_target=parquet_target,
70
- model_monitoring_access_key=model_monitoring_access_key,
71
- )
17
+ import typing
72
18
 
73
- # Create a new serving function for the streaming process
74
- function = mlrun.code_to_function(
75
- name="model-monitoring-stream",
76
- project=project,
77
- filename=str(_STREAM_PROCESSING_FUNCTION_PATH),
78
- kind="serving",
79
- image=tracking_policy.stream_image,
80
- )
19
+ import mlrun.common.model_monitoring.helpers
20
+ import mlrun.common.schemas
81
21
 
82
- # Create monitoring serving graph
83
- stream_processor.apply_monitoring_serving_graph(function)
84
22
 
85
- # Set the project to the serving function
86
- function.metadata.project = project
23
+ def get_stream_path(project: str = None):
24
+ """Get stream path from the project secret. If wasn't set, take it from the system configurations"""
87
25
 
88
- # Add stream triggers
89
- function = _apply_stream_trigger(
26
+ stream_uri = mlrun.get_secret_or_env(
27
+ mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
28
+ ) or mlrun.mlconf.get_model_monitoring_file_target_path(
90
29
  project=project,
91
- function=function,
92
- model_monitoring_access_key=model_monitoring_access_key,
93
- auth_info=auth_info,
30
+ kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
31
+ target="online",
94
32
  )
95
33
 
96
- # Apply feature store run configurations on the serving function
97
- run_config = fstore.RunConfig(function=function, local=False)
98
- function.spec.parameters = run_config.parameters
99
-
100
- return function
101
-
102
-
103
- def get_model_monitoring_batch_function(
104
- project: str,
105
- model_monitoring_access_key: str,
106
- db_session: sqlalchemy.orm.Session,
107
- auth_info: mlrun.common.schemas.AuthInfo,
108
- tracking_policy: mlrun.utils.model_monitoring.TrackingPolicy,
109
- ):
110
- """
111
- Initialize model monitoring batch function.
112
-
113
- :param project: project name.
114
- :param model_monitoring_access_key: access key to apply the model monitoring process. Please note that in CE
115
- deployments this parameter will be None.
116
- :param db_session: A session that manages the current dialog with the database.
117
- :param auth_info: The auth info of the request.
118
- :param tracking_policy: Model monitoring configurations.
119
-
120
- :return: A function object from a mlrun runtime class
121
-
122
- """
123
-
124
- # Create job function runtime for the model monitoring batch
125
- function: mlrun.runtimes.KubejobRuntime = mlrun.code_to_function(
126
- name="model-monitoring-batch",
127
- project=project,
128
- filename=str(_MONIOTINRG_BATCH_FUNCTION_PATH),
129
- kind="job",
130
- image=tracking_policy.default_batch_image,
131
- handler="handler",
34
+ return mlrun.common.model_monitoring.helpers.parse_monitoring_stream_path(
35
+ stream_uri=stream_uri, project=project
132
36
  )
133
- function.set_db_connection(mlrun.api.api.utils.get_run_db_instance(db_session))
134
37
 
135
- # Set the project to the job function
136
- function.metadata.project = project
137
38
 
138
- if not mlrun.mlconf.is_ce_mode():
139
- function = _apply_access_key_and_mount_function(
140
- project=project,
141
- function=function,
142
- model_monitoring_access_key=model_monitoring_access_key,
143
- auth_info=auth_info,
144
- )
39
+ def get_connection_string(secret_provider: typing.Callable = None) -> str:
40
+ """Get endpoint store connection string from the project secret. If wasn't set, take it from the system
41
+ configurations.
145
42
 
146
- # Enrich runtime with the required configurations
147
- mlrun.api.api.utils.apply_enrichment_and_validation_on_function(function, auth_info)
43
+ :param secret_provider: An optional secret provider to get the connection string secret.
148
44
 
149
- return function
45
+ :return: Valid SQL connection string.
150
46
 
151
-
152
- def _apply_stream_trigger(
153
- project: str,
154
- function: mlrun.runtimes.ServingRuntime,
155
- model_monitoring_access_key: str = None,
156
- auth_info: mlrun.common.schemas.AuthInfo = Depends(deps.authenticate_request),
157
- ) -> mlrun.runtimes.ServingRuntime:
158
- """Adding stream source for the nuclio serving function. By default, the function has HTTP stream trigger along
159
- with another supported stream source that can be either Kafka or V3IO, depends on the stream path schema that is
160
- defined under mlrun.mlconf.model_endpoint_monitoring.store_prefixes. Note that if no valid stream path has been
161
- provided then the function will have a single HTTP stream source.
162
-
163
- :param project: Project name.
164
- :param function: The serving function object that will be applied with the stream trigger.
165
- :param model_monitoring_access_key: Access key to apply the model monitoring stream function when the stream is
166
- schema is V3IO.
167
- :param auth_info: The auth info of the request.
168
-
169
- :return: ServingRuntime object with stream trigger.
170
47
  """
171
48
 
172
- # Get the stream path from the configuration
173
- # stream_path = mlrun.mlconf.get_file_target_path(project=project, kind="stream", target="stream")
174
- stream_path = mlrun.utils.model_monitoring.get_stream_path(project=project)
175
-
176
- if stream_path.startswith("kafka://"):
177
-
178
- topic, brokers = mlrun.datastore.utils.parse_kafka_url(url=stream_path)
179
- # Generate Kafka stream source
180
- stream_source = mlrun.datastore.sources.KafkaSource(
181
- brokers=brokers,
182
- topics=[topic],
49
+ return (
50
+ mlrun.get_secret_or_env(
51
+ key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
52
+ secret_provider=secret_provider,
183
53
  )
184
- function = stream_source.add_nuclio_trigger(function)
185
-
186
- if not mlrun.mlconf.is_ce_mode():
187
- function = _apply_access_key_and_mount_function(
188
- project=project,
189
- function=function,
190
- model_monitoring_access_key=model_monitoring_access_key,
191
- auth_info=auth_info,
192
- )
193
- if stream_path.startswith("v3io://"):
194
- # Generate V3IO stream trigger
195
- function.add_v3io_stream_trigger(
196
- stream_path=stream_path, name="monitoring_stream_trigger"
197
- )
198
- # Add the default HTTP source
199
- http_source = mlrun.datastore.sources.HttpSource()
200
- function = http_source.add_nuclio_trigger(function)
201
-
202
- return function
203
-
204
-
205
- def _apply_access_key_and_mount_function(
206
- project: str,
207
- function: typing.Union[
208
- mlrun.runtimes.KubejobRuntime, mlrun.runtimes.ServingRuntime
209
- ],
210
- model_monitoring_access_key: str,
211
- auth_info: mlrun.common.schemas.AuthInfo,
212
- ) -> typing.Union[mlrun.runtimes.KubejobRuntime, mlrun.runtimes.ServingRuntime]:
213
- """Applying model monitoring access key on the provided function when using V3IO path. In addition, this method
214
- mount the V3IO path for the provided function to configure the access to the system files.
215
-
216
- :param project: Project name.
217
- :param function: Model monitoring function object that will be filled with the access key and
218
- the access to the system files.
219
- :param model_monitoring_access_key: Access key to apply the model monitoring stream function when the stream is
220
- schema is V3IO.
221
- :param auth_info: The auth info of the request.
222
-
223
- :return: function runtime object with access key and access to system files.
224
- """
225
-
226
- # Set model monitoring access key for managing permissions
227
- function.set_env_from_secret(
228
- model_monitoring_constants.ProjectSecretKeys.ACCESS_KEY,
229
- mlrun.api.utils.singletons.k8s.get_k8s_helper().get_project_secret_name(
230
- project
231
- ),
232
- mlrun.api.crud.secrets.Secrets().generate_client_project_secret_key(
233
- mlrun.api.crud.secrets.SecretsClientType.model_monitoring,
234
- model_monitoring_constants.ProjectSecretKeys.ACCESS_KEY,
235
- ),
54
+ or mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection
236
55
  )
237
- function.metadata.credentials.access_key = model_monitoring_access_key
238
- function.apply(mlrun.mount_v3io())
239
-
240
- # Ensure that the auth env vars are set
241
- mlrun.api.api.utils.ensure_function_has_auth_set(function, auth_info)
242
-
243
- return function
@@ -16,7 +16,7 @@
16
16
  from typing import Any, Dict, List, Optional
17
17
 
18
18
  import mlrun.model
19
- from mlrun.common.model_monitoring import (
19
+ from mlrun.common.schemas.model_monitoring.constants import (
20
20
  EndpointType,
21
21
  EventKeyMetrics,
22
22
  EventLiveStats,
@@ -27,16 +27,11 @@ import v3io
27
27
  import v3io.dataplane
28
28
  import v3io_frames
29
29
 
30
- import mlrun
31
- import mlrun.common.model_monitoring
32
- import mlrun.common.schemas
30
+ import mlrun.common.helpers
31
+ import mlrun.common.model_monitoring.helpers
32
+ import mlrun.common.schemas.model_monitoring
33
33
  import mlrun.data_types.infer
34
34
  import mlrun.feature_store as fstore
35
- import mlrun.model_monitoring
36
- import mlrun.model_monitoring.stores
37
- import mlrun.run
38
- import mlrun.utils.helpers
39
- import mlrun.utils.model_monitoring
40
35
  import mlrun.utils.v3io_clients
41
36
  from mlrun.utils import logger
42
37
 
@@ -497,7 +492,6 @@ class BatchProcessor:
497
492
  context: mlrun.run.MLClientCtx,
498
493
  project: str,
499
494
  ):
500
-
501
495
  """
502
496
  Initialize Batch Processor object.
503
497
 
@@ -525,9 +519,7 @@ class BatchProcessor:
525
519
 
526
520
  # Get a runtime database
527
521
 
528
- self.db = mlrun.model_monitoring.stores.get_model_endpoint_store(
529
- project=project
530
- )
522
+ self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
531
523
 
532
524
  if not mlrun.mlconf.is_ce_mode():
533
525
  # TODO: Once there is a time series DB alternative in a non-CE deployment, we need to update this if
@@ -539,7 +531,7 @@ class BatchProcessor:
539
531
 
540
532
  # Get the batch interval range
541
533
  self.batch_dict = context.parameters[
542
- mlrun.common.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
534
+ mlrun.common.schemas.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
543
535
  ]
544
536
 
545
537
  # TODO: This will be removed in 1.5.0 once the job params can be parsed with different types
@@ -556,23 +548,27 @@ class BatchProcessor:
556
548
  # Define the required paths for the project objects
557
549
  tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
558
550
  project=self.project,
559
- kind=mlrun.common.model_monitoring.FileTargetKind.EVENTS,
551
+ kind=mlrun.common.schemas.model_monitoring.FileTargetKind.EVENTS,
560
552
  )
561
553
  (
562
554
  _,
563
555
  self.tsdb_container,
564
556
  self.tsdb_path,
565
- ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(tsdb_path)
557
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
558
+ tsdb_path
559
+ )
566
560
  # stream_path = template.format(project=self.project, kind="log_stream")
567
561
  stream_path = mlrun.mlconf.get_model_monitoring_file_target_path(
568
562
  project=self.project,
569
- kind=mlrun.common.model_monitoring.FileTargetKind.LOG_STREAM,
563
+ kind=mlrun.common.schemas.model_monitoring.FileTargetKind.LOG_STREAM,
570
564
  )
571
565
  (
572
566
  _,
573
567
  self.stream_container,
574
568
  self.stream_path,
575
- ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(stream_path)
569
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
570
+ stream_path
571
+ )
576
572
 
577
573
  # Get the frames clients based on the v3io configuration
578
574
  # it will be used later for writing the results into the tsdb
@@ -619,24 +615,24 @@ class BatchProcessor:
619
615
 
620
616
  for endpoint in endpoints:
621
617
  if (
622
- endpoint[mlrun.common.model_monitoring.EventFieldType.ACTIVE]
618
+ endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.ACTIVE]
623
619
  and endpoint[
624
- mlrun.common.model_monitoring.EventFieldType.MONITORING_MODE
620
+ mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
625
621
  ]
626
- == mlrun.common.model_monitoring.ModelMonitoringMode.enabled.value
622
+ == mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
627
623
  ):
628
624
  # Skip router endpoint:
629
625
  if (
630
626
  int(
631
627
  endpoint[
632
- mlrun.common.model_monitoring.EventFieldType.ENDPOINT_TYPE
628
+ mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
633
629
  ]
634
630
  )
635
- == mlrun.common.model_monitoring.EndpointType.ROUTER
631
+ == mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
636
632
  ):
637
633
  # Router endpoint has no feature stats
638
634
  logger.info(
639
- f"{endpoint[mlrun.common.model_monitoring.EventFieldType.UID]} is router skipping"
635
+ f"{endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]} is router skipping"
640
636
  )
641
637
  continue
642
638
  self.update_drift_metrics(endpoint=endpoint)
@@ -649,12 +645,14 @@ class BatchProcessor:
649
645
  serving_function_name,
650
646
  _,
651
647
  _,
652
- ) = mlrun.utils.helpers.parse_versioned_object_uri(
653
- endpoint[mlrun.common.model_monitoring.EventFieldType.FUNCTION_URI]
648
+ ) = mlrun.common.helpers.parse_versioned_object_uri(
649
+ endpoint[
650
+ mlrun.common.schemas.model_monitoring.EventFieldType.FUNCTION_URI
651
+ ]
654
652
  )
655
653
 
656
654
  model_name = endpoint[
657
- mlrun.common.model_monitoring.EventFieldType.MODEL
655
+ mlrun.common.schemas.model_monitoring.EventFieldType.MODEL
658
656
  ].replace(":", "-")
659
657
 
660
658
  m_fs = fstore.get_feature_set(
@@ -668,7 +666,7 @@ class BatchProcessor:
668
666
  df = m_fs.to_dataframe(
669
667
  start_time=start_time,
670
668
  end_time=end_time,
671
- time_column=mlrun.common.model_monitoring.EventFieldType.TIMESTAMP,
669
+ time_column=mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
672
670
  )
673
671
 
674
672
  if len(df) == 0:
@@ -676,7 +674,7 @@ class BatchProcessor:
676
674
  "Not enough model events since the beginning of the batch interval",
677
675
  parquet_target=m_fs.status.targets[0].path,
678
676
  endpoint=endpoint[
679
- mlrun.common.model_monitoring.EventFieldType.UID
677
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
680
678
  ],
681
679
  min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
682
680
  start_time=str(
@@ -694,7 +692,9 @@ class BatchProcessor:
694
692
  logger.warn(
695
693
  "Parquet not found, probably due to not enough model events",
696
694
  parquet_target=m_fs.status.targets[0].path,
697
- endpoint=endpoint[mlrun.common.model_monitoring.EventFieldType.UID],
695
+ endpoint=endpoint[
696
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
697
+ ],
698
698
  min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
699
699
  )
700
700
  return
@@ -706,14 +706,16 @@ class BatchProcessor:
706
706
 
707
707
  # Create DataFrame based on the input features
708
708
  stats_columns = [
709
- mlrun.common.model_monitoring.EventFieldType.TIMESTAMP,
709
+ mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
710
710
  *feature_names,
711
711
  ]
712
712
 
713
713
  # Add label names if provided
714
- if endpoint[mlrun.common.model_monitoring.EventFieldType.LABEL_NAMES]:
714
+ if endpoint[
715
+ mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
716
+ ]:
715
717
  labels = endpoint[
716
- mlrun.common.model_monitoring.EventFieldType.LABEL_NAMES
718
+ mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
717
719
  ]
718
720
  if isinstance(labels, str):
719
721
  labels = json.loads(labels)
@@ -731,13 +733,15 @@ class BatchProcessor:
731
733
  m_fs.save()
732
734
 
733
735
  # Get the timestamp of the latest request:
734
- timestamp = df[mlrun.common.model_monitoring.EventFieldType.TIMESTAMP].iloc[
735
- -1
736
- ]
736
+ timestamp = df[
737
+ mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
738
+ ].iloc[-1]
737
739
 
738
740
  # Get the feature stats from the model endpoint for reference data
739
741
  feature_stats = json.loads(
740
- endpoint[mlrun.common.model_monitoring.EventFieldType.FEATURE_STATS]
742
+ endpoint[
743
+ mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
744
+ ]
741
745
  )
742
746
 
743
747
  # Get the current stats:
@@ -758,7 +762,7 @@ class BatchProcessor:
758
762
  monitor_configuration = (
759
763
  json.loads(
760
764
  endpoint[
761
- mlrun.common.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
765
+ mlrun.common.schemas.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
762
766
  ]
763
767
  )
764
768
  or {}
@@ -778,7 +782,9 @@ class BatchProcessor:
778
782
  )
779
783
  logger.info(
780
784
  "Drift status",
781
- endpoint_id=endpoint[mlrun.common.model_monitoring.EventFieldType.UID],
785
+ endpoint_id=endpoint[
786
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
787
+ ],
782
788
  drift_status=drift_status.value,
783
789
  drift_measure=drift_measure,
784
790
  )
@@ -790,40 +796,54 @@ class BatchProcessor:
790
796
  }
791
797
 
792
798
  self.db.update_model_endpoint(
793
- endpoint_id=endpoint[mlrun.common.model_monitoring.EventFieldType.UID],
799
+ endpoint_id=endpoint[
800
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
801
+ ],
794
802
  attributes=attributes,
795
803
  )
796
804
 
797
805
  if not mlrun.mlconf.is_ce_mode():
798
806
  # Update drift results in TSDB
799
- self._update_drift_in_input_stream(
807
+ self._update_drift_in_v3io_tsdb(
800
808
  endpoint_id=endpoint[
801
- mlrun.common.model_monitoring.EventFieldType.UID
809
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
802
810
  ],
803
811
  drift_status=drift_status,
804
812
  drift_measure=drift_measure,
805
813
  drift_result=drift_result,
806
814
  timestamp=timestamp,
807
815
  )
808
- logger.info(
809
- "Done updating drift measures",
816
+
817
+ else:
818
+ # Update drift results in Prometheus
819
+ self._update_drift_in_prometheus(
810
820
  endpoint_id=endpoint[
811
- mlrun.common.model_monitoring.EventFieldType.UID
821
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
812
822
  ],
823
+ drift_status=drift_status,
824
+ drift_result=drift_result,
813
825
  )
814
826
 
815
827
  except Exception as e:
816
828
  logger.error(
817
- f"Exception for endpoint {endpoint[mlrun.common.model_monitoring.EventFieldType.UID]}"
829
+ f"Exception for endpoint {endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]}"
818
830
  )
819
831
  self.exception = e
832
+ logger.info(
833
+ "Done updating drift measures",
834
+ endpoint_id=endpoint[
835
+ mlrun.common.schemas.model_monitoring.EventFieldType.UID
836
+ ],
837
+ )
820
838
 
821
839
  def _get_interval_range(self) -> Tuple[datetime.datetime, datetime.datetime]:
822
840
  """Getting batch interval time range"""
823
841
  minutes, hours, days = (
824
- self.batch_dict[mlrun.common.model_monitoring.EventFieldType.MINUTES],
825
- self.batch_dict[mlrun.common.model_monitoring.EventFieldType.HOURS],
826
- self.batch_dict[mlrun.common.model_monitoring.EventFieldType.DAYS],
842
+ self.batch_dict[
843
+ mlrun.common.schemas.model_monitoring.EventFieldType.MINUTES
844
+ ],
845
+ self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.HOURS],
846
+ self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.DAYS],
827
847
  )
828
848
  start_time = datetime.datetime.now() - datetime.timedelta(
829
849
  minutes=minutes, hours=hours, days=days
@@ -843,7 +863,7 @@ class BatchProcessor:
843
863
  pair_list = pair.split(":")
844
864
  self.batch_dict[pair_list[0]] = float(pair_list[1])
845
865
 
846
- def _update_drift_in_input_stream(
866
+ def _update_drift_in_v3io_tsdb(
847
867
  self,
848
868
  endpoint_id: str,
849
869
  drift_status: DriftStatus,
@@ -888,7 +908,7 @@ class BatchProcessor:
888
908
  "endpoint_id": endpoint_id,
889
909
  "timestamp": pd.to_datetime(
890
910
  timestamp,
891
- format=mlrun.common.model_monitoring.EventFieldType.TIME_FORMAT,
911
+ format=mlrun.common.schemas.model_monitoring.EventFieldType.TIME_FORMAT,
892
912
  ),
893
913
  "record_type": "drift_measures",
894
914
  "tvd_mean": drift_result["tvd_mean"],
@@ -911,6 +931,63 @@ class BatchProcessor:
911
931
  endpoint=endpoint_id,
912
932
  )
913
933
 
934
+ def _update_drift_in_prometheus(
935
+ self,
936
+ endpoint_id: str,
937
+ drift_status: DriftStatus,
938
+ drift_result: Dict[str, Dict[str, Any]],
939
+ ):
940
+ """Push drift metrics to Prometheus registry. Please note that the metrics are being pushed through HTTP
941
+ to the monitoring stream pod that writes them into a local registry. Afterwards, Prometheus wil scrape these
942
+ metrics that will be available in the Grafana charts.
943
+
944
+ :param endpoint_id: The unique id of the model endpoint.
945
+ :param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
946
+ :param drift_result: A dictionary that includes the drift results for each feature.
947
+
948
+
949
+ """
950
+ stream_http_path = (
951
+ mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
952
+ project=self.project
953
+ )
954
+ )
955
+
956
+ statistical_metrics = ["hellinger_mean", "tvd_mean", "kld_mean"]
957
+ metrics = []
958
+ for metric in statistical_metrics:
959
+ metrics.append(
960
+ {
961
+ mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
962
+ mlrun.common.schemas.model_monitoring.EventFieldType.METRIC: metric,
963
+ mlrun.common.schemas.model_monitoring.EventFieldType.VALUE: drift_result[
964
+ metric
965
+ ],
966
+ }
967
+ )
968
+
969
+ http_session = mlrun.utils.HTTPSessionWithRetry(
970
+ retry_on_post=True,
971
+ verbose=True,
972
+ )
973
+
974
+ http_session.request(
975
+ method="POST",
976
+ url=stream_http_path + "/monitoring-batch-metrics",
977
+ data=json.dumps(metrics),
978
+ )
979
+
980
+ drift_status_dict = {
981
+ mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
982
+ mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_STATUS: drift_status.value,
983
+ }
984
+
985
+ http_session.request(
986
+ method="POST",
987
+ url=stream_http_path + "/monitoring-drift-status",
988
+ data=json.dumps(drift_status_dict),
989
+ )
990
+
914
991
 
915
992
  def handler(context: mlrun.run.MLClientCtx):
916
993
  batch_processor = BatchProcessor(