mlrun 1.8.0rc26__py3-none-any.whl → 1.8.0rc28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +3 -2
- mlrun/artifacts/document.py +9 -6
- mlrun/artifacts/model.py +19 -4
- mlrun/common/model_monitoring/helpers.py +2 -2
- mlrun/common/schemas/model_monitoring/constants.py +0 -1
- mlrun/common/schemas/serving.py +22 -0
- mlrun/config.py +22 -9
- mlrun/datastore/base.py +0 -7
- mlrun/datastore/s3.py +9 -2
- mlrun/db/base.py +2 -1
- mlrun/db/httpdb.py +17 -10
- mlrun/db/nopdb.py +2 -1
- mlrun/execution.py +15 -4
- mlrun/lists.py +4 -1
- mlrun/model.py +2 -0
- mlrun/model_monitoring/applications/_application_steps.py +1 -0
- mlrun/model_monitoring/applications/base.py +132 -21
- mlrun/model_monitoring/applications/context.py +2 -3
- mlrun/model_monitoring/controller.py +117 -57
- mlrun/model_monitoring/db/_schedules.py +8 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +12 -5
- mlrun/model_monitoring/stream_processing.py +3 -2
- mlrun/projects/project.py +44 -7
- mlrun/runtimes/base.py +1 -1
- mlrun/runtimes/generators.py +1 -1
- mlrun/runtimes/nuclio/function.py +37 -0
- mlrun/runtimes/nuclio/serving.py +3 -0
- mlrun/runtimes/pod.py +1 -3
- mlrun/serving/routers.py +62 -17
- mlrun/serving/server.py +11 -0
- mlrun/serving/states.py +0 -4
- mlrun/serving/v2_serving.py +45 -10
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc26.dist-info → mlrun-1.8.0rc28.dist-info}/METADATA +4 -2
- {mlrun-1.8.0rc26.dist-info → mlrun-1.8.0rc28.dist-info}/RECORD +39 -38
- {mlrun-1.8.0rc26.dist-info → mlrun-1.8.0rc28.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc26.dist-info → mlrun-1.8.0rc28.dist-info}/WHEEL +0 -0
- {mlrun-1.8.0rc26.dist-info → mlrun-1.8.0rc28.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc26.dist-info → mlrun-1.8.0rc28.dist-info}/top_level.txt +0 -0
|
@@ -215,6 +215,116 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
215
215
|
)
|
|
216
216
|
function.deploy()
|
|
217
217
|
|
|
218
|
+
@classmethod
|
|
219
|
+
def get_job_handler(cls, handler_to_class: str) -> str:
|
|
220
|
+
"""
|
|
221
|
+
A helper function to get the handler to the application job ``_handler``.
|
|
222
|
+
|
|
223
|
+
:param handler_to_class: The handler to the application class, e.g. ``my_package.sub_module1.MonitoringApp1``.
|
|
224
|
+
:returns: The handler to the job of the application class.
|
|
225
|
+
"""
|
|
226
|
+
return f"{handler_to_class}::{cls._handler.__name__}"
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
def to_job(
|
|
230
|
+
cls,
|
|
231
|
+
*,
|
|
232
|
+
class_handler: Optional[str] = None,
|
|
233
|
+
func_path: Optional[str] = None,
|
|
234
|
+
func_name: Optional[str] = None,
|
|
235
|
+
tag: Optional[str] = None,
|
|
236
|
+
image: Optional[str] = None,
|
|
237
|
+
with_repo: Optional[bool] = False,
|
|
238
|
+
requirements: Optional[Union[str, list[str]]] = None,
|
|
239
|
+
requirements_file: str = "",
|
|
240
|
+
project: Optional["mlrun.MlrunProject"] = None,
|
|
241
|
+
) -> mlrun.runtimes.KubejobRuntime:
|
|
242
|
+
"""
|
|
243
|
+
Get the application's :py:meth:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase.do_tracking`
|
|
244
|
+
model monitoring logic as a :py:class:`~mlrun.runtimes.KubejobRuntime`.
|
|
245
|
+
|
|
246
|
+
The returned job can be run as any MLRun job with the relevant inputs and params to your application:
|
|
247
|
+
|
|
248
|
+
.. code-block:: python
|
|
249
|
+
|
|
250
|
+
job = ModelMonitoringApplicationBase.to_job(
|
|
251
|
+
class_handler="package.module.AppClass"
|
|
252
|
+
)
|
|
253
|
+
job.run(inputs={}, params={}, local=False) # Add the relevant inputs and params
|
|
254
|
+
|
|
255
|
+
Optional inputs:
|
|
256
|
+
|
|
257
|
+
* ``sample_data``, ``pd.DataFrame``
|
|
258
|
+
* ``reference_data``, ``pd.DataFrame``
|
|
259
|
+
|
|
260
|
+
Optional params:
|
|
261
|
+
|
|
262
|
+
* ``endpoints``, ``list[tuple[str, str]]``
|
|
263
|
+
* ``start``, ``datetime``
|
|
264
|
+
* ``end``, ``datetime``
|
|
265
|
+
* ``base_period``, ``int``
|
|
266
|
+
|
|
267
|
+
For Git sources, add the source archive to the returned job and change the handler:
|
|
268
|
+
|
|
269
|
+
.. code-block:: python
|
|
270
|
+
|
|
271
|
+
handler = ModelMonitoringApplicationBase.get_job_handler("module.AppClass")
|
|
272
|
+
job.with_source_archive(
|
|
273
|
+
"git://github.com/owner/repo.git#branch-category/specific-task",
|
|
274
|
+
workdir="path/to/application/folder",
|
|
275
|
+
handler=handler,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
:param class_handler: The handler to the class, e.g. ``path.to.module::MonitoringApplication``,
|
|
279
|
+
useful when using Git sources or code from images.
|
|
280
|
+
If ``None``, the current class, deriving from
|
|
281
|
+
:py:class:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase`,
|
|
282
|
+
is used.
|
|
283
|
+
:param func_path: The path to the function. If ``None``, the current notebook is used.
|
|
284
|
+
:param func_name: The name of the function. If not ``None``, the class name is used.
|
|
285
|
+
:param tag: Tag for the function.
|
|
286
|
+
:param image: Docker image to run the job on (when running remotely).
|
|
287
|
+
:param with_repo: Whether to clone the current repo to the build source.
|
|
288
|
+
:param requirements: List of Python requirements to be installed in the image.
|
|
289
|
+
:param requirements_file: Path to a Python requirements file to be installed in the image.
|
|
290
|
+
:param project: The current project to set the function to. If not set, the current project is used.
|
|
291
|
+
|
|
292
|
+
:returns: The :py:class:`~mlrun.runtimes.KubejobRuntime` job that wraps the model monitoring application's
|
|
293
|
+
logic.
|
|
294
|
+
"""
|
|
295
|
+
project = project or cast("mlrun.MlrunProject", mlrun.get_current_project())
|
|
296
|
+
|
|
297
|
+
if not class_handler and cls == ModelMonitoringApplicationBase:
|
|
298
|
+
raise ValueError(
|
|
299
|
+
"You must provide a handler to the model monitoring application class"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
handler_to_class = class_handler or cls.__name__
|
|
303
|
+
handler = cls.get_job_handler(handler_to_class)
|
|
304
|
+
|
|
305
|
+
if not class_handler:
|
|
306
|
+
class_name = cls.__name__
|
|
307
|
+
else:
|
|
308
|
+
class_name = handler_to_class.split(".")[-1].split("::")[-1]
|
|
309
|
+
|
|
310
|
+
job_name = func_name if func_name else class_name
|
|
311
|
+
|
|
312
|
+
job = cast(
|
|
313
|
+
mlrun.runtimes.KubejobRuntime,
|
|
314
|
+
project.set_function(
|
|
315
|
+
func=func_path,
|
|
316
|
+
name=job_name,
|
|
317
|
+
kind=mlrun.runtimes.KubejobRuntime.kind,
|
|
318
|
+
handler=handler,
|
|
319
|
+
tag=tag,
|
|
320
|
+
image=image,
|
|
321
|
+
with_repo=with_repo,
|
|
322
|
+
requirements=requirements,
|
|
323
|
+
requirements_file=requirements_file,
|
|
324
|
+
),
|
|
325
|
+
)
|
|
326
|
+
return job
|
|
327
|
+
|
|
218
328
|
@classmethod
|
|
219
329
|
def evaluate(
|
|
220
330
|
cls,
|
|
@@ -223,10 +333,12 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
223
333
|
*,
|
|
224
334
|
tag: Optional[str] = None,
|
|
225
335
|
run_local: bool = True,
|
|
336
|
+
auto_build: bool = True,
|
|
226
337
|
sample_data: Optional[pd.DataFrame] = None,
|
|
227
338
|
reference_data: Optional[pd.DataFrame] = None,
|
|
228
339
|
image: Optional[str] = None,
|
|
229
340
|
with_repo: Optional[bool] = False,
|
|
341
|
+
class_handler: Optional[str] = None,
|
|
230
342
|
requirements: Optional[Union[str, list[str]]] = None,
|
|
231
343
|
requirements_file: str = "",
|
|
232
344
|
endpoints: Optional[list[tuple[str, str]]] = None,
|
|
@@ -239,19 +351,21 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
239
351
|
:py:meth:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase.do_tracking`
|
|
240
352
|
model monitoring logic as a :py:class:`~mlrun.runtimes.KubejobRuntime`, which is an MLRun function.
|
|
241
353
|
|
|
242
|
-
This
|
|
354
|
+
This function has default values for all of its arguments. You should be change them when you want to pass
|
|
243
355
|
data to the application.
|
|
244
356
|
|
|
245
357
|
:param func_path: The path to the function. If ``None``, the current notebook is used.
|
|
246
358
|
:param func_name: The name of the function. If not ``None``, the class name is used.
|
|
247
359
|
:param tag: Tag for the function.
|
|
248
360
|
:param run_local: Whether to run the function locally or remotely.
|
|
361
|
+
:param auto_build: Whether to auto build the function.
|
|
249
362
|
:param sample_data: Pandas data-frame as the current dataset.
|
|
250
363
|
When set, it replaces the data read from the model endpoint's offline source.
|
|
251
364
|
:param reference_data: Pandas data-frame of the reference dataset.
|
|
252
365
|
When set, its statistics override the model endpoint's feature statistics.
|
|
253
|
-
:param image: Docker image to run the job on.
|
|
366
|
+
:param image: Docker image to run the job on (when running remotely).
|
|
254
367
|
:param with_repo: Whether to clone the current repo to the build source.
|
|
368
|
+
:param class_handler: The relative path to the class, useful when using Git sources or code from images.
|
|
255
369
|
:param requirements: List of Python requirements to be installed in the image.
|
|
256
370
|
:param requirements_file: Path to a Python requirements file to be installed in the image.
|
|
257
371
|
:param endpoints: A list of tuples of the model endpoint (name, uid) to get the data from.
|
|
@@ -268,23 +382,17 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
268
382
|
method with the given parameters and inputs, wrapped in a :py:class:`~mlrun.model.RunObject`.
|
|
269
383
|
"""
|
|
270
384
|
project = cast("mlrun.MlrunProject", mlrun.get_current_project())
|
|
271
|
-
class_name = cls.__name__
|
|
272
|
-
job_name = func_name if func_name is not None else class_name
|
|
273
|
-
handler = f"{class_name}::{cls._handler.__name__}"
|
|
274
385
|
|
|
275
|
-
job =
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
requirements=requirements,
|
|
286
|
-
requirements_file=requirements_file,
|
|
287
|
-
),
|
|
386
|
+
job = cls.to_job(
|
|
387
|
+
func_path=func_path,
|
|
388
|
+
func_name=func_name,
|
|
389
|
+
class_handler=class_handler,
|
|
390
|
+
tag=tag,
|
|
391
|
+
image=image,
|
|
392
|
+
with_repo=with_repo,
|
|
393
|
+
requirements=requirements,
|
|
394
|
+
requirements_file=requirements_file,
|
|
395
|
+
project=project,
|
|
288
396
|
)
|
|
289
397
|
|
|
290
398
|
params: dict[str, Union[list[tuple[str, str]], datetime, int, None]] = {}
|
|
@@ -305,18 +413,21 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
305
413
|
(reference_data, "reference_data"),
|
|
306
414
|
]:
|
|
307
415
|
if data is not None:
|
|
308
|
-
key = f"{
|
|
416
|
+
key = f"{job.metadata.name}_{identifier}"
|
|
309
417
|
inputs[identifier] = project.log_dataset(
|
|
310
418
|
key,
|
|
311
419
|
data,
|
|
312
420
|
labels={
|
|
313
421
|
mlrun_constants.MLRunInternalLabels.runner_pod: socket.gethostname(),
|
|
314
422
|
mlrun_constants.MLRunInternalLabels.producer_type: "model-monitoring-job",
|
|
315
|
-
mlrun_constants.MLRunInternalLabels.app_name:
|
|
423
|
+
mlrun_constants.MLRunInternalLabels.app_name: func_name
|
|
424
|
+
or cls.__name__,
|
|
316
425
|
},
|
|
317
426
|
).uri
|
|
318
427
|
|
|
319
|
-
run_result = job.run(
|
|
428
|
+
run_result = job.run(
|
|
429
|
+
local=run_local, auto_build=auto_build, params=params, inputs=inputs
|
|
430
|
+
)
|
|
320
431
|
return run_result
|
|
321
432
|
|
|
322
433
|
@abstractmethod
|
|
@@ -169,9 +169,8 @@ class MonitoringApplicationContext:
|
|
|
169
169
|
sample_df: Optional[pd.DataFrame] = None,
|
|
170
170
|
feature_stats: Optional[FeatureStats] = None,
|
|
171
171
|
) -> "MonitoringApplicationContext":
|
|
172
|
-
project = mlrun.load_project(url=graph_context.project)
|
|
173
172
|
nuclio_logger = graph_context.logger
|
|
174
|
-
artifacts_logger =
|
|
173
|
+
artifacts_logger = graph_context.project_obj
|
|
175
174
|
logger = mlrun.utils.create_logger(
|
|
176
175
|
level=mlrun.mlconf.log_level,
|
|
177
176
|
formatter_kind=mlrun.mlconf.log_formatter,
|
|
@@ -180,7 +179,7 @@ class MonitoringApplicationContext:
|
|
|
180
179
|
return cls(
|
|
181
180
|
application_name=application_name,
|
|
182
181
|
event=event,
|
|
183
|
-
project=
|
|
182
|
+
project=graph_context.project_obj,
|
|
184
183
|
model_endpoint_dict=model_endpoint_dict,
|
|
185
184
|
logger=logger,
|
|
186
185
|
nuclio_logger=nuclio_logger,
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import concurrent.futures
|
|
15
16
|
import datetime
|
|
16
17
|
import json
|
|
17
18
|
import os
|
|
@@ -138,7 +139,9 @@ class _BatchWindow:
|
|
|
138
139
|
|
|
139
140
|
|
|
140
141
|
class _BatchWindowGenerator(AbstractContextManager):
|
|
141
|
-
def __init__(
|
|
142
|
+
def __init__(
|
|
143
|
+
self, project: str, endpoint_id: str, window_length: Optional[int] = None
|
|
144
|
+
) -> None:
|
|
142
145
|
"""
|
|
143
146
|
Initialize a batch window generator object that generates batch window objects
|
|
144
147
|
for the monitoring functions.
|
|
@@ -165,6 +168,12 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
165
168
|
exc_type=exc_type, exc_value=exc_value, traceback=traceback
|
|
166
169
|
)
|
|
167
170
|
|
|
171
|
+
def get_application_list(self) -> set[str]:
|
|
172
|
+
return self._schedules_file.get_application_list()
|
|
173
|
+
|
|
174
|
+
def get_min_last_analyzed(self) -> Optional[int]:
|
|
175
|
+
return self._schedules_file.get_min_timestamp()
|
|
176
|
+
|
|
168
177
|
@classmethod
|
|
169
178
|
def _get_last_updated_time(
|
|
170
179
|
cls, last_request: datetime.datetime, not_batch_endpoint: bool
|
|
@@ -234,8 +243,7 @@ class MonitoringApplicationController:
|
|
|
234
243
|
def __init__(self) -> None:
|
|
235
244
|
"""Initialize Monitoring Application Controller"""
|
|
236
245
|
self.project = cast(str, mlrun.mlconf.default_project)
|
|
237
|
-
self.project_obj = mlrun.
|
|
238
|
-
|
|
246
|
+
self.project_obj = mlrun.get_run_db().get_project(name=self.project)
|
|
239
247
|
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
240
248
|
|
|
241
249
|
self._window_length = _get_window_length()
|
|
@@ -255,8 +263,10 @@ class MonitoringApplicationController:
|
|
|
255
263
|
return access_key
|
|
256
264
|
|
|
257
265
|
@staticmethod
|
|
258
|
-
def _should_monitor_endpoint(
|
|
259
|
-
|
|
266
|
+
def _should_monitor_endpoint(
|
|
267
|
+
endpoint: mlrun.common.schemas.ModelEndpoint, application_names: set
|
|
268
|
+
) -> bool:
|
|
269
|
+
if (
|
|
260
270
|
# Is the model endpoint monitored?
|
|
261
271
|
endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
|
|
262
272
|
# Was the model endpoint called? I.e., are the first and last requests nonempty?
|
|
@@ -265,7 +275,40 @@ class MonitoringApplicationController:
|
|
|
265
275
|
# Is the model endpoint not a router endpoint? Router endpoint has no feature stats
|
|
266
276
|
and endpoint.metadata.endpoint_type.value
|
|
267
277
|
!= mm_constants.EndpointType.ROUTER.value
|
|
268
|
-
)
|
|
278
|
+
):
|
|
279
|
+
with _BatchWindowGenerator(
|
|
280
|
+
project=endpoint.metadata.project,
|
|
281
|
+
endpoint_id=endpoint.metadata.uid,
|
|
282
|
+
) as batch_window_generator:
|
|
283
|
+
if application_names != batch_window_generator.get_application_list():
|
|
284
|
+
return True
|
|
285
|
+
elif (
|
|
286
|
+
not batch_window_generator.get_min_last_analyzed()
|
|
287
|
+
or batch_window_generator.get_min_last_analyzed()
|
|
288
|
+
<= int(endpoint.status.last_request.timestamp())
|
|
289
|
+
):
|
|
290
|
+
return True
|
|
291
|
+
else:
|
|
292
|
+
logger.info(
|
|
293
|
+
"All the possible intervals were already analyzed, didn't push regular event",
|
|
294
|
+
endpoint_id=endpoint.metadata.uid,
|
|
295
|
+
last_analyzed=datetime.datetime.fromtimestamp(
|
|
296
|
+
batch_window_generator.get_min_last_analyzed(),
|
|
297
|
+
tz=datetime.timezone.utc,
|
|
298
|
+
),
|
|
299
|
+
last_request=endpoint.status.last_request,
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
logger.info(
|
|
303
|
+
"Should not monitor model endpoint, didn't push regular event",
|
|
304
|
+
endpoint_id=endpoint.metadata.uid,
|
|
305
|
+
endpoint_name=endpoint.metadata.name,
|
|
306
|
+
last_request=endpoint.status.last_request,
|
|
307
|
+
first_request=endpoint.status.first_request,
|
|
308
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
309
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
310
|
+
)
|
|
311
|
+
return False
|
|
269
312
|
|
|
270
313
|
def run(self, event: nuclio_sdk.Event) -> None:
|
|
271
314
|
"""
|
|
@@ -314,7 +357,7 @@ class MonitoringApplicationController:
|
|
|
314
357
|
)
|
|
315
358
|
m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
|
|
316
359
|
logger.info(
|
|
317
|
-
"Starting analyzing for
|
|
360
|
+
"Starting analyzing for", timestamp=event[ControllerEvent.TIMESTAMP]
|
|
318
361
|
)
|
|
319
362
|
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
320
363
|
event[ControllerEvent.TIMESTAMP]
|
|
@@ -370,7 +413,7 @@ class MonitoringApplicationController:
|
|
|
370
413
|
current_time = mlrun.utils.datetime_now()
|
|
371
414
|
if (
|
|
372
415
|
current_time.timestamp()
|
|
373
|
-
- batch_window_generator.
|
|
416
|
+
- batch_window_generator.get_min_last_analyzed()
|
|
374
417
|
>= datetime.timedelta(minutes=base_period).total_seconds()
|
|
375
418
|
and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
|
|
376
419
|
):
|
|
@@ -399,6 +442,9 @@ class MonitoringApplicationController:
|
|
|
399
442
|
event=event,
|
|
400
443
|
endpoint_id=endpoint_id,
|
|
401
444
|
)
|
|
445
|
+
logger.info(
|
|
446
|
+
"Finish analyze for", timestamp=event[ControllerEvent.TIMESTAMP]
|
|
447
|
+
)
|
|
402
448
|
|
|
403
449
|
except Exception:
|
|
404
450
|
logger.exception(
|
|
@@ -455,17 +501,14 @@ class MonitoringApplicationController:
|
|
|
455
501
|
[data]
|
|
456
502
|
)
|
|
457
503
|
|
|
458
|
-
def push_regular_event_to_controller_stream(self
|
|
504
|
+
def push_regular_event_to_controller_stream(self) -> None:
|
|
459
505
|
"""
|
|
460
506
|
pushes a regular event to the controller stream.
|
|
461
507
|
:param event: the nuclio trigger event
|
|
462
508
|
"""
|
|
463
509
|
logger.info("Starting monitoring controller chief")
|
|
464
510
|
applications_names = []
|
|
465
|
-
|
|
466
|
-
endpoints = db.list_model_endpoints(
|
|
467
|
-
project=self.project, tsdb_metrics=True
|
|
468
|
-
).endpoints
|
|
511
|
+
endpoints = self.project_obj.list_model_endpoints(tsdb_metrics=True).endpoints
|
|
469
512
|
if not endpoints:
|
|
470
513
|
logger.info("No model endpoints found", project=self.project)
|
|
471
514
|
return
|
|
@@ -505,48 +548,59 @@ class MonitoringApplicationController:
|
|
|
505
548
|
// 60
|
|
506
549
|
),
|
|
507
550
|
}
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
),
|
|
517
|
-
|
|
518
|
-
sep=" ", timespec="microseconds"
|
|
519
|
-
),
|
|
520
|
-
endpoint_type=endpoint.metadata.endpoint_type,
|
|
521
|
-
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
522
|
-
endpoint_policy=json.dumps(policy),
|
|
523
|
-
)
|
|
524
|
-
self.push_to_controller_stream(
|
|
525
|
-
kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
|
|
526
|
-
project=self.project,
|
|
527
|
-
endpoint_id=endpoint.metadata.uid,
|
|
528
|
-
endpoint_name=endpoint.metadata.name,
|
|
529
|
-
stream_access_key=self.v3io_access_key,
|
|
530
|
-
timestamp=endpoint.status.last_request.isoformat(
|
|
531
|
-
sep=" ", timespec="microseconds"
|
|
532
|
-
),
|
|
533
|
-
first_request=endpoint.status.first_request.isoformat(
|
|
534
|
-
sep=" ", timespec="microseconds"
|
|
535
|
-
),
|
|
536
|
-
endpoint_type=endpoint.metadata.endpoint_type,
|
|
537
|
-
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
538
|
-
endpoint_policy=policy,
|
|
539
|
-
)
|
|
540
|
-
else:
|
|
541
|
-
logger.info(
|
|
542
|
-
"Should not monitor model endpoint, didn't push regular event",
|
|
543
|
-
endpoint_id=endpoint.metadata.uid,
|
|
544
|
-
endpoint_name=endpoint.metadata.name,
|
|
545
|
-
timestamp=endpoint.status.last_request,
|
|
546
|
-
first_request=endpoint.status.first_request,
|
|
547
|
-
endpoint_type=endpoint.metadata.endpoint_type,
|
|
548
|
-
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
551
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
552
|
+
max_workers=min(len(endpoints), 10)
|
|
553
|
+
) as pool:
|
|
554
|
+
for endpoint in endpoints:
|
|
555
|
+
pool.submit(
|
|
556
|
+
MonitoringApplicationController.endpoint_to_regular_event,
|
|
557
|
+
endpoint,
|
|
558
|
+
policy,
|
|
559
|
+
set(applications_names),
|
|
560
|
+
self.v3io_access_key,
|
|
549
561
|
)
|
|
562
|
+
logger.info("Finishing monitoring controller chief")
|
|
563
|
+
|
|
564
|
+
@staticmethod
|
|
565
|
+
def endpoint_to_regular_event(
|
|
566
|
+
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
567
|
+
policy: dict,
|
|
568
|
+
applications_names: set,
|
|
569
|
+
v3io_access_key: str,
|
|
570
|
+
) -> None:
|
|
571
|
+
if MonitoringApplicationController._should_monitor_endpoint(
|
|
572
|
+
endpoint, set(applications_names)
|
|
573
|
+
):
|
|
574
|
+
logger.info(
|
|
575
|
+
"Regular event is being pushed to controller stream for model endpoint",
|
|
576
|
+
endpoint_id=endpoint.metadata.uid,
|
|
577
|
+
endpoint_name=endpoint.metadata.name,
|
|
578
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
579
|
+
sep=" ", timespec="microseconds"
|
|
580
|
+
),
|
|
581
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
582
|
+
sep=" ", timespec="microseconds"
|
|
583
|
+
),
|
|
584
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
585
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
586
|
+
endpoint_policy=json.dumps(policy),
|
|
587
|
+
)
|
|
588
|
+
MonitoringApplicationController.push_to_controller_stream(
|
|
589
|
+
kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
|
|
590
|
+
project=endpoint.metadata.project,
|
|
591
|
+
endpoint_id=endpoint.metadata.uid,
|
|
592
|
+
endpoint_name=endpoint.metadata.name,
|
|
593
|
+
stream_access_key=v3io_access_key,
|
|
594
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
595
|
+
sep=" ", timespec="microseconds"
|
|
596
|
+
),
|
|
597
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
598
|
+
sep=" ", timespec="microseconds"
|
|
599
|
+
),
|
|
600
|
+
endpoint_type=endpoint.metadata.endpoint_type.value,
|
|
601
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
602
|
+
endpoint_policy=policy,
|
|
603
|
+
)
|
|
550
604
|
|
|
551
605
|
@staticmethod
|
|
552
606
|
def push_to_controller_stream(
|
|
@@ -557,7 +611,7 @@ class MonitoringApplicationController:
|
|
|
557
611
|
stream_access_key: str,
|
|
558
612
|
timestamp: str,
|
|
559
613
|
first_request: str,
|
|
560
|
-
endpoint_type:
|
|
614
|
+
endpoint_type: int,
|
|
561
615
|
feature_set_uri: str,
|
|
562
616
|
endpoint_policy: dict[str, Any],
|
|
563
617
|
) -> None:
|
|
@@ -633,7 +687,13 @@ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
|
633
687
|
|
|
634
688
|
if event.trigger.kind == "http":
|
|
635
689
|
# Runs controller chief:
|
|
636
|
-
|
|
690
|
+
context.user_data.monitor_app_controller.push_regular_event_to_controller_stream()
|
|
637
691
|
else:
|
|
638
692
|
# Runs controller worker:
|
|
639
|
-
|
|
693
|
+
context.user_data.monitor_app_controller.run(event)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def init_context(context):
|
|
697
|
+
monitor_app_controller = MonitoringApplicationController()
|
|
698
|
+
setattr(context.user_data, "monitor_app_controller", monitor_app_controller)
|
|
699
|
+
context.logger.info("Monitoring application controller initialized")
|
|
@@ -140,6 +140,14 @@ class ModelMonitoringSchedulesFile(AbstractContextManager):
|
|
|
140
140
|
self._check_open_schedules()
|
|
141
141
|
self._schedules[application] = timestamp
|
|
142
142
|
|
|
143
|
+
def get_application_list(self) -> set[str]:
|
|
144
|
+
self._check_open_schedules()
|
|
145
|
+
return set(self._schedules.keys())
|
|
146
|
+
|
|
147
|
+
def get_min_timestamp(self) -> Optional[int]:
|
|
148
|
+
self._check_open_schedules()
|
|
149
|
+
return min(self._schedules.values(), default=None)
|
|
150
|
+
|
|
143
151
|
|
|
144
152
|
def delete_model_monitoring_schedules_folder(project: str) -> None:
|
|
145
153
|
"""Delete the model monitoring schedules folder of the project"""
|
|
@@ -75,10 +75,11 @@ def get_tsdb_connector(
|
|
|
75
75
|
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
76
76
|
:param profile: An optional profile to initialize the TSDB connector from.
|
|
77
77
|
|
|
78
|
-
:return:
|
|
78
|
+
:return: ``TSDBConnector`` object. The main goal of this object is to handle different operations on the
|
|
79
79
|
TSDB connector such as updating drift metrics or write application record result.
|
|
80
|
-
:raise:
|
|
81
|
-
|
|
80
|
+
:raise: ``MLRunNotFoundError`` if the user didn't set the TSDB datastore profile and didn't provide it through
|
|
81
|
+
the ``profile`` parameter.
|
|
82
|
+
:raise: ``MLRunInvalidMMStoreTypeError`` if the TSDB datastore profile is of an invalid type.
|
|
82
83
|
"""
|
|
83
84
|
profile = profile or mlrun.model_monitoring.helpers._get_tsdb_profile(
|
|
84
85
|
project=project, secret_provider=secret_provider
|
|
@@ -93,9 +94,15 @@ def get_tsdb_connector(
|
|
|
93
94
|
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
|
|
94
95
|
kwargs["connection_string"] = profile.dsn()
|
|
95
96
|
else:
|
|
97
|
+
extra_message = (
|
|
98
|
+
""
|
|
99
|
+
if profile
|
|
100
|
+
else " by using `project.set_model_monitoring_credentials` API"
|
|
101
|
+
)
|
|
96
102
|
raise mlrun.errors.MLRunInvalidMMStoreTypeError(
|
|
97
|
-
"You must provide a valid
|
|
98
|
-
"
|
|
103
|
+
"You must provide a valid TSDB datastore profile"
|
|
104
|
+
f"{extra_message}. "
|
|
105
|
+
f"Found an unexpected profile of class: {type(profile)}"
|
|
99
106
|
)
|
|
100
107
|
|
|
101
108
|
# Get connector type value from ObjectTSDBFactory enum class
|
|
@@ -378,7 +378,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
378
378
|
def do(self, full_event):
|
|
379
379
|
event = full_event.body
|
|
380
380
|
if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
|
|
381
|
-
logger.
|
|
381
|
+
logger.debug(
|
|
382
|
+
"Skipped nop event inside of ProcessEndpointEvent", event=event
|
|
383
|
+
)
|
|
382
384
|
return storey.Event(body=[event])
|
|
383
385
|
# Getting model version and function uri from event
|
|
384
386
|
# and use them for retrieving the endpoint_id
|
|
@@ -637,7 +639,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
637
639
|
|
|
638
640
|
def do(self, event: dict):
|
|
639
641
|
if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
|
|
640
|
-
logger.info("Skipped nop event inside of MapFeatureNames", event=event)
|
|
641
642
|
return event
|
|
642
643
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
643
644
|
|