mlrun 1.10.0rc12__py3-none-any.whl → 1.10.0rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

mlrun/run.py CHANGED
@@ -895,7 +895,7 @@ def _run_pipeline(
895
895
  def retry_pipeline(
896
896
  run_id: str,
897
897
  project: str,
898
- ) -> str:
898
+ ) -> typing.Union[str, dict[str, str]]:
899
899
  """Retry a pipeline run.
900
900
 
901
901
  This function retries a previously executed pipeline run using the specified run ID. If the run is not in a
@@ -914,10 +914,33 @@ def retry_pipeline(
914
914
  "Please set the dbpath URL."
915
915
  )
916
916
 
917
- pipeline_run_id = mldb.retry_pipeline(
917
+ # Invoke retry pipeline run. Depending on the context, this call returns either:
918
+ # 1. A simple string of a workflow-id, for direct retries or non-remote workflows, or
919
+ # 2. A dict payload representing a WorkflowResponse when rerunning remote workflows.
920
+ rerun_response = mldb.retry_pipeline(
918
921
  run_id=run_id,
919
922
  project=project,
920
923
  )
924
+ if isinstance(rerun_response, str):
925
+ pipeline_run_id = rerun_response
926
+ else:
927
+ rerun_response = mlrun.common.schemas.WorkflowResponse(**rerun_response)
928
+
929
+ def _fetch_workflow_id():
930
+ rerun = mldb.read_run(rerun_response.run_id, project)
931
+ workflow_id = rerun["metadata"]["labels"].get("workflow-id")
932
+ if not workflow_id:
933
+ raise mlrun.errors.MLRunRuntimeError("workflow-id label not set yet")
934
+ return workflow_id
935
+
936
+ pipeline_run_id = mlrun.utils.helpers.retry_until_successful(
937
+ backoff=3,
938
+ timeout=int(mlrun.mlconf.workflows.timeouts.remote),
939
+ logger=logger,
940
+ verbose=False,
941
+ _function=_fetch_workflow_id,
942
+ )
943
+
921
944
  if pipeline_run_id == run_id:
922
945
  logger.info(
923
946
  f"Retried pipeline run ID={pipeline_run_id}, check UI for progress."
mlrun/serving/server.py CHANGED
@@ -788,6 +788,7 @@ class GraphContext:
788
788
  self.verbose = False
789
789
  self.stream = None
790
790
  self.root = None
791
+ self.executor: Optional[storey.flow.RunnableExecutor] = None
791
792
 
792
793
  if nuclio_context:
793
794
  self.logger: NuclioLogger = nuclio_context.logger
mlrun/serving/states.py CHANGED
@@ -44,7 +44,6 @@ from mlrun.datastore.datastore_profile import (
44
44
  datastore_profile_read,
45
45
  )
46
46
  from mlrun.datastore.model_provider.model_provider import ModelProvider
47
- from mlrun.datastore.store_resources import get_store_resource
48
47
  from mlrun.datastore.storeytargets import KafkaStoreyTarget, StreamStoreyTarget
49
48
  from mlrun.utils import logger
50
49
 
@@ -518,7 +517,7 @@ class BaseStep(ModelObj):
518
517
  "ModelRunnerStep can be added to 'Flow' topology graph only"
519
518
  )
520
519
  step_model_endpoints_names = list(
521
- step.class_args[schemas.ModelRunnerStepData.MODELS].keys()
520
+ step.class_args.get(schemas.ModelRunnerStepData.MODELS, {}).keys()
522
521
  )
523
522
  # Get all model_endpoints names that are in both lists
524
523
  common_endpoints_names = list(
@@ -530,8 +529,77 @@ class BaseStep(ModelObj):
530
529
  raise GraphError(
531
530
  f"The graph already contains the model endpoints named - {common_endpoints_names}."
532
531
  )
532
+
533
+ # Check if shared models are defined in the graph
534
+ self._verify_shared_models(root, step, step_model_endpoints_names)
535
+ # Update model endpoints names in the root step
533
536
  root.update_model_endpoints_names(step_model_endpoints_names)
534
537
 
538
+ @staticmethod
539
+ def _verify_shared_models(
540
+ root: "RootFlowStep",
541
+ step: "ModelRunnerStep",
542
+ step_model_endpoints_names: list[str],
543
+ ) -> None:
544
+ proxy_endpoints = [
545
+ name
546
+ for name in step_model_endpoints_names
547
+ if step.class_args.get(
548
+ schemas.ModelRunnerStepData.MODEL_TO_EXECUTION_MECHANISM, {}
549
+ ).get(name)
550
+ == ParallelExecutionMechanisms.shared_executor
551
+ ]
552
+ shared_models = []
553
+
554
+ for name in proxy_endpoints:
555
+ shared_runnable_name = (
556
+ step.class_args.get(schemas.ModelRunnerStepData.MODELS, {})
557
+ .get(name, ["", {}])[schemas.ModelsData.MODEL_PARAMETERS.value]
558
+ .get("shared_runnable_name")
559
+ )
560
+ model_artifact_uri = (
561
+ step.class_args.get(schemas.ModelRunnerStepData.MODELS, {})
562
+ .get(name, ["", {}])[schemas.ModelsData.MODEL_PARAMETERS.value]
563
+ .get("artifact_uri")
564
+ )
565
+ prefix, _ = mlrun.datastore.parse_store_uri(model_artifact_uri)
566
+ # if the model artifact is a prompt, we need to get the model URI
567
+ # to ensure that the shared runnable name is correct
568
+ if prefix == mlrun.utils.StorePrefix.LLMPrompt:
569
+ llm_artifact, _ = mlrun.store_manager.get_store_artifact(
570
+ model_artifact_uri
571
+ )
572
+ model_artifact_uri = llm_artifact.spec.parent_uri
573
+ actual_shared_name = root.get_shared_model_name_by_artifact_uri(
574
+ model_artifact_uri
575
+ )
576
+
577
+ if not shared_runnable_name:
578
+ if not actual_shared_name:
579
+ raise GraphError(
580
+ f"Can't find shared model for {name} model endpoint"
581
+ )
582
+ else:
583
+ step.class_args[schemas.ModelRunnerStepData.MODELS][name][
584
+ schemas.ModelsData.MODEL_PARAMETERS.value
585
+ ]["shared_runnable_name"] = actual_shared_name
586
+ shared_models.append(actual_shared_name)
587
+ elif actual_shared_name != shared_runnable_name:
588
+ raise GraphError(
589
+ f"Model endpoint {name} shared runnable name mismatch: "
590
+ f"expected {actual_shared_name}, got {shared_runnable_name}"
591
+ )
592
+ else:
593
+ shared_models.append(actual_shared_name)
594
+
595
+ undefined_shared_models = list(
596
+ set(shared_models) - set(root.shared_models.keys())
597
+ )
598
+ if undefined_shared_models:
599
+ raise GraphError(
600
+ f"The following shared models are not defined in the graph: {undefined_shared_models}."
601
+ )
602
+
535
603
 
536
604
  class TaskStep(BaseStep):
537
605
  """task execution step, runs a class or handler"""
@@ -1008,7 +1076,13 @@ class RouterStep(TaskStep):
1008
1076
 
1009
1077
 
1010
1078
  class Model(storey.ParallelExecutionRunnable, ModelObj):
1011
- _dict_fields = ["name", "raise_exception", "artifact_uri"]
1079
+ _dict_fields = [
1080
+ "name",
1081
+ "raise_exception",
1082
+ "artifact_uri",
1083
+ "shared_runnable_name",
1084
+ ]
1085
+ kind = "model"
1012
1086
 
1013
1087
  def __init__(
1014
1088
  self,
@@ -1238,16 +1312,105 @@ class ModelRunnerStep(MonitoredStep):
1238
1312
  self.raise_exception = raise_exception
1239
1313
  self.shape = "folder"
1240
1314
 
1315
+ def add_shared_model_proxy(
1316
+ self,
1317
+ endpoint_name: str,
1318
+ model_artifact: Union[str, ModelArtifact, LLMPromptArtifact],
1319
+ shared_model_name: Optional[str] = None,
1320
+ labels: Optional[Union[list[str], dict[str, str]]] = None,
1321
+ model_endpoint_creation_strategy: Optional[
1322
+ schemas.ModelEndpointCreationStrategy
1323
+ ] = schemas.ModelEndpointCreationStrategy.INPLACE,
1324
+ inputs: Optional[list[str]] = None,
1325
+ outputs: Optional[list[str]] = None,
1326
+ input_path: Optional[str] = None,
1327
+ result_path: Optional[str] = None,
1328
+ override: bool = False,
1329
+ ) -> None:
1330
+ """
1331
+ Add a proxy model to the ModelRunnerStep, which is a proxy for a model that is already defined as shared model
1332
+ within the graph
1333
+
1334
+ :param endpoint_name: str, will identify the model in the ModelRunnerStep, and assign model endpoint name
1335
+ :param model_artifact: model artifact or mlrun model artifact uri, according to the model artifact
1336
+ we will match the model endpoint to the correct shared model.
1337
+ :param shared_model_name: str, the name of the shared model that is already defined within the graph
1338
+ :param labels: model endpoint labels, should be list of str or mapping of str:str
1339
+ :param model_endpoint_creation_strategy: Strategy for creating or updating the model endpoint:
1340
+ * **overwrite**:
1341
+ 1. If model endpoints with the same name exist, delete the `latest` one.
1342
+ 2. Create a new model endpoint entry and set it as `latest`.
1343
+ * **inplace** (default):
1344
+ 1. If model endpoints with the same name exist, update the `latest` entry.
1345
+ 2. Otherwise, create a new entry.
1346
+ * **archive**:
1347
+ 1. If model endpoints with the same name exist, preserve them.
1348
+ 2. Create a new model endpoint with the same name and set it to `latest`.
1349
+
1350
+ :param inputs: list of the model inputs (e.g. features) ,if provided will override the inputs
1351
+ that been configured in the model artifact, please note that those inputs need to
1352
+ be equal in length and order to the inputs that model_class predict method expects
1353
+ :param outputs: list of the model outputs (e.g. labels) ,if provided will override the outputs
1354
+ that been configured in the model artifact, please note that those outputs need to
1355
+ be equal to the model_class predict method outputs (length, and order)
1356
+ :param input_path: input path inside the user event, expect scopes to be defined by dot notation
1357
+ (e.g "inputs.my_model_inputs"). expects list or dictionary type object in path.
1358
+ :param result_path: result path inside the user output event, expect scopes to be defined by dot
1359
+ notation (e.g "outputs.my_model_outputs") expects list or dictionary type object
1360
+ in path.
1361
+ :param override: bool allow override existing model on the current ModelRunnerStep.
1362
+ """
1363
+ model_class = Model(
1364
+ name=endpoint_name,
1365
+ shared_runnable_name=shared_model_name,
1366
+ )
1367
+ if isinstance(model_artifact, str):
1368
+ model_artifact_uri = model_artifact
1369
+ elif isinstance(model_artifact, ModelArtifact):
1370
+ model_artifact_uri = model_artifact.uri
1371
+ elif isinstance(model_artifact, LLMPromptArtifact):
1372
+ model_artifact_uri = model_artifact.model_artifact.uri
1373
+ else:
1374
+ raise MLRunInvalidArgumentError(
1375
+ "model_artifact must be a string, ModelArtifact or LLMPromptArtifact"
1376
+ )
1377
+ root = self._extract_root_step()
1378
+ if isinstance(root, RootFlowStep):
1379
+ shared_model_name = (
1380
+ shared_model_name
1381
+ or root.get_shared_model_name_by_artifact_uri(model_artifact_uri)
1382
+ )
1383
+ if not root.shared_models or (
1384
+ root.shared_models
1385
+ and shared_model_name
1386
+ and shared_model_name not in root.shared_models.keys()
1387
+ ):
1388
+ raise GraphError(
1389
+ f"ModelRunnerStep can only add proxy models that were added to the root flow step, "
1390
+ f"model {shared_model_name} is not in the shared models."
1391
+ )
1392
+ self.add_model(
1393
+ endpoint_name=endpoint_name,
1394
+ model_class=model_class,
1395
+ execution_mechanism=ParallelExecutionMechanisms.shared_executor,
1396
+ model_artifact=model_artifact,
1397
+ labels=labels,
1398
+ model_endpoint_creation_strategy=model_endpoint_creation_strategy,
1399
+ override=override,
1400
+ inputs=inputs,
1401
+ outputs=outputs,
1402
+ input_path=input_path,
1403
+ result_path=result_path,
1404
+ )
1405
+
1241
1406
  def add_model(
1242
1407
  self,
1243
1408
  endpoint_name: str,
1244
1409
  model_class: Union[str, Model],
1245
1410
  execution_mechanism: Union[str, ParallelExecutionMechanisms],
1246
- model_artifact: Optional[
1247
- Union[str, mlrun.artifacts.ModelArtifact, mlrun.artifacts.LLMPromptArtifact]
1248
- ] = None,
1411
+ model_artifact: Optional[Union[str, ModelArtifact, LLMPromptArtifact]] = None,
1249
1412
  labels: Optional[Union[list[str], dict[str, str]]] = None,
1250
- creation_strategy: Optional[
1413
+ model_endpoint_creation_strategy: Optional[
1251
1414
  schemas.ModelEndpointCreationStrategy
1252
1415
  ] = schemas.ModelEndpointCreationStrategy.INPLACE,
1253
1416
  inputs: Optional[list[str]] = None,
@@ -1285,7 +1448,7 @@ class ModelRunnerStep(MonitoredStep):
1285
1448
 
1286
1449
  :param model_artifact: model artifact or mlrun model artifact uri
1287
1450
  :param labels: model endpoint labels, should be list of str or mapping of str:str
1288
- :param creation_strategy: Strategy for creating or updating the model endpoint:
1451
+ :param model_endpoint_creation_strategy: Strategy for creating or updating the model endpoint:
1289
1452
  * **overwrite**:
1290
1453
  1. If model endpoints with the same name exist, delete the `latest` one.
1291
1454
  2. Create a new model endpoint entry and set it as `latest`.
@@ -1310,7 +1473,6 @@ class ModelRunnerStep(MonitoredStep):
1310
1473
  :param override: bool allow override existing model on the current ModelRunnerStep.
1311
1474
  :param model_parameters: Parameters for model instantiation
1312
1475
  """
1313
-
1314
1476
  if isinstance(model_class, Model) and model_parameters:
1315
1477
  raise mlrun.errors.MLRunInvalidArgumentError(
1316
1478
  "Cannot provide a model object as argument to `model_class` and also provide `model_parameters`."
@@ -1319,10 +1481,20 @@ class ModelRunnerStep(MonitoredStep):
1319
1481
  model_parameters = model_parameters or (
1320
1482
  model_class.to_dict() if isinstance(model_class, Model) else {}
1321
1483
  )
1322
- if outputs is None and isinstance(
1323
- model_artifact, mlrun.artifacts.ModelArtifact
1484
+
1485
+ if isinstance(
1486
+ model_artifact,
1487
+ str,
1324
1488
  ):
1325
- outputs = [feature.name for feature in model_artifact.spec.outputs]
1489
+ try:
1490
+ model_artifact, _ = mlrun.store_manager.get_store_artifact(
1491
+ model_artifact
1492
+ )
1493
+ except mlrun.errors.MLRunNotFoundError:
1494
+ raise mlrun.errors.MLRunInvalidArgumentError("Artifact not found.")
1495
+
1496
+ outputs = outputs or self._get_model_output_schema(model_artifact)
1497
+
1326
1498
  model_artifact = (
1327
1499
  model_artifact.uri
1328
1500
  if isinstance(model_artifact, mlrun.artifacts.Artifact)
@@ -1369,7 +1541,7 @@ class ModelRunnerStep(MonitoredStep):
1369
1541
  schemas.MonitoringData.OUTPUTS: outputs,
1370
1542
  schemas.MonitoringData.INPUT_PATH: input_path,
1371
1543
  schemas.MonitoringData.RESULT_PATH: result_path,
1372
- schemas.MonitoringData.CREATION_STRATEGY: creation_strategy,
1544
+ schemas.MonitoringData.CREATION_STRATEGY: model_endpoint_creation_strategy,
1373
1545
  schemas.MonitoringData.LABELS: labels,
1374
1546
  schemas.MonitoringData.MODEL_PATH: model_artifact,
1375
1547
  schemas.MonitoringData.MODEL_CLASS: model_class,
@@ -1379,14 +1551,44 @@ class ModelRunnerStep(MonitoredStep):
1379
1551
 
1380
1552
  @staticmethod
1381
1553
  def _get_model_output_schema(
1382
- model: str, monitoring_data: dict[str, dict[str, str]]
1554
+ model_artifact: Union[ModelArtifact, LLMPromptArtifact],
1555
+ ) -> Optional[list[str]]:
1556
+ if isinstance(
1557
+ model_artifact,
1558
+ ModelArtifact,
1559
+ ):
1560
+ return [feature.name for feature in model_artifact.spec.outputs]
1561
+ elif isinstance(
1562
+ model_artifact,
1563
+ LLMPromptArtifact,
1564
+ ):
1565
+ _model_artifact = model_artifact.model_artifact
1566
+ return [feature.name for feature in _model_artifact.spec.outputs]
1567
+
1568
+ @staticmethod
1569
+ def _get_model_endpoint_output_schema(
1570
+ name: str,
1571
+ project: str,
1572
+ uid: str,
1383
1573
  ) -> list[str]:
1384
1574
  output_schema = None
1385
- if monitoring_data[model].get(schemas.MonitoringData.MODEL_PATH) is not None:
1386
- artifact = get_store_resource(
1387
- monitoring_data[model].get(schemas.MonitoringData.MODEL_PATH)
1575
+ try:
1576
+ model_endpoint: mlrun.common.schemas.model_monitoring.ModelEndpoint = (
1577
+ mlrun.db.get_run_db().get_model_endpoint(
1578
+ name=name,
1579
+ project=project,
1580
+ endpoint_id=uid,
1581
+ tsdb_metrics=False,
1582
+ )
1583
+ )
1584
+ output_schema = model_endpoint.spec.label_names
1585
+ except (
1586
+ mlrun.errors.MLRunNotFoundError,
1587
+ mlrun.errors.MLRunInvalidArgumentError,
1588
+ ):
1589
+ logger.warning(
1590
+ f"Model endpoint not found, using default output schema for model {name}"
1388
1591
  )
1389
- output_schema = [feature.name for feature in artifact.spec.outputs]
1390
1592
  return output_schema
1391
1593
 
1392
1594
  @staticmethod
@@ -1407,8 +1609,14 @@ class ModelRunnerStep(MonitoredStep):
1407
1609
  if isinstance(monitoring_data, dict):
1408
1610
  for model in monitoring_data:
1409
1611
  monitoring_data[model][schemas.MonitoringData.OUTPUTS] = (
1410
- monitoring_data[model][schemas.MonitoringData.OUTPUTS]
1411
- or self._get_model_output_schema(model, monitoring_data)
1612
+ monitoring_data.get(model, {}).get(schemas.MonitoringData.OUTPUTS)
1613
+ or self._get_model_endpoint_output_schema(
1614
+ name=model,
1615
+ project=self.context.project if self.context else None,
1616
+ uid=monitoring_data.get(model, {}).get(
1617
+ mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID
1618
+ ),
1619
+ )
1412
1620
  )
1413
1621
  # Prevent calling _get_model_output_schema for same model more than once
1414
1622
  self.class_args[
@@ -1429,6 +1637,7 @@ class ModelRunnerStep(MonitoredStep):
1429
1637
  return monitoring_data
1430
1638
 
1431
1639
  def init_object(self, context, namespace, mode="sync", reset=False, **extra_kwargs):
1640
+ self.context = context
1432
1641
  if not self._is_local_function(context):
1433
1642
  # skip init of non local functions
1434
1643
  return
@@ -1773,7 +1982,7 @@ class FlowStep(BaseStep):
1773
1982
  self._insert_all_error_handlers()
1774
1983
  self.check_and_process_graph()
1775
1984
 
1776
- for step in self._steps.values():
1985
+ for step in self.steps.values():
1777
1986
  step.set_parent(self)
1778
1987
  step.init_object(context, namespace, mode, reset=reset)
1779
1988
  self._set_error_handler()
@@ -2136,6 +2345,11 @@ class RootFlowStep(FlowStep):
2136
2345
  "model_endpoints_names",
2137
2346
  "model_endpoints_routes_names",
2138
2347
  "track_models",
2348
+ "shared_max_processes",
2349
+ "shared_max_threads",
2350
+ "shared_models",
2351
+ "shared_models_mechanism",
2352
+ "pool_factor",
2139
2353
  ]
2140
2354
 
2141
2355
  def __init__(
@@ -2156,6 +2370,140 @@ class RootFlowStep(FlowStep):
2156
2370
  self._models = set()
2157
2371
  self._route_models = set()
2158
2372
  self._track_models = False
2373
+ self._shared_models: dict[str, tuple[str, dict]] = {}
2374
+ self._shared_models_mechanism: dict[str, ParallelExecutionMechanisms] = {}
2375
+ self._shared_max_processes = None
2376
+ self._shared_max_threads = None
2377
+ self._pool_factor = None
2378
+
2379
+ def add_shared_model(
2380
+ self,
2381
+ name: str,
2382
+ model_class: Union[str, Model],
2383
+ execution_mechanism: Union[str, ParallelExecutionMechanisms],
2384
+ model_artifact: Optional[Union[str, ModelArtifact]],
2385
+ override: bool = False,
2386
+ **model_parameters,
2387
+ ) -> None:
2388
+ """
2389
+ Add a shared model to the graph, this model will be available to all the ModelRunners in the graph
2390
+ :param name: Name of the shared model (should be unique in the graph)
2391
+ :param model_class: Model class name
2392
+ :param execution_mechanism: Parallel execution mechanism to be used to execute this model. Must be one of:
2393
+ * "process_pool" – To run in a separate process from a process pool. This is appropriate for CPU or GPU
2394
+ intensive tasks as they would otherwise block the main process by holding Python's Global Interpreter
2395
+ Lock (GIL).
2396
+ * "dedicated_process" – To run in a separate dedicated process. This is appropriate for CPU or GPU intensive
2397
+ tasks that also require significant Runnable-specific initialization (e.g. a large model).
2398
+ * "thread_pool" – To run in a separate thread. This is appropriate for blocking I/O tasks, as they would
2399
+ otherwise block the main event loop thread.
2400
+ * "asyncio" – To run in an asyncio task. This is appropriate for I/O tasks that use asyncio, allowing the
2401
+ event loop to continue running while waiting for a response.
2402
+ * "shared_executor" – Reuses an external executor (typically managed by the flow or context) to execute the
2403
+ runnable. Should be used only if you have multiply `ParallelExecution` in the same flow and especially
2404
+ useful when:
2405
+ - You want to share a heavy resource like a large model loaded onto a GPU.
2406
+ - You want to centralize task scheduling or coordination for multiple lightweight tasks.
2407
+ - You aim to minimize overhead from creating new executors or processes/threads per runnable.
2408
+ The runnable is expected to be pre-initialized and reused across events, enabling efficient use of
2409
+ memory and hardware accelerators.
2410
+ * "naive" – To run in the main event loop. This is appropriate only for trivial computation and/or file I/O.
2411
+ It means that the runnable will not actually be run in parallel to anything else.
2412
+
2413
+ :param model_artifact: model artifact or mlrun model artifact uri
2414
+ :param override: bool allow override existing model on the current ModelRunnerStep.
2415
+ :param model_parameters: Parameters for model instantiation
2416
+ """
2417
+ if isinstance(model_class, Model) and model_parameters:
2418
+ raise mlrun.errors.MLRunInvalidArgumentError(
2419
+ "Cannot provide a model object as argument to `model_class` and also provide `model_parameters`."
2420
+ )
2421
+
2422
+ if execution_mechanism == ParallelExecutionMechanisms.shared_executor:
2423
+ raise mlrun.errors.MLRunInvalidArgumentError(
2424
+ "Cannot add a shared model with execution mechanism 'shared_executor'"
2425
+ )
2426
+ ParallelExecutionMechanisms.validate(execution_mechanism)
2427
+
2428
+ model_parameters = model_parameters or (
2429
+ model_class.to_dict() if isinstance(model_class, Model) else {}
2430
+ )
2431
+ model_artifact = (
2432
+ model_artifact.uri
2433
+ if isinstance(model_artifact, mlrun.artifacts.Artifact)
2434
+ else model_artifact
2435
+ )
2436
+ model_parameters["artifact_uri"] = model_parameters.get(
2437
+ "artifact_uri", model_artifact
2438
+ )
2439
+
2440
+ if model_parameters.get("name", name) != name or (
2441
+ isinstance(model_class, Model) and model_class.name != name
2442
+ ):
2443
+ raise mlrun.errors.MLRunInvalidArgumentError(
2444
+ "Inconsistent name for the added model."
2445
+ )
2446
+ model_parameters["name"] = name
2447
+
2448
+ if name in self.shared_models and not override:
2449
+ raise mlrun.errors.MLRunInvalidArgumentError(
2450
+ f"Model with name {name} already exists in this graph."
2451
+ )
2452
+
2453
+ model_class = (
2454
+ model_class
2455
+ if isinstance(model_class, str)
2456
+ else model_class.__class__.__name__
2457
+ )
2458
+ self.shared_models[name] = (model_class, model_parameters)
2459
+ self.shared_models_mechanism[name] = execution_mechanism
2460
+
2461
+ def get_shared_model_name_by_artifact_uri(self, artifact_uri: str) -> Optional[str]:
2462
+ """
2463
+ Get a shared model by its artifact URI.
2464
+ :param artifact_uri: The artifact URI of the model.
2465
+ :return: A tuple of (model_class, model_parameters) if found, otherwise None.
2466
+ """
2467
+ for model_name, (model_class, model_params) in self.shared_models.items():
2468
+ if model_params.get("artifact_uri") == artifact_uri:
2469
+ return model_name
2470
+ return None
2471
+
2472
+ def config_pool_resource(
2473
+ self,
2474
+ max_processes: Optional[int] = None,
2475
+ max_threads: Optional[int] = None,
2476
+ pool_factor: Optional[int] = None,
2477
+ ) -> None:
2478
+ """
2479
+ Configure the resource limits for the shared models in the graph.
2480
+ :param max_processes: Maximum number of processes to spawn (excluding dedicated processes).
2481
+ Defaults to the number of CPUs or 16 if undetectable.
2482
+ :param max_threads: Maximum number of threads to spawn. Defaults to 32.
2483
+ :param pool_factor: Multiplier to scale the number of process/thread workers per runnable. Defaults to 1.
2484
+ """
2485
+ self.shared_max_processes = max_processes
2486
+ self.shared_max_threads = max_threads
2487
+ self.pool_factor = pool_factor
2488
+
2489
+ def init_object(self, context, namespace, mode="sync", reset=False, **extra_kwargs):
2490
+ self.context = context
2491
+ if self.shared_models:
2492
+ self.context.executor = storey.flow.RunnableExecutor(
2493
+ max_processes=self.shared_max_processes,
2494
+ max_threads=self.shared_max_threads,
2495
+ pool_factor=self.pool_factor,
2496
+ )
2497
+
2498
+ for model, model_params in self.shared_models.values():
2499
+ model = get_class(model, namespace).from_dict(
2500
+ model_params, init_with_params=True
2501
+ )
2502
+ model._raise_exception = False
2503
+ self.context.executor.add_runnable(
2504
+ model, self._shared_models_mechanism[model.name]
2505
+ )
2506
+ super().init_object(context, namespace, mode, reset=reset, **extra_kwargs)
2159
2507
 
2160
2508
  @property
2161
2509
  def model_endpoints_names(self) -> list[str]:
@@ -2184,6 +2532,48 @@ class RootFlowStep(FlowStep):
2184
2532
  def track_models(self, track_models: bool):
2185
2533
  self._track_models = track_models
2186
2534
 
2535
+ @property
2536
+ def shared_models(self) -> dict[str, tuple[str, dict]]:
2537
+ return self._shared_models
2538
+
2539
+ @shared_models.setter
2540
+ def shared_models(self, shared_models: dict[str, tuple[str, dict]]):
2541
+ self._shared_models = shared_models
2542
+
2543
+ @property
2544
+ def shared_models_mechanism(self) -> dict[str, ParallelExecutionMechanisms]:
2545
+ return self._shared_models_mechanism
2546
+
2547
+ @shared_models_mechanism.setter
2548
+ def shared_models_mechanism(
2549
+ self, shared_models_mechanism: dict[str, ParallelExecutionMechanisms]
2550
+ ):
2551
+ self._shared_models_mechanism = shared_models_mechanism
2552
+
2553
+ @property
2554
+ def shared_max_processes(self) -> Optional[int]:
2555
+ return self._shared_max_processes
2556
+
2557
+ @shared_max_processes.setter
2558
+ def shared_max_processes(self, max_processes: Optional[int]):
2559
+ self._shared_max_processes = max_processes
2560
+
2561
+ @property
2562
+ def shared_max_threads(self) -> Optional[int]:
2563
+ return self._shared_max_threads
2564
+
2565
+ @shared_max_threads.setter
2566
+ def shared_max_threads(self, max_threads: Optional[int]):
2567
+ self._shared_max_threads = max_threads
2568
+
2569
+ @property
2570
+ def pool_factor(self) -> Optional[int]:
2571
+ return self._pool_factor
2572
+
2573
+ @pool_factor.setter
2574
+ def pool_factor(self, pool_factor: Optional[int]):
2575
+ self._pool_factor = pool_factor
2576
+
2187
2577
  def update_model_endpoints_routes_names(self, model_endpoints_names: list):
2188
2578
  self._route_models.update(model_endpoints_names)
2189
2579
 
@@ -150,12 +150,16 @@ class MonitoringPreProcessor(storey.MapClass):
150
150
  def do(self, event):
151
151
  monitoring_event_list = []
152
152
  model_runner_name = event._metadata.get("model_runner_name", "")
153
- step = self.server.graph.steps[model_runner_name] if self.server else {}
153
+ step = self.server.graph.steps[model_runner_name] if self.server else None
154
+ if not step or not hasattr(step, "monitoring_data"):
155
+ raise mlrun.errors.MLRunRuntimeError(
156
+ f"ModelRunnerStep name {model_runner_name} is not found in the graph or does not have monitoring data"
157
+ )
154
158
  monitoring_data = step.monitoring_data
155
159
  logger.debug(
156
160
  "monitoring preprocessor started",
157
161
  event=event,
158
- model_endpoints=monitoring_data,
162
+ monitoring_data=monitoring_data,
159
163
  metadata=event._metadata,
160
164
  )
161
165
  if len(monitoring_data) > 1:
@@ -1,4 +1,4 @@
1
1
  {
2
- "git_commit": "557e2bbe23718b3eb2bdc5d6782704c14f9d5fc3",
3
- "version": "1.10.0-rc12"
2
+ "git_commit": "9b3d4665fca9a019fc99b6902d8d71ebeff8d664",
3
+ "version": "1.10.0-rc13"
4
4
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlrun
3
- Version: 1.10.0rc12
3
+ Version: 1.10.0rc13
4
4
  Summary: Tracking and config of machine learning runs
5
5
  Home-page: https://github.com/mlrun/mlrun
6
6
  Author: Yaron Haviv