mlrun 1.10.0rc42__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

mlrun/config.py CHANGED
@@ -66,7 +66,6 @@ default_config = {
66
66
  "nuclio_version": "",
67
67
  "default_nuclio_runtime": "python:3.11",
68
68
  "nest_asyncio_enabled": "", # enable import of nest_asyncio for corner cases with old jupyter, set "1"
69
- "ui_url": "", # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
70
69
  "remote_host": "",
71
70
  "api_base_version": "v1",
72
71
  "version": "", # will be set to current version
@@ -304,7 +303,7 @@ default_config = {
304
303
  "application": {
305
304
  "default_sidecar_internal_port": 8050,
306
305
  "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
307
- "default_worker_number": 10000,
306
+ "default_worker_number": 100,
308
307
  },
309
308
  },
310
309
  # TODO: function defaults should be moved to the function spec config above
@@ -725,7 +724,7 @@ default_config = {
725
724
  # Set false to avoid creating a global source (for example in a dark site)
726
725
  "create": True,
727
726
  "name": "default",
728
- "description": "MLRun global function hub",
727
+ "description": "MLRun hub",
729
728
  "url": "https://mlrun.github.io/marketplace",
730
729
  "channel": "master",
731
730
  },
@@ -1280,10 +1279,7 @@ class Config:
1280
1279
 
1281
1280
  @staticmethod
1282
1281
  def resolve_ui_url():
1283
- # ui_url is deprecated in favor of the ui.url (we created the ui block)
1284
- # since the config class is used in a "recursive" way, we can't use property like we used in other places
1285
- # since the property will need to be url, which exists in other structs as well
1286
- return config.ui.url or config.ui_url
1282
+ return config.ui.url
1287
1283
 
1288
1284
  def is_api_running_on_k8s(self):
1289
1285
  # determine if the API service is attached to K8s cluster
@@ -1570,7 +1566,6 @@ def read_env(env=None, prefix=env_prefix):
1570
1566
  "https://mlrun-api.", "https://framesd."
1571
1567
  )
1572
1568
 
1573
- uisvc = env.get("MLRUN_UI_SERVICE_HOST")
1574
1569
  igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
1575
1570
 
1576
1571
  # workaround to try and detect IGZ domain
@@ -1596,10 +1591,6 @@ def read_env(env=None, prefix=env_prefix):
1596
1591
  if config.get("nuclio_dashboard_url") == "disabled":
1597
1592
  config["nuclio_dashboard_url"] = ""
1598
1593
 
1599
- if uisvc and not config.get("ui_url"):
1600
- if igz_domain:
1601
- config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
1602
-
1603
1594
  if log_level := config.get("log_level"):
1604
1595
  import mlrun.utils.logger
1605
1596
 
mlrun/datastore/base.py CHANGED
@@ -11,11 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import datetime
15
+ import os
16
+ import os.path
14
17
  import tempfile
15
18
  import urllib.parse
16
19
  from base64 import b64encode
17
20
  from copy import copy
18
- from os import path, remove
21
+ from types import ModuleType
19
22
  from typing import Optional, Union
20
23
  from urllib.parse import urlparse
21
24
 
@@ -156,6 +159,195 @@ class DataStore(BaseRemoteClient):
156
159
  def get_spark_options(self, path=None):
157
160
  return {}
158
161
 
162
+ @staticmethod
163
+ def _is_directory_in_range(
164
+ start_time: Optional[datetime.datetime],
165
+ end_time: Optional[datetime.datetime],
166
+ year: int,
167
+ month: Optional[int] = None,
168
+ day: Optional[int] = None,
169
+ hour: Optional[int] = None,
170
+ **kwargs,
171
+ ):
172
+ """Check if a partition directory (year=.., month=.., etc.) is in the time range."""
173
+ from dateutil.relativedelta import relativedelta
174
+
175
+ partition_start = datetime.datetime(
176
+ year=year,
177
+ month=month or 1,
178
+ day=day or 1,
179
+ hour=hour or 0,
180
+ tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
181
+ )
182
+ partition_end = (
183
+ partition_start
184
+ + relativedelta(
185
+ years=1 if month is None else 0,
186
+ months=1 if day is None and month is not None else 0,
187
+ days=1 if hour is None and day is not None else 0,
188
+ hours=1 if hour is not None else 0,
189
+ )
190
+ - datetime.timedelta(microseconds=1)
191
+ )
192
+
193
+ if (end_time and end_time < partition_start) or (
194
+ start_time and start_time > partition_end
195
+ ):
196
+ return False
197
+ return True
198
+
199
+ @staticmethod
200
+ def _list_partition_paths_helper(
201
+ paths: list[str],
202
+ start_time: Optional[datetime.datetime],
203
+ end_time: Optional[datetime.datetime],
204
+ current_path: str,
205
+ partition_level: str,
206
+ filesystem,
207
+ ):
208
+ directory_split = current_path.rsplit("/", 1)
209
+ time_unit = None
210
+ directory_start, directory_end = "", ""
211
+ if len(directory_split) == 2:
212
+ directory_start, directory_end = directory_split
213
+ time_unit = directory_end.split("=")[0] if "=" in directory_end else None
214
+
215
+ if not time_unit and directory_end.endswith((".parquet", ".pq")):
216
+ paths.append(directory_start.rstrip("/"))
217
+ return
218
+ elif time_unit and time_unit == partition_level:
219
+ paths.append(current_path.rstrip("/"))
220
+ return
221
+
222
+ directories = filesystem.ls(current_path, detail=True)
223
+ if len(directories) == 0:
224
+ return
225
+ for directory in directories:
226
+ current_path = directory["name"]
227
+ parts = [p for p in current_path.split("/") if "=" in p]
228
+ kwargs = {}
229
+ for part in parts:
230
+ key, value = part.split("=", 1)
231
+ if value.isdigit():
232
+ value = int(value)
233
+ kwargs[key] = value
234
+ if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
235
+ DataStore._list_partition_paths_helper(
236
+ paths,
237
+ start_time,
238
+ end_time,
239
+ current_path,
240
+ partition_level,
241
+ filesystem,
242
+ )
243
+
244
+ @staticmethod
245
+ def _list_partitioned_paths(
246
+ base_url: str,
247
+ start_time: Optional[datetime.datetime],
248
+ end_time: Optional[datetime.datetime],
249
+ partition_level: str,
250
+ filesystem,
251
+ ):
252
+ paths = []
253
+ parsed_base_url = urlparse(base_url)
254
+ base_path = parsed_base_url.path
255
+
256
+ if parsed_base_url.scheme not in ["v3io", "v3ios"]:
257
+ base_path = parsed_base_url.netloc + base_path
258
+
259
+ DataStore._list_partition_paths_helper(
260
+ paths, start_time, end_time, base_path, partition_level, filesystem
261
+ )
262
+ paths = [
263
+ DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
264
+ for path in paths
265
+ ]
266
+ return paths
267
+
268
+ @staticmethod
269
+ def _reconstruct_path_from_base_url(
270
+ parsed_base_url: urllib.parse.ParseResult, returned_path: str
271
+ ) -> str:
272
+ scheme = parsed_base_url.scheme
273
+ authority = parsed_base_url.netloc
274
+ returned_path = returned_path.lstrip("/")
275
+ if scheme == "v3io":
276
+ return f"{scheme}://{authority}/{returned_path}"
277
+ else:
278
+ return f"{scheme}://{returned_path}"
279
+
280
+ @staticmethod
281
+ def _clean_filters_for_partitions(
282
+ filters: list[list[tuple]],
283
+ partition_keys: list[str],
284
+ ):
285
+ """
286
+ Remove partition keys from filters.
287
+
288
+ :param filters: pandas-style filters
289
+ Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
290
+ :param partition_keys: partition columns handled via directory
291
+
292
+ :return list of list of tuples: cleaned filters without partition keys
293
+ """
294
+ cleaned_filters = []
295
+ for group in filters:
296
+ new_group = [f for f in group if f[0] not in partition_keys]
297
+ if new_group:
298
+ cleaned_filters.append(new_group)
299
+ return cleaned_filters
300
+
301
+ @staticmethod
302
+ def _read_partitioned_parquet(
303
+ base_url: str,
304
+ start_time: Optional[datetime.datetime],
305
+ end_time: Optional[datetime.datetime],
306
+ partition_keys: list[str],
307
+ df_module: ModuleType,
308
+ filesystem: fsspec.AbstractFileSystem,
309
+ **kwargs,
310
+ ):
311
+ """
312
+ Reads only the relevant partitions and concatenates the results.
313
+ Note that partition_keys cannot be empty.
314
+ """
315
+ logger.debug(f"Starting partition discovery process for {base_url}")
316
+
317
+ paths = DataStore._list_partitioned_paths(
318
+ base_url,
319
+ start_time,
320
+ end_time,
321
+ partition_keys[-1],
322
+ filesystem,
323
+ )
324
+
325
+ dfs = []
326
+ for current_path in paths:
327
+ try:
328
+ kwargs["filters"] = DataStore._clean_filters_for_partitions(
329
+ kwargs["filters"], partition_keys
330
+ )
331
+ df = df_module.read_parquet(current_path, **kwargs)
332
+ logger.debug(
333
+ "Finished reading DataFrame from subpath",
334
+ url=current_path,
335
+ )
336
+ dfs.append(df)
337
+ except FileNotFoundError as e:
338
+ # Skip partitions that don't exist or have no data
339
+ logger.warning(
340
+ "Failed to read DataFrame", url=current_path, exception=e
341
+ )
342
+
343
+ final_df = pd.concat(dfs) if dfs else pd.DataFrame()
344
+ logger.debug(
345
+ "Finished reading partitioned parquet files",
346
+ url=base_url,
347
+ columns=final_df.columns,
348
+ )
349
+ return final_df
350
+
159
351
  @staticmethod
160
352
  def _parquet_reader(
161
353
  df_module,
@@ -165,6 +357,7 @@ class DataStore(BaseRemoteClient):
165
357
  start_time,
166
358
  end_time,
167
359
  additional_filters,
360
+ optimize_discovery,
168
361
  ):
169
362
  from storey.utils import find_filters, find_partitions
170
363
 
@@ -203,7 +396,10 @@ class DataStore(BaseRemoteClient):
203
396
  )
204
397
 
205
398
  if start_time or end_time or additional_filters:
206
- partitions_time_attributes = find_partitions(url, file_system)
399
+ partitions_time_attributes, partitions = find_partitions(
400
+ url, file_system, True
401
+ )
402
+ logger.debug("Partitioned parquet read", partitions=partitions)
207
403
  set_filters(
208
404
  partitions_time_attributes,
209
405
  start_time,
@@ -211,8 +407,28 @@ class DataStore(BaseRemoteClient):
211
407
  additional_filters,
212
408
  kwargs,
213
409
  )
410
+
214
411
  try:
215
- return df_module.read_parquet(*args, **kwargs)
412
+ if (
413
+ optimize_discovery
414
+ and partitions_time_attributes
415
+ and DataStore._verify_path_partition_level(
416
+ urlparse(url).path, partitions
417
+ )
418
+ and (start_time or end_time)
419
+ ):
420
+ return DataStore._read_partitioned_parquet(
421
+ url,
422
+ start_time,
423
+ end_time,
424
+ partitions_time_attributes,
425
+ df_module,
426
+ file_system,
427
+ **kwargs,
428
+ )
429
+
430
+ else:
431
+ return df_module.read_parquet(*args, **kwargs)
216
432
  except pyarrow.lib.ArrowInvalid as ex:
217
433
  if not str(ex).startswith(
218
434
  "Cannot compare timestamp with timezone to timestamp without timezone"
@@ -238,7 +454,24 @@ class DataStore(BaseRemoteClient):
238
454
  additional_filters,
239
455
  kwargs,
240
456
  )
241
- return df_module.read_parquet(*args, **kwargs)
457
+ if (
458
+ optimize_discovery
459
+ and partitions_time_attributes
460
+ and DataStore._verify_path_partition_level(
461
+ urlparse(url).path, partitions
462
+ )
463
+ ):
464
+ return DataStore._read_partitioned_parquet(
465
+ url,
466
+ start_time_inner,
467
+ end_time_inner,
468
+ partitions_time_attributes,
469
+ df_module,
470
+ file_system,
471
+ **kwargs,
472
+ )
473
+ else:
474
+ return df_module.read_parquet(*args, **kwargs)
242
475
  else:
243
476
  return df_module.read_parquet(*args, **kwargs)
244
477
 
@@ -261,6 +494,10 @@ class DataStore(BaseRemoteClient):
261
494
  file_url = self._sanitize_url(url)
262
495
  is_csv, is_json, drop_time_column = False, False, False
263
496
  file_system = self.filesystem
497
+
498
+ # Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
499
+ optimize_discovery = kwargs.pop("optimize_discovery", True)
500
+
264
501
  if file_url.endswith(".csv") or format == "csv":
265
502
  is_csv = True
266
503
  drop_time_column = False
@@ -322,6 +559,7 @@ class DataStore(BaseRemoteClient):
322
559
  start_time,
323
560
  end_time,
324
561
  additional_filters,
562
+ optimize_discovery,
325
563
  )
326
564
 
327
565
  elif file_url.endswith(".json") or format == "json":
@@ -347,7 +585,7 @@ class DataStore(BaseRemoteClient):
347
585
  temp_file = tempfile.NamedTemporaryFile(delete=False)
348
586
  self.download(self._join(subpath), temp_file.name)
349
587
  df = reader(temp_file.name, **kwargs)
350
- remove(temp_file.name)
588
+ os.remove(temp_file.name)
351
589
 
352
590
  if is_json or is_csv:
353
591
  # for parquet file the time filtering is executed in `reader`
@@ -387,6 +625,26 @@ class DataStore(BaseRemoteClient):
387
625
  except ImportError:
388
626
  return False
389
627
 
628
+ @staticmethod
629
+ def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
630
+ if not partitions:
631
+ return False
632
+
633
+ path_parts = base_path.strip("/").split("/")
634
+ path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
635
+ if "hour" in partitions:
636
+ hour_index = partitions.index("hour")
637
+ else:
638
+ return False
639
+ for i, part in enumerate(partitions):
640
+ if not (
641
+ part in path_parts
642
+ or part in ["year", "month", "day", "hour"]
643
+ or i > hour_index
644
+ ):
645
+ return False
646
+ return True
647
+
390
648
 
391
649
  class DataItem:
392
650
  """Data input/output class abstracting access to various local/remote data sources
@@ -439,7 +697,7 @@ class DataItem:
439
697
  @property
440
698
  def suffix(self):
441
699
  """DataItem suffix (file extension) e.g. '.png'"""
442
- _, file_ext = path.splitext(self._path)
700
+ _, file_ext = os.path.splitext(self._path)
443
701
  return file_ext
444
702
 
445
703
  @property
@@ -548,7 +806,7 @@ class DataItem:
548
806
  return
549
807
 
550
808
  if self._local_path:
551
- remove(self._local_path)
809
+ os.remove(self._local_path)
552
810
  self._local_path = ""
553
811
 
554
812
  def as_df(
@@ -47,7 +47,7 @@ from .v3io import V3ioStore
47
47
  in_memory_store = InMemoryStore()
48
48
 
49
49
 
50
- def schema_to_store(schema) -> DataStore.__subclasses__():
50
+ def schema_to_store(schema) -> type[DataStore]:
51
51
  # import store classes inside to enable making their dependencies optional (package extras)
52
52
 
53
53
  if not schema or schema in get_local_file_schema():
@@ -11,7 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import threading
15
15
  from typing import TYPE_CHECKING, Any, Optional, Union
16
16
 
17
17
  import mlrun
@@ -41,6 +41,9 @@ class HuggingFaceProvider(ModelProvider):
41
41
  into memory for inference. Ensure you have the required CPU/GPU and memory to use this operation.
42
42
  """
43
43
 
44
+ # locks for threading use cases
45
+ _client_lock = threading.Lock()
46
+
44
47
  def __init__(
45
48
  self,
46
49
  parent,
@@ -224,7 +227,8 @@ class HuggingFaceProvider(ModelProvider):
224
227
 
225
228
  self.options["model_kwargs"] = self.options.get("model_kwargs", {})
226
229
  self.options["model_kwargs"]["local_files_only"] = True
227
- self._client = pipeline(model=self.model, **self.options)
230
+ with self._client_lock:
231
+ self._client = pipeline(model=self.model, **self.options)
228
232
  self._expected_operation_type = Pipeline
229
233
  except ImportError as exc:
230
234
  raise ImportError("transformers package is not installed") from exc
@@ -76,9 +76,9 @@ class ResourceCache:
76
76
  return self._tabels[uri]
77
77
 
78
78
  if uri.startswith("v3io://") or uri.startswith("v3ios://"):
79
- endpoint, uri = parse_path(uri)
79
+ endpoint, path = parse_path(uri)
80
80
  self._tabels[uri] = Table(
81
- uri,
81
+ path,
82
82
  V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
83
83
  flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
84
84
  )
@@ -87,10 +87,10 @@ class ResourceCache:
87
87
  if uri.startswith("redis://") or uri.startswith("rediss://"):
88
88
  from storey.redis_driver import RedisDriver
89
89
 
90
- endpoint, uri = parse_path(uri)
90
+ endpoint, path = parse_path(uri)
91
91
  endpoint = endpoint or mlrun.mlconf.redis.url
92
92
  self._tabels[uri] = Table(
93
- uri,
93
+ path,
94
94
  RedisDriver(redis_url=endpoint, key_prefix="/"),
95
95
  flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
96
96
  )
@@ -850,6 +850,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
850
850
  * ``base_period``, ``int``
851
851
  * ``write_output``, ``bool``
852
852
  * ``existing_data_handling``, ``str``
853
+ * ``_init_args``, ``dict`` - the arguments for the application class constructor
854
+ (equivalent to ``class_arguments``)
855
+
856
+ See :py:meth:`~ModelMonitoringApplicationBase.evaluate` for more details
857
+ about these inputs and params.
853
858
 
854
859
  For Git sources, add the source archive to the returned job and change the handler:
855
860
 
@@ -928,6 +933,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
928
933
  image: Optional[str] = None,
929
934
  with_repo: Optional[bool] = False,
930
935
  class_handler: Optional[str] = None,
936
+ class_arguments: Optional[dict[str, Any]] = None,
931
937
  requirements: Optional[Union[str, list[str]]] = None,
932
938
  requirements_file: str = "",
933
939
  endpoints: Union[list[tuple[str, str]], list[str], Literal["all"], None] = None,
@@ -963,7 +969,10 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
963
969
  You do not need to have a model endpoint to use this option.
964
970
  :param image: Docker image to run the job on (when running remotely).
965
971
  :param with_repo: Whether to clone the current repo to the build source.
966
- :param class_handler: The relative path to the class, useful when using Git sources or code from images.
972
+ :param class_handler: The relative path to the application class, useful when using Git sources or code
973
+ from images.
974
+ :param class_arguments: The arguments for the application class constructor. These are passed to the
975
+ class ``__init__``. The values must be JSON-serializable.
967
976
  :param requirements: List of Python requirements to be installed in the image.
968
977
  :param requirements_file: Path to a Python requirements file to be installed in the image.
969
978
  :param endpoints: The model endpoints to get the data from. The options are:
@@ -1041,7 +1050,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
1041
1050
  project=project,
1042
1051
  )
1043
1052
 
1044
- params: dict[str, Union[list, str, int, None, ds_profile.DatastoreProfile]] = {}
1053
+ params: dict[
1054
+ str, Union[list, dict, str, int, None, ds_profile.DatastoreProfile]
1055
+ ] = {}
1045
1056
  if endpoints:
1046
1057
  params["endpoints"] = endpoints
1047
1058
  if sample_data is None:
@@ -1077,6 +1088,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
1077
1088
  )
1078
1089
  params["stream_profile"] = stream_profile
1079
1090
 
1091
+ if class_arguments:
1092
+ params["_init_args"] = class_arguments
1093
+
1080
1094
  inputs: dict[str, str] = {}
1081
1095
  for data, identifier in [
1082
1096
  (sample_data, "sample_data"),
@@ -85,17 +85,17 @@ def run_function(
85
85
  ) -> Union[mlrun.model.RunObject, mlrun_pipelines.models.PipelineNodeWrapper]:
86
86
  """Run a local or remote task as part of a local/kubeflow pipeline
87
87
 
88
- run_function() allow you to execute a function locally, on a remote cluster, or as part of an automated workflow
89
- function can be specified as an object or by name (str), when the function is specified by name it is looked up
90
- in the current project eliminating the need to redefine/edit functions.
88
+ run_function() allows you to execute a function locally, on a remote cluster, or as part of an automated workflow.
89
+ The function can be specified as an object or by name (str). When the function is specified by name it is looked up
90
+ in the current project, eliminating the need to redefine/edit functions.
91
91
 
92
- when functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level,
92
+ When functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level,
93
93
  e.g. local=True will run all the functions locally, setting artifact_path will direct all outputs to the same path.
94
- project runs provide additional notifications/reporting and exception handling.
95
- inside a Kubeflow pipeline (KFP) run_function() generates KFP node (see PipelineNodeWrapper) which forms a DAG
96
- some behavior may differ between regular runs and deferred KFP runs.
94
+ Project runs provide additional notifications/reporting and exception handling.
95
+ Inside a Kubeflow pipeline (KFP) run_function() generates KFP node (see PipelineNodeWrapper) which forms a DAG.
96
+ Some behavior may differ between regular runs and deferred KFP runs.
97
97
 
98
- example (use with function object)::
98
+ Example (use with function object)::
99
99
 
100
100
  LABELS = "is_error"
101
101
  MODEL_CLASS = "sklearn.ensemble.RandomForestClassifier"
@@ -107,7 +107,7 @@ def run_function(
107
107
  inputs={"dataset": DATA_PATH},
108
108
  )
109
109
 
110
- example (use with project)::
110
+ Example (use with project)::
111
111
 
112
112
  # create a project with two functions (local and from hub)
113
113
  project = mlrun.new_project(project_name, "./proj)
@@ -119,7 +119,7 @@ def run_function(
119
119
  run2 = run_function("train", params={"label_columns": LABELS, "model_class": MODEL_CLASS},
120
120
  inputs={"dataset": run1.outputs["data"]})
121
121
 
122
- example (use in pipeline)::
122
+ Example (use in pipeline)::
123
123
 
124
124
  @dsl.pipeline(name="test pipeline", description="test")
125
125
  def my_pipe(url=""):