mlrun 1.10.0rc40__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (150) hide show
  1. mlrun/__init__.py +3 -2
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/plots.py +1 -1
  5. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  6. mlrun/auth/nuclio.py +89 -0
  7. mlrun/auth/providers.py +429 -0
  8. mlrun/auth/utils.py +415 -0
  9. mlrun/common/constants.py +7 -0
  10. mlrun/common/model_monitoring/helpers.py +41 -4
  11. mlrun/common/runtimes/constants.py +28 -0
  12. mlrun/common/schemas/__init__.py +13 -3
  13. mlrun/common/schemas/alert.py +2 -2
  14. mlrun/common/schemas/api_gateway.py +3 -0
  15. mlrun/common/schemas/auth.py +10 -10
  16. mlrun/common/schemas/client_spec.py +4 -0
  17. mlrun/common/schemas/constants.py +25 -0
  18. mlrun/common/schemas/frontend_spec.py +1 -8
  19. mlrun/common/schemas/function.py +24 -0
  20. mlrun/common/schemas/hub.py +3 -2
  21. mlrun/common/schemas/model_monitoring/__init__.py +1 -1
  22. mlrun/common/schemas/model_monitoring/constants.py +2 -2
  23. mlrun/common/schemas/secret.py +17 -2
  24. mlrun/common/secrets.py +95 -1
  25. mlrun/common/types.py +10 -10
  26. mlrun/config.py +53 -15
  27. mlrun/data_types/infer.py +2 -2
  28. mlrun/datastore/__init__.py +2 -3
  29. mlrun/datastore/base.py +274 -10
  30. mlrun/datastore/datastore.py +1 -1
  31. mlrun/datastore/datastore_profile.py +49 -17
  32. mlrun/datastore/model_provider/huggingface_provider.py +6 -2
  33. mlrun/datastore/model_provider/model_provider.py +2 -2
  34. mlrun/datastore/model_provider/openai_provider.py +2 -2
  35. mlrun/datastore/s3.py +15 -16
  36. mlrun/datastore/sources.py +1 -1
  37. mlrun/datastore/store_resources.py +4 -4
  38. mlrun/datastore/storeytargets.py +16 -10
  39. mlrun/datastore/targets.py +1 -1
  40. mlrun/datastore/utils.py +16 -3
  41. mlrun/datastore/v3io.py +1 -1
  42. mlrun/db/base.py +36 -12
  43. mlrun/db/httpdb.py +316 -101
  44. mlrun/db/nopdb.py +29 -11
  45. mlrun/errors.py +4 -2
  46. mlrun/execution.py +11 -12
  47. mlrun/feature_store/api.py +1 -1
  48. mlrun/feature_store/common.py +1 -1
  49. mlrun/feature_store/feature_vector_utils.py +1 -1
  50. mlrun/feature_store/steps.py +8 -6
  51. mlrun/frameworks/_common/utils.py +3 -3
  52. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  53. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  54. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  55. mlrun/frameworks/_ml_common/utils.py +2 -1
  56. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  57. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  58. mlrun/frameworks/onnx/dataset.py +2 -1
  59. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  60. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  61. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  62. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  63. mlrun/frameworks/pytorch/utils.py +2 -1
  64. mlrun/frameworks/sklearn/metric.py +2 -1
  65. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  66. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  67. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  68. mlrun/hub/__init__.py +37 -0
  69. mlrun/hub/base.py +142 -0
  70. mlrun/hub/module.py +67 -76
  71. mlrun/hub/step.py +113 -0
  72. mlrun/launcher/base.py +2 -1
  73. mlrun/launcher/local.py +2 -1
  74. mlrun/model.py +12 -2
  75. mlrun/model_monitoring/__init__.py +0 -1
  76. mlrun/model_monitoring/api.py +2 -2
  77. mlrun/model_monitoring/applications/base.py +20 -6
  78. mlrun/model_monitoring/applications/context.py +1 -0
  79. mlrun/model_monitoring/controller.py +7 -17
  80. mlrun/model_monitoring/db/_schedules.py +2 -16
  81. mlrun/model_monitoring/db/_stats.py +2 -13
  82. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  83. mlrun/model_monitoring/db/tsdb/base.py +2 -4
  84. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  85. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  86. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  87. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  88. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  89. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  90. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  91. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  92. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  93. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  94. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  95. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  98. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +4 -6
  99. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +147 -79
  100. mlrun/model_monitoring/features_drift_table.py +2 -1
  101. mlrun/model_monitoring/helpers.py +2 -1
  102. mlrun/model_monitoring/stream_processing.py +18 -16
  103. mlrun/model_monitoring/writer.py +4 -3
  104. mlrun/package/__init__.py +2 -1
  105. mlrun/platforms/__init__.py +0 -44
  106. mlrun/platforms/iguazio.py +1 -1
  107. mlrun/projects/operations.py +11 -10
  108. mlrun/projects/project.py +81 -82
  109. mlrun/run.py +4 -7
  110. mlrun/runtimes/__init__.py +2 -204
  111. mlrun/runtimes/base.py +89 -21
  112. mlrun/runtimes/constants.py +225 -0
  113. mlrun/runtimes/daskjob.py +4 -2
  114. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  115. mlrun/runtimes/mounts.py +5 -0
  116. mlrun/runtimes/nuclio/__init__.py +12 -8
  117. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  118. mlrun/runtimes/nuclio/application/application.py +200 -32
  119. mlrun/runtimes/nuclio/function.py +154 -49
  120. mlrun/runtimes/nuclio/serving.py +55 -42
  121. mlrun/runtimes/pod.py +59 -10
  122. mlrun/secrets.py +46 -2
  123. mlrun/serving/__init__.py +2 -0
  124. mlrun/serving/remote.py +5 -5
  125. mlrun/serving/routers.py +3 -3
  126. mlrun/serving/server.py +46 -43
  127. mlrun/serving/serving_wrapper.py +6 -2
  128. mlrun/serving/states.py +554 -207
  129. mlrun/serving/steps.py +1 -1
  130. mlrun/serving/system_steps.py +42 -33
  131. mlrun/track/trackers/mlflow_tracker.py +29 -31
  132. mlrun/utils/helpers.py +89 -16
  133. mlrun/utils/http.py +9 -2
  134. mlrun/utils/notifications/notification/git.py +1 -1
  135. mlrun/utils/notifications/notification/mail.py +39 -16
  136. mlrun/utils/notifications/notification_pusher.py +2 -2
  137. mlrun/utils/version/version.json +2 -2
  138. mlrun/utils/version/version.py +3 -4
  139. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +39 -49
  140. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +144 -130
  141. mlrun/db/auth_utils.py +0 -152
  142. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -343
  143. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  144. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1368
  146. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +0 -51
  147. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  148. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  149. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  150. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -11,11 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import datetime
15
+ import os
16
+ import os.path
14
17
  import tempfile
15
18
  import urllib.parse
16
19
  from base64 import b64encode
17
20
  from copy import copy
18
- from os import path, remove
21
+ from types import ModuleType
19
22
  from typing import Optional, Union
20
23
  from urllib.parse import urlparse
21
24
 
@@ -26,6 +29,7 @@ import pyarrow
26
29
  import pytz
27
30
  import requests
28
31
 
32
+ import mlrun.common.schemas
29
33
  import mlrun.config
30
34
  import mlrun.errors
31
35
  from mlrun.datastore.remote_client import BaseRemoteClient
@@ -156,6 +160,195 @@ class DataStore(BaseRemoteClient):
156
160
  def get_spark_options(self, path=None):
157
161
  return {}
158
162
 
163
+ @staticmethod
164
+ def _is_directory_in_range(
165
+ start_time: Optional[datetime.datetime],
166
+ end_time: Optional[datetime.datetime],
167
+ year: int,
168
+ month: Optional[int] = None,
169
+ day: Optional[int] = None,
170
+ hour: Optional[int] = None,
171
+ **kwargs,
172
+ ):
173
+ """Check if a partition directory (year=.., month=.., etc.) is in the time range."""
174
+ from dateutil.relativedelta import relativedelta
175
+
176
+ partition_start = datetime.datetime(
177
+ year=year,
178
+ month=month or 1,
179
+ day=day or 1,
180
+ hour=hour or 0,
181
+ tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
182
+ )
183
+ partition_end = (
184
+ partition_start
185
+ + relativedelta(
186
+ years=1 if month is None else 0,
187
+ months=1 if day is None and month is not None else 0,
188
+ days=1 if hour is None and day is not None else 0,
189
+ hours=1 if hour is not None else 0,
190
+ )
191
+ - datetime.timedelta(microseconds=1)
192
+ )
193
+
194
+ if (end_time and end_time < partition_start) or (
195
+ start_time and start_time > partition_end
196
+ ):
197
+ return False
198
+ return True
199
+
200
+ @staticmethod
201
+ def _list_partition_paths_helper(
202
+ paths: list[str],
203
+ start_time: Optional[datetime.datetime],
204
+ end_time: Optional[datetime.datetime],
205
+ current_path: str,
206
+ partition_level: str,
207
+ filesystem,
208
+ ):
209
+ directory_split = current_path.rsplit("/", 1)
210
+ time_unit = None
211
+ directory_start, directory_end = "", ""
212
+ if len(directory_split) == 2:
213
+ directory_start, directory_end = directory_split
214
+ time_unit = directory_end.split("=")[0] if "=" in directory_end else None
215
+
216
+ if not time_unit and directory_end.endswith((".parquet", ".pq")):
217
+ paths.append(directory_start.rstrip("/"))
218
+ return
219
+ elif time_unit and time_unit == partition_level:
220
+ paths.append(current_path.rstrip("/"))
221
+ return
222
+
223
+ directories = filesystem.ls(current_path, detail=True)
224
+ if len(directories) == 0:
225
+ return
226
+ for directory in directories:
227
+ current_path = directory["name"]
228
+ parts = [p for p in current_path.split("/") if "=" in p]
229
+ kwargs = {}
230
+ for part in parts:
231
+ key, value = part.split("=", 1)
232
+ if value.isdigit():
233
+ value = int(value)
234
+ kwargs[key] = value
235
+ if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
236
+ DataStore._list_partition_paths_helper(
237
+ paths,
238
+ start_time,
239
+ end_time,
240
+ current_path,
241
+ partition_level,
242
+ filesystem,
243
+ )
244
+
245
+ @staticmethod
246
+ def _list_partitioned_paths(
247
+ base_url: str,
248
+ start_time: Optional[datetime.datetime],
249
+ end_time: Optional[datetime.datetime],
250
+ partition_level: str,
251
+ filesystem,
252
+ ):
253
+ paths = []
254
+ parsed_base_url = urlparse(base_url)
255
+ base_path = parsed_base_url.path
256
+
257
+ if parsed_base_url.scheme not in ["v3io", "v3ios"]:
258
+ base_path = parsed_base_url.netloc + base_path
259
+
260
+ DataStore._list_partition_paths_helper(
261
+ paths, start_time, end_time, base_path, partition_level, filesystem
262
+ )
263
+ paths = [
264
+ DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
265
+ for path in paths
266
+ ]
267
+ return paths
268
+
269
+ @staticmethod
270
+ def _reconstruct_path_from_base_url(
271
+ parsed_base_url: urllib.parse.ParseResult, returned_path: str
272
+ ) -> str:
273
+ scheme = parsed_base_url.scheme
274
+ authority = parsed_base_url.netloc
275
+ returned_path = returned_path.lstrip("/")
276
+ if scheme == "v3io":
277
+ return f"{scheme}://{authority}/{returned_path}"
278
+ else:
279
+ return f"{scheme}://{returned_path}"
280
+
281
+ @staticmethod
282
+ def _clean_filters_for_partitions(
283
+ filters: list[list[tuple]],
284
+ partition_keys: list[str],
285
+ ):
286
+ """
287
+ Remove partition keys from filters.
288
+
289
+ :param filters: pandas-style filters
290
+ Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
291
+ :param partition_keys: partition columns handled via directory
292
+
293
+ :return list of list of tuples: cleaned filters without partition keys
294
+ """
295
+ cleaned_filters = []
296
+ for group in filters:
297
+ new_group = [f for f in group if f[0] not in partition_keys]
298
+ if new_group:
299
+ cleaned_filters.append(new_group)
300
+ return cleaned_filters
301
+
302
+ @staticmethod
303
+ def _read_partitioned_parquet(
304
+ base_url: str,
305
+ start_time: Optional[datetime.datetime],
306
+ end_time: Optional[datetime.datetime],
307
+ partition_keys: list[str],
308
+ df_module: ModuleType,
309
+ filesystem: fsspec.AbstractFileSystem,
310
+ **kwargs,
311
+ ):
312
+ """
313
+ Reads only the relevant partitions and concatenates the results.
314
+ Note that partition_keys cannot be empty.
315
+ """
316
+ logger.debug(f"Starting partition discovery process for {base_url}")
317
+
318
+ paths = DataStore._list_partitioned_paths(
319
+ base_url,
320
+ start_time,
321
+ end_time,
322
+ partition_keys[-1],
323
+ filesystem,
324
+ )
325
+
326
+ dfs = []
327
+ for current_path in paths:
328
+ try:
329
+ kwargs["filters"] = DataStore._clean_filters_for_partitions(
330
+ kwargs["filters"], partition_keys
331
+ )
332
+ df = df_module.read_parquet(current_path, **kwargs)
333
+ logger.debug(
334
+ "Finished reading DataFrame from subpath",
335
+ url=current_path,
336
+ )
337
+ dfs.append(df)
338
+ except FileNotFoundError as e:
339
+ # Skip partitions that don't exist or have no data
340
+ logger.warning(
341
+ "Failed to read DataFrame", url=current_path, exception=e
342
+ )
343
+
344
+ final_df = pd.concat(dfs) if dfs else pd.DataFrame()
345
+ logger.debug(
346
+ "Finished reading partitioned parquet files",
347
+ url=base_url,
348
+ columns=final_df.columns,
349
+ )
350
+ return final_df
351
+
159
352
  @staticmethod
160
353
  def _parquet_reader(
161
354
  df_module,
@@ -165,6 +358,7 @@ class DataStore(BaseRemoteClient):
165
358
  start_time,
166
359
  end_time,
167
360
  additional_filters,
361
+ optimize_discovery,
168
362
  ):
169
363
  from storey.utils import find_filters, find_partitions
170
364
 
@@ -203,7 +397,10 @@ class DataStore(BaseRemoteClient):
203
397
  )
204
398
 
205
399
  if start_time or end_time or additional_filters:
206
- partitions_time_attributes = find_partitions(url, file_system)
400
+ partitions_time_attributes, partitions = find_partitions(
401
+ url, file_system
402
+ )
403
+ logger.debug("Partitioned parquet read", partitions=partitions)
207
404
  set_filters(
208
405
  partitions_time_attributes,
209
406
  start_time,
@@ -211,8 +408,28 @@ class DataStore(BaseRemoteClient):
211
408
  additional_filters,
212
409
  kwargs,
213
410
  )
411
+
214
412
  try:
215
- return df_module.read_parquet(*args, **kwargs)
413
+ if (
414
+ optimize_discovery
415
+ and partitions_time_attributes
416
+ and DataStore._verify_path_partition_level(
417
+ urlparse(url).path, partitions
418
+ )
419
+ and (start_time or end_time)
420
+ ):
421
+ return DataStore._read_partitioned_parquet(
422
+ url,
423
+ start_time,
424
+ end_time,
425
+ partitions_time_attributes,
426
+ df_module,
427
+ file_system,
428
+ **kwargs,
429
+ )
430
+
431
+ else:
432
+ return df_module.read_parquet(*args, **kwargs)
216
433
  except pyarrow.lib.ArrowInvalid as ex:
217
434
  if not str(ex).startswith(
218
435
  "Cannot compare timestamp with timezone to timestamp without timezone"
@@ -238,7 +455,24 @@ class DataStore(BaseRemoteClient):
238
455
  additional_filters,
239
456
  kwargs,
240
457
  )
241
- return df_module.read_parquet(*args, **kwargs)
458
+ if (
459
+ optimize_discovery
460
+ and partitions_time_attributes
461
+ and DataStore._verify_path_partition_level(
462
+ urlparse(url).path, partitions
463
+ )
464
+ ):
465
+ return DataStore._read_partitioned_parquet(
466
+ url,
467
+ start_time_inner,
468
+ end_time_inner,
469
+ partitions_time_attributes,
470
+ df_module,
471
+ file_system,
472
+ **kwargs,
473
+ )
474
+ else:
475
+ return df_module.read_parquet(*args, **kwargs)
242
476
  else:
243
477
  return df_module.read_parquet(*args, **kwargs)
244
478
 
@@ -261,6 +495,10 @@ class DataStore(BaseRemoteClient):
261
495
  file_url = self._sanitize_url(url)
262
496
  is_csv, is_json, drop_time_column = False, False, False
263
497
  file_system = self.filesystem
498
+
499
+ # Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
500
+ optimize_discovery = kwargs.pop("optimize_discovery", True)
501
+
264
502
  if file_url.endswith(".csv") or format == "csv":
265
503
  is_csv = True
266
504
  drop_time_column = False
@@ -322,6 +560,7 @@ class DataStore(BaseRemoteClient):
322
560
  start_time,
323
561
  end_time,
324
562
  additional_filters,
563
+ optimize_discovery,
325
564
  )
326
565
 
327
566
  elif file_url.endswith(".json") or format == "json":
@@ -347,7 +586,7 @@ class DataStore(BaseRemoteClient):
347
586
  temp_file = tempfile.NamedTemporaryFile(delete=False)
348
587
  self.download(self._join(subpath), temp_file.name)
349
588
  df = reader(temp_file.name, **kwargs)
350
- remove(temp_file.name)
589
+ os.remove(temp_file.name)
351
590
 
352
591
  if is_json or is_csv:
353
592
  # for parquet file the time filtering is executed in `reader`
@@ -387,6 +626,26 @@ class DataStore(BaseRemoteClient):
387
626
  except ImportError:
388
627
  return False
389
628
 
629
+ @staticmethod
630
+ def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
631
+ if not partitions:
632
+ return False
633
+
634
+ path_parts = base_path.strip("/").split("/")
635
+ path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
636
+ if "hour" in partitions:
637
+ hour_index = partitions.index("hour")
638
+ else:
639
+ return False
640
+ for i, part in enumerate(partitions):
641
+ if not (
642
+ part in path_parts
643
+ or part in ["year", "month", "day", "hour"]
644
+ or i > hour_index
645
+ ):
646
+ return False
647
+ return True
648
+
390
649
 
391
650
  class DataItem:
392
651
  """Data input/output class abstracting access to various local/remote data sources
@@ -439,7 +698,7 @@ class DataItem:
439
698
  @property
440
699
  def suffix(self):
441
700
  """DataItem suffix (file extension) e.g. '.png'"""
442
- _, file_ext = path.splitext(self._path)
701
+ _, file_ext = os.path.splitext(self._path)
443
702
  return file_ext
444
703
 
445
704
  @property
@@ -548,7 +807,7 @@ class DataItem:
548
807
  return
549
808
 
550
809
  if self._local_path:
551
- remove(self._local_path)
810
+ os.remove(self._local_path)
552
811
  self._local_path = ""
553
812
 
554
813
  def as_df(
@@ -648,8 +907,10 @@ def basic_auth_header(user, password):
648
907
  username = user.encode("latin1")
649
908
  password = password.encode("latin1")
650
909
  base = b64encode(b":".join((username, password))).strip()
651
- authstr = "Basic " + base.decode("ascii")
652
- return {"Authorization": authstr}
910
+ authstr = mlrun.common.schemas.AuthorizationHeaderPrefixes.basic + base.decode(
911
+ "ascii"
912
+ )
913
+ return {mlrun.common.schemas.HeaderNames.authorization: authstr}
653
914
 
654
915
 
655
916
  class HttpStore(DataStore):
@@ -696,7 +957,10 @@ class HttpStore(DataStore):
696
957
  token = self._get_secret_or_env("HTTPS_AUTH_TOKEN")
697
958
  if token:
698
959
  self._https_auth_token = token
699
- self._headers.setdefault("Authorization", f"Bearer {token}")
960
+ self._headers.setdefault(
961
+ mlrun.common.schemas.HeaderNames.authorization,
962
+ f"{mlrun.common.schemas.AuthorizationHeaderPrefixes.bearer}{token}",
963
+ )
700
964
 
701
965
  def _validate_https_token(self):
702
966
  if self._https_auth_token and self._schema in ["http"]:
@@ -47,7 +47,7 @@ from .v3io import V3ioStore
47
47
  in_memory_store = InMemoryStore()
48
48
 
49
49
 
50
- def schema_to_store(schema) -> DataStore.__subclasses__():
50
+ def schema_to_store(schema) -> type[DataStore]:
51
51
  # import store classes inside to enable making their dependencies optional (package extras)
52
52
 
53
53
  if not schema or schema in get_local_file_schema():
@@ -16,7 +16,7 @@ import ast
16
16
  import base64
17
17
  import json
18
18
  import typing
19
- from urllib.parse import ParseResult, urlparse
19
+ from urllib.parse import ParseResult, quote, unquote, urlparse
20
20
 
21
21
  import pydantic.v1
22
22
  from deprecated import deprecated
@@ -283,8 +283,9 @@ class DatastoreProfileRedis(DatastoreProfile):
283
283
 
284
284
  def url_with_credentials(self):
285
285
  parsed_url = urlparse(self.endpoint_url)
286
- username = self.username
287
- password = self.password
286
+ # URL-encode username and password to handle special characters like @, :, /
287
+ username = quote(self.username, safe="") if self.username else None
288
+ password = quote(self.password, safe="") if self.password else None
288
289
  netloc = parsed_url.hostname
289
290
  if username:
290
291
  if password:
@@ -448,40 +449,71 @@ class DatastoreProfileHdfs(DatastoreProfile):
448
449
  return f"webhdfs://{self.host}:{self.http_port}{subpath}"
449
450
 
450
451
 
451
- class DatastoreProfileTDEngine(DatastoreProfile):
452
+ class DatastoreProfilePostgreSQL(DatastoreProfile):
452
453
  """
453
- A profile that holds the required parameters for a TDEngine database, with the websocket scheme.
454
- https://docs.tdengine.com/developer-guide/connecting-to-tdengine/#websocket-connection
454
+ A profile that holds the required parameters for a PostgreSQL database.
455
+ PostgreSQL uses standard PostgreSQL connection parameters.
455
456
  """
456
457
 
457
- type: str = pydantic.v1.Field("taosws")
458
+ type: str = pydantic.v1.Field("postgresql")
458
459
  _private_attributes = ["password"]
459
460
  user: str
460
461
  # The password cannot be empty in real world scenarios. It's here just because of the profiles completion design.
461
462
  password: typing.Optional[str]
462
463
  host: str
463
464
  port: int
465
+ database: str = "postgres" # Default PostgreSQL admin database
464
466
 
465
- def dsn(self) -> str:
466
- """Get the Data Source Name of the configured TDEngine profile."""
467
- return f"{self.type}://{self.user}:{self.password}@{self.host}:{self.port}"
467
+ def dsn(self, database: typing.Optional[str] = None) -> str:
468
+ """
469
+ Get the Data Source Name of the configured PostgreSQL profile.
470
+
471
+ :param database: Optional database name to use instead of the configured one.
472
+ If None, uses the configured database.
473
+ :return: The DSN string.
474
+ """
475
+ db = database or self.database
476
+ # URL-encode credentials and database to handle special characters
477
+ user = quote(self.user, safe="")
478
+ password = quote(self.password or "", safe="")
479
+ db_encoded = quote(db, safe="")
480
+ return f"{self.type}://{user}:{password}@{self.host}:{self.port}/{db_encoded}"
481
+
482
+ def admin_dsn(self) -> str:
483
+ """
484
+ Get DSN for administrative operations using the 'postgres' database.
485
+
486
+ Assumes the default 'postgres' database exists (standard PostgreSQL setup).
487
+ Used for admin tasks like creating/dropping databases.
488
+
489
+ :return: DSN pointing to the 'postgres' database.
490
+ """
491
+ return self.dsn(database="postgres")
468
492
 
469
493
  @classmethod
470
- def from_dsn(cls, dsn: str, profile_name: str) -> "DatastoreProfileTDEngine":
494
+ def from_dsn(cls, dsn: str, profile_name: str) -> "DatastoreProfilePostgreSQL":
471
495
  """
472
- Construct a TDEngine profile from DSN (connection string) and a name for the profile.
496
+ Construct a PostgreSQL profile from DSN (connection string) and a name for the profile.
473
497
 
474
- :param dsn: The DSN (Data Source Name) of the TDEngine database, e.g.: ``"taosws://root:taosdata@localhost:6041"``.
498
+ :param dsn: The DSN (Data Source Name) of the PostgreSQL database,
499
+ e.g.: ``"postgresql://user:password@localhost:5432/mydb"``.
475
500
  :param profile_name: The new profile's name.
476
- :return: The TDEngine profile.
501
+ :return: The PostgreSQL profile.
477
502
  """
478
503
  parsed_url = urlparse(dsn)
504
+ # URL-decode username, password, and database (urlparse doesn't decode them)
505
+ username = unquote(parsed_url.username) if parsed_url.username else None
506
+ password = unquote(parsed_url.password) if parsed_url.password else None
507
+ database = (
508
+ unquote(parsed_url.path.lstrip("/")) if parsed_url.path else "postgres"
509
+ )
479
510
  return cls(
480
511
  name=profile_name,
481
- user=parsed_url.username,
482
- password=parsed_url.password,
512
+ user=username,
513
+ password=password,
483
514
  host=parsed_url.hostname,
484
515
  port=parsed_url.port,
516
+ database=database or "postgres",
485
517
  )
486
518
 
487
519
 
@@ -552,7 +584,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
552
584
  "gcs": DatastoreProfileGCS,
553
585
  "az": DatastoreProfileAzureBlob,
554
586
  "hdfs": DatastoreProfileHdfs,
555
- "taosws": DatastoreProfileTDEngine,
587
+ "postgresql": DatastoreProfilePostgreSQL,
556
588
  "config": ConfigProfile,
557
589
  "openai": OpenAIProfile,
558
590
  "huggingface": HuggingFaceProfile,
@@ -11,7 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import threading
15
15
  from typing import TYPE_CHECKING, Any, Optional, Union
16
16
 
17
17
  import mlrun
@@ -41,6 +41,9 @@ class HuggingFaceProvider(ModelProvider):
41
41
  into memory for inference. Ensure you have the required CPU/GPU and memory to use this operation.
42
42
  """
43
43
 
44
+ # locks for threading use cases
45
+ _client_lock = threading.Lock()
46
+
44
47
  def __init__(
45
48
  self,
46
49
  parent,
@@ -224,7 +227,8 @@ class HuggingFaceProvider(ModelProvider):
224
227
 
225
228
  self.options["model_kwargs"] = self.options.get("model_kwargs", {})
226
229
  self.options["model_kwargs"]["local_files_only"] = True
227
- self._client = pipeline(model=self.model, **self.options)
230
+ with self._client_lock:
231
+ self._client = pipeline(model=self.model, **self.options)
228
232
  self._expected_operation_type = Pipeline
229
233
  except ImportError as exc:
230
234
  raise ImportError("transformers package is not installed") from exc
@@ -11,8 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from collections.abc import Awaitable
15
- from typing import Any, Callable, Optional, Union
14
+ from collections.abc import Awaitable, Callable
15
+ from typing import Any, Optional, Union
16
16
 
17
17
  import mlrun.errors
18
18
  from mlrun.common.types import StrEnum
@@ -12,8 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import inspect
15
- from collections.abc import Awaitable
16
- from typing import TYPE_CHECKING, Any, Callable, Optional, Union
15
+ from collections.abc import Awaitable, Callable
16
+ from typing import TYPE_CHECKING, Any, Optional, Union
17
17
 
18
18
  import mlrun
19
19
  from mlrun.datastore.model_provider.model_provider import (
mlrun/datastore/s3.py CHANGED
@@ -18,12 +18,16 @@ from typing import Optional
18
18
  from urllib.parse import urlparse
19
19
 
20
20
  import boto3
21
+ import botocore.exceptions
21
22
  from boto3.s3.transfer import TransferConfig
22
23
  from fsspec.registry import get_filesystem_class
23
24
 
24
25
  import mlrun.errors
25
26
 
26
27
  from .base import DataStore, FileStats, make_datastore_schema_sanitizer
28
+ from .utils import parse_s3_bucket_and_key
29
+
30
+ __all__ = ["parse_s3_bucket_and_key"]
27
31
 
28
32
 
29
33
  class S3Store(DataStore):
@@ -225,9 +229,17 @@ class S3Store(DataStore):
225
229
  def get(self, key, size=None, offset=0):
226
230
  bucket, key = self.get_bucket_and_key(key)
227
231
  obj = self.s3.Object(bucket, key)
228
- if size or offset:
229
- return obj.get(Range=S3Store.get_range(size, offset))["Body"].read()
230
- return obj.get()["Body"].read()
232
+ try:
233
+ if size or offset:
234
+ return obj.get(Range=S3Store.get_range(size, offset))["Body"].read()
235
+ return obj.get()["Body"].read()
236
+
237
+ except botocore.exceptions.ClientError as exc:
238
+ if exc.response["Error"]["Code"] == "NoSuchKey":
239
+ # "NoSuchKey" errors codes - equivalent to `FileNotFoundError`
240
+ raise FileNotFoundError(f"s3://{bucket}/{key}") from exc
241
+ # Other errors are raised as-is
242
+ raise
231
243
 
232
244
  def put(self, key, data, append=False):
233
245
  data, _ = self._prepare_put_data(data, append)
@@ -259,16 +271,3 @@ class S3Store(DataStore):
259
271
  # In order to raise an error if there is connection error, ML-7056.
260
272
  self.filesystem.exists(path=path)
261
273
  self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
262
-
263
-
264
- def parse_s3_bucket_and_key(s3_path):
265
- try:
266
- path_parts = s3_path.replace("s3://", "").split("/")
267
- bucket = path_parts.pop(0)
268
- key = "/".join(path_parts)
269
- except Exception as exc:
270
- raise mlrun.errors.MLRunInvalidArgumentError(
271
- "failed to parse s3 bucket and key"
272
- ) from exc
273
-
274
- return bucket, key
@@ -460,7 +460,7 @@ class ParquetSource(BaseSourceDriver):
460
460
  if not filter_tuple:
461
461
  continue
462
462
  col_name, op, value = filter_tuple
463
- if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
463
+ if op.lower() in ("in", "not in") and isinstance(value, list | tuple | set):
464
464
  none_exists = False
465
465
  value = list(value)
466
466
  for sub_value in value:
@@ -76,9 +76,9 @@ class ResourceCache:
76
76
  return self._tabels[uri]
77
77
 
78
78
  if uri.startswith("v3io://") or uri.startswith("v3ios://"):
79
- endpoint, uri = parse_path(uri)
79
+ endpoint, path = parse_path(uri)
80
80
  self._tabels[uri] = Table(
81
- uri,
81
+ path,
82
82
  V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
83
83
  flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
84
84
  )
@@ -87,10 +87,10 @@ class ResourceCache:
87
87
  if uri.startswith("redis://") or uri.startswith("rediss://"):
88
88
  from storey.redis_driver import RedisDriver
89
89
 
90
- endpoint, uri = parse_path(uri)
90
+ endpoint, path = parse_path(uri)
91
91
  endpoint = endpoint or mlrun.mlconf.redis.url
92
92
  self._tabels[uri] = Table(
93
- uri,
93
+ path,
94
94
  RedisDriver(redis_url=endpoint, key_prefix="/"),
95
95
  flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
96
96
  )