mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (167) hide show
  1. mlrun/__init__.py +24 -3
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/document.py +6 -1
  5. mlrun/artifacts/llm_prompt.py +21 -15
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/artifacts/plots.py +1 -1
  8. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  9. mlrun/auth/nuclio.py +89 -0
  10. mlrun/auth/providers.py +429 -0
  11. mlrun/auth/utils.py +415 -0
  12. mlrun/common/constants.py +14 -0
  13. mlrun/common/model_monitoring/helpers.py +123 -0
  14. mlrun/common/runtimes/constants.py +28 -0
  15. mlrun/common/schemas/__init__.py +14 -3
  16. mlrun/common/schemas/alert.py +2 -2
  17. mlrun/common/schemas/api_gateway.py +3 -0
  18. mlrun/common/schemas/auth.py +12 -10
  19. mlrun/common/schemas/client_spec.py +4 -0
  20. mlrun/common/schemas/constants.py +25 -0
  21. mlrun/common/schemas/frontend_spec.py +1 -8
  22. mlrun/common/schemas/function.py +34 -0
  23. mlrun/common/schemas/hub.py +33 -20
  24. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  25. mlrun/common/schemas/model_monitoring/constants.py +12 -15
  26. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  27. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  28. mlrun/common/schemas/pipeline.py +1 -1
  29. mlrun/common/schemas/secret.py +17 -2
  30. mlrun/common/secrets.py +95 -1
  31. mlrun/common/types.py +10 -10
  32. mlrun/config.py +69 -19
  33. mlrun/data_types/infer.py +2 -2
  34. mlrun/datastore/__init__.py +12 -5
  35. mlrun/datastore/azure_blob.py +162 -47
  36. mlrun/datastore/base.py +274 -10
  37. mlrun/datastore/datastore.py +7 -2
  38. mlrun/datastore/datastore_profile.py +84 -22
  39. mlrun/datastore/model_provider/huggingface_provider.py +225 -41
  40. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  41. mlrun/datastore/model_provider/model_provider.py +206 -74
  42. mlrun/datastore/model_provider/openai_provider.py +226 -66
  43. mlrun/datastore/s3.py +39 -18
  44. mlrun/datastore/sources.py +1 -1
  45. mlrun/datastore/store_resources.py +4 -4
  46. mlrun/datastore/storeytargets.py +17 -12
  47. mlrun/datastore/targets.py +1 -1
  48. mlrun/datastore/utils.py +25 -6
  49. mlrun/datastore/v3io.py +1 -1
  50. mlrun/db/base.py +63 -32
  51. mlrun/db/httpdb.py +373 -153
  52. mlrun/db/nopdb.py +54 -21
  53. mlrun/errors.py +4 -2
  54. mlrun/execution.py +66 -25
  55. mlrun/feature_store/api.py +1 -1
  56. mlrun/feature_store/common.py +1 -1
  57. mlrun/feature_store/feature_vector_utils.py +1 -1
  58. mlrun/feature_store/steps.py +8 -6
  59. mlrun/frameworks/_common/utils.py +3 -3
  60. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  61. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  62. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  63. mlrun/frameworks/_ml_common/utils.py +2 -1
  64. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  65. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  66. mlrun/frameworks/onnx/dataset.py +2 -1
  67. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  68. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  69. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  70. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  71. mlrun/frameworks/pytorch/utils.py +2 -1
  72. mlrun/frameworks/sklearn/metric.py +2 -1
  73. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  74. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  75. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  76. mlrun/hub/__init__.py +52 -0
  77. mlrun/hub/base.py +142 -0
  78. mlrun/hub/module.py +172 -0
  79. mlrun/hub/step.py +113 -0
  80. mlrun/k8s_utils.py +105 -16
  81. mlrun/launcher/base.py +15 -7
  82. mlrun/launcher/local.py +4 -1
  83. mlrun/model.py +14 -4
  84. mlrun/model_monitoring/__init__.py +0 -1
  85. mlrun/model_monitoring/api.py +65 -28
  86. mlrun/model_monitoring/applications/__init__.py +1 -1
  87. mlrun/model_monitoring/applications/base.py +299 -128
  88. mlrun/model_monitoring/applications/context.py +2 -4
  89. mlrun/model_monitoring/controller.py +132 -58
  90. mlrun/model_monitoring/db/_schedules.py +38 -29
  91. mlrun/model_monitoring/db/_stats.py +6 -16
  92. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  93. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  94. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  95. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  98. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  99. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  100. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  101. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  102. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  103. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  104. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  105. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  106. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  107. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  108. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
  109. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
  110. mlrun/model_monitoring/features_drift_table.py +2 -1
  111. mlrun/model_monitoring/helpers.py +30 -6
  112. mlrun/model_monitoring/stream_processing.py +34 -28
  113. mlrun/model_monitoring/writer.py +224 -4
  114. mlrun/package/__init__.py +2 -1
  115. mlrun/platforms/__init__.py +0 -43
  116. mlrun/platforms/iguazio.py +8 -4
  117. mlrun/projects/operations.py +17 -11
  118. mlrun/projects/pipelines.py +2 -2
  119. mlrun/projects/project.py +187 -123
  120. mlrun/run.py +95 -21
  121. mlrun/runtimes/__init__.py +2 -186
  122. mlrun/runtimes/base.py +103 -25
  123. mlrun/runtimes/constants.py +225 -0
  124. mlrun/runtimes/daskjob.py +5 -2
  125. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  126. mlrun/runtimes/local.py +5 -2
  127. mlrun/runtimes/mounts.py +20 -2
  128. mlrun/runtimes/nuclio/__init__.py +12 -7
  129. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  130. mlrun/runtimes/nuclio/application/application.py +339 -40
  131. mlrun/runtimes/nuclio/function.py +222 -72
  132. mlrun/runtimes/nuclio/serving.py +132 -42
  133. mlrun/runtimes/pod.py +213 -21
  134. mlrun/runtimes/utils.py +49 -9
  135. mlrun/secrets.py +99 -14
  136. mlrun/serving/__init__.py +2 -0
  137. mlrun/serving/remote.py +84 -11
  138. mlrun/serving/routers.py +26 -44
  139. mlrun/serving/server.py +138 -51
  140. mlrun/serving/serving_wrapper.py +6 -2
  141. mlrun/serving/states.py +997 -283
  142. mlrun/serving/steps.py +62 -0
  143. mlrun/serving/system_steps.py +149 -95
  144. mlrun/serving/v2_serving.py +9 -10
  145. mlrun/track/trackers/mlflow_tracker.py +29 -31
  146. mlrun/utils/helpers.py +292 -94
  147. mlrun/utils/http.py +9 -2
  148. mlrun/utils/notifications/notification/base.py +18 -0
  149. mlrun/utils/notifications/notification/git.py +3 -5
  150. mlrun/utils/notifications/notification/mail.py +39 -16
  151. mlrun/utils/notifications/notification/slack.py +2 -4
  152. mlrun/utils/notifications/notification/webhook.py +2 -5
  153. mlrun/utils/notifications/notification_pusher.py +3 -3
  154. mlrun/utils/version/version.json +2 -2
  155. mlrun/utils/version/version.py +3 -4
  156. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
  157. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
  158. mlrun/api/schemas/__init__.py +0 -259
  159. mlrun/db/auth_utils.py +0 -152
  160. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
  161. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  162. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  163. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
  164. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  165. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  166. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  167. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -11,11 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import datetime
15
+ import os
16
+ import os.path
14
17
  import tempfile
15
18
  import urllib.parse
16
19
  from base64 import b64encode
17
20
  from copy import copy
18
- from os import path, remove
21
+ from types import ModuleType
19
22
  from typing import Optional, Union
20
23
  from urllib.parse import urlparse
21
24
 
@@ -26,6 +29,7 @@ import pyarrow
26
29
  import pytz
27
30
  import requests
28
31
 
32
+ import mlrun.common.schemas
29
33
  import mlrun.config
30
34
  import mlrun.errors
31
35
  from mlrun.datastore.remote_client import BaseRemoteClient
@@ -156,6 +160,195 @@ class DataStore(BaseRemoteClient):
156
160
  def get_spark_options(self, path=None):
157
161
  return {}
158
162
 
163
+ @staticmethod
164
+ def _is_directory_in_range(
165
+ start_time: Optional[datetime.datetime],
166
+ end_time: Optional[datetime.datetime],
167
+ year: int,
168
+ month: Optional[int] = None,
169
+ day: Optional[int] = None,
170
+ hour: Optional[int] = None,
171
+ **kwargs,
172
+ ):
173
+ """Check if a partition directory (year=.., month=.., etc.) is in the time range."""
174
+ from dateutil.relativedelta import relativedelta
175
+
176
+ partition_start = datetime.datetime(
177
+ year=year,
178
+ month=month or 1,
179
+ day=day or 1,
180
+ hour=hour or 0,
181
+ tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
182
+ )
183
+ partition_end = (
184
+ partition_start
185
+ + relativedelta(
186
+ years=1 if month is None else 0,
187
+ months=1 if day is None and month is not None else 0,
188
+ days=1 if hour is None and day is not None else 0,
189
+ hours=1 if hour is not None else 0,
190
+ )
191
+ - datetime.timedelta(microseconds=1)
192
+ )
193
+
194
+ if (end_time and end_time < partition_start) or (
195
+ start_time and start_time > partition_end
196
+ ):
197
+ return False
198
+ return True
199
+
200
+ @staticmethod
201
+ def _list_partition_paths_helper(
202
+ paths: list[str],
203
+ start_time: Optional[datetime.datetime],
204
+ end_time: Optional[datetime.datetime],
205
+ current_path: str,
206
+ partition_level: str,
207
+ filesystem,
208
+ ):
209
+ directory_split = current_path.rsplit("/", 1)
210
+ time_unit = None
211
+ directory_start, directory_end = "", ""
212
+ if len(directory_split) == 2:
213
+ directory_start, directory_end = directory_split
214
+ time_unit = directory_end.split("=")[0] if "=" in directory_end else None
215
+
216
+ if not time_unit and directory_end.endswith((".parquet", ".pq")):
217
+ paths.append(directory_start.rstrip("/"))
218
+ return
219
+ elif time_unit and time_unit == partition_level:
220
+ paths.append(current_path.rstrip("/"))
221
+ return
222
+
223
+ directories = filesystem.ls(current_path, detail=True)
224
+ if len(directories) == 0:
225
+ return
226
+ for directory in directories:
227
+ current_path = directory["name"]
228
+ parts = [p for p in current_path.split("/") if "=" in p]
229
+ kwargs = {}
230
+ for part in parts:
231
+ key, value = part.split("=", 1)
232
+ if value.isdigit():
233
+ value = int(value)
234
+ kwargs[key] = value
235
+ if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
236
+ DataStore._list_partition_paths_helper(
237
+ paths,
238
+ start_time,
239
+ end_time,
240
+ current_path,
241
+ partition_level,
242
+ filesystem,
243
+ )
244
+
245
+ @staticmethod
246
+ def _list_partitioned_paths(
247
+ base_url: str,
248
+ start_time: Optional[datetime.datetime],
249
+ end_time: Optional[datetime.datetime],
250
+ partition_level: str,
251
+ filesystem,
252
+ ):
253
+ paths = []
254
+ parsed_base_url = urlparse(base_url)
255
+ base_path = parsed_base_url.path
256
+
257
+ if parsed_base_url.scheme not in ["v3io", "v3ios"]:
258
+ base_path = parsed_base_url.netloc + base_path
259
+
260
+ DataStore._list_partition_paths_helper(
261
+ paths, start_time, end_time, base_path, partition_level, filesystem
262
+ )
263
+ paths = [
264
+ DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
265
+ for path in paths
266
+ ]
267
+ return paths
268
+
269
+ @staticmethod
270
+ def _reconstruct_path_from_base_url(
271
+ parsed_base_url: urllib.parse.ParseResult, returned_path: str
272
+ ) -> str:
273
+ scheme = parsed_base_url.scheme
274
+ authority = parsed_base_url.netloc
275
+ returned_path = returned_path.lstrip("/")
276
+ if scheme == "v3io":
277
+ return f"{scheme}://{authority}/{returned_path}"
278
+ else:
279
+ return f"{scheme}://{returned_path}"
280
+
281
+ @staticmethod
282
+ def _clean_filters_for_partitions(
283
+ filters: list[list[tuple]],
284
+ partition_keys: list[str],
285
+ ):
286
+ """
287
+ Remove partition keys from filters.
288
+
289
+ :param filters: pandas-style filters
290
+ Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
291
+ :param partition_keys: partition columns handled via directory
292
+
293
+ :return list of list of tuples: cleaned filters without partition keys
294
+ """
295
+ cleaned_filters = []
296
+ for group in filters:
297
+ new_group = [f for f in group if f[0] not in partition_keys]
298
+ if new_group:
299
+ cleaned_filters.append(new_group)
300
+ return cleaned_filters
301
+
302
+ @staticmethod
303
+ def _read_partitioned_parquet(
304
+ base_url: str,
305
+ start_time: Optional[datetime.datetime],
306
+ end_time: Optional[datetime.datetime],
307
+ partition_keys: list[str],
308
+ df_module: ModuleType,
309
+ filesystem: fsspec.AbstractFileSystem,
310
+ **kwargs,
311
+ ):
312
+ """
313
+ Reads only the relevant partitions and concatenates the results.
314
+ Note that partition_keys cannot be empty.
315
+ """
316
+ logger.debug(f"Starting partition discovery process for {base_url}")
317
+
318
+ paths = DataStore._list_partitioned_paths(
319
+ base_url,
320
+ start_time,
321
+ end_time,
322
+ partition_keys[-1],
323
+ filesystem,
324
+ )
325
+
326
+ dfs = []
327
+ for current_path in paths:
328
+ try:
329
+ kwargs["filters"] = DataStore._clean_filters_for_partitions(
330
+ kwargs["filters"], partition_keys
331
+ )
332
+ df = df_module.read_parquet(current_path, **kwargs)
333
+ logger.debug(
334
+ "Finished reading DataFrame from subpath",
335
+ url=current_path,
336
+ )
337
+ dfs.append(df)
338
+ except FileNotFoundError as e:
339
+ # Skip partitions that don't exist or have no data
340
+ logger.warning(
341
+ "Failed to read DataFrame", url=current_path, exception=e
342
+ )
343
+
344
+ final_df = pd.concat(dfs) if dfs else pd.DataFrame()
345
+ logger.debug(
346
+ "Finished reading partitioned parquet files",
347
+ url=base_url,
348
+ columns=final_df.columns,
349
+ )
350
+ return final_df
351
+
159
352
  @staticmethod
160
353
  def _parquet_reader(
161
354
  df_module,
@@ -165,6 +358,7 @@ class DataStore(BaseRemoteClient):
165
358
  start_time,
166
359
  end_time,
167
360
  additional_filters,
361
+ optimize_discovery,
168
362
  ):
169
363
  from storey.utils import find_filters, find_partitions
170
364
 
@@ -203,7 +397,10 @@ class DataStore(BaseRemoteClient):
203
397
  )
204
398
 
205
399
  if start_time or end_time or additional_filters:
206
- partitions_time_attributes = find_partitions(url, file_system)
400
+ partitions_time_attributes, partitions = find_partitions(
401
+ url, file_system
402
+ )
403
+ logger.debug("Partitioned parquet read", partitions=partitions)
207
404
  set_filters(
208
405
  partitions_time_attributes,
209
406
  start_time,
@@ -211,8 +408,28 @@ class DataStore(BaseRemoteClient):
211
408
  additional_filters,
212
409
  kwargs,
213
410
  )
411
+
214
412
  try:
215
- return df_module.read_parquet(*args, **kwargs)
413
+ if (
414
+ optimize_discovery
415
+ and partitions_time_attributes
416
+ and DataStore._verify_path_partition_level(
417
+ urlparse(url).path, partitions
418
+ )
419
+ and (start_time or end_time)
420
+ ):
421
+ return DataStore._read_partitioned_parquet(
422
+ url,
423
+ start_time,
424
+ end_time,
425
+ partitions_time_attributes,
426
+ df_module,
427
+ file_system,
428
+ **kwargs,
429
+ )
430
+
431
+ else:
432
+ return df_module.read_parquet(*args, **kwargs)
216
433
  except pyarrow.lib.ArrowInvalid as ex:
217
434
  if not str(ex).startswith(
218
435
  "Cannot compare timestamp with timezone to timestamp without timezone"
@@ -238,7 +455,24 @@ class DataStore(BaseRemoteClient):
238
455
  additional_filters,
239
456
  kwargs,
240
457
  )
241
- return df_module.read_parquet(*args, **kwargs)
458
+ if (
459
+ optimize_discovery
460
+ and partitions_time_attributes
461
+ and DataStore._verify_path_partition_level(
462
+ urlparse(url).path, partitions
463
+ )
464
+ ):
465
+ return DataStore._read_partitioned_parquet(
466
+ url,
467
+ start_time_inner,
468
+ end_time_inner,
469
+ partitions_time_attributes,
470
+ df_module,
471
+ file_system,
472
+ **kwargs,
473
+ )
474
+ else:
475
+ return df_module.read_parquet(*args, **kwargs)
242
476
  else:
243
477
  return df_module.read_parquet(*args, **kwargs)
244
478
 
@@ -261,6 +495,10 @@ class DataStore(BaseRemoteClient):
261
495
  file_url = self._sanitize_url(url)
262
496
  is_csv, is_json, drop_time_column = False, False, False
263
497
  file_system = self.filesystem
498
+
499
+ # Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
500
+ optimize_discovery = kwargs.pop("optimize_discovery", True)
501
+
264
502
  if file_url.endswith(".csv") or format == "csv":
265
503
  is_csv = True
266
504
  drop_time_column = False
@@ -322,6 +560,7 @@ class DataStore(BaseRemoteClient):
322
560
  start_time,
323
561
  end_time,
324
562
  additional_filters,
563
+ optimize_discovery,
325
564
  )
326
565
 
327
566
  elif file_url.endswith(".json") or format == "json":
@@ -347,7 +586,7 @@ class DataStore(BaseRemoteClient):
347
586
  temp_file = tempfile.NamedTemporaryFile(delete=False)
348
587
  self.download(self._join(subpath), temp_file.name)
349
588
  df = reader(temp_file.name, **kwargs)
350
- remove(temp_file.name)
589
+ os.remove(temp_file.name)
351
590
 
352
591
  if is_json or is_csv:
353
592
  # for parquet file the time filtering is executed in `reader`
@@ -387,6 +626,26 @@ class DataStore(BaseRemoteClient):
387
626
  except ImportError:
388
627
  return False
389
628
 
629
+ @staticmethod
630
+ def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
631
+ if not partitions:
632
+ return False
633
+
634
+ path_parts = base_path.strip("/").split("/")
635
+ path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
636
+ if "hour" in partitions:
637
+ hour_index = partitions.index("hour")
638
+ else:
639
+ return False
640
+ for i, part in enumerate(partitions):
641
+ if not (
642
+ part in path_parts
643
+ or part in ["year", "month", "day", "hour"]
644
+ or i > hour_index
645
+ ):
646
+ return False
647
+ return True
648
+
390
649
 
391
650
  class DataItem:
392
651
  """Data input/output class abstracting access to various local/remote data sources
@@ -439,7 +698,7 @@ class DataItem:
439
698
  @property
440
699
  def suffix(self):
441
700
  """DataItem suffix (file extension) e.g. '.png'"""
442
- _, file_ext = path.splitext(self._path)
701
+ _, file_ext = os.path.splitext(self._path)
443
702
  return file_ext
444
703
 
445
704
  @property
@@ -548,7 +807,7 @@ class DataItem:
548
807
  return
549
808
 
550
809
  if self._local_path:
551
- remove(self._local_path)
810
+ os.remove(self._local_path)
552
811
  self._local_path = ""
553
812
 
554
813
  def as_df(
@@ -648,8 +907,10 @@ def basic_auth_header(user, password):
648
907
  username = user.encode("latin1")
649
908
  password = password.encode("latin1")
650
909
  base = b64encode(b":".join((username, password))).strip()
651
- authstr = "Basic " + base.decode("ascii")
652
- return {"Authorization": authstr}
910
+ authstr = mlrun.common.schemas.AuthorizationHeaderPrefixes.basic + base.decode(
911
+ "ascii"
912
+ )
913
+ return {mlrun.common.schemas.HeaderNames.authorization: authstr}
653
914
 
654
915
 
655
916
  class HttpStore(DataStore):
@@ -696,7 +957,10 @@ class HttpStore(DataStore):
696
957
  token = self._get_secret_or_env("HTTPS_AUTH_TOKEN")
697
958
  if token:
698
959
  self._https_auth_token = token
699
- self._headers.setdefault("Authorization", f"Bearer {token}")
960
+ self._headers.setdefault(
961
+ mlrun.common.schemas.HeaderNames.authorization,
962
+ f"{mlrun.common.schemas.AuthorizationHeaderPrefixes.bearer}{token}",
963
+ )
700
964
 
701
965
  def _validate_https_token(self):
702
966
  if self._https_auth_token and self._schema in ["http"]:
@@ -39,6 +39,7 @@ from .base import DataItem, DataStore, HttpStore
39
39
  from .filestore import FileStore
40
40
  from .inmem import InMemoryStore
41
41
  from .model_provider.huggingface_provider import HuggingFaceProvider
42
+ from .model_provider.mock_model_provider import MockModelProvider
42
43
  from .model_provider.openai_provider import OpenAIProvider
43
44
  from .store_resources import get_store_resource, is_store_uri
44
45
  from .v3io import V3ioStore
@@ -46,7 +47,7 @@ from .v3io import V3ioStore
46
47
  in_memory_store = InMemoryStore()
47
48
 
48
49
 
49
- def schema_to_store(schema) -> DataStore.__subclasses__():
50
+ def schema_to_store(schema) -> type[DataStore]:
50
51
  # import store classes inside to enable making their dependencies optional (package extras)
51
52
 
52
53
  if not schema or schema in get_local_file_schema():
@@ -103,7 +104,11 @@ def schema_to_store(schema) -> DataStore.__subclasses__():
103
104
  def schema_to_model_provider(
104
105
  schema: str, raise_missing_schema_exception=True
105
106
  ) -> type[ModelProvider]:
106
- schema_dict = {"openai": OpenAIProvider, "huggingface": HuggingFaceProvider}
107
+ schema_dict = {
108
+ "openai": OpenAIProvider,
109
+ "huggingface": HuggingFaceProvider,
110
+ "mock": MockModelProvider,
111
+ }
107
112
  provider_class = schema_dict.get(schema, None)
108
113
  if not provider_class:
109
114
  if raise_missing_schema_exception:
@@ -16,9 +16,10 @@ import ast
16
16
  import base64
17
17
  import json
18
18
  import typing
19
- from urllib.parse import ParseResult, urlparse
19
+ from urllib.parse import ParseResult, quote, unquote, urlparse
20
20
 
21
21
  import pydantic.v1
22
+ from deprecated import deprecated
22
23
  from mergedeep import merge
23
24
 
24
25
  import mlrun
@@ -138,6 +139,15 @@ class ConfigProfile(DatastoreProfile):
138
139
  return res
139
140
 
140
141
 
142
+ # TODO: Remove in 1.12.0
143
+ @deprecated(
144
+ version="1.10.0",
145
+ reason=(
146
+ "This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
147
+ "Use `DatastoreProfileKafkaStream` instead."
148
+ ),
149
+ category=FutureWarning,
150
+ )
141
151
  class DatastoreProfileKafkaTarget(DatastoreProfile):
142
152
  type: str = pydantic.v1.Field("kafka_target")
143
153
  _private_attributes = "kwargs_private"
@@ -158,8 +168,8 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
158
168
  return attributes
159
169
 
160
170
 
161
- class DatastoreProfileKafkaSource(DatastoreProfile):
162
- type: str = pydantic.v1.Field("kafka_source")
171
+ class DatastoreProfileKafkaStream(DatastoreProfile):
172
+ type: str = pydantic.v1.Field("kafka_stream")
163
173
  _private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
164
174
  brokers: typing.Union[str, list[str]]
165
175
  topics: typing.Union[str, list[str]]
@@ -198,6 +208,19 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
198
208
  return attributes
199
209
 
200
210
 
211
+ # TODO: Remove in 1.12.0
212
+ @deprecated(
213
+ version="1.10.0",
214
+ reason=(
215
+ "This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
216
+ "Use `DatastoreProfileKafkaStream` instead."
217
+ ),
218
+ category=FutureWarning,
219
+ )
220
+ class DatastoreProfileKafkaSource(DatastoreProfileKafkaStream):
221
+ type: str = pydantic.v1.Field("kafka_source")
222
+
223
+
201
224
  class DatastoreProfileV3io(DatastoreProfile):
202
225
  type: str = pydantic.v1.Field("v3io")
203
226
  v3io_access_key: typing.Optional[str] = None
@@ -232,7 +255,7 @@ class DatastoreProfileS3(DatastoreProfile):
232
255
  if self.secret_key:
233
256
  res["AWS_SECRET_ACCESS_KEY"] = self.secret_key
234
257
  if self.endpoint_url:
235
- res["S3_ENDPOINT_URL"] = self.endpoint_url
258
+ res["AWS_ENDPOINT_URL_S3"] = self.endpoint_url
236
259
  if self.force_non_anonymous:
237
260
  res["S3_NON_ANONYMOUS"] = self.force_non_anonymous
238
261
  if self.profile_name:
@@ -260,8 +283,9 @@ class DatastoreProfileRedis(DatastoreProfile):
260
283
 
261
284
  def url_with_credentials(self):
262
285
  parsed_url = urlparse(self.endpoint_url)
263
- username = self.username
264
- password = self.password
286
+ # URL-encode username and password to handle special characters like @, :, /
287
+ username = quote(self.username, safe="") if self.username else None
288
+ password = quote(self.password, safe="") if self.password else None
265
289
  netloc = parsed_url.hostname
266
290
  if username:
267
291
  if password:
@@ -333,7 +357,9 @@ class DatastoreProfileGCS(DatastoreProfile):
333
357
  # in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
334
358
  subpath = subpath[1:]
335
359
  if self.bucket:
336
- return f"gcs://{self.bucket}/{subpath}"
360
+ return (
361
+ f"gcs://{self.bucket}/{subpath}" if subpath else f"gcs://{self.bucket}"
362
+ )
337
363
  else:
338
364
  return f"gcs://{subpath}"
339
365
 
@@ -370,7 +396,11 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
370
396
  # in azure the path after schema is starts with container, wherefore it should not start with "/".
371
397
  subpath = subpath[1:]
372
398
  if self.container:
373
- return f"az://{self.container}/{subpath}"
399
+ return (
400
+ f"az://{self.container}/{subpath}"
401
+ if subpath
402
+ else f"az://{self.container}"
403
+ )
374
404
  else:
375
405
  return f"az://{subpath}"
376
406
 
@@ -419,40 +449,71 @@ class DatastoreProfileHdfs(DatastoreProfile):
419
449
  return f"webhdfs://{self.host}:{self.http_port}{subpath}"
420
450
 
421
451
 
422
- class DatastoreProfileTDEngine(DatastoreProfile):
452
+ class DatastoreProfilePostgreSQL(DatastoreProfile):
423
453
  """
424
- A profile that holds the required parameters for a TDEngine database, with the websocket scheme.
425
- https://docs.tdengine.com/developer-guide/connecting-to-tdengine/#websocket-connection
454
+ A profile that holds the required parameters for a PostgreSQL database.
455
+ PostgreSQL uses standard PostgreSQL connection parameters.
426
456
  """
427
457
 
428
- type: str = pydantic.v1.Field("taosws")
458
+ type: str = pydantic.v1.Field("postgresql")
429
459
  _private_attributes = ["password"]
430
460
  user: str
431
461
  # The password cannot be empty in real world scenarios. It's here just because of the profiles completion design.
432
462
  password: typing.Optional[str]
433
463
  host: str
434
464
  port: int
465
+ database: str = "postgres" # Default PostgreSQL admin database
466
+
467
+ def dsn(self, database: typing.Optional[str] = None) -> str:
468
+ """
469
+ Get the Data Source Name of the configured PostgreSQL profile.
470
+
471
+ :param database: Optional database name to use instead of the configured one.
472
+ If None, uses the configured database.
473
+ :return: The DSN string.
474
+ """
475
+ db = database or self.database
476
+ # URL-encode credentials and database to handle special characters
477
+ user = quote(self.user, safe="")
478
+ password = quote(self.password or "", safe="")
479
+ db_encoded = quote(db, safe="")
480
+ return f"{self.type}://{user}:{password}@{self.host}:{self.port}/{db_encoded}"
481
+
482
+ def admin_dsn(self) -> str:
483
+ """
484
+ Get DSN for administrative operations using the 'postgres' database.
435
485
 
436
- def dsn(self) -> str:
437
- """Get the Data Source Name of the configured TDEngine profile."""
438
- return f"{self.type}://{self.user}:{self.password}@{self.host}:{self.port}"
486
+ Assumes the default 'postgres' database exists (standard PostgreSQL setup).
487
+ Used for admin tasks like creating/dropping databases.
488
+
489
+ :return: DSN pointing to the 'postgres' database.
490
+ """
491
+ return self.dsn(database="postgres")
439
492
 
440
493
  @classmethod
441
- def from_dsn(cls, dsn: str, profile_name: str) -> "DatastoreProfileTDEngine":
494
+ def from_dsn(cls, dsn: str, profile_name: str) -> "DatastoreProfilePostgreSQL":
442
495
  """
443
- Construct a TDEngine profile from DSN (connection string) and a name for the profile.
496
+ Construct a PostgreSQL profile from DSN (connection string) and a name for the profile.
444
497
 
445
- :param dsn: The DSN (Data Source Name) of the TDEngine database, e.g.: ``"taosws://root:taosdata@localhost:6041"``.
498
+ :param dsn: The DSN (Data Source Name) of the PostgreSQL database,
499
+ e.g.: ``"postgresql://user:password@localhost:5432/mydb"``.
446
500
  :param profile_name: The new profile's name.
447
- :return: The TDEngine profile.
501
+ :return: The PostgreSQL profile.
448
502
  """
449
503
  parsed_url = urlparse(dsn)
504
+ # URL-decode username, password, and database (urlparse doesn't decode them)
505
+ username = unquote(parsed_url.username) if parsed_url.username else None
506
+ password = unquote(parsed_url.password) if parsed_url.password else None
507
+ database = (
508
+ unquote(parsed_url.path.lstrip("/")) if parsed_url.path else "postgres"
509
+ )
450
510
  return cls(
451
511
  name=profile_name,
452
- user=parsed_url.username,
453
- password=parsed_url.password,
512
+ user=username,
513
+ password=password,
454
514
  host=parsed_url.hostname,
455
515
  port=parsed_url.port,
516
+ database=database or "postgres",
456
517
  )
457
518
 
458
519
 
@@ -518,11 +579,12 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
518
579
  "basic": DatastoreProfileBasic,
519
580
  "kafka_target": DatastoreProfileKafkaTarget,
520
581
  "kafka_source": DatastoreProfileKafkaSource,
582
+ "kafka_stream": DatastoreProfileKafkaStream,
521
583
  "dbfs": DatastoreProfileDBFS,
522
584
  "gcs": DatastoreProfileGCS,
523
585
  "az": DatastoreProfileAzureBlob,
524
586
  "hdfs": DatastoreProfileHdfs,
525
- "taosws": DatastoreProfileTDEngine,
587
+ "postgresql": DatastoreProfilePostgreSQL,
526
588
  "config": ConfigProfile,
527
589
  "openai": OpenAIProfile,
528
590
  "huggingface": HuggingFaceProfile,