mlrun 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (101) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/document.py +6 -1
  3. mlrun/artifacts/llm_prompt.py +21 -15
  4. mlrun/artifacts/model.py +3 -3
  5. mlrun/common/constants.py +9 -0
  6. mlrun/common/formatters/artifact.py +1 -0
  7. mlrun/common/model_monitoring/helpers.py +86 -0
  8. mlrun/common/schemas/__init__.py +2 -0
  9. mlrun/common/schemas/auth.py +2 -0
  10. mlrun/common/schemas/function.py +10 -0
  11. mlrun/common/schemas/hub.py +30 -18
  12. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  13. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  14. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  15. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  16. mlrun/common/schemas/pipeline.py +1 -1
  17. mlrun/common/schemas/serving.py +3 -0
  18. mlrun/common/schemas/workflow.py +1 -0
  19. mlrun/common/secrets.py +22 -1
  20. mlrun/config.py +34 -21
  21. mlrun/datastore/__init__.py +11 -3
  22. mlrun/datastore/azure_blob.py +162 -47
  23. mlrun/datastore/base.py +265 -7
  24. mlrun/datastore/datastore.py +10 -5
  25. mlrun/datastore/datastore_profile.py +61 -5
  26. mlrun/datastore/model_provider/huggingface_provider.py +367 -0
  27. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  28. mlrun/datastore/model_provider/model_provider.py +211 -74
  29. mlrun/datastore/model_provider/openai_provider.py +243 -71
  30. mlrun/datastore/s3.py +24 -2
  31. mlrun/datastore/store_resources.py +4 -4
  32. mlrun/datastore/storeytargets.py +2 -3
  33. mlrun/datastore/utils.py +15 -3
  34. mlrun/db/base.py +27 -19
  35. mlrun/db/httpdb.py +57 -48
  36. mlrun/db/nopdb.py +25 -10
  37. mlrun/execution.py +55 -13
  38. mlrun/hub/__init__.py +15 -0
  39. mlrun/hub/module.py +181 -0
  40. mlrun/k8s_utils.py +105 -16
  41. mlrun/launcher/base.py +13 -6
  42. mlrun/launcher/local.py +2 -0
  43. mlrun/model.py +9 -3
  44. mlrun/model_monitoring/api.py +66 -27
  45. mlrun/model_monitoring/applications/__init__.py +1 -1
  46. mlrun/model_monitoring/applications/base.py +388 -138
  47. mlrun/model_monitoring/applications/context.py +2 -4
  48. mlrun/model_monitoring/applications/results.py +4 -7
  49. mlrun/model_monitoring/controller.py +239 -101
  50. mlrun/model_monitoring/db/_schedules.py +36 -13
  51. mlrun/model_monitoring/db/_stats.py +4 -3
  52. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  53. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
  54. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
  55. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  56. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  57. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
  58. mlrun/model_monitoring/helpers.py +28 -5
  59. mlrun/model_monitoring/stream_processing.py +45 -14
  60. mlrun/model_monitoring/writer.py +220 -1
  61. mlrun/platforms/__init__.py +3 -2
  62. mlrun/platforms/iguazio.py +7 -3
  63. mlrun/projects/operations.py +16 -11
  64. mlrun/projects/pipelines.py +2 -2
  65. mlrun/projects/project.py +157 -69
  66. mlrun/run.py +97 -20
  67. mlrun/runtimes/__init__.py +18 -0
  68. mlrun/runtimes/base.py +14 -6
  69. mlrun/runtimes/daskjob.py +1 -0
  70. mlrun/runtimes/local.py +5 -2
  71. mlrun/runtimes/mounts.py +20 -2
  72. mlrun/runtimes/nuclio/__init__.py +1 -0
  73. mlrun/runtimes/nuclio/application/application.py +147 -17
  74. mlrun/runtimes/nuclio/function.py +72 -27
  75. mlrun/runtimes/nuclio/serving.py +102 -20
  76. mlrun/runtimes/pod.py +213 -21
  77. mlrun/runtimes/utils.py +49 -9
  78. mlrun/secrets.py +54 -13
  79. mlrun/serving/remote.py +79 -6
  80. mlrun/serving/routers.py +23 -41
  81. mlrun/serving/server.py +230 -40
  82. mlrun/serving/states.py +605 -232
  83. mlrun/serving/steps.py +62 -0
  84. mlrun/serving/system_steps.py +136 -81
  85. mlrun/serving/v2_serving.py +9 -10
  86. mlrun/utils/helpers.py +215 -83
  87. mlrun/utils/logger.py +3 -1
  88. mlrun/utils/notifications/notification/base.py +18 -0
  89. mlrun/utils/notifications/notification/git.py +2 -4
  90. mlrun/utils/notifications/notification/mail.py +38 -15
  91. mlrun/utils/notifications/notification/slack.py +2 -4
  92. mlrun/utils/notifications/notification/webhook.py +2 -5
  93. mlrun/utils/notifications/notification_pusher.py +1 -1
  94. mlrun/utils/version/version.json +2 -2
  95. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +51 -50
  96. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +100 -95
  97. mlrun/api/schemas/__init__.py +0 -259
  98. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
  99. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
  100. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
  101. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -11,11 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import datetime
15
+ import os
16
+ import os.path
14
17
  import tempfile
15
18
  import urllib.parse
16
19
  from base64 import b64encode
17
20
  from copy import copy
18
- from os import path, remove
21
+ from types import ModuleType
19
22
  from typing import Optional, Union
20
23
  from urllib.parse import urlparse
21
24
 
@@ -156,6 +159,195 @@ class DataStore(BaseRemoteClient):
156
159
  def get_spark_options(self, path=None):
157
160
  return {}
158
161
 
162
+ @staticmethod
163
+ def _is_directory_in_range(
164
+ start_time: Optional[datetime.datetime],
165
+ end_time: Optional[datetime.datetime],
166
+ year: int,
167
+ month: Optional[int] = None,
168
+ day: Optional[int] = None,
169
+ hour: Optional[int] = None,
170
+ **kwargs,
171
+ ):
172
+ """Check if a partition directory (year=.., month=.., etc.) is in the time range."""
173
+ from dateutil.relativedelta import relativedelta
174
+
175
+ partition_start = datetime.datetime(
176
+ year=year,
177
+ month=month or 1,
178
+ day=day or 1,
179
+ hour=hour or 0,
180
+ tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
181
+ )
182
+ partition_end = (
183
+ partition_start
184
+ + relativedelta(
185
+ years=1 if month is None else 0,
186
+ months=1 if day is None and month is not None else 0,
187
+ days=1 if hour is None and day is not None else 0,
188
+ hours=1 if hour is not None else 0,
189
+ )
190
+ - datetime.timedelta(microseconds=1)
191
+ )
192
+
193
+ if (end_time and end_time < partition_start) or (
194
+ start_time and start_time > partition_end
195
+ ):
196
+ return False
197
+ return True
198
+
199
+ @staticmethod
200
+ def _list_partition_paths_helper(
201
+ paths: list[str],
202
+ start_time: Optional[datetime.datetime],
203
+ end_time: Optional[datetime.datetime],
204
+ current_path: str,
205
+ partition_level: str,
206
+ filesystem,
207
+ ):
208
+ directory_split = current_path.rsplit("/", 1)
209
+ time_unit = None
210
+ directory_start, directory_end = "", ""
211
+ if len(directory_split) == 2:
212
+ directory_start, directory_end = directory_split
213
+ time_unit = directory_end.split("=")[0] if "=" in directory_end else None
214
+
215
+ if not time_unit and directory_end.endswith((".parquet", ".pq")):
216
+ paths.append(directory_start.rstrip("/"))
217
+ return
218
+ elif time_unit and time_unit == partition_level:
219
+ paths.append(current_path.rstrip("/"))
220
+ return
221
+
222
+ directories = filesystem.ls(current_path, detail=True)
223
+ if len(directories) == 0:
224
+ return
225
+ for directory in directories:
226
+ current_path = directory["name"]
227
+ parts = [p for p in current_path.split("/") if "=" in p]
228
+ kwargs = {}
229
+ for part in parts:
230
+ key, value = part.split("=", 1)
231
+ if value.isdigit():
232
+ value = int(value)
233
+ kwargs[key] = value
234
+ if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
235
+ DataStore._list_partition_paths_helper(
236
+ paths,
237
+ start_time,
238
+ end_time,
239
+ current_path,
240
+ partition_level,
241
+ filesystem,
242
+ )
243
+
244
+ @staticmethod
245
+ def _list_partitioned_paths(
246
+ base_url: str,
247
+ start_time: Optional[datetime.datetime],
248
+ end_time: Optional[datetime.datetime],
249
+ partition_level: str,
250
+ filesystem,
251
+ ):
252
+ paths = []
253
+ parsed_base_url = urlparse(base_url)
254
+ base_path = parsed_base_url.path
255
+
256
+ if parsed_base_url.scheme not in ["v3io", "v3ios"]:
257
+ base_path = parsed_base_url.netloc + base_path
258
+
259
+ DataStore._list_partition_paths_helper(
260
+ paths, start_time, end_time, base_path, partition_level, filesystem
261
+ )
262
+ paths = [
263
+ DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
264
+ for path in paths
265
+ ]
266
+ return paths
267
+
268
+ @staticmethod
269
+ def _reconstruct_path_from_base_url(
270
+ parsed_base_url: urllib.parse.ParseResult, returned_path: str
271
+ ) -> str:
272
+ scheme = parsed_base_url.scheme
273
+ authority = parsed_base_url.netloc
274
+ returned_path = returned_path.lstrip("/")
275
+ if scheme == "v3io":
276
+ return f"{scheme}://{authority}/{returned_path}"
277
+ else:
278
+ return f"{scheme}://{returned_path}"
279
+
280
+ @staticmethod
281
+ def _clean_filters_for_partitions(
282
+ filters: list[list[tuple]],
283
+ partition_keys: list[str],
284
+ ):
285
+ """
286
+ Remove partition keys from filters.
287
+
288
+ :param filters: pandas-style filters
289
+ Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
290
+ :param partition_keys: partition columns handled via directory
291
+
292
+ :return list of list of tuples: cleaned filters without partition keys
293
+ """
294
+ cleaned_filters = []
295
+ for group in filters:
296
+ new_group = [f for f in group if f[0] not in partition_keys]
297
+ if new_group:
298
+ cleaned_filters.append(new_group)
299
+ return cleaned_filters
300
+
301
+ @staticmethod
302
+ def _read_partitioned_parquet(
303
+ base_url: str,
304
+ start_time: Optional[datetime.datetime],
305
+ end_time: Optional[datetime.datetime],
306
+ partition_keys: list[str],
307
+ df_module: ModuleType,
308
+ filesystem: fsspec.AbstractFileSystem,
309
+ **kwargs,
310
+ ):
311
+ """
312
+ Reads only the relevant partitions and concatenates the results.
313
+ Note that partition_keys cannot be empty.
314
+ """
315
+ logger.debug(f"Starting partition discovery process for {base_url}")
316
+
317
+ paths = DataStore._list_partitioned_paths(
318
+ base_url,
319
+ start_time,
320
+ end_time,
321
+ partition_keys[-1],
322
+ filesystem,
323
+ )
324
+
325
+ dfs = []
326
+ for current_path in paths:
327
+ try:
328
+ kwargs["filters"] = DataStore._clean_filters_for_partitions(
329
+ kwargs["filters"], partition_keys
330
+ )
331
+ df = df_module.read_parquet(current_path, **kwargs)
332
+ logger.debug(
333
+ "Finished reading DataFrame from subpath",
334
+ url=current_path,
335
+ )
336
+ dfs.append(df)
337
+ except FileNotFoundError as e:
338
+ # Skip partitions that don't exist or have no data
339
+ logger.warning(
340
+ "Failed to read DataFrame", url=current_path, exception=e
341
+ )
342
+
343
+ final_df = pd.concat(dfs) if dfs else pd.DataFrame()
344
+ logger.debug(
345
+ "Finished reading partitioned parquet files",
346
+ url=base_url,
347
+ columns=final_df.columns,
348
+ )
349
+ return final_df
350
+
159
351
  @staticmethod
160
352
  def _parquet_reader(
161
353
  df_module,
@@ -165,6 +357,7 @@ class DataStore(BaseRemoteClient):
165
357
  start_time,
166
358
  end_time,
167
359
  additional_filters,
360
+ optimize_discovery,
168
361
  ):
169
362
  from storey.utils import find_filters, find_partitions
170
363
 
@@ -203,7 +396,10 @@ class DataStore(BaseRemoteClient):
203
396
  )
204
397
 
205
398
  if start_time or end_time or additional_filters:
206
- partitions_time_attributes = find_partitions(url, file_system)
399
+ partitions_time_attributes, partitions = find_partitions(
400
+ url, file_system, True
401
+ )
402
+ logger.debug("Partitioned parquet read", partitions=partitions)
207
403
  set_filters(
208
404
  partitions_time_attributes,
209
405
  start_time,
@@ -211,8 +407,28 @@ class DataStore(BaseRemoteClient):
211
407
  additional_filters,
212
408
  kwargs,
213
409
  )
410
+
214
411
  try:
215
- return df_module.read_parquet(*args, **kwargs)
412
+ if (
413
+ optimize_discovery
414
+ and partitions_time_attributes
415
+ and DataStore._verify_path_partition_level(
416
+ urlparse(url).path, partitions
417
+ )
418
+ and (start_time or end_time)
419
+ ):
420
+ return DataStore._read_partitioned_parquet(
421
+ url,
422
+ start_time,
423
+ end_time,
424
+ partitions_time_attributes,
425
+ df_module,
426
+ file_system,
427
+ **kwargs,
428
+ )
429
+
430
+ else:
431
+ return df_module.read_parquet(*args, **kwargs)
216
432
  except pyarrow.lib.ArrowInvalid as ex:
217
433
  if not str(ex).startswith(
218
434
  "Cannot compare timestamp with timezone to timestamp without timezone"
@@ -238,7 +454,24 @@ class DataStore(BaseRemoteClient):
238
454
  additional_filters,
239
455
  kwargs,
240
456
  )
241
- return df_module.read_parquet(*args, **kwargs)
457
+ if (
458
+ optimize_discovery
459
+ and partitions_time_attributes
460
+ and DataStore._verify_path_partition_level(
461
+ urlparse(url).path, partitions
462
+ )
463
+ ):
464
+ return DataStore._read_partitioned_parquet(
465
+ url,
466
+ start_time_inner,
467
+ end_time_inner,
468
+ partitions_time_attributes,
469
+ df_module,
470
+ file_system,
471
+ **kwargs,
472
+ )
473
+ else:
474
+ return df_module.read_parquet(*args, **kwargs)
242
475
  else:
243
476
  return df_module.read_parquet(*args, **kwargs)
244
477
 
@@ -261,6 +494,10 @@ class DataStore(BaseRemoteClient):
261
494
  file_url = self._sanitize_url(url)
262
495
  is_csv, is_json, drop_time_column = False, False, False
263
496
  file_system = self.filesystem
497
+
498
+ # Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
499
+ optimize_discovery = kwargs.pop("optimize_discovery", True)
500
+
264
501
  if file_url.endswith(".csv") or format == "csv":
265
502
  is_csv = True
266
503
  drop_time_column = False
@@ -322,6 +559,7 @@ class DataStore(BaseRemoteClient):
322
559
  start_time,
323
560
  end_time,
324
561
  additional_filters,
562
+ optimize_discovery,
325
563
  )
326
564
 
327
565
  elif file_url.endswith(".json") or format == "json":
@@ -347,7 +585,7 @@ class DataStore(BaseRemoteClient):
347
585
  temp_file = tempfile.NamedTemporaryFile(delete=False)
348
586
  self.download(self._join(subpath), temp_file.name)
349
587
  df = reader(temp_file.name, **kwargs)
350
- remove(temp_file.name)
588
+ os.remove(temp_file.name)
351
589
 
352
590
  if is_json or is_csv:
353
591
  # for parquet file the time filtering is executed in `reader`
@@ -387,6 +625,26 @@ class DataStore(BaseRemoteClient):
387
625
  except ImportError:
388
626
  return False
389
627
 
628
+ @staticmethod
629
+ def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
630
+ if not partitions:
631
+ return False
632
+
633
+ path_parts = base_path.strip("/").split("/")
634
+ path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
635
+ if "hour" in partitions:
636
+ hour_index = partitions.index("hour")
637
+ else:
638
+ return False
639
+ for i, part in enumerate(partitions):
640
+ if not (
641
+ part in path_parts
642
+ or part in ["year", "month", "day", "hour"]
643
+ or i > hour_index
644
+ ):
645
+ return False
646
+ return True
647
+
390
648
 
391
649
  class DataItem:
392
650
  """Data input/output class abstracting access to various local/remote data sources
@@ -439,7 +697,7 @@ class DataItem:
439
697
  @property
440
698
  def suffix(self):
441
699
  """DataItem suffix (file extension) e.g. '.png'"""
442
- _, file_ext = path.splitext(self._path)
700
+ _, file_ext = os.path.splitext(self._path)
443
701
  return file_ext
444
702
 
445
703
  @property
@@ -548,7 +806,7 @@ class DataItem:
548
806
  return
549
807
 
550
808
  if self._local_path:
551
- remove(self._local_path)
809
+ os.remove(self._local_path)
552
810
  self._local_path = ""
553
811
 
554
812
  def as_df(
@@ -38,6 +38,8 @@ from ..utils import DB_SCHEMA, RunKeys
38
38
  from .base import DataItem, DataStore, HttpStore
39
39
  from .filestore import FileStore
40
40
  from .inmem import InMemoryStore
41
+ from .model_provider.huggingface_provider import HuggingFaceProvider
42
+ from .model_provider.mock_model_provider import MockModelProvider
41
43
  from .model_provider.openai_provider import OpenAIProvider
42
44
  from .store_resources import get_store_resource, is_store_uri
43
45
  from .v3io import V3ioStore
@@ -45,7 +47,7 @@ from .v3io import V3ioStore
45
47
  in_memory_store = InMemoryStore()
46
48
 
47
49
 
48
- def schema_to_store(schema) -> DataStore.__subclasses__():
50
+ def schema_to_store(schema) -> type[DataStore]:
49
51
  # import store classes inside to enable making their dependencies optional (package extras)
50
52
 
51
53
  if not schema or schema in get_local_file_schema():
@@ -102,8 +104,11 @@ def schema_to_store(schema) -> DataStore.__subclasses__():
102
104
  def schema_to_model_provider(
103
105
  schema: str, raise_missing_schema_exception=True
104
106
  ) -> type[ModelProvider]:
105
- # TODO add hugging face and http
106
- schema_dict = {"openai": OpenAIProvider}
107
+ schema_dict = {
108
+ "openai": OpenAIProvider,
109
+ "huggingface": HuggingFaceProvider,
110
+ "mock": MockModelProvider,
111
+ }
107
112
  provider_class = schema_dict.get(schema, None)
108
113
  if not provider_class:
109
114
  if raise_missing_schema_exception:
@@ -247,7 +252,7 @@ class StoreManager:
247
252
 
248
253
  if schema == "ds":
249
254
  datastore_profile = datastore_profile_read(url, project_name, secrets)
250
- secrets = merge(secrets or {}, datastore_profile.secrets() or {})
255
+ secrets = merge({}, secrets or {}, datastore_profile.secrets() or {})
251
256
  url = datastore_profile.url(subpath)
252
257
  schema, endpoint, parsed_url = parse_url(url)
253
258
  subpath = parsed_url.path
@@ -281,7 +286,7 @@ class StoreManager:
281
286
  endpoint, subpath
282
287
  )
283
288
  remote_client = remote_client_class(
284
- self, schema, cache_key, parsed_url.netloc, secrets=secrets, **kwargs
289
+ self, schema, cache_key, endpoint, secrets=secrets, **kwargs
285
290
  )
286
291
  if not secrets and not mlrun.config.is_running_as_api():
287
292
  cache[cache_key] = remote_client
@@ -19,6 +19,7 @@ import typing
19
19
  from urllib.parse import ParseResult, urlparse
20
20
 
21
21
  import pydantic.v1
22
+ from deprecated import deprecated
22
23
  from mergedeep import merge
23
24
 
24
25
  import mlrun
@@ -138,6 +139,15 @@ class ConfigProfile(DatastoreProfile):
138
139
  return res
139
140
 
140
141
 
142
+ # TODO: Remove in 1.12.0
143
+ @deprecated(
144
+ version="1.10.0",
145
+ reason=(
146
+ "This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
147
+ "Use `DatastoreProfileKafkaStream` instead."
148
+ ),
149
+ category=FutureWarning,
150
+ )
141
151
  class DatastoreProfileKafkaTarget(DatastoreProfile):
142
152
  type: str = pydantic.v1.Field("kafka_target")
143
153
  _private_attributes = "kwargs_private"
@@ -158,8 +168,8 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
158
168
  return attributes
159
169
 
160
170
 
161
- class DatastoreProfileKafkaSource(DatastoreProfile):
162
- type: str = pydantic.v1.Field("kafka_source")
171
+ class DatastoreProfileKafkaStream(DatastoreProfile):
172
+ type: str = pydantic.v1.Field("kafka_stream")
163
173
  _private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
164
174
  brokers: typing.Union[str, list[str]]
165
175
  topics: typing.Union[str, list[str]]
@@ -198,6 +208,19 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
198
208
  return attributes
199
209
 
200
210
 
211
+ # TODO: Remove in 1.12.0
212
+ @deprecated(
213
+ version="1.10.0",
214
+ reason=(
215
+ "This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
216
+ "Use `DatastoreProfileKafkaStream` instead."
217
+ ),
218
+ category=FutureWarning,
219
+ )
220
+ class DatastoreProfileKafkaSource(DatastoreProfileKafkaStream):
221
+ type: str = pydantic.v1.Field("kafka_source")
222
+
223
+
201
224
  class DatastoreProfileV3io(DatastoreProfile):
202
225
  type: str = pydantic.v1.Field("v3io")
203
226
  v3io_access_key: typing.Optional[str] = None
@@ -232,7 +255,7 @@ class DatastoreProfileS3(DatastoreProfile):
232
255
  if self.secret_key:
233
256
  res["AWS_SECRET_ACCESS_KEY"] = self.secret_key
234
257
  if self.endpoint_url:
235
- res["S3_ENDPOINT_URL"] = self.endpoint_url
258
+ res["AWS_ENDPOINT_URL_S3"] = self.endpoint_url
236
259
  if self.force_non_anonymous:
237
260
  res["S3_NON_ANONYMOUS"] = self.force_non_anonymous
238
261
  if self.profile_name:
@@ -333,7 +356,9 @@ class DatastoreProfileGCS(DatastoreProfile):
333
356
  # in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
334
357
  subpath = subpath[1:]
335
358
  if self.bucket:
336
- return f"gcs://{self.bucket}/{subpath}"
359
+ return (
360
+ f"gcs://{self.bucket}/{subpath}" if subpath else f"gcs://{self.bucket}"
361
+ )
337
362
  else:
338
363
  return f"gcs://{subpath}"
339
364
 
@@ -370,7 +395,11 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
370
395
  # in azure the path after schema is starts with container, wherefore it should not start with "/".
371
396
  subpath = subpath[1:]
372
397
  if self.container:
373
- return f"az://{self.container}/{subpath}"
398
+ return (
399
+ f"az://{self.container}/{subpath}"
400
+ if subpath
401
+ else f"az://{self.container}"
402
+ )
374
403
  else:
375
404
  return f"az://{subpath}"
376
405
 
@@ -486,6 +515,31 @@ class OpenAIProfile(DatastoreProfile):
486
515
  return f"{self.type}://{subpath.lstrip('/')}"
487
516
 
488
517
 
518
+ class HuggingFaceProfile(DatastoreProfile):
519
+ type: str = pydantic.v1.Field("huggingface")
520
+ _private_attributes = ("token", "model_kwargs")
521
+ task: typing.Optional[str] = None
522
+ token: typing.Optional[str] = None
523
+ device: typing.Optional[typing.Union[int, str]] = None
524
+ device_map: typing.Union[str, dict[str, typing.Union[int, str]], None] = None
525
+ trust_remote_code: bool = None
526
+ model_kwargs: typing.Optional[dict[str, typing.Any]] = None
527
+
528
+ def secrets(self) -> dict:
529
+ keys = {
530
+ "HF_TASK": self.task,
531
+ "HF_TOKEN": self.token,
532
+ "HF_DEVICE": self.device,
533
+ "HF_DEVICE_MAP": self.device_map,
534
+ "HF_TRUST_REMOTE_CODE": self.trust_remote_code,
535
+ "HF_MODEL_KWARGS": self.model_kwargs,
536
+ }
537
+ return {k: v for k, v in keys.items() if v}
538
+
539
+ def url(self, subpath):
540
+ return f"{self.type}://{subpath.lstrip('/')}"
541
+
542
+
489
543
  _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
490
544
  "v3io": DatastoreProfileV3io,
491
545
  "s3": DatastoreProfileS3,
@@ -493,6 +547,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
493
547
  "basic": DatastoreProfileBasic,
494
548
  "kafka_target": DatastoreProfileKafkaTarget,
495
549
  "kafka_source": DatastoreProfileKafkaSource,
550
+ "kafka_stream": DatastoreProfileKafkaStream,
496
551
  "dbfs": DatastoreProfileDBFS,
497
552
  "gcs": DatastoreProfileGCS,
498
553
  "az": DatastoreProfileAzureBlob,
@@ -500,6 +555,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
500
555
  "taosws": DatastoreProfileTDEngine,
501
556
  "config": ConfigProfile,
502
557
  "openai": OpenAIProfile,
558
+ "huggingface": HuggingFaceProfile,
503
559
  }
504
560
 
505
561