mlrun 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +22 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/common/constants.py +9 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/function.py +10 -0
- mlrun/common/schemas/hub.py +30 -18
- mlrun/common/schemas/model_monitoring/__init__.py +2 -0
- mlrun/common/schemas/model_monitoring/constants.py +30 -6
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/common/secrets.py +22 -1
- mlrun/config.py +34 -21
- mlrun/datastore/__init__.py +11 -3
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +265 -7
- mlrun/datastore/datastore.py +10 -5
- mlrun/datastore/datastore_profile.py +61 -5
- mlrun/datastore/model_provider/huggingface_provider.py +367 -0
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +211 -74
- mlrun/datastore/model_provider/openai_provider.py +243 -71
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/datastore/utils.py +15 -3
- mlrun/db/base.py +27 -19
- mlrun/db/httpdb.py +57 -48
- mlrun/db/nopdb.py +25 -10
- mlrun/execution.py +55 -13
- mlrun/hub/__init__.py +15 -0
- mlrun/hub/module.py +181 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +13 -6
- mlrun/launcher/local.py +2 -0
- mlrun/model.py +9 -3
- mlrun/model_monitoring/api.py +66 -27
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +388 -138
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +239 -101
- mlrun/model_monitoring/db/_schedules.py +36 -13
- mlrun/model_monitoring/db/_stats.py +4 -3
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
- mlrun/model_monitoring/helpers.py +28 -5
- mlrun/model_monitoring/stream_processing.py +45 -14
- mlrun/model_monitoring/writer.py +220 -1
- mlrun/platforms/__init__.py +3 -2
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/operations.py +16 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +157 -69
- mlrun/run.py +97 -20
- mlrun/runtimes/__init__.py +18 -0
- mlrun/runtimes/base.py +14 -6
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +147 -17
- mlrun/runtimes/nuclio/function.py +72 -27
- mlrun/runtimes/nuclio/serving.py +102 -20
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +54 -13
- mlrun/serving/remote.py +79 -6
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +230 -40
- mlrun/serving/states.py +605 -232
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +136 -81
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +215 -83
- mlrun/utils/logger.py +3 -1
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/mail.py +38 -15
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +51 -50
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +100 -95
- mlrun/api/schemas/__init__.py +0 -259
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py
CHANGED
|
@@ -11,11 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import datetime
|
|
15
|
+
import os
|
|
16
|
+
import os.path
|
|
14
17
|
import tempfile
|
|
15
18
|
import urllib.parse
|
|
16
19
|
from base64 import b64encode
|
|
17
20
|
from copy import copy
|
|
18
|
-
from
|
|
21
|
+
from types import ModuleType
|
|
19
22
|
from typing import Optional, Union
|
|
20
23
|
from urllib.parse import urlparse
|
|
21
24
|
|
|
@@ -156,6 +159,195 @@ class DataStore(BaseRemoteClient):
|
|
|
156
159
|
def get_spark_options(self, path=None):
|
|
157
160
|
return {}
|
|
158
161
|
|
|
162
|
+
@staticmethod
|
|
163
|
+
def _is_directory_in_range(
|
|
164
|
+
start_time: Optional[datetime.datetime],
|
|
165
|
+
end_time: Optional[datetime.datetime],
|
|
166
|
+
year: int,
|
|
167
|
+
month: Optional[int] = None,
|
|
168
|
+
day: Optional[int] = None,
|
|
169
|
+
hour: Optional[int] = None,
|
|
170
|
+
**kwargs,
|
|
171
|
+
):
|
|
172
|
+
"""Check if a partition directory (year=.., month=.., etc.) is in the time range."""
|
|
173
|
+
from dateutil.relativedelta import relativedelta
|
|
174
|
+
|
|
175
|
+
partition_start = datetime.datetime(
|
|
176
|
+
year=year,
|
|
177
|
+
month=month or 1,
|
|
178
|
+
day=day or 1,
|
|
179
|
+
hour=hour or 0,
|
|
180
|
+
tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
|
|
181
|
+
)
|
|
182
|
+
partition_end = (
|
|
183
|
+
partition_start
|
|
184
|
+
+ relativedelta(
|
|
185
|
+
years=1 if month is None else 0,
|
|
186
|
+
months=1 if day is None and month is not None else 0,
|
|
187
|
+
days=1 if hour is None and day is not None else 0,
|
|
188
|
+
hours=1 if hour is not None else 0,
|
|
189
|
+
)
|
|
190
|
+
- datetime.timedelta(microseconds=1)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if (end_time and end_time < partition_start) or (
|
|
194
|
+
start_time and start_time > partition_end
|
|
195
|
+
):
|
|
196
|
+
return False
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _list_partition_paths_helper(
|
|
201
|
+
paths: list[str],
|
|
202
|
+
start_time: Optional[datetime.datetime],
|
|
203
|
+
end_time: Optional[datetime.datetime],
|
|
204
|
+
current_path: str,
|
|
205
|
+
partition_level: str,
|
|
206
|
+
filesystem,
|
|
207
|
+
):
|
|
208
|
+
directory_split = current_path.rsplit("/", 1)
|
|
209
|
+
time_unit = None
|
|
210
|
+
directory_start, directory_end = "", ""
|
|
211
|
+
if len(directory_split) == 2:
|
|
212
|
+
directory_start, directory_end = directory_split
|
|
213
|
+
time_unit = directory_end.split("=")[0] if "=" in directory_end else None
|
|
214
|
+
|
|
215
|
+
if not time_unit and directory_end.endswith((".parquet", ".pq")):
|
|
216
|
+
paths.append(directory_start.rstrip("/"))
|
|
217
|
+
return
|
|
218
|
+
elif time_unit and time_unit == partition_level:
|
|
219
|
+
paths.append(current_path.rstrip("/"))
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
directories = filesystem.ls(current_path, detail=True)
|
|
223
|
+
if len(directories) == 0:
|
|
224
|
+
return
|
|
225
|
+
for directory in directories:
|
|
226
|
+
current_path = directory["name"]
|
|
227
|
+
parts = [p for p in current_path.split("/") if "=" in p]
|
|
228
|
+
kwargs = {}
|
|
229
|
+
for part in parts:
|
|
230
|
+
key, value = part.split("=", 1)
|
|
231
|
+
if value.isdigit():
|
|
232
|
+
value = int(value)
|
|
233
|
+
kwargs[key] = value
|
|
234
|
+
if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
|
|
235
|
+
DataStore._list_partition_paths_helper(
|
|
236
|
+
paths,
|
|
237
|
+
start_time,
|
|
238
|
+
end_time,
|
|
239
|
+
current_path,
|
|
240
|
+
partition_level,
|
|
241
|
+
filesystem,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
@staticmethod
|
|
245
|
+
def _list_partitioned_paths(
|
|
246
|
+
base_url: str,
|
|
247
|
+
start_time: Optional[datetime.datetime],
|
|
248
|
+
end_time: Optional[datetime.datetime],
|
|
249
|
+
partition_level: str,
|
|
250
|
+
filesystem,
|
|
251
|
+
):
|
|
252
|
+
paths = []
|
|
253
|
+
parsed_base_url = urlparse(base_url)
|
|
254
|
+
base_path = parsed_base_url.path
|
|
255
|
+
|
|
256
|
+
if parsed_base_url.scheme not in ["v3io", "v3ios"]:
|
|
257
|
+
base_path = parsed_base_url.netloc + base_path
|
|
258
|
+
|
|
259
|
+
DataStore._list_partition_paths_helper(
|
|
260
|
+
paths, start_time, end_time, base_path, partition_level, filesystem
|
|
261
|
+
)
|
|
262
|
+
paths = [
|
|
263
|
+
DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
|
|
264
|
+
for path in paths
|
|
265
|
+
]
|
|
266
|
+
return paths
|
|
267
|
+
|
|
268
|
+
@staticmethod
|
|
269
|
+
def _reconstruct_path_from_base_url(
|
|
270
|
+
parsed_base_url: urllib.parse.ParseResult, returned_path: str
|
|
271
|
+
) -> str:
|
|
272
|
+
scheme = parsed_base_url.scheme
|
|
273
|
+
authority = parsed_base_url.netloc
|
|
274
|
+
returned_path = returned_path.lstrip("/")
|
|
275
|
+
if scheme == "v3io":
|
|
276
|
+
return f"{scheme}://{authority}/{returned_path}"
|
|
277
|
+
else:
|
|
278
|
+
return f"{scheme}://{returned_path}"
|
|
279
|
+
|
|
280
|
+
@staticmethod
|
|
281
|
+
def _clean_filters_for_partitions(
|
|
282
|
+
filters: list[list[tuple]],
|
|
283
|
+
partition_keys: list[str],
|
|
284
|
+
):
|
|
285
|
+
"""
|
|
286
|
+
Remove partition keys from filters.
|
|
287
|
+
|
|
288
|
+
:param filters: pandas-style filters
|
|
289
|
+
Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
|
|
290
|
+
:param partition_keys: partition columns handled via directory
|
|
291
|
+
|
|
292
|
+
:return list of list of tuples: cleaned filters without partition keys
|
|
293
|
+
"""
|
|
294
|
+
cleaned_filters = []
|
|
295
|
+
for group in filters:
|
|
296
|
+
new_group = [f for f in group if f[0] not in partition_keys]
|
|
297
|
+
if new_group:
|
|
298
|
+
cleaned_filters.append(new_group)
|
|
299
|
+
return cleaned_filters
|
|
300
|
+
|
|
301
|
+
@staticmethod
|
|
302
|
+
def _read_partitioned_parquet(
|
|
303
|
+
base_url: str,
|
|
304
|
+
start_time: Optional[datetime.datetime],
|
|
305
|
+
end_time: Optional[datetime.datetime],
|
|
306
|
+
partition_keys: list[str],
|
|
307
|
+
df_module: ModuleType,
|
|
308
|
+
filesystem: fsspec.AbstractFileSystem,
|
|
309
|
+
**kwargs,
|
|
310
|
+
):
|
|
311
|
+
"""
|
|
312
|
+
Reads only the relevant partitions and concatenates the results.
|
|
313
|
+
Note that partition_keys cannot be empty.
|
|
314
|
+
"""
|
|
315
|
+
logger.debug(f"Starting partition discovery process for {base_url}")
|
|
316
|
+
|
|
317
|
+
paths = DataStore._list_partitioned_paths(
|
|
318
|
+
base_url,
|
|
319
|
+
start_time,
|
|
320
|
+
end_time,
|
|
321
|
+
partition_keys[-1],
|
|
322
|
+
filesystem,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
dfs = []
|
|
326
|
+
for current_path in paths:
|
|
327
|
+
try:
|
|
328
|
+
kwargs["filters"] = DataStore._clean_filters_for_partitions(
|
|
329
|
+
kwargs["filters"], partition_keys
|
|
330
|
+
)
|
|
331
|
+
df = df_module.read_parquet(current_path, **kwargs)
|
|
332
|
+
logger.debug(
|
|
333
|
+
"Finished reading DataFrame from subpath",
|
|
334
|
+
url=current_path,
|
|
335
|
+
)
|
|
336
|
+
dfs.append(df)
|
|
337
|
+
except FileNotFoundError as e:
|
|
338
|
+
# Skip partitions that don't exist or have no data
|
|
339
|
+
logger.warning(
|
|
340
|
+
"Failed to read DataFrame", url=current_path, exception=e
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
final_df = pd.concat(dfs) if dfs else pd.DataFrame()
|
|
344
|
+
logger.debug(
|
|
345
|
+
"Finished reading partitioned parquet files",
|
|
346
|
+
url=base_url,
|
|
347
|
+
columns=final_df.columns,
|
|
348
|
+
)
|
|
349
|
+
return final_df
|
|
350
|
+
|
|
159
351
|
@staticmethod
|
|
160
352
|
def _parquet_reader(
|
|
161
353
|
df_module,
|
|
@@ -165,6 +357,7 @@ class DataStore(BaseRemoteClient):
|
|
|
165
357
|
start_time,
|
|
166
358
|
end_time,
|
|
167
359
|
additional_filters,
|
|
360
|
+
optimize_discovery,
|
|
168
361
|
):
|
|
169
362
|
from storey.utils import find_filters, find_partitions
|
|
170
363
|
|
|
@@ -203,7 +396,10 @@ class DataStore(BaseRemoteClient):
|
|
|
203
396
|
)
|
|
204
397
|
|
|
205
398
|
if start_time or end_time or additional_filters:
|
|
206
|
-
partitions_time_attributes = find_partitions(
|
|
399
|
+
partitions_time_attributes, partitions = find_partitions(
|
|
400
|
+
url, file_system, True
|
|
401
|
+
)
|
|
402
|
+
logger.debug("Partitioned parquet read", partitions=partitions)
|
|
207
403
|
set_filters(
|
|
208
404
|
partitions_time_attributes,
|
|
209
405
|
start_time,
|
|
@@ -211,8 +407,28 @@ class DataStore(BaseRemoteClient):
|
|
|
211
407
|
additional_filters,
|
|
212
408
|
kwargs,
|
|
213
409
|
)
|
|
410
|
+
|
|
214
411
|
try:
|
|
215
|
-
|
|
412
|
+
if (
|
|
413
|
+
optimize_discovery
|
|
414
|
+
and partitions_time_attributes
|
|
415
|
+
and DataStore._verify_path_partition_level(
|
|
416
|
+
urlparse(url).path, partitions
|
|
417
|
+
)
|
|
418
|
+
and (start_time or end_time)
|
|
419
|
+
):
|
|
420
|
+
return DataStore._read_partitioned_parquet(
|
|
421
|
+
url,
|
|
422
|
+
start_time,
|
|
423
|
+
end_time,
|
|
424
|
+
partitions_time_attributes,
|
|
425
|
+
df_module,
|
|
426
|
+
file_system,
|
|
427
|
+
**kwargs,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
else:
|
|
431
|
+
return df_module.read_parquet(*args, **kwargs)
|
|
216
432
|
except pyarrow.lib.ArrowInvalid as ex:
|
|
217
433
|
if not str(ex).startswith(
|
|
218
434
|
"Cannot compare timestamp with timezone to timestamp without timezone"
|
|
@@ -238,7 +454,24 @@ class DataStore(BaseRemoteClient):
|
|
|
238
454
|
additional_filters,
|
|
239
455
|
kwargs,
|
|
240
456
|
)
|
|
241
|
-
|
|
457
|
+
if (
|
|
458
|
+
optimize_discovery
|
|
459
|
+
and partitions_time_attributes
|
|
460
|
+
and DataStore._verify_path_partition_level(
|
|
461
|
+
urlparse(url).path, partitions
|
|
462
|
+
)
|
|
463
|
+
):
|
|
464
|
+
return DataStore._read_partitioned_parquet(
|
|
465
|
+
url,
|
|
466
|
+
start_time_inner,
|
|
467
|
+
end_time_inner,
|
|
468
|
+
partitions_time_attributes,
|
|
469
|
+
df_module,
|
|
470
|
+
file_system,
|
|
471
|
+
**kwargs,
|
|
472
|
+
)
|
|
473
|
+
else:
|
|
474
|
+
return df_module.read_parquet(*args, **kwargs)
|
|
242
475
|
else:
|
|
243
476
|
return df_module.read_parquet(*args, **kwargs)
|
|
244
477
|
|
|
@@ -261,6 +494,10 @@ class DataStore(BaseRemoteClient):
|
|
|
261
494
|
file_url = self._sanitize_url(url)
|
|
262
495
|
is_csv, is_json, drop_time_column = False, False, False
|
|
263
496
|
file_system = self.filesystem
|
|
497
|
+
|
|
498
|
+
# Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
|
|
499
|
+
optimize_discovery = kwargs.pop("optimize_discovery", True)
|
|
500
|
+
|
|
264
501
|
if file_url.endswith(".csv") or format == "csv":
|
|
265
502
|
is_csv = True
|
|
266
503
|
drop_time_column = False
|
|
@@ -322,6 +559,7 @@ class DataStore(BaseRemoteClient):
|
|
|
322
559
|
start_time,
|
|
323
560
|
end_time,
|
|
324
561
|
additional_filters,
|
|
562
|
+
optimize_discovery,
|
|
325
563
|
)
|
|
326
564
|
|
|
327
565
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -347,7 +585,7 @@ class DataStore(BaseRemoteClient):
|
|
|
347
585
|
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
348
586
|
self.download(self._join(subpath), temp_file.name)
|
|
349
587
|
df = reader(temp_file.name, **kwargs)
|
|
350
|
-
remove(temp_file.name)
|
|
588
|
+
os.remove(temp_file.name)
|
|
351
589
|
|
|
352
590
|
if is_json or is_csv:
|
|
353
591
|
# for parquet file the time filtering is executed in `reader`
|
|
@@ -387,6 +625,26 @@ class DataStore(BaseRemoteClient):
|
|
|
387
625
|
except ImportError:
|
|
388
626
|
return False
|
|
389
627
|
|
|
628
|
+
@staticmethod
|
|
629
|
+
def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
|
|
630
|
+
if not partitions:
|
|
631
|
+
return False
|
|
632
|
+
|
|
633
|
+
path_parts = base_path.strip("/").split("/")
|
|
634
|
+
path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
|
|
635
|
+
if "hour" in partitions:
|
|
636
|
+
hour_index = partitions.index("hour")
|
|
637
|
+
else:
|
|
638
|
+
return False
|
|
639
|
+
for i, part in enumerate(partitions):
|
|
640
|
+
if not (
|
|
641
|
+
part in path_parts
|
|
642
|
+
or part in ["year", "month", "day", "hour"]
|
|
643
|
+
or i > hour_index
|
|
644
|
+
):
|
|
645
|
+
return False
|
|
646
|
+
return True
|
|
647
|
+
|
|
390
648
|
|
|
391
649
|
class DataItem:
|
|
392
650
|
"""Data input/output class abstracting access to various local/remote data sources
|
|
@@ -439,7 +697,7 @@ class DataItem:
|
|
|
439
697
|
@property
|
|
440
698
|
def suffix(self):
|
|
441
699
|
"""DataItem suffix (file extension) e.g. '.png'"""
|
|
442
|
-
_, file_ext = path.splitext(self._path)
|
|
700
|
+
_, file_ext = os.path.splitext(self._path)
|
|
443
701
|
return file_ext
|
|
444
702
|
|
|
445
703
|
@property
|
|
@@ -548,7 +806,7 @@ class DataItem:
|
|
|
548
806
|
return
|
|
549
807
|
|
|
550
808
|
if self._local_path:
|
|
551
|
-
remove(self._local_path)
|
|
809
|
+
os.remove(self._local_path)
|
|
552
810
|
self._local_path = ""
|
|
553
811
|
|
|
554
812
|
def as_df(
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -38,6 +38,8 @@ from ..utils import DB_SCHEMA, RunKeys
|
|
|
38
38
|
from .base import DataItem, DataStore, HttpStore
|
|
39
39
|
from .filestore import FileStore
|
|
40
40
|
from .inmem import InMemoryStore
|
|
41
|
+
from .model_provider.huggingface_provider import HuggingFaceProvider
|
|
42
|
+
from .model_provider.mock_model_provider import MockModelProvider
|
|
41
43
|
from .model_provider.openai_provider import OpenAIProvider
|
|
42
44
|
from .store_resources import get_store_resource, is_store_uri
|
|
43
45
|
from .v3io import V3ioStore
|
|
@@ -45,7 +47,7 @@ from .v3io import V3ioStore
|
|
|
45
47
|
in_memory_store = InMemoryStore()
|
|
46
48
|
|
|
47
49
|
|
|
48
|
-
def schema_to_store(schema) -> DataStore
|
|
50
|
+
def schema_to_store(schema) -> type[DataStore]:
|
|
49
51
|
# import store classes inside to enable making their dependencies optional (package extras)
|
|
50
52
|
|
|
51
53
|
if not schema or schema in get_local_file_schema():
|
|
@@ -102,8 +104,11 @@ def schema_to_store(schema) -> DataStore.__subclasses__():
|
|
|
102
104
|
def schema_to_model_provider(
|
|
103
105
|
schema: str, raise_missing_schema_exception=True
|
|
104
106
|
) -> type[ModelProvider]:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
schema_dict = {
|
|
108
|
+
"openai": OpenAIProvider,
|
|
109
|
+
"huggingface": HuggingFaceProvider,
|
|
110
|
+
"mock": MockModelProvider,
|
|
111
|
+
}
|
|
107
112
|
provider_class = schema_dict.get(schema, None)
|
|
108
113
|
if not provider_class:
|
|
109
114
|
if raise_missing_schema_exception:
|
|
@@ -247,7 +252,7 @@ class StoreManager:
|
|
|
247
252
|
|
|
248
253
|
if schema == "ds":
|
|
249
254
|
datastore_profile = datastore_profile_read(url, project_name, secrets)
|
|
250
|
-
secrets = merge(secrets or {}, datastore_profile.secrets() or {})
|
|
255
|
+
secrets = merge({}, secrets or {}, datastore_profile.secrets() or {})
|
|
251
256
|
url = datastore_profile.url(subpath)
|
|
252
257
|
schema, endpoint, parsed_url = parse_url(url)
|
|
253
258
|
subpath = parsed_url.path
|
|
@@ -281,7 +286,7 @@ class StoreManager:
|
|
|
281
286
|
endpoint, subpath
|
|
282
287
|
)
|
|
283
288
|
remote_client = remote_client_class(
|
|
284
|
-
self, schema, cache_key,
|
|
289
|
+
self, schema, cache_key, endpoint, secrets=secrets, **kwargs
|
|
285
290
|
)
|
|
286
291
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
287
292
|
cache[cache_key] = remote_client
|
|
@@ -19,6 +19,7 @@ import typing
|
|
|
19
19
|
from urllib.parse import ParseResult, urlparse
|
|
20
20
|
|
|
21
21
|
import pydantic.v1
|
|
22
|
+
from deprecated import deprecated
|
|
22
23
|
from mergedeep import merge
|
|
23
24
|
|
|
24
25
|
import mlrun
|
|
@@ -138,6 +139,15 @@ class ConfigProfile(DatastoreProfile):
|
|
|
138
139
|
return res
|
|
139
140
|
|
|
140
141
|
|
|
142
|
+
# TODO: Remove in 1.12.0
|
|
143
|
+
@deprecated(
|
|
144
|
+
version="1.10.0",
|
|
145
|
+
reason=(
|
|
146
|
+
"This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
|
|
147
|
+
"Use `DatastoreProfileKafkaStream` instead."
|
|
148
|
+
),
|
|
149
|
+
category=FutureWarning,
|
|
150
|
+
)
|
|
141
151
|
class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
142
152
|
type: str = pydantic.v1.Field("kafka_target")
|
|
143
153
|
_private_attributes = "kwargs_private"
|
|
@@ -158,8 +168,8 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
|
158
168
|
return attributes
|
|
159
169
|
|
|
160
170
|
|
|
161
|
-
class
|
|
162
|
-
type: str = pydantic.v1.Field("
|
|
171
|
+
class DatastoreProfileKafkaStream(DatastoreProfile):
|
|
172
|
+
type: str = pydantic.v1.Field("kafka_stream")
|
|
163
173
|
_private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
|
|
164
174
|
brokers: typing.Union[str, list[str]]
|
|
165
175
|
topics: typing.Union[str, list[str]]
|
|
@@ -198,6 +208,19 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
|
198
208
|
return attributes
|
|
199
209
|
|
|
200
210
|
|
|
211
|
+
# TODO: Remove in 1.12.0
|
|
212
|
+
@deprecated(
|
|
213
|
+
version="1.10.0",
|
|
214
|
+
reason=(
|
|
215
|
+
"This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
|
|
216
|
+
"Use `DatastoreProfileKafkaStream` instead."
|
|
217
|
+
),
|
|
218
|
+
category=FutureWarning,
|
|
219
|
+
)
|
|
220
|
+
class DatastoreProfileKafkaSource(DatastoreProfileKafkaStream):
|
|
221
|
+
type: str = pydantic.v1.Field("kafka_source")
|
|
222
|
+
|
|
223
|
+
|
|
201
224
|
class DatastoreProfileV3io(DatastoreProfile):
|
|
202
225
|
type: str = pydantic.v1.Field("v3io")
|
|
203
226
|
v3io_access_key: typing.Optional[str] = None
|
|
@@ -232,7 +255,7 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
232
255
|
if self.secret_key:
|
|
233
256
|
res["AWS_SECRET_ACCESS_KEY"] = self.secret_key
|
|
234
257
|
if self.endpoint_url:
|
|
235
|
-
res["
|
|
258
|
+
res["AWS_ENDPOINT_URL_S3"] = self.endpoint_url
|
|
236
259
|
if self.force_non_anonymous:
|
|
237
260
|
res["S3_NON_ANONYMOUS"] = self.force_non_anonymous
|
|
238
261
|
if self.profile_name:
|
|
@@ -333,7 +356,9 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
333
356
|
# in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
334
357
|
subpath = subpath[1:]
|
|
335
358
|
if self.bucket:
|
|
336
|
-
return
|
|
359
|
+
return (
|
|
360
|
+
f"gcs://{self.bucket}/{subpath}" if subpath else f"gcs://{self.bucket}"
|
|
361
|
+
)
|
|
337
362
|
else:
|
|
338
363
|
return f"gcs://{subpath}"
|
|
339
364
|
|
|
@@ -370,7 +395,11 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
370
395
|
# in azure the path after schema is starts with container, wherefore it should not start with "/".
|
|
371
396
|
subpath = subpath[1:]
|
|
372
397
|
if self.container:
|
|
373
|
-
return
|
|
398
|
+
return (
|
|
399
|
+
f"az://{self.container}/{subpath}"
|
|
400
|
+
if subpath
|
|
401
|
+
else f"az://{self.container}"
|
|
402
|
+
)
|
|
374
403
|
else:
|
|
375
404
|
return f"az://{subpath}"
|
|
376
405
|
|
|
@@ -486,6 +515,31 @@ class OpenAIProfile(DatastoreProfile):
|
|
|
486
515
|
return f"{self.type}://{subpath.lstrip('/')}"
|
|
487
516
|
|
|
488
517
|
|
|
518
|
+
class HuggingFaceProfile(DatastoreProfile):
|
|
519
|
+
type: str = pydantic.v1.Field("huggingface")
|
|
520
|
+
_private_attributes = ("token", "model_kwargs")
|
|
521
|
+
task: typing.Optional[str] = None
|
|
522
|
+
token: typing.Optional[str] = None
|
|
523
|
+
device: typing.Optional[typing.Union[int, str]] = None
|
|
524
|
+
device_map: typing.Union[str, dict[str, typing.Union[int, str]], None] = None
|
|
525
|
+
trust_remote_code: bool = None
|
|
526
|
+
model_kwargs: typing.Optional[dict[str, typing.Any]] = None
|
|
527
|
+
|
|
528
|
+
def secrets(self) -> dict:
|
|
529
|
+
keys = {
|
|
530
|
+
"HF_TASK": self.task,
|
|
531
|
+
"HF_TOKEN": self.token,
|
|
532
|
+
"HF_DEVICE": self.device,
|
|
533
|
+
"HF_DEVICE_MAP": self.device_map,
|
|
534
|
+
"HF_TRUST_REMOTE_CODE": self.trust_remote_code,
|
|
535
|
+
"HF_MODEL_KWARGS": self.model_kwargs,
|
|
536
|
+
}
|
|
537
|
+
return {k: v for k, v in keys.items() if v}
|
|
538
|
+
|
|
539
|
+
def url(self, subpath):
|
|
540
|
+
return f"{self.type}://{subpath.lstrip('/')}"
|
|
541
|
+
|
|
542
|
+
|
|
489
543
|
_DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
|
|
490
544
|
"v3io": DatastoreProfileV3io,
|
|
491
545
|
"s3": DatastoreProfileS3,
|
|
@@ -493,6 +547,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
|
|
|
493
547
|
"basic": DatastoreProfileBasic,
|
|
494
548
|
"kafka_target": DatastoreProfileKafkaTarget,
|
|
495
549
|
"kafka_source": DatastoreProfileKafkaSource,
|
|
550
|
+
"kafka_stream": DatastoreProfileKafkaStream,
|
|
496
551
|
"dbfs": DatastoreProfileDBFS,
|
|
497
552
|
"gcs": DatastoreProfileGCS,
|
|
498
553
|
"az": DatastoreProfileAzureBlob,
|
|
@@ -500,6 +555,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
|
|
|
500
555
|
"taosws": DatastoreProfileTDEngine,
|
|
501
556
|
"config": ConfigProfile,
|
|
502
557
|
"openai": OpenAIProfile,
|
|
558
|
+
"huggingface": HuggingFaceProfile,
|
|
503
559
|
}
|
|
504
560
|
|
|
505
561
|
|