mlrun 1.7.1rc10__py3-none-any.whl → 1.8.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +23 -21
- mlrun/__main__.py +3 -3
- mlrun/alerts/alert.py +148 -14
- mlrun/artifacts/__init__.py +2 -3
- mlrun/artifacts/base.py +55 -12
- mlrun/artifacts/dataset.py +16 -16
- mlrun/artifacts/document.py +378 -0
- mlrun/artifacts/manager.py +26 -17
- mlrun/artifacts/model.py +66 -53
- mlrun/common/constants.py +8 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/base/__init__.py → common/formatters/model_endpoint.py} +16 -1
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +1 -29
- mlrun/common/runtimes/constants.py +1 -2
- mlrun/common/schemas/__init__.py +6 -2
- mlrun/common/schemas/alert.py +111 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +11 -7
- mlrun/common/schemas/auth.py +6 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +2 -3
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +67 -14
- mlrun/common/schemas/model_monitoring/grafana.py +1 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +92 -147
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +137 -0
- mlrun/common/schemas/pipeline.py +2 -2
- mlrun/common/schemas/project.py +25 -17
- mlrun/common/schemas/runs.py +2 -2
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +5 -5
- mlrun/config.py +68 -10
- mlrun/data_types/__init__.py +0 -2
- mlrun/data_types/data_types.py +1 -0
- mlrun/data_types/infer.py +3 -1
- mlrun/data_types/spark.py +5 -3
- mlrun/data_types/to_pandas.py +11 -2
- mlrun/datastore/__init__.py +2 -2
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +12 -4
- mlrun/datastore/datastore.py +9 -3
- mlrun/datastore/datastore_profile.py +79 -20
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +4 -1
- mlrun/datastore/sources.py +52 -51
- mlrun/datastore/store_resources.py +7 -4
- mlrun/datastore/targets.py +23 -22
- mlrun/datastore/utils.py +2 -2
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +229 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +213 -83
- mlrun/db/factory.py +0 -3
- mlrun/db/httpdb.py +1265 -387
- mlrun/db/nopdb.py +205 -74
- mlrun/errors.py +2 -2
- mlrun/execution.py +136 -50
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +41 -40
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +27 -24
- mlrun/feature_store/retrieval/base.py +14 -9
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/steps.py +2 -2
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +29 -27
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +4 -3
- mlrun/model.py +117 -46
- mlrun/model_monitoring/__init__.py +4 -4
- mlrun/model_monitoring/api.py +72 -59
- mlrun/model_monitoring/applications/_application_steps.py +17 -17
- mlrun/model_monitoring/applications/base.py +165 -6
- mlrun/model_monitoring/applications/context.py +88 -37
- mlrun/model_monitoring/applications/evidently_base.py +0 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +43 -21
- mlrun/model_monitoring/applications/results.py +55 -3
- mlrun/model_monitoring/controller.py +207 -239
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +156 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/base.py +78 -25
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +61 -6
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +255 -29
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +78 -17
- mlrun/model_monitoring/helpers.py +151 -49
- mlrun/model_monitoring/stream_processing.py +99 -283
- mlrun/model_monitoring/tracking_policy.py +10 -3
- mlrun/model_monitoring/writer.py +48 -36
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +31 -14
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +27 -27
- mlrun/projects/pipelines.py +71 -36
- mlrun/projects/project.py +890 -220
- mlrun/run.py +53 -10
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +15 -11
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/generators.py +2 -1
- mlrun/runtimes/kubejob.py +4 -5
- mlrun/runtimes/mounts.py +572 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -11
- mlrun/runtimes/nuclio/function.py +19 -17
- mlrun/runtimes/nuclio/serving.py +18 -13
- mlrun/runtimes/pod.py +154 -45
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +21 -11
- mlrun/runtimes/utils.py +6 -5
- mlrun/serving/merger.py +6 -4
- mlrun/serving/remote.py +18 -17
- mlrun/serving/routers.py +185 -172
- mlrun/serving/server.py +7 -1
- mlrun/serving/states.py +97 -78
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +105 -72
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/clones.py +1 -1
- mlrun/utils/helpers.py +63 -19
- mlrun/utils/logger.py +106 -4
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +33 -14
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +6 -6
- mlrun/utils/notifications/notification/webhook.py +6 -6
- mlrun/utils/notifications/notification_pusher.py +86 -44
- mlrun/utils/regex.py +11 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/METADATA +29 -24
- mlrun-1.8.0rc11.dist-info/RECORD +347 -0
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.1rc10.dist-info/RECORD +0 -351
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/LICENSE +0 -0
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/WHEEL +0 -0
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -396,7 +396,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
396
396
|
self,
|
|
397
397
|
name: str = "",
|
|
398
398
|
path=None,
|
|
399
|
-
attributes: dict[str, str] = None,
|
|
399
|
+
attributes: Optional[dict[str, str]] = None,
|
|
400
400
|
after_step=None,
|
|
401
401
|
columns=None,
|
|
402
402
|
partitioned: bool = False,
|
|
@@ -405,8 +405,8 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
405
405
|
time_partitioning_granularity: Optional[str] = None,
|
|
406
406
|
max_events: Optional[int] = None,
|
|
407
407
|
flush_after_seconds: Optional[int] = None,
|
|
408
|
-
storage_options: dict[str, str] = None,
|
|
409
|
-
schema: dict[str, Any] = None,
|
|
408
|
+
storage_options: Optional[dict[str, str]] = None,
|
|
409
|
+
schema: Optional[dict[str, Any]] = None,
|
|
410
410
|
credentials_prefix=None,
|
|
411
411
|
):
|
|
412
412
|
super().__init__(
|
|
@@ -834,16 +834,16 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
834
834
|
self,
|
|
835
835
|
name: str = "",
|
|
836
836
|
path=None,
|
|
837
|
-
attributes: dict[str, str] = None,
|
|
837
|
+
attributes: Optional[dict[str, str]] = None,
|
|
838
838
|
after_step=None,
|
|
839
839
|
columns=None,
|
|
840
|
-
partitioned: bool = None,
|
|
840
|
+
partitioned: Optional[bool] = None,
|
|
841
841
|
key_bucketing_number: Optional[int] = None,
|
|
842
842
|
partition_cols: Optional[list[str]] = None,
|
|
843
843
|
time_partitioning_granularity: Optional[str] = None,
|
|
844
844
|
max_events: Optional[int] = 10000,
|
|
845
845
|
flush_after_seconds: Optional[int] = 900,
|
|
846
|
-
storage_options: dict[str, str] = None,
|
|
846
|
+
storage_options: Optional[dict[str, str]] = None,
|
|
847
847
|
):
|
|
848
848
|
self.path = path
|
|
849
849
|
if partitioned is None:
|
|
@@ -1136,7 +1136,8 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1136
1136
|
import pyspark.sql.functions as funcs
|
|
1137
1137
|
|
|
1138
1138
|
for col_name, col_type in df.dtypes:
|
|
1139
|
-
|
|
1139
|
+
# covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
|
|
1140
|
+
if col_type.startswith("timestamp"):
|
|
1140
1141
|
# df.write.csv saves timestamps with millisecond precision, but we want microsecond precision
|
|
1141
1142
|
# for compatibility with storey.
|
|
1142
1143
|
df = df.withColumn(
|
|
@@ -1199,7 +1200,7 @@ class SnowflakeTarget(BaseStoreTarget):
|
|
|
1199
1200
|
self,
|
|
1200
1201
|
name: str = "",
|
|
1201
1202
|
path=None,
|
|
1202
|
-
attributes: dict[str, str] = None,
|
|
1203
|
+
attributes: Optional[dict[str, str]] = None,
|
|
1203
1204
|
after_step=None,
|
|
1204
1205
|
columns=None,
|
|
1205
1206
|
partitioned: bool = False,
|
|
@@ -1208,15 +1209,15 @@ class SnowflakeTarget(BaseStoreTarget):
|
|
|
1208
1209
|
time_partitioning_granularity: Optional[str] = None,
|
|
1209
1210
|
max_events: Optional[int] = None,
|
|
1210
1211
|
flush_after_seconds: Optional[int] = None,
|
|
1211
|
-
storage_options: dict[str, str] = None,
|
|
1212
|
-
schema: dict[str, Any] = None,
|
|
1212
|
+
storage_options: Optional[dict[str, str]] = None,
|
|
1213
|
+
schema: Optional[dict[str, Any]] = None,
|
|
1213
1214
|
credentials_prefix=None,
|
|
1214
|
-
url: str = None,
|
|
1215
|
-
user: str = None,
|
|
1216
|
-
db_schema: str = None,
|
|
1217
|
-
database: str = None,
|
|
1218
|
-
warehouse: str = None,
|
|
1219
|
-
table_name: str = None,
|
|
1215
|
+
url: Optional[str] = None,
|
|
1216
|
+
user: Optional[str] = None,
|
|
1217
|
+
db_schema: Optional[str] = None,
|
|
1218
|
+
database: Optional[str] = None,
|
|
1219
|
+
warehouse: Optional[str] = None,
|
|
1220
|
+
table_name: Optional[str] = None,
|
|
1220
1221
|
):
|
|
1221
1222
|
attributes = attributes or {}
|
|
1222
1223
|
if url:
|
|
@@ -1903,7 +1904,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1903
1904
|
self,
|
|
1904
1905
|
name: str = "",
|
|
1905
1906
|
path=None,
|
|
1906
|
-
attributes: dict[str, str] = None,
|
|
1907
|
+
attributes: Optional[dict[str, str]] = None,
|
|
1907
1908
|
after_step=None,
|
|
1908
1909
|
partitioned: bool = False,
|
|
1909
1910
|
key_bucketing_number: Optional[int] = None,
|
|
@@ -1911,16 +1912,16 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1911
1912
|
time_partitioning_granularity: Optional[str] = None,
|
|
1912
1913
|
max_events: Optional[int] = None,
|
|
1913
1914
|
flush_after_seconds: Optional[int] = None,
|
|
1914
|
-
storage_options: dict[str, str] = None,
|
|
1915
|
-
db_url: str = None,
|
|
1916
|
-
table_name: str = None,
|
|
1917
|
-
schema: dict[str, Any] = None,
|
|
1915
|
+
storage_options: Optional[dict[str, str]] = None,
|
|
1916
|
+
db_url: Optional[str] = None,
|
|
1917
|
+
table_name: Optional[str] = None,
|
|
1918
|
+
schema: Optional[dict[str, Any]] = None,
|
|
1918
1919
|
primary_key_column: str = "",
|
|
1919
1920
|
if_exists: str = "append",
|
|
1920
1921
|
create_table: bool = False,
|
|
1921
1922
|
# create_according_to_data: bool = False,
|
|
1922
1923
|
varchar_len: int = 50,
|
|
1923
|
-
parse_dates: list[str] = None,
|
|
1924
|
+
parse_dates: Optional[list[str]] = None,
|
|
1924
1925
|
):
|
|
1925
1926
|
"""
|
|
1926
1927
|
Write to SqlDB as output target for a flow.
|
mlrun/datastore/utils.py
CHANGED
|
@@ -26,7 +26,7 @@ import mlrun.datastore
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def parse_kafka_url(
|
|
29
|
-
url: str, brokers: typing.Union[list, str] = None
|
|
29
|
+
url: str, brokers: typing.Optional[typing.Union[list, str]] = None
|
|
30
30
|
) -> tuple[str, list]:
|
|
31
31
|
"""Generating Kafka topic and adjusting a list of bootstrap servers.
|
|
32
32
|
|
|
@@ -71,7 +71,7 @@ def upload_tarball(source_dir, target, secrets=None):
|
|
|
71
71
|
|
|
72
72
|
def filter_df_start_end_time(
|
|
73
73
|
df: typing.Union[pd.DataFrame, typing.Iterator[pd.DataFrame]],
|
|
74
|
-
time_column: str = None,
|
|
74
|
+
time_column: typing.Optional[str] = None,
|
|
75
75
|
start_time: pd.Timestamp = None,
|
|
76
76
|
end_time: pd.Timestamp = None,
|
|
77
77
|
) -> typing.Union[pd.DataFrame, typing.Iterator[pd.DataFrame]]:
|
mlrun/datastore/v3io.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import time
|
|
16
16
|
from datetime import datetime
|
|
17
|
+
from typing import Optional
|
|
17
18
|
|
|
18
19
|
import fsspec
|
|
19
20
|
import v3io
|
|
@@ -33,7 +34,9 @@ V3IO_DEFAULT_UPLOAD_CHUNK_SIZE = 1024 * 1024 * 10
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class V3ioStore(DataStore):
|
|
36
|
-
def __init__(
|
|
37
|
+
def __init__(
|
|
38
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
39
|
+
):
|
|
37
40
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
38
41
|
self.endpoint = self.endpoint or mlrun.mlconf.v3io_api
|
|
39
42
|
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
from collections.abc import Iterable
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
19
|
+
from mlrun.artifacts import DocumentArtifact
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _extract_collection_name(vectorstore: "VectorStore") -> str: # noqa: F821
|
|
23
|
+
# List of possible attribute names for collection name
|
|
24
|
+
possible_attributes = ["collection_name", "_collection_name"]
|
|
25
|
+
|
|
26
|
+
for attr in possible_attributes:
|
|
27
|
+
if hasattr(vectorstore, attr):
|
|
28
|
+
collection_name = getattr(vectorstore, attr)
|
|
29
|
+
if collection_name:
|
|
30
|
+
return collection_name
|
|
31
|
+
|
|
32
|
+
store_class = vectorstore.__class__.__name__.lower()
|
|
33
|
+
if store_class == "mongodbatlasvectorsearch":
|
|
34
|
+
return vectorstore.collection.name
|
|
35
|
+
|
|
36
|
+
# If we get here, we couldn't find a valid collection name
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"Failed to extract collection name from the vector store. "
|
|
39
|
+
"Please provide the collection name explicitly. "
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class VectorStoreCollection:
|
|
44
|
+
"""
|
|
45
|
+
A wrapper class for vector store collections with MLRun integration.
|
|
46
|
+
|
|
47
|
+
This class wraps a vector store implementation (like Milvus, Chroma) and provides
|
|
48
|
+
integration with MLRun context for document and artifact management. It delegates
|
|
49
|
+
most operations to the underlying vector store while handling MLRun-specific
|
|
50
|
+
functionality.
|
|
51
|
+
|
|
52
|
+
The class implements attribute delegation through __getattr__ and __setattr__,
|
|
53
|
+
allowing direct access to the underlying vector store's methods and attributes
|
|
54
|
+
while maintaining MLRun integration.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
mlrun_context: Union["MlrunProject", "MLClientCtx"], # noqa: F821
|
|
60
|
+
vector_store: "VectorStore", # noqa: F821
|
|
61
|
+
collection_name: Optional[str] = None,
|
|
62
|
+
):
|
|
63
|
+
self._collection_impl = vector_store
|
|
64
|
+
self._mlrun_context = mlrun_context
|
|
65
|
+
self.collection_name = collection_name or _extract_collection_name(vector_store)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def __class__(self):
|
|
69
|
+
# Make isinstance() check the wrapped object's class
|
|
70
|
+
return self._collection_impl.__class__
|
|
71
|
+
|
|
72
|
+
def __getattr__(self, name):
|
|
73
|
+
# This method is called when an attribute is not found in the usual places
|
|
74
|
+
# Forward the attribute access to _collection_impl
|
|
75
|
+
return getattr(self._collection_impl, name)
|
|
76
|
+
|
|
77
|
+
def __setattr__(self, name, value):
|
|
78
|
+
if name in ["_collection_impl", "_mlrun_context"] or name in self.__dict__:
|
|
79
|
+
# Use the base class method to avoid recursion
|
|
80
|
+
super().__setattr__(name, value)
|
|
81
|
+
else:
|
|
82
|
+
# Forward the attribute setting to _collection_impl
|
|
83
|
+
setattr(self._collection_impl, name, value)
|
|
84
|
+
|
|
85
|
+
def delete(self, *args, **kwargs):
|
|
86
|
+
self._collection_impl.delete(*args, **kwargs)
|
|
87
|
+
|
|
88
|
+
def add_documents(
|
|
89
|
+
self,
|
|
90
|
+
documents: list["Document"], # noqa: F821
|
|
91
|
+
**kwargs,
|
|
92
|
+
):
|
|
93
|
+
"""
|
|
94
|
+
Add a list of documents to the collection.
|
|
95
|
+
|
|
96
|
+
If the instance has an MLRun context, it will update the MLRun artifacts
|
|
97
|
+
associated with the documents.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
documents (list[Document]): A list of Document objects to be added.
|
|
101
|
+
**kwargs: Additional keyword arguments to be passed to the underlying
|
|
102
|
+
collection implementation.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
The result of the underlying collection implementation's add_documents method.
|
|
106
|
+
"""
|
|
107
|
+
if self._mlrun_context:
|
|
108
|
+
for document in documents:
|
|
109
|
+
mlrun_uri = document.metadata.get(
|
|
110
|
+
DocumentArtifact.METADATA_ARTIFACT_URI_KEY
|
|
111
|
+
)
|
|
112
|
+
if mlrun_uri:
|
|
113
|
+
artifact = self._mlrun_context.get_store_resource(mlrun_uri)
|
|
114
|
+
artifact.collection_add(self.collection_name)
|
|
115
|
+
self._mlrun_context.update_artifact(artifact)
|
|
116
|
+
|
|
117
|
+
return self._collection_impl.add_documents(documents, **kwargs)
|
|
118
|
+
|
|
119
|
+
def add_artifacts(self, artifacts: list[DocumentArtifact], splitter=None, **kwargs):
|
|
120
|
+
"""
|
|
121
|
+
Add a list of DocumentArtifact objects to the vector store collection.
|
|
122
|
+
|
|
123
|
+
Converts artifacts to LangChain documents, adds them to the vector store, and
|
|
124
|
+
updates the MLRun context. If documents are split, the IDs are handled appropriately.
|
|
125
|
+
|
|
126
|
+
:param artifacts: List of DocumentArtifact objects to add
|
|
127
|
+
:type artifacts: list[DocumentArtifact]
|
|
128
|
+
:param splitter: Document splitter to break artifacts into smaller chunks.
|
|
129
|
+
If None, each artifact becomes a single document.
|
|
130
|
+
:type splitter: TextSplitter, optional
|
|
131
|
+
:param kwargs: Additional arguments passed to the underlying add_documents method.
|
|
132
|
+
Special handling for 'ids' kwarg:
|
|
133
|
+
|
|
134
|
+
* If provided and document is split, IDs are generated as "{original_id}_{i}"
|
|
135
|
+
where i starts from 1 (e.g., "doc1_1", "doc1_2", etc.)
|
|
136
|
+
* If provided and document isn't split, original IDs are used as-is
|
|
137
|
+
|
|
138
|
+
:return: List of IDs for all added documents. When no custom IDs are provided:
|
|
139
|
+
|
|
140
|
+
* Without splitting: Vector store generates IDs automatically
|
|
141
|
+
* With splitting: Vector store generates separate IDs for each chunk
|
|
142
|
+
|
|
143
|
+
When custom IDs are provided:
|
|
144
|
+
|
|
145
|
+
* Without splitting: Uses provided IDs directly
|
|
146
|
+
* With splitting: Generates sequential IDs as "{original_id}_{i}" for each chunk
|
|
147
|
+
:rtype: list
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
all_ids = []
|
|
151
|
+
user_ids = kwargs.pop("ids", None)
|
|
152
|
+
|
|
153
|
+
if user_ids:
|
|
154
|
+
if not isinstance(user_ids, Iterable):
|
|
155
|
+
raise ValueError("IDs must be an iterable collection")
|
|
156
|
+
if len(user_ids) != len(artifacts):
|
|
157
|
+
raise ValueError(
|
|
158
|
+
"The number of IDs should match the number of artifacts"
|
|
159
|
+
)
|
|
160
|
+
for index, artifact in enumerate(artifacts):
|
|
161
|
+
documents = artifact.to_langchain_documents(splitter)
|
|
162
|
+
artifact.collection_add(self.collection_name)
|
|
163
|
+
if self._mlrun_context:
|
|
164
|
+
self._mlrun_context.update_artifact(artifact)
|
|
165
|
+
if user_ids:
|
|
166
|
+
num_of_documents = len(documents)
|
|
167
|
+
if num_of_documents > 1:
|
|
168
|
+
ids_to_pass = [
|
|
169
|
+
f"{user_ids[index]}_{i}" for i in range(1, num_of_documents + 1)
|
|
170
|
+
]
|
|
171
|
+
else:
|
|
172
|
+
ids_to_pass = [user_ids[index]]
|
|
173
|
+
kwargs["ids"] = ids_to_pass
|
|
174
|
+
ids = self._collection_impl.add_documents(documents, **kwargs)
|
|
175
|
+
all_ids.extend(ids)
|
|
176
|
+
return all_ids
|
|
177
|
+
|
|
178
|
+
def remove_from_artifact(self, artifact: DocumentArtifact):
|
|
179
|
+
"""
|
|
180
|
+
Remove the current object from the given artifact's collection and update the artifact.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
artifact (DocumentArtifact): The artifact from which the current object should be removed.
|
|
184
|
+
"""
|
|
185
|
+
artifact.collection_remove(self.collection_name)
|
|
186
|
+
if self._mlrun_context:
|
|
187
|
+
self._mlrun_context.update_artifact(artifact)
|
|
188
|
+
|
|
189
|
+
def delete_artifacts(self, artifacts: list[DocumentArtifact]):
|
|
190
|
+
"""
|
|
191
|
+
Delete a list of DocumentArtifact objects from the collection.
|
|
192
|
+
|
|
193
|
+
This method removes the specified artifacts from the collection and updates the MLRun context.
|
|
194
|
+
The deletion process varies depending on the type of the underlying collection implementation.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
artifacts (list[DocumentArtifact]): A list of DocumentArtifact objects to be deleted.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
NotImplementedError: If the delete operation is not supported for the collection implementation.
|
|
201
|
+
"""
|
|
202
|
+
store_class = self._collection_impl.__class__.__name__.lower()
|
|
203
|
+
for artifact in artifacts:
|
|
204
|
+
artifact.collection_remove(self.collection_name)
|
|
205
|
+
if self._mlrun_context:
|
|
206
|
+
self._mlrun_context.update_artifact(artifact)
|
|
207
|
+
|
|
208
|
+
if store_class == "milvus":
|
|
209
|
+
expr = f"{DocumentArtifact.METADATA_SOURCE_KEY} == '{artifact.get_source()}'"
|
|
210
|
+
return self._collection_impl.delete(expr=expr)
|
|
211
|
+
elif store_class == "chroma":
|
|
212
|
+
where = {DocumentArtifact.METADATA_SOURCE_KEY: artifact.get_source()}
|
|
213
|
+
return self._collection_impl.delete(where=where)
|
|
214
|
+
|
|
215
|
+
elif (
|
|
216
|
+
hasattr(self._collection_impl, "delete")
|
|
217
|
+
and "filter"
|
|
218
|
+
in inspect.signature(self._collection_impl.delete).parameters
|
|
219
|
+
):
|
|
220
|
+
filter = {
|
|
221
|
+
"metadata": {
|
|
222
|
+
DocumentArtifact.METADATA_SOURCE_KEY: artifact.get_source()
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return self._collection_impl.delete(filter=filter)
|
|
226
|
+
else:
|
|
227
|
+
raise NotImplementedError(
|
|
228
|
+
f"delete_artifacts() operation not supported for {store_class}"
|
|
229
|
+
)
|
mlrun/datastore/wasbfs/fs.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from typing import Optional
|
|
15
16
|
from urllib.parse import urlparse
|
|
16
17
|
|
|
17
18
|
from fsspec import AbstractFileSystem
|
|
@@ -22,23 +23,23 @@ class WasbFS(AbstractFileSystem):
|
|
|
22
23
|
|
|
23
24
|
def __init__(
|
|
24
25
|
self,
|
|
25
|
-
account_name: str = None,
|
|
26
|
-
account_key: str = None,
|
|
27
|
-
connection_string: str = None,
|
|
28
|
-
credential: str = None,
|
|
29
|
-
sas_token: str = None,
|
|
26
|
+
account_name: Optional[str] = None,
|
|
27
|
+
account_key: Optional[str] = None,
|
|
28
|
+
connection_string: Optional[str] = None,
|
|
29
|
+
credential: Optional[str] = None,
|
|
30
|
+
sas_token: Optional[str] = None,
|
|
30
31
|
request_session=None,
|
|
31
|
-
socket_timeout: int = None,
|
|
32
|
-
blocksize: int = None,
|
|
33
|
-
client_id: str = None,
|
|
34
|
-
client_secret: str = None,
|
|
35
|
-
tenant_id: str = None,
|
|
32
|
+
socket_timeout: Optional[int] = None,
|
|
33
|
+
blocksize: Optional[int] = None,
|
|
34
|
+
client_id: Optional[str] = None,
|
|
35
|
+
client_secret: Optional[str] = None,
|
|
36
|
+
tenant_id: Optional[str] = None,
|
|
36
37
|
anon: bool = True,
|
|
37
|
-
location_mode: str = None,
|
|
38
|
+
location_mode: Optional[str] = None,
|
|
38
39
|
loop=None,
|
|
39
40
|
asynchronous: bool = False,
|
|
40
41
|
default_fill_cache: bool = True,
|
|
41
|
-
default_cache_type: str = None,
|
|
42
|
+
default_cache_type: Optional[str] = None,
|
|
42
43
|
**kwargs,
|
|
43
44
|
):
|
|
44
45
|
from adlfs import AzureBlobFileSystem
|