mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +26 -112
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +46 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +47 -48
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +69 -0
- mlrun/common/db/sql_session.py +2 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/common/formatters/artifact.py +21 -0
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/helpers.py +1 -2
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +24 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +18 -8
- mlrun/common/schemas/auth.py +11 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -1
- mlrun/common/schemas/feature_store.py +16 -16
- mlrun/common/schemas/frontend_spec.py +8 -7
- mlrun/common/schemas/function.py +5 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +18 -3
- mlrun/common/schemas/model_monitoring/constants.py +83 -26
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
- mlrun/common/schemas/notification.py +4 -4
- mlrun/common/schemas/object.py +2 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +1 -10
- mlrun/common/schemas/project.py +24 -23
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +3 -3
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +2 -2
- mlrun/common/types.py +7 -1
- mlrun/config.py +54 -17
- mlrun/data_types/to_pandas.py +10 -12
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +17 -5
- mlrun/datastore/base.py +62 -39
- mlrun/datastore/datastore.py +28 -9
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/filestore.py +0 -1
- mlrun/datastore/google_cloud_storage.py +6 -2
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +6 -2
- mlrun/datastore/s3.py +9 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +201 -96
- mlrun/datastore/spark_utils.py +1 -2
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +358 -104
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +5 -1
- mlrun/db/base.py +185 -35
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +614 -179
- mlrun/db/nopdb.py +210 -26
- mlrun/errors.py +12 -1
- mlrun/execution.py +41 -24
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -72
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +28 -30
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/conversion.py +11 -13
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +37 -34
- mlrun/features.py +9 -20
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +2 -3
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +4 -3
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +14 -16
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +17 -11
- mlrun/launcher/remote.py +16 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +238 -73
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +138 -315
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +24 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +104 -84
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +127 -28
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/prometheus.py +1 -4
- mlrun/model_monitoring/stream_processing.py +62 -231
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +6 -6
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +35 -21
- mlrun/projects/pipelines.py +68 -99
- mlrun/projects/project.py +830 -266
- mlrun/render.py +3 -11
- mlrun/run.py +162 -166
- mlrun/runtimes/__init__.py +62 -7
- mlrun/runtimes/base.py +39 -32
- mlrun/runtimes/daskjob.py +8 -8
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +6 -3
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
- mlrun/runtimes/pod.py +286 -88
- mlrun/runtimes/remotesparkjob.py +2 -2
- mlrun/runtimes/sparkjob/spark3job.py +51 -34
- mlrun/runtimes/utils.py +7 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +13 -10
- mlrun/serving/server.py +22 -26
- mlrun/serving/states.py +99 -25
- mlrun/serving/utils.py +3 -3
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +59 -20
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +1 -2
- mlrun/utils/async_http.py +5 -7
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +3 -3
- mlrun/utils/helpers.py +183 -197
- mlrun/utils/http.py +2 -5
- mlrun/utils/logger.py +76 -14
- mlrun/utils/notifications/notification/__init__.py +17 -12
- mlrun/utils/notifications/notification/base.py +14 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +155 -30
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +2 -4
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc2.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -17,9 +17,10 @@ import os
|
|
|
17
17
|
import random
|
|
18
18
|
import sys
|
|
19
19
|
import time
|
|
20
|
+
import warnings
|
|
20
21
|
from collections import Counter
|
|
21
22
|
from copy import copy
|
|
22
|
-
from typing import Any,
|
|
23
|
+
from typing import Any, Optional, Union
|
|
23
24
|
from urllib.parse import urlparse
|
|
24
25
|
|
|
25
26
|
import pandas as pd
|
|
@@ -28,8 +29,10 @@ from mergedeep import merge
|
|
|
28
29
|
import mlrun
|
|
29
30
|
import mlrun.utils.helpers
|
|
30
31
|
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
31
34
|
from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
|
|
32
|
-
from mlrun.utils import now_date
|
|
35
|
+
from mlrun.utils import logger, now_date
|
|
33
36
|
from mlrun.utils.helpers import to_parquet
|
|
34
37
|
from mlrun.utils.v3io_clients import get_frames_client
|
|
35
38
|
|
|
@@ -43,7 +46,6 @@ from .utils import (
|
|
|
43
46
|
filter_df_start_end_time,
|
|
44
47
|
parse_kafka_url,
|
|
45
48
|
select_columns_from_df,
|
|
46
|
-
store_path_to_spark,
|
|
47
49
|
)
|
|
48
50
|
|
|
49
51
|
|
|
@@ -58,6 +60,7 @@ class TargetTypes:
|
|
|
58
60
|
dataframe = "dataframe"
|
|
59
61
|
custom = "custom"
|
|
60
62
|
sql = "sql"
|
|
63
|
+
snowflake = "snowflake"
|
|
61
64
|
|
|
62
65
|
@staticmethod
|
|
63
66
|
def all():
|
|
@@ -72,6 +75,7 @@ class TargetTypes:
|
|
|
72
75
|
TargetTypes.dataframe,
|
|
73
76
|
TargetTypes.custom,
|
|
74
77
|
TargetTypes.sql,
|
|
78
|
+
TargetTypes.snowflake,
|
|
75
79
|
]
|
|
76
80
|
|
|
77
81
|
|
|
@@ -79,11 +83,14 @@ def generate_target_run_id():
|
|
|
79
83
|
return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
|
|
80
84
|
|
|
81
85
|
|
|
82
|
-
def write_spark_dataframe_with_options(spark_options, df, mode):
|
|
86
|
+
def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
|
|
83
87
|
non_hadoop_spark_options = spark_session_update_hadoop_options(
|
|
84
88
|
df.sql_ctx.sparkSession, spark_options
|
|
85
89
|
)
|
|
86
|
-
|
|
90
|
+
if write_format:
|
|
91
|
+
df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
|
|
92
|
+
else:
|
|
93
|
+
df.write.mode(mode).save(**non_hadoop_spark_options)
|
|
87
94
|
|
|
88
95
|
|
|
89
96
|
def default_target_names():
|
|
@@ -215,9 +222,8 @@ def validate_target_list(targets):
|
|
|
215
222
|
]
|
|
216
223
|
if target_types_requiring_name:
|
|
217
224
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
218
|
-
"Only one default name per target type is allowed (please
|
|
219
|
-
|
|
220
|
-
)
|
|
225
|
+
"Only one default name per target type is allowed (please "
|
|
226
|
+
f"specify name for {target_types_requiring_name} target)"
|
|
221
227
|
)
|
|
222
228
|
|
|
223
229
|
target_names_count = Counter(
|
|
@@ -232,9 +238,8 @@ def validate_target_list(targets):
|
|
|
232
238
|
|
|
233
239
|
if targets_with_same_name:
|
|
234
240
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
-
"Each target must have a unique name (more than one target with
|
|
236
|
-
|
|
237
|
-
)
|
|
241
|
+
"Each target must have a unique name (more than one target with "
|
|
242
|
+
f"those names found {targets_with_same_name})"
|
|
238
243
|
)
|
|
239
244
|
|
|
240
245
|
no_path_target_types_count = Counter(
|
|
@@ -252,9 +257,8 @@ def validate_target_list(targets):
|
|
|
252
257
|
]
|
|
253
258
|
if target_types_requiring_path:
|
|
254
259
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
255
|
-
"Only one default path per target type is allowed (please specify
|
|
256
|
-
|
|
257
|
-
)
|
|
260
|
+
"Only one default path per target type is allowed (please specify "
|
|
261
|
+
f"path for {target_types_requiring_path} target)"
|
|
258
262
|
)
|
|
259
263
|
|
|
260
264
|
target_paths_count = Counter(
|
|
@@ -269,9 +273,8 @@ def validate_target_list(targets):
|
|
|
269
273
|
|
|
270
274
|
if targets_with_same_path:
|
|
271
275
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
272
|
-
"Each target must have a unique path (more than one target
|
|
273
|
-
|
|
274
|
-
)
|
|
276
|
+
"Each target must have a unique path (more than one target "
|
|
277
|
+
f"with those names found {targets_with_same_path})"
|
|
275
278
|
)
|
|
276
279
|
|
|
277
280
|
|
|
@@ -390,17 +393,17 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
390
393
|
self,
|
|
391
394
|
name: str = "",
|
|
392
395
|
path=None,
|
|
393
|
-
attributes:
|
|
396
|
+
attributes: dict[str, str] = None,
|
|
394
397
|
after_step=None,
|
|
395
398
|
columns=None,
|
|
396
399
|
partitioned: bool = False,
|
|
397
400
|
key_bucketing_number: Optional[int] = None,
|
|
398
|
-
partition_cols: Optional[
|
|
401
|
+
partition_cols: Optional[list[str]] = None,
|
|
399
402
|
time_partitioning_granularity: Optional[str] = None,
|
|
400
403
|
max_events: Optional[int] = None,
|
|
401
404
|
flush_after_seconds: Optional[int] = None,
|
|
402
|
-
storage_options:
|
|
403
|
-
schema:
|
|
405
|
+
storage_options: dict[str, str] = None,
|
|
406
|
+
schema: dict[str, Any] = None,
|
|
404
407
|
credentials_prefix=None,
|
|
405
408
|
):
|
|
406
409
|
super().__init__(
|
|
@@ -452,14 +455,11 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
452
455
|
if self.credentials_prefix
|
|
453
456
|
else None
|
|
454
457
|
)
|
|
455
|
-
store, resolved_store_path = mlrun.store_manager.get_or_create_store(
|
|
458
|
+
store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
|
|
456
459
|
self.get_target_path(),
|
|
457
460
|
credentials_prefix_secrets,
|
|
458
461
|
)
|
|
459
|
-
|
|
460
|
-
return store, store.url + resolved_store_path
|
|
461
|
-
else:
|
|
462
|
-
return store, self.get_target_path()
|
|
462
|
+
return store, resolved_store_path, url
|
|
463
463
|
|
|
464
464
|
def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
|
|
465
465
|
result = []
|
|
@@ -505,10 +505,13 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
505
505
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
506
506
|
options.update(kwargs)
|
|
507
507
|
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
508
|
-
|
|
508
|
+
write_format = options.pop("format", None)
|
|
509
|
+
write_spark_dataframe_with_options(
|
|
510
|
+
options, df, "overwrite", write_format=write_format
|
|
511
|
+
)
|
|
509
512
|
elif hasattr(df, "dask"):
|
|
510
513
|
dask_options = self.get_dask_options()
|
|
511
|
-
store, target_path = self._get_store_and_path()
|
|
514
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
512
515
|
storage_options = store.get_storage_options()
|
|
513
516
|
df = df.repartition(partition_size="100MB")
|
|
514
517
|
try:
|
|
@@ -529,10 +532,15 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
529
532
|
except Exception as exc:
|
|
530
533
|
raise RuntimeError("Failed to write Dask Dataframe") from exc
|
|
531
534
|
else:
|
|
532
|
-
store, target_path = self._get_store_and_path()
|
|
535
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
533
536
|
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
534
537
|
file_system = store.filesystem
|
|
535
|
-
if
|
|
538
|
+
if (
|
|
539
|
+
file_system.protocol == "file"
|
|
540
|
+
# fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
|
|
541
|
+
or isinstance(file_system.protocol, (tuple, list))
|
|
542
|
+
and "file" in file_system.protocol
|
|
543
|
+
):
|
|
536
544
|
dir = os.path.dirname(target_path)
|
|
537
545
|
if dir:
|
|
538
546
|
os.makedirs(dir, exist_ok=True)
|
|
@@ -649,6 +657,29 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
649
657
|
def _target_path_object(self):
|
|
650
658
|
"""return the actual/computed target path"""
|
|
651
659
|
is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
|
|
660
|
+
|
|
661
|
+
if self._resource and self.path:
|
|
662
|
+
parsed_url = urlparse(self.path)
|
|
663
|
+
# When the URL consists only from scheme and endpoint and no path,
|
|
664
|
+
# make a default path for DS and redis targets.
|
|
665
|
+
# Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
|
|
666
|
+
if (
|
|
667
|
+
not isinstance(self, KafkaTarget)
|
|
668
|
+
and parsed_url.scheme in ["ds", "redis", "rediss"]
|
|
669
|
+
and (not parsed_url.path or parsed_url.path == "/")
|
|
670
|
+
):
|
|
671
|
+
return TargetPathObject(
|
|
672
|
+
_get_target_path(
|
|
673
|
+
self,
|
|
674
|
+
self._resource,
|
|
675
|
+
self.run_id is not None,
|
|
676
|
+
netloc=parsed_url.netloc,
|
|
677
|
+
scheme=parsed_url.scheme,
|
|
678
|
+
),
|
|
679
|
+
self.run_id,
|
|
680
|
+
is_single_file,
|
|
681
|
+
)
|
|
682
|
+
|
|
652
683
|
return self.get_path() or (
|
|
653
684
|
TargetPathObject(
|
|
654
685
|
_get_target_path(self, self._resource, self.run_id is not None),
|
|
@@ -696,7 +727,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
696
727
|
raise NotImplementedError()
|
|
697
728
|
|
|
698
729
|
def purge(self):
|
|
699
|
-
store, target_path = self._get_store_and_path()
|
|
730
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
700
731
|
store.rm(target_path, recursive=True)
|
|
701
732
|
|
|
702
733
|
def as_df(
|
|
@@ -707,9 +738,13 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
707
738
|
start_time=None,
|
|
708
739
|
end_time=None,
|
|
709
740
|
time_column=None,
|
|
741
|
+
additional_filters=None,
|
|
710
742
|
**kwargs,
|
|
711
743
|
):
|
|
712
744
|
"""return the target data as dataframe"""
|
|
745
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
746
|
+
additional_filters, self.__class__
|
|
747
|
+
)
|
|
713
748
|
return mlrun.get_dataitem(self.get_target_path()).as_df(
|
|
714
749
|
columns=columns,
|
|
715
750
|
df_module=df_module,
|
|
@@ -723,7 +758,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
723
758
|
# options used in spark.read.load(**options)
|
|
724
759
|
raise NotImplementedError()
|
|
725
760
|
|
|
726
|
-
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=
|
|
761
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
727
762
|
return df
|
|
728
763
|
|
|
729
764
|
def get_dask_options(self):
|
|
@@ -731,7 +766,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
731
766
|
|
|
732
767
|
|
|
733
768
|
class ParquetTarget(BaseStoreTarget):
|
|
734
|
-
"""
|
|
769
|
+
"""Parquet target storage driver, used to materialize feature set/vector data into parquet files.
|
|
735
770
|
|
|
736
771
|
:param name: optional, target name. By default will be called ParquetTarget
|
|
737
772
|
:param path: optional, Output path. Can be either a file or directory.
|
|
@@ -766,16 +801,16 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
766
801
|
self,
|
|
767
802
|
name: str = "",
|
|
768
803
|
path=None,
|
|
769
|
-
attributes:
|
|
804
|
+
attributes: dict[str, str] = None,
|
|
770
805
|
after_step=None,
|
|
771
806
|
columns=None,
|
|
772
807
|
partitioned: bool = None,
|
|
773
808
|
key_bucketing_number: Optional[int] = None,
|
|
774
|
-
partition_cols: Optional[
|
|
809
|
+
partition_cols: Optional[list[str]] = None,
|
|
775
810
|
time_partitioning_granularity: Optional[str] = None,
|
|
776
811
|
max_events: Optional[int] = 10000,
|
|
777
812
|
flush_after_seconds: Optional[int] = 900,
|
|
778
|
-
storage_options:
|
|
813
|
+
storage_options: dict[str, str] = None,
|
|
779
814
|
):
|
|
780
815
|
self.path = path
|
|
781
816
|
if partitioned is None:
|
|
@@ -876,7 +911,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
876
911
|
for key_column in key_columns:
|
|
877
912
|
tuple_key_columns.append((key_column.name, key_column.value_type))
|
|
878
913
|
|
|
879
|
-
store, target_path = self._get_store_and_path()
|
|
914
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
880
915
|
|
|
881
916
|
storage_options = store.get_storage_options()
|
|
882
917
|
if storage_options and self.storage_options:
|
|
@@ -929,27 +964,19 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
929
964
|
if unit == time_partitioning_granularity:
|
|
930
965
|
break
|
|
931
966
|
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
path = store.url + path
|
|
938
|
-
result = {
|
|
939
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
940
|
-
"format": "parquet",
|
|
941
|
-
}
|
|
942
|
-
result = {**result, **storage_spark_options}
|
|
943
|
-
else:
|
|
944
|
-
result = {
|
|
945
|
-
"path": store_path_to_spark(self.get_target_path()),
|
|
967
|
+
store, path, url = self._get_store_and_path()
|
|
968
|
+
spark_options = store.get_spark_options()
|
|
969
|
+
spark_options.update(
|
|
970
|
+
{
|
|
971
|
+
"path": store.spark_url + path,
|
|
946
972
|
"format": "parquet",
|
|
947
973
|
}
|
|
974
|
+
)
|
|
948
975
|
for partition_col in self.partition_cols or []:
|
|
949
976
|
partition_cols.append(partition_col)
|
|
950
977
|
if partition_cols:
|
|
951
|
-
|
|
952
|
-
return
|
|
978
|
+
spark_options["partitionBy"] = partition_cols
|
|
979
|
+
return spark_options
|
|
953
980
|
|
|
954
981
|
def get_dask_options(self):
|
|
955
982
|
return {"format": "parquet"}
|
|
@@ -962,6 +989,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
962
989
|
start_time=None,
|
|
963
990
|
end_time=None,
|
|
964
991
|
time_column=None,
|
|
992
|
+
additional_filters=None,
|
|
965
993
|
**kwargs,
|
|
966
994
|
):
|
|
967
995
|
"""return the target data as dataframe"""
|
|
@@ -972,6 +1000,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
972
1000
|
start_time=start_time,
|
|
973
1001
|
end_time=end_time,
|
|
974
1002
|
time_column=time_column,
|
|
1003
|
+
additional_filters=transform_list_filters_to_tuple(additional_filters),
|
|
975
1004
|
**kwargs,
|
|
976
1005
|
)
|
|
977
1006
|
if not columns:
|
|
@@ -1056,7 +1085,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1056
1085
|
column_list = self._get_column_list(
|
|
1057
1086
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1058
1087
|
)
|
|
1059
|
-
store, target_path = self._get_store_and_path()
|
|
1088
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1060
1089
|
graph.add_step(
|
|
1061
1090
|
name=self.name or "CSVTarget",
|
|
1062
1091
|
after=after,
|
|
@@ -1071,24 +1100,16 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1071
1100
|
)
|
|
1072
1101
|
|
|
1073
1102
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
path = store.url + path
|
|
1080
|
-
result = {
|
|
1081
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
1082
|
-
"format": "csv",
|
|
1083
|
-
"header": "true",
|
|
1084
|
-
}
|
|
1085
|
-
return {**result, **storage_spark_options}
|
|
1086
|
-
else:
|
|
1087
|
-
return {
|
|
1088
|
-
"path": store_path_to_spark(self.get_target_path()),
|
|
1103
|
+
store, path, url = self._get_store_and_path()
|
|
1104
|
+
spark_options = store.get_spark_options()
|
|
1105
|
+
spark_options.update(
|
|
1106
|
+
{
|
|
1107
|
+
"path": store.spark_url + path,
|
|
1089
1108
|
"format": "csv",
|
|
1090
1109
|
"header": "true",
|
|
1091
1110
|
}
|
|
1111
|
+
)
|
|
1112
|
+
return spark_options
|
|
1092
1113
|
|
|
1093
1114
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1094
1115
|
import pyspark.sql.functions as funcs
|
|
@@ -1110,8 +1131,12 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1110
1131
|
start_time=None,
|
|
1111
1132
|
end_time=None,
|
|
1112
1133
|
time_column=None,
|
|
1134
|
+
additional_filters=None,
|
|
1113
1135
|
**kwargs,
|
|
1114
1136
|
):
|
|
1137
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1138
|
+
additional_filters, self.__class__
|
|
1139
|
+
)
|
|
1115
1140
|
df = super().as_df(
|
|
1116
1141
|
columns=columns,
|
|
1117
1142
|
df_module=df_module,
|
|
@@ -1132,6 +1157,98 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1132
1157
|
return True
|
|
1133
1158
|
|
|
1134
1159
|
|
|
1160
|
+
class SnowflakeTarget(BaseStoreTarget):
|
|
1161
|
+
"""
|
|
1162
|
+
:param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
|
|
1163
|
+
if they exist.
|
|
1164
|
+
:param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
|
|
1165
|
+
:param user: Snowflake user for login
|
|
1166
|
+
:param db_schema: Database schema
|
|
1167
|
+
:param database: Database name
|
|
1168
|
+
:param warehouse: Snowflake warehouse name
|
|
1169
|
+
:param table_name: Snowflake table name
|
|
1170
|
+
"""
|
|
1171
|
+
|
|
1172
|
+
support_spark = True
|
|
1173
|
+
support_append = True
|
|
1174
|
+
is_offline = True
|
|
1175
|
+
kind = TargetTypes.snowflake
|
|
1176
|
+
|
|
1177
|
+
def __init__(
|
|
1178
|
+
self,
|
|
1179
|
+
name: str = "",
|
|
1180
|
+
path=None,
|
|
1181
|
+
attributes: dict[str, str] = None,
|
|
1182
|
+
after_step=None,
|
|
1183
|
+
columns=None,
|
|
1184
|
+
partitioned: bool = False,
|
|
1185
|
+
key_bucketing_number: Optional[int] = None,
|
|
1186
|
+
partition_cols: Optional[list[str]] = None,
|
|
1187
|
+
time_partitioning_granularity: Optional[str] = None,
|
|
1188
|
+
max_events: Optional[int] = None,
|
|
1189
|
+
flush_after_seconds: Optional[int] = None,
|
|
1190
|
+
storage_options: dict[str, str] = None,
|
|
1191
|
+
schema: dict[str, Any] = None,
|
|
1192
|
+
credentials_prefix=None,
|
|
1193
|
+
url: str = None,
|
|
1194
|
+
user: str = None,
|
|
1195
|
+
db_schema: str = None,
|
|
1196
|
+
database: str = None,
|
|
1197
|
+
warehouse: str = None,
|
|
1198
|
+
table_name: str = None,
|
|
1199
|
+
):
|
|
1200
|
+
attrs = {
|
|
1201
|
+
"url": url,
|
|
1202
|
+
"user": user,
|
|
1203
|
+
"database": database,
|
|
1204
|
+
"schema": db_schema,
|
|
1205
|
+
"warehouse": warehouse,
|
|
1206
|
+
"table": table_name,
|
|
1207
|
+
}
|
|
1208
|
+
extended_attrs = {
|
|
1209
|
+
key: value for key, value in attrs.items() if value is not None
|
|
1210
|
+
}
|
|
1211
|
+
attributes = {} if not attributes else attributes
|
|
1212
|
+
attributes.update(extended_attrs)
|
|
1213
|
+
super().__init__(
|
|
1214
|
+
name,
|
|
1215
|
+
path,
|
|
1216
|
+
attributes,
|
|
1217
|
+
after_step,
|
|
1218
|
+
list(schema.keys()) if schema else columns,
|
|
1219
|
+
partitioned,
|
|
1220
|
+
key_bucketing_number,
|
|
1221
|
+
partition_cols,
|
|
1222
|
+
time_partitioning_granularity,
|
|
1223
|
+
max_events=max_events,
|
|
1224
|
+
flush_after_seconds=flush_after_seconds,
|
|
1225
|
+
storage_options=storage_options,
|
|
1226
|
+
schema=schema,
|
|
1227
|
+
credentials_prefix=credentials_prefix,
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1231
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
1232
|
+
spark_options["dbtable"] = self.attributes.get("table")
|
|
1233
|
+
return spark_options
|
|
1234
|
+
|
|
1235
|
+
def purge(self):
|
|
1236
|
+
pass
|
|
1237
|
+
|
|
1238
|
+
def as_df(
|
|
1239
|
+
self,
|
|
1240
|
+
columns=None,
|
|
1241
|
+
df_module=None,
|
|
1242
|
+
entities=None,
|
|
1243
|
+
start_time=None,
|
|
1244
|
+
end_time=None,
|
|
1245
|
+
time_column=None,
|
|
1246
|
+
additional_filters=None,
|
|
1247
|
+
**kwargs,
|
|
1248
|
+
):
|
|
1249
|
+
raise NotImplementedError()
|
|
1250
|
+
|
|
1251
|
+
|
|
1135
1252
|
class NoSqlBaseTarget(BaseStoreTarget):
|
|
1136
1253
|
is_table = True
|
|
1137
1254
|
is_online = True
|
|
@@ -1193,7 +1310,17 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1193
1310
|
def get_dask_options(self):
|
|
1194
1311
|
return {"format": "csv"}
|
|
1195
1312
|
|
|
1196
|
-
def as_df(
|
|
1313
|
+
def as_df(
|
|
1314
|
+
self,
|
|
1315
|
+
columns=None,
|
|
1316
|
+
df_module=None,
|
|
1317
|
+
entities=None,
|
|
1318
|
+
start_time=None,
|
|
1319
|
+
end_time=None,
|
|
1320
|
+
time_column=None,
|
|
1321
|
+
additional_filters=None,
|
|
1322
|
+
**kwargs,
|
|
1323
|
+
):
|
|
1197
1324
|
raise NotImplementedError()
|
|
1198
1325
|
|
|
1199
1326
|
def write_dataframe(
|
|
@@ -1203,7 +1330,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1203
1330
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
1204
1331
|
options.update(kwargs)
|
|
1205
1332
|
df = self.prepare_spark_df(df)
|
|
1206
|
-
|
|
1333
|
+
write_format = options.pop("format", None)
|
|
1334
|
+
write_spark_dataframe_with_options(
|
|
1335
|
+
options, df, "overwrite", write_format=write_format
|
|
1336
|
+
)
|
|
1207
1337
|
else:
|
|
1208
1338
|
# To prevent modification of the original dataframe and make sure
|
|
1209
1339
|
# that the last event of a key is the one being persisted
|
|
@@ -1213,7 +1343,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1213
1343
|
df = df.copy(deep=False)
|
|
1214
1344
|
access_key = self._get_credential("V3IO_ACCESS_KEY")
|
|
1215
1345
|
|
|
1216
|
-
|
|
1346
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1347
|
+
storage_options = store.get_storage_options()
|
|
1348
|
+
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1349
|
+
|
|
1350
|
+
_, path_with_container = parse_path(target_path)
|
|
1217
1351
|
container, path = split_path(path_with_container)
|
|
1218
1352
|
|
|
1219
1353
|
frames_client = get_frames_client(
|
|
@@ -1231,17 +1365,31 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1231
1365
|
def get_table_object(self):
|
|
1232
1366
|
from storey import Table, V3ioDriver
|
|
1233
1367
|
|
|
1234
|
-
|
|
1235
|
-
endpoint, uri = parse_path(
|
|
1368
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1369
|
+
endpoint, uri = parse_path(target_path)
|
|
1370
|
+
storage_options = store.get_storage_options()
|
|
1371
|
+
access_key = storage_options.get("v3io_access_key")
|
|
1372
|
+
|
|
1236
1373
|
return Table(
|
|
1237
1374
|
uri,
|
|
1238
|
-
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
|
|
1375
|
+
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
|
|
1239
1376
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
1240
1377
|
)
|
|
1241
1378
|
|
|
1242
1379
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1380
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1381
|
+
storage_options = store.get_storage_options()
|
|
1382
|
+
store_access_key = storage_options.get("v3io_access_key")
|
|
1383
|
+
env_access_key = self._secrets.get(
|
|
1384
|
+
"V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
|
|
1385
|
+
)
|
|
1386
|
+
if store_access_key and env_access_key and store_access_key != env_access_key:
|
|
1387
|
+
logger.warning(
|
|
1388
|
+
"The Spark v3io connector does not support access_key parameterization."
|
|
1389
|
+
"Spark will disregard the store-provided key."
|
|
1390
|
+
)
|
|
1243
1391
|
spark_options = {
|
|
1244
|
-
"path":
|
|
1392
|
+
"path": store.spark_url + path_in_store,
|
|
1245
1393
|
"format": "io.iguaz.v3io.spark.sql.kv",
|
|
1246
1394
|
}
|
|
1247
1395
|
if isinstance(key_column, list) and len(key_column) >= 1:
|
|
@@ -1334,10 +1482,10 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1334
1482
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1335
1483
|
endpoint, uri = self._get_server_endpoint()
|
|
1336
1484
|
parsed_endpoint = urlparse(endpoint)
|
|
1337
|
-
|
|
1485
|
+
store, path_in_store, path = self._get_store_and_path()
|
|
1338
1486
|
return {
|
|
1339
1487
|
"key.column": "_spark_object_name",
|
|
1340
|
-
"table": "{" +
|
|
1488
|
+
"table": "{" + path_in_store,
|
|
1341
1489
|
"format": "org.apache.spark.sql.redis",
|
|
1342
1490
|
"host": parsed_endpoint.hostname,
|
|
1343
1491
|
"port": parsed_endpoint.port,
|
|
@@ -1385,10 +1533,12 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1385
1533
|
from storey import V3ioDriver
|
|
1386
1534
|
|
|
1387
1535
|
key_columns = list(key_columns.keys())
|
|
1388
|
-
path = self.
|
|
1536
|
+
store, path_in_store, path = self._get_store_and_path()
|
|
1389
1537
|
if not path:
|
|
1390
1538
|
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1391
1539
|
endpoint, uri = parse_path(path)
|
|
1540
|
+
storage_options = store.get_storage_options()
|
|
1541
|
+
access_key = storage_options.get("v3io_access_key")
|
|
1392
1542
|
column_list = self._get_column_list(
|
|
1393
1543
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1394
1544
|
)
|
|
@@ -1399,16 +1549,47 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1399
1549
|
graph_shape="cylinder",
|
|
1400
1550
|
class_name="storey.StreamTarget",
|
|
1401
1551
|
columns=column_list,
|
|
1402
|
-
storage=V3ioDriver(
|
|
1552
|
+
storage=V3ioDriver(
|
|
1553
|
+
webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
|
|
1554
|
+
),
|
|
1403
1555
|
stream_path=uri,
|
|
1404
1556
|
**self.attributes,
|
|
1405
1557
|
)
|
|
1406
1558
|
|
|
1407
|
-
def as_df(
|
|
1559
|
+
def as_df(
|
|
1560
|
+
self,
|
|
1561
|
+
columns=None,
|
|
1562
|
+
df_module=None,
|
|
1563
|
+
entities=None,
|
|
1564
|
+
start_time=None,
|
|
1565
|
+
end_time=None,
|
|
1566
|
+
time_column=None,
|
|
1567
|
+
additional_filters=None,
|
|
1568
|
+
**kwargs,
|
|
1569
|
+
):
|
|
1408
1570
|
raise NotImplementedError()
|
|
1409
1571
|
|
|
1410
1572
|
|
|
1411
1573
|
class KafkaTarget(BaseStoreTarget):
|
|
1574
|
+
"""
|
|
1575
|
+
Kafka target storage driver, used to write data into kafka topics.
|
|
1576
|
+
example::
|
|
1577
|
+
# define target
|
|
1578
|
+
kafka_target = KafkaTarget(
|
|
1579
|
+
name="kafka", path="my_topic", brokers="localhost:9092"
|
|
1580
|
+
)
|
|
1581
|
+
# ingest
|
|
1582
|
+
stocks_set.ingest(stocks, [kafka_target])
|
|
1583
|
+
:param name: target name
|
|
1584
|
+
:param path: topic name e.g. "my_topic"
|
|
1585
|
+
:param after_step: optional, after what step in the graph to add the target
|
|
1586
|
+
:param columns: optional, which columns from data to write
|
|
1587
|
+
:param bootstrap_servers: Deprecated. Use the brokers parameter instead
|
|
1588
|
+
:param producer_options: additional configurations for kafka producer
|
|
1589
|
+
:param brokers: kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
|
|
1590
|
+
"localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
|
|
1591
|
+
"""
|
|
1592
|
+
|
|
1412
1593
|
kind = TargetTypes.kafka
|
|
1413
1594
|
is_table = False
|
|
1414
1595
|
is_online = False
|
|
@@ -1421,11 +1602,27 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1421
1602
|
*args,
|
|
1422
1603
|
bootstrap_servers=None,
|
|
1423
1604
|
producer_options=None,
|
|
1605
|
+
brokers=None,
|
|
1424
1606
|
**kwargs,
|
|
1425
1607
|
):
|
|
1426
1608
|
attrs = {}
|
|
1427
|
-
|
|
1428
|
-
|
|
1609
|
+
|
|
1610
|
+
# TODO: Remove this in 1.9.0
|
|
1611
|
+
if bootstrap_servers:
|
|
1612
|
+
if brokers:
|
|
1613
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1614
|
+
"KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
|
|
1615
|
+
"'bootstrap_servers' parameter. Please use 'brokers' only."
|
|
1616
|
+
)
|
|
1617
|
+
warnings.warn(
|
|
1618
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
1619
|
+
"use 'brokers' instead.",
|
|
1620
|
+
FutureWarning,
|
|
1621
|
+
)
|
|
1622
|
+
brokers = bootstrap_servers
|
|
1623
|
+
|
|
1624
|
+
if brokers:
|
|
1625
|
+
attrs["brokers"] = brokers
|
|
1429
1626
|
if producer_options is not None:
|
|
1430
1627
|
attrs["producer_options"] = producer_options
|
|
1431
1628
|
|
|
@@ -1447,14 +1644,16 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1447
1644
|
if self.path and self.path.startswith("ds://"):
|
|
1448
1645
|
datastore_profile = datastore_profile_read(self.path)
|
|
1449
1646
|
attributes = datastore_profile.attributes()
|
|
1450
|
-
|
|
1647
|
+
brokers = attributes.pop(
|
|
1648
|
+
"brokers", attributes.pop("bootstrap_servers", None)
|
|
1649
|
+
)
|
|
1451
1650
|
topic = datastore_profile.topic
|
|
1452
1651
|
else:
|
|
1453
1652
|
attributes = copy(self.attributes)
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
self.get_target_path(), bootstrap_servers
|
|
1653
|
+
brokers = attributes.pop(
|
|
1654
|
+
"brokers", attributes.pop("bootstrap_servers", None)
|
|
1457
1655
|
)
|
|
1656
|
+
topic, brokers = parse_kafka_url(self.get_target_path(), brokers)
|
|
1458
1657
|
|
|
1459
1658
|
if not topic:
|
|
1460
1659
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -1468,11 +1667,21 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1468
1667
|
class_name="storey.KafkaTarget",
|
|
1469
1668
|
columns=column_list,
|
|
1470
1669
|
topic=topic,
|
|
1471
|
-
|
|
1670
|
+
brokers=brokers,
|
|
1472
1671
|
**attributes,
|
|
1473
1672
|
)
|
|
1474
1673
|
|
|
1475
|
-
def as_df(
|
|
1674
|
+
def as_df(
|
|
1675
|
+
self,
|
|
1676
|
+
columns=None,
|
|
1677
|
+
df_module=None,
|
|
1678
|
+
entities=None,
|
|
1679
|
+
start_time=None,
|
|
1680
|
+
end_time=None,
|
|
1681
|
+
time_column=None,
|
|
1682
|
+
additional_filters=None,
|
|
1683
|
+
**kwargs,
|
|
1684
|
+
):
|
|
1476
1685
|
raise NotImplementedError()
|
|
1477
1686
|
|
|
1478
1687
|
def purge(self):
|
|
@@ -1519,7 +1728,17 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1519
1728
|
**self.attributes,
|
|
1520
1729
|
)
|
|
1521
1730
|
|
|
1522
|
-
def as_df(
|
|
1731
|
+
def as_df(
|
|
1732
|
+
self,
|
|
1733
|
+
columns=None,
|
|
1734
|
+
df_module=None,
|
|
1735
|
+
entities=None,
|
|
1736
|
+
start_time=None,
|
|
1737
|
+
end_time=None,
|
|
1738
|
+
time_column=None,
|
|
1739
|
+
additional_filters=None,
|
|
1740
|
+
**kwargs,
|
|
1741
|
+
):
|
|
1523
1742
|
raise NotImplementedError()
|
|
1524
1743
|
|
|
1525
1744
|
def write_dataframe(
|
|
@@ -1535,7 +1754,11 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1535
1754
|
key_column = [key_column]
|
|
1536
1755
|
new_index.extend(key_column)
|
|
1537
1756
|
|
|
1538
|
-
|
|
1757
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1758
|
+
storage_options = store.get_storage_options()
|
|
1759
|
+
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1760
|
+
|
|
1761
|
+
_, path_with_container = parse_path(target_path)
|
|
1539
1762
|
container, path = split_path(path_with_container)
|
|
1540
1763
|
|
|
1541
1764
|
frames_client = get_frames_client(
|
|
@@ -1626,11 +1849,16 @@ class DFTarget(BaseStoreTarget):
|
|
|
1626
1849
|
self,
|
|
1627
1850
|
columns=None,
|
|
1628
1851
|
df_module=None,
|
|
1852
|
+
entities=None,
|
|
1629
1853
|
start_time=None,
|
|
1630
1854
|
end_time=None,
|
|
1631
1855
|
time_column=None,
|
|
1856
|
+
additional_filters=None,
|
|
1632
1857
|
**kwargs,
|
|
1633
1858
|
):
|
|
1859
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1860
|
+
additional_filters, self.__class__
|
|
1861
|
+
)
|
|
1634
1862
|
return select_columns_from_df(
|
|
1635
1863
|
filter_df_start_end_time(
|
|
1636
1864
|
self._df,
|
|
@@ -1652,24 +1880,24 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1652
1880
|
self,
|
|
1653
1881
|
name: str = "",
|
|
1654
1882
|
path=None,
|
|
1655
|
-
attributes:
|
|
1883
|
+
attributes: dict[str, str] = None,
|
|
1656
1884
|
after_step=None,
|
|
1657
1885
|
partitioned: bool = False,
|
|
1658
1886
|
key_bucketing_number: Optional[int] = None,
|
|
1659
|
-
partition_cols: Optional[
|
|
1887
|
+
partition_cols: Optional[list[str]] = None,
|
|
1660
1888
|
time_partitioning_granularity: Optional[str] = None,
|
|
1661
1889
|
max_events: Optional[int] = None,
|
|
1662
1890
|
flush_after_seconds: Optional[int] = None,
|
|
1663
|
-
storage_options:
|
|
1891
|
+
storage_options: dict[str, str] = None,
|
|
1664
1892
|
db_url: str = None,
|
|
1665
1893
|
table_name: str = None,
|
|
1666
|
-
schema:
|
|
1894
|
+
schema: dict[str, Any] = None,
|
|
1667
1895
|
primary_key_column: str = "",
|
|
1668
1896
|
if_exists: str = "append",
|
|
1669
1897
|
create_table: bool = False,
|
|
1670
1898
|
# create_according_to_data: bool = False,
|
|
1671
1899
|
varchar_len: int = 50,
|
|
1672
|
-
parse_dates:
|
|
1900
|
+
parse_dates: list[str] = None,
|
|
1673
1901
|
):
|
|
1674
1902
|
"""
|
|
1675
1903
|
Write to SqlDB as output target for a flow.
|
|
@@ -1805,6 +2033,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1805
2033
|
start_time=None,
|
|
1806
2034
|
end_time=None,
|
|
1807
2035
|
time_column=None,
|
|
2036
|
+
additional_filters=None,
|
|
1808
2037
|
**kwargs,
|
|
1809
2038
|
):
|
|
1810
2039
|
try:
|
|
@@ -1813,9 +2042,13 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1813
2042
|
except (ModuleNotFoundError, ImportError) as exc:
|
|
1814
2043
|
self._raise_sqlalchemy_import_error(exc)
|
|
1815
2044
|
|
|
2045
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
2046
|
+
additional_filters, self.__class__
|
|
2047
|
+
)
|
|
2048
|
+
|
|
1816
2049
|
db_path, table_name, _, _, _, _ = self._parse_url()
|
|
1817
2050
|
engine = sqlalchemy.create_engine(db_path)
|
|
1818
|
-
parse_dates: Optional[
|
|
2051
|
+
parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
|
|
1819
2052
|
with engine.connect() as conn:
|
|
1820
2053
|
query, parse_dates = _generate_sql_query_with_time_filter(
|
|
1821
2054
|
table_name=table_name,
|
|
@@ -1902,7 +2135,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1902
2135
|
raise ValueError(f"Table named {table_name} is not exist")
|
|
1903
2136
|
|
|
1904
2137
|
elif not table_exists and create_table:
|
|
1905
|
-
|
|
2138
|
+
type_to_sql_type = {
|
|
1906
2139
|
int: sqlalchemy.Integer,
|
|
1907
2140
|
str: sqlalchemy.String(self.attributes.get("varchar_len")),
|
|
1908
2141
|
datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
|
|
@@ -1915,12 +2148,16 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1915
2148
|
# creat new table with the given name
|
|
1916
2149
|
columns = []
|
|
1917
2150
|
for col, col_type in self.schema.items():
|
|
1918
|
-
|
|
1919
|
-
if
|
|
1920
|
-
raise TypeError(
|
|
2151
|
+
col_type_sql = type_to_sql_type.get(col_type)
|
|
2152
|
+
if col_type_sql is None:
|
|
2153
|
+
raise TypeError(
|
|
2154
|
+
f"'{col_type}' unsupported type for column '{col}'"
|
|
2155
|
+
)
|
|
1921
2156
|
columns.append(
|
|
1922
2157
|
sqlalchemy.Column(
|
|
1923
|
-
col,
|
|
2158
|
+
col,
|
|
2159
|
+
col_type_sql,
|
|
2160
|
+
primary_key=(col in primary_key_for_check),
|
|
1924
2161
|
)
|
|
1925
2162
|
)
|
|
1926
2163
|
|
|
@@ -1951,10 +2188,11 @@ kind_to_driver = {
|
|
|
1951
2188
|
TargetTypes.tsdb: TSDBTarget,
|
|
1952
2189
|
TargetTypes.custom: CustomTarget,
|
|
1953
2190
|
TargetTypes.sql: SQLTarget,
|
|
2191
|
+
TargetTypes.snowflake: SnowflakeTarget,
|
|
1954
2192
|
}
|
|
1955
2193
|
|
|
1956
2194
|
|
|
1957
|
-
def _get_target_path(driver, resource, run_id_mode=False):
|
|
2195
|
+
def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
|
|
1958
2196
|
"""return the default target path given the resource and target kind"""
|
|
1959
2197
|
kind = driver.kind
|
|
1960
2198
|
suffix = driver.suffix
|
|
@@ -1971,11 +2209,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
|
|
|
1971
2209
|
)
|
|
1972
2210
|
name = resource.metadata.name
|
|
1973
2211
|
project = resource.metadata.project or mlrun.mlconf.default_project
|
|
1974
|
-
|
|
2212
|
+
|
|
2213
|
+
default_kind_name = kind
|
|
2214
|
+
if scheme == "ds":
|
|
2215
|
+
# "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
|
|
2216
|
+
# as a placeholder that can be used in any specified target
|
|
2217
|
+
default_kind_name = "dsnosql"
|
|
2218
|
+
if scheme == "redis" or scheme == "rediss":
|
|
2219
|
+
default_kind_name = TargetTypes.redisnosql
|
|
2220
|
+
|
|
2221
|
+
netloc = netloc or ""
|
|
2222
|
+
data_prefix = get_default_prefix_for_target(default_kind_name).format(
|
|
2223
|
+
ds_profile_name=netloc, # In case of ds profile, set its the name
|
|
2224
|
+
authority=netloc, # In case of redis, replace {authority} with netloc
|
|
1975
2225
|
project=project,
|
|
1976
2226
|
kind=kind,
|
|
1977
2227
|
name=name,
|
|
1978
2228
|
)
|
|
2229
|
+
|
|
2230
|
+
if scheme == "rediss":
|
|
2231
|
+
data_prefix = data_prefix.replace("redis://", "rediss://", 1)
|
|
2232
|
+
|
|
1979
2233
|
# todo: handle ver tag changes, may need to copy files?
|
|
1980
2234
|
if not run_id_mode:
|
|
1981
2235
|
version = resource.metadata.tag
|