mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +131 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +129 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc7.dist-info/METADATA +0 -272
- mlrun-1.6.4rc7.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -17,9 +17,10 @@ import os
|
|
|
17
17
|
import random
|
|
18
18
|
import sys
|
|
19
19
|
import time
|
|
20
|
+
import warnings
|
|
20
21
|
from collections import Counter
|
|
21
22
|
from copy import copy
|
|
22
|
-
from typing import Any,
|
|
23
|
+
from typing import Any, Optional, Union
|
|
23
24
|
from urllib.parse import urlparse
|
|
24
25
|
|
|
25
26
|
import pandas as pd
|
|
@@ -28,8 +29,13 @@ from mergedeep import merge
|
|
|
28
29
|
import mlrun
|
|
29
30
|
import mlrun.utils.helpers
|
|
30
31
|
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import (
|
|
33
|
+
get_snowflake_password,
|
|
34
|
+
get_snowflake_spark_options,
|
|
35
|
+
)
|
|
36
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
31
37
|
from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
|
|
32
|
-
from mlrun.utils import now_date
|
|
38
|
+
from mlrun.utils import logger, now_date
|
|
33
39
|
from mlrun.utils.helpers import to_parquet
|
|
34
40
|
from mlrun.utils.v3io_clients import get_frames_client
|
|
35
41
|
|
|
@@ -41,9 +47,7 @@ from .spark_utils import spark_session_update_hadoop_options
|
|
|
41
47
|
from .utils import (
|
|
42
48
|
_generate_sql_query_with_time_filter,
|
|
43
49
|
filter_df_start_end_time,
|
|
44
|
-
parse_kafka_url,
|
|
45
50
|
select_columns_from_df,
|
|
46
|
-
store_path_to_spark,
|
|
47
51
|
)
|
|
48
52
|
|
|
49
53
|
|
|
@@ -58,6 +62,7 @@ class TargetTypes:
|
|
|
58
62
|
dataframe = "dataframe"
|
|
59
63
|
custom = "custom"
|
|
60
64
|
sql = "sql"
|
|
65
|
+
snowflake = "snowflake"
|
|
61
66
|
|
|
62
67
|
@staticmethod
|
|
63
68
|
def all():
|
|
@@ -72,6 +77,7 @@ class TargetTypes:
|
|
|
72
77
|
TargetTypes.dataframe,
|
|
73
78
|
TargetTypes.custom,
|
|
74
79
|
TargetTypes.sql,
|
|
80
|
+
TargetTypes.snowflake,
|
|
75
81
|
]
|
|
76
82
|
|
|
77
83
|
|
|
@@ -79,11 +85,14 @@ def generate_target_run_id():
|
|
|
79
85
|
return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
|
|
80
86
|
|
|
81
87
|
|
|
82
|
-
def write_spark_dataframe_with_options(spark_options, df, mode):
|
|
88
|
+
def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
|
|
83
89
|
non_hadoop_spark_options = spark_session_update_hadoop_options(
|
|
84
90
|
df.sql_ctx.sparkSession, spark_options
|
|
85
91
|
)
|
|
86
|
-
|
|
92
|
+
if write_format:
|
|
93
|
+
df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
|
|
94
|
+
else:
|
|
95
|
+
df.write.mode(mode).save(**non_hadoop_spark_options)
|
|
87
96
|
|
|
88
97
|
|
|
89
98
|
def default_target_names():
|
|
@@ -215,9 +224,8 @@ def validate_target_list(targets):
|
|
|
215
224
|
]
|
|
216
225
|
if target_types_requiring_name:
|
|
217
226
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
218
|
-
"Only one default name per target type is allowed (please
|
|
219
|
-
|
|
220
|
-
)
|
|
227
|
+
"Only one default name per target type is allowed (please "
|
|
228
|
+
f"specify name for {target_types_requiring_name} target)"
|
|
221
229
|
)
|
|
222
230
|
|
|
223
231
|
target_names_count = Counter(
|
|
@@ -232,9 +240,8 @@ def validate_target_list(targets):
|
|
|
232
240
|
|
|
233
241
|
if targets_with_same_name:
|
|
234
242
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
-
"Each target must have a unique name (more than one target with
|
|
236
|
-
|
|
237
|
-
)
|
|
243
|
+
"Each target must have a unique name (more than one target with "
|
|
244
|
+
f"those names found {targets_with_same_name})"
|
|
238
245
|
)
|
|
239
246
|
|
|
240
247
|
no_path_target_types_count = Counter(
|
|
@@ -252,9 +259,8 @@ def validate_target_list(targets):
|
|
|
252
259
|
]
|
|
253
260
|
if target_types_requiring_path:
|
|
254
261
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
255
|
-
"Only one default path per target type is allowed (please specify
|
|
256
|
-
|
|
257
|
-
)
|
|
262
|
+
"Only one default path per target type is allowed (please specify "
|
|
263
|
+
f"path for {target_types_requiring_path} target)"
|
|
258
264
|
)
|
|
259
265
|
|
|
260
266
|
target_paths_count = Counter(
|
|
@@ -269,9 +275,8 @@ def validate_target_list(targets):
|
|
|
269
275
|
|
|
270
276
|
if targets_with_same_path:
|
|
271
277
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
272
|
-
"Each target must have a unique path (more than one target
|
|
273
|
-
|
|
274
|
-
)
|
|
278
|
+
"Each target must have a unique path (more than one target "
|
|
279
|
+
f"with those names found {targets_with_same_path})"
|
|
275
280
|
)
|
|
276
281
|
|
|
277
282
|
|
|
@@ -384,23 +389,24 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
384
389
|
is_offline = False
|
|
385
390
|
support_spark = False
|
|
386
391
|
support_storey = False
|
|
392
|
+
support_pandas = False
|
|
387
393
|
support_append = False
|
|
388
394
|
|
|
389
395
|
def __init__(
|
|
390
396
|
self,
|
|
391
397
|
name: str = "",
|
|
392
398
|
path=None,
|
|
393
|
-
attributes:
|
|
399
|
+
attributes: dict[str, str] = None,
|
|
394
400
|
after_step=None,
|
|
395
401
|
columns=None,
|
|
396
402
|
partitioned: bool = False,
|
|
397
403
|
key_bucketing_number: Optional[int] = None,
|
|
398
|
-
partition_cols: Optional[
|
|
404
|
+
partition_cols: Optional[list[str]] = None,
|
|
399
405
|
time_partitioning_granularity: Optional[str] = None,
|
|
400
406
|
max_events: Optional[int] = None,
|
|
401
407
|
flush_after_seconds: Optional[int] = None,
|
|
402
|
-
storage_options:
|
|
403
|
-
schema:
|
|
408
|
+
storage_options: dict[str, str] = None,
|
|
409
|
+
schema: dict[str, Any] = None,
|
|
404
410
|
credentials_prefix=None,
|
|
405
411
|
):
|
|
406
412
|
super().__init__(
|
|
@@ -433,6 +439,12 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
433
439
|
self.storage_options = storage_options
|
|
434
440
|
self.schema = schema or {}
|
|
435
441
|
self.credentials_prefix = credentials_prefix
|
|
442
|
+
if credentials_prefix:
|
|
443
|
+
warnings.warn(
|
|
444
|
+
"The 'credentials_prefix' parameter is deprecated and will be removed in "
|
|
445
|
+
"1.9.0. Please use datastore profiles instead.",
|
|
446
|
+
FutureWarning,
|
|
447
|
+
)
|
|
436
448
|
|
|
437
449
|
self._target = None
|
|
438
450
|
self._resource = None
|
|
@@ -452,14 +464,11 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
452
464
|
if self.credentials_prefix
|
|
453
465
|
else None
|
|
454
466
|
)
|
|
455
|
-
store, resolved_store_path = mlrun.store_manager.get_or_create_store(
|
|
467
|
+
store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
|
|
456
468
|
self.get_target_path(),
|
|
457
469
|
credentials_prefix_secrets,
|
|
458
470
|
)
|
|
459
|
-
|
|
460
|
-
return store, store.url + resolved_store_path
|
|
461
|
-
else:
|
|
462
|
-
return store, self.get_target_path()
|
|
471
|
+
return store, resolved_store_path, url
|
|
463
472
|
|
|
464
473
|
def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
|
|
465
474
|
result = []
|
|
@@ -505,10 +514,13 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
505
514
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
506
515
|
options.update(kwargs)
|
|
507
516
|
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
508
|
-
|
|
517
|
+
write_format = options.pop("format", None)
|
|
518
|
+
write_spark_dataframe_with_options(
|
|
519
|
+
options, df, "overwrite", write_format=write_format
|
|
520
|
+
)
|
|
509
521
|
elif hasattr(df, "dask"):
|
|
510
522
|
dask_options = self.get_dask_options()
|
|
511
|
-
store, target_path = self._get_store_and_path()
|
|
523
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
512
524
|
storage_options = store.get_storage_options()
|
|
513
525
|
df = df.repartition(partition_size="100MB")
|
|
514
526
|
try:
|
|
@@ -529,18 +541,21 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
529
541
|
except Exception as exc:
|
|
530
542
|
raise RuntimeError("Failed to write Dask Dataframe") from exc
|
|
531
543
|
else:
|
|
532
|
-
store, target_path = self._get_store_and_path()
|
|
544
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
533
545
|
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
534
546
|
file_system = store.filesystem
|
|
535
|
-
if
|
|
547
|
+
if (
|
|
548
|
+
file_system.protocol == "file"
|
|
549
|
+
# fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
|
|
550
|
+
or isinstance(file_system.protocol, (tuple, list))
|
|
551
|
+
and "file" in file_system.protocol
|
|
552
|
+
):
|
|
536
553
|
dir = os.path.dirname(target_path)
|
|
537
554
|
if dir:
|
|
538
555
|
os.makedirs(dir, exist_ok=True)
|
|
539
556
|
target_df = df
|
|
540
557
|
partition_cols = None # single parquet file
|
|
541
|
-
if not
|
|
542
|
-
".pq"
|
|
543
|
-
): # directory
|
|
558
|
+
if not mlrun.utils.helpers.is_parquet_file(target_path): # directory
|
|
544
559
|
partition_cols = []
|
|
545
560
|
if timestamp_key and (
|
|
546
561
|
self.partitioned or self.time_partitioning_granularity
|
|
@@ -649,6 +664,29 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
649
664
|
def _target_path_object(self):
|
|
650
665
|
"""return the actual/computed target path"""
|
|
651
666
|
is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
|
|
667
|
+
|
|
668
|
+
if self._resource and self.path:
|
|
669
|
+
parsed_url = urlparse(self.path)
|
|
670
|
+
# When the URL consists only from scheme and endpoint and no path,
|
|
671
|
+
# make a default path for DS and redis targets.
|
|
672
|
+
# Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
|
|
673
|
+
if (
|
|
674
|
+
not isinstance(self, KafkaTarget)
|
|
675
|
+
and parsed_url.scheme in ["ds", "redis", "rediss"]
|
|
676
|
+
and (not parsed_url.path or parsed_url.path == "/")
|
|
677
|
+
):
|
|
678
|
+
return TargetPathObject(
|
|
679
|
+
_get_target_path(
|
|
680
|
+
self,
|
|
681
|
+
self._resource,
|
|
682
|
+
self.run_id is not None,
|
|
683
|
+
netloc=parsed_url.netloc,
|
|
684
|
+
scheme=parsed_url.scheme,
|
|
685
|
+
),
|
|
686
|
+
self.run_id,
|
|
687
|
+
is_single_file,
|
|
688
|
+
)
|
|
689
|
+
|
|
652
690
|
return self.get_path() or (
|
|
653
691
|
TargetPathObject(
|
|
654
692
|
_get_target_path(self, self._resource, self.run_id is not None),
|
|
@@ -665,6 +703,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
665
703
|
self.kind, self.name, self.get_target_templated_path()
|
|
666
704
|
)
|
|
667
705
|
target = self._target
|
|
706
|
+
target.attributes = self.attributes
|
|
668
707
|
target.run_id = self.run_id
|
|
669
708
|
target.status = status or target.status or "created"
|
|
670
709
|
target.updated = now_date().isoformat()
|
|
@@ -693,11 +732,25 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
693
732
|
timestamp_key=None,
|
|
694
733
|
featureset_status=None,
|
|
695
734
|
):
|
|
735
|
+
if not self.support_storey:
|
|
736
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
737
|
+
f"{type(self).__name__} does not support storey engine"
|
|
738
|
+
)
|
|
696
739
|
raise NotImplementedError()
|
|
697
740
|
|
|
698
741
|
def purge(self):
|
|
699
|
-
|
|
700
|
-
|
|
742
|
+
"""
|
|
743
|
+
Delete the files of the target.
|
|
744
|
+
|
|
745
|
+
Do not use this function directly from the sdk. Use FeatureSet.purge_targets.
|
|
746
|
+
"""
|
|
747
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
748
|
+
if path_in_store not in ["", "/"]:
|
|
749
|
+
store.rm(path_in_store, recursive=True)
|
|
750
|
+
else:
|
|
751
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
752
|
+
"Unable to delete target. Please Use purge_targets from FeatureSet object."
|
|
753
|
+
)
|
|
701
754
|
|
|
702
755
|
def as_df(
|
|
703
756
|
self,
|
|
@@ -707,9 +760,15 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
707
760
|
start_time=None,
|
|
708
761
|
end_time=None,
|
|
709
762
|
time_column=None,
|
|
763
|
+
additional_filters=None,
|
|
710
764
|
**kwargs,
|
|
711
765
|
):
|
|
712
766
|
"""return the target data as dataframe"""
|
|
767
|
+
if not self.support_pandas:
|
|
768
|
+
raise NotImplementedError()
|
|
769
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
770
|
+
additional_filters, self.__class__
|
|
771
|
+
)
|
|
713
772
|
return mlrun.get_dataitem(self.get_target_path()).as_df(
|
|
714
773
|
columns=columns,
|
|
715
774
|
df_module=df_module,
|
|
@@ -721,17 +780,25 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
721
780
|
|
|
722
781
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
723
782
|
# options used in spark.read.load(**options)
|
|
783
|
+
if not self.support_spark:
|
|
784
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
785
|
+
f"{type(self).__name__} does not support spark engine"
|
|
786
|
+
)
|
|
724
787
|
raise NotImplementedError()
|
|
725
788
|
|
|
726
|
-
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=
|
|
789
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
727
790
|
return df
|
|
728
791
|
|
|
729
792
|
def get_dask_options(self):
|
|
730
793
|
raise NotImplementedError()
|
|
731
794
|
|
|
795
|
+
@property
|
|
796
|
+
def source_spark_attributes(self) -> dict:
|
|
797
|
+
return {}
|
|
798
|
+
|
|
732
799
|
|
|
733
800
|
class ParquetTarget(BaseStoreTarget):
|
|
734
|
-
"""
|
|
801
|
+
"""Parquet target storage driver, used to materialize feature set/vector data into parquet files.
|
|
735
802
|
|
|
736
803
|
:param name: optional, target name. By default will be called ParquetTarget
|
|
737
804
|
:param path: optional, Output path. Can be either a file or directory.
|
|
@@ -760,22 +827,23 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
760
827
|
support_spark = True
|
|
761
828
|
support_storey = True
|
|
762
829
|
support_dask = True
|
|
830
|
+
support_pandas = True
|
|
763
831
|
support_append = True
|
|
764
832
|
|
|
765
833
|
def __init__(
|
|
766
834
|
self,
|
|
767
835
|
name: str = "",
|
|
768
836
|
path=None,
|
|
769
|
-
attributes:
|
|
837
|
+
attributes: dict[str, str] = None,
|
|
770
838
|
after_step=None,
|
|
771
839
|
columns=None,
|
|
772
840
|
partitioned: bool = None,
|
|
773
841
|
key_bucketing_number: Optional[int] = None,
|
|
774
|
-
partition_cols: Optional[
|
|
842
|
+
partition_cols: Optional[list[str]] = None,
|
|
775
843
|
time_partitioning_granularity: Optional[str] = None,
|
|
776
844
|
max_events: Optional[int] = 10000,
|
|
777
845
|
flush_after_seconds: Optional[int] = 900,
|
|
778
|
-
storage_options:
|
|
846
|
+
storage_options: dict[str, str] = None,
|
|
779
847
|
):
|
|
780
848
|
self.path = path
|
|
781
849
|
if partitioned is None:
|
|
@@ -865,10 +933,9 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
865
933
|
if time_unit == time_partitioning_granularity:
|
|
866
934
|
break
|
|
867
935
|
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
and not self.get_target_path().endswith(".pq")
|
|
936
|
+
target_path = self.get_target_path()
|
|
937
|
+
if not self.partitioned and not mlrun.utils.helpers.is_parquet_file(
|
|
938
|
+
target_path
|
|
872
939
|
):
|
|
873
940
|
partition_cols = []
|
|
874
941
|
|
|
@@ -876,25 +943,16 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
876
943
|
for key_column in key_columns:
|
|
877
944
|
tuple_key_columns.append((key_column.name, key_column.value_type))
|
|
878
945
|
|
|
879
|
-
store, target_path = self._get_store_and_path()
|
|
880
|
-
|
|
881
|
-
storage_options = store.get_storage_options()
|
|
882
|
-
if storage_options and self.storage_options:
|
|
883
|
-
storage_options = merge(storage_options, self.storage_options)
|
|
884
|
-
else:
|
|
885
|
-
storage_options = storage_options or self.storage_options
|
|
886
|
-
|
|
887
946
|
step = graph.add_step(
|
|
888
947
|
name=self.name or "ParquetTarget",
|
|
889
948
|
after=after,
|
|
890
949
|
graph_shape="cylinder",
|
|
891
|
-
class_name="
|
|
950
|
+
class_name="mlrun.datastore.storeytargets.ParquetStoreyTarget",
|
|
892
951
|
path=target_path,
|
|
893
952
|
columns=column_list,
|
|
894
953
|
index_cols=tuple_key_columns,
|
|
895
954
|
partition_cols=partition_cols,
|
|
896
955
|
time_field=timestamp_key,
|
|
897
|
-
storage_options=storage_options,
|
|
898
956
|
max_events=self.max_events,
|
|
899
957
|
flush_after_seconds=self.flush_after_seconds,
|
|
900
958
|
update_last_written=featureset_status.update_last_written_for_target,
|
|
@@ -929,27 +987,19 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
929
987
|
if unit == time_partitioning_granularity:
|
|
930
988
|
break
|
|
931
989
|
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
path = store.url + path
|
|
938
|
-
result = {
|
|
939
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
940
|
-
"format": "parquet",
|
|
941
|
-
}
|
|
942
|
-
result = {**result, **storage_spark_options}
|
|
943
|
-
else:
|
|
944
|
-
result = {
|
|
945
|
-
"path": store_path_to_spark(self.get_target_path()),
|
|
990
|
+
store, path, url = self._get_store_and_path()
|
|
991
|
+
spark_options = store.get_spark_options()
|
|
992
|
+
spark_options.update(
|
|
993
|
+
{
|
|
994
|
+
"path": store.spark_url + path,
|
|
946
995
|
"format": "parquet",
|
|
947
996
|
}
|
|
997
|
+
)
|
|
948
998
|
for partition_col in self.partition_cols or []:
|
|
949
999
|
partition_cols.append(partition_col)
|
|
950
1000
|
if partition_cols:
|
|
951
|
-
|
|
952
|
-
return
|
|
1001
|
+
spark_options["partitionBy"] = partition_cols
|
|
1002
|
+
return spark_options
|
|
953
1003
|
|
|
954
1004
|
def get_dask_options(self):
|
|
955
1005
|
return {"format": "parquet"}
|
|
@@ -962,6 +1012,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
962
1012
|
start_time=None,
|
|
963
1013
|
end_time=None,
|
|
964
1014
|
time_column=None,
|
|
1015
|
+
additional_filters=None,
|
|
965
1016
|
**kwargs,
|
|
966
1017
|
):
|
|
967
1018
|
"""return the target data as dataframe"""
|
|
@@ -972,6 +1023,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
972
1023
|
start_time=start_time,
|
|
973
1024
|
end_time=end_time,
|
|
974
1025
|
time_column=time_column,
|
|
1026
|
+
additional_filters=transform_list_filters_to_tuple(additional_filters),
|
|
975
1027
|
**kwargs,
|
|
976
1028
|
)
|
|
977
1029
|
if not columns:
|
|
@@ -993,9 +1045,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
993
1045
|
return result
|
|
994
1046
|
|
|
995
1047
|
def is_single_file(self):
|
|
996
|
-
|
|
997
|
-
return self.path.endswith(".parquet") or self.path.endswith(".pq")
|
|
998
|
-
return False
|
|
1048
|
+
return mlrun.utils.helpers.is_parquet_file(self.path)
|
|
999
1049
|
|
|
1000
1050
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1001
1051
|
# If partitioning by time, add the necessary columns
|
|
@@ -1035,6 +1085,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1035
1085
|
is_offline = True
|
|
1036
1086
|
support_spark = True
|
|
1037
1087
|
support_storey = True
|
|
1088
|
+
support_pandas = True
|
|
1038
1089
|
|
|
1039
1090
|
@staticmethod
|
|
1040
1091
|
def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
|
|
@@ -1056,39 +1107,30 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1056
1107
|
column_list = self._get_column_list(
|
|
1057
1108
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1058
1109
|
)
|
|
1059
|
-
|
|
1110
|
+
target_path = self.get_target_path()
|
|
1060
1111
|
graph.add_step(
|
|
1061
1112
|
name=self.name or "CSVTarget",
|
|
1062
1113
|
after=after,
|
|
1063
1114
|
graph_shape="cylinder",
|
|
1064
|
-
class_name="
|
|
1115
|
+
class_name="mlrun.datastore.storeytargets.CSVStoreyTarget",
|
|
1065
1116
|
path=target_path,
|
|
1066
1117
|
columns=column_list,
|
|
1067
1118
|
header=True,
|
|
1068
1119
|
index_cols=key_columns,
|
|
1069
|
-
storage_options=store.get_storage_options(),
|
|
1070
1120
|
**self.attributes,
|
|
1071
1121
|
)
|
|
1072
1122
|
|
|
1073
1123
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
path = store.url + path
|
|
1080
|
-
result = {
|
|
1081
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
1082
|
-
"format": "csv",
|
|
1083
|
-
"header": "true",
|
|
1084
|
-
}
|
|
1085
|
-
return {**result, **storage_spark_options}
|
|
1086
|
-
else:
|
|
1087
|
-
return {
|
|
1088
|
-
"path": store_path_to_spark(self.get_target_path()),
|
|
1124
|
+
store, path, url = self._get_store_and_path()
|
|
1125
|
+
spark_options = store.get_spark_options()
|
|
1126
|
+
spark_options.update(
|
|
1127
|
+
{
|
|
1128
|
+
"path": store.spark_url + path,
|
|
1089
1129
|
"format": "csv",
|
|
1090
1130
|
"header": "true",
|
|
1091
1131
|
}
|
|
1132
|
+
)
|
|
1133
|
+
return spark_options
|
|
1092
1134
|
|
|
1093
1135
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1094
1136
|
import pyspark.sql.functions as funcs
|
|
@@ -1110,8 +1152,12 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1110
1152
|
start_time=None,
|
|
1111
1153
|
end_time=None,
|
|
1112
1154
|
time_column=None,
|
|
1155
|
+
additional_filters=None,
|
|
1113
1156
|
**kwargs,
|
|
1114
1157
|
):
|
|
1158
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1159
|
+
additional_filters, self.__class__
|
|
1160
|
+
)
|
|
1115
1161
|
df = super().as_df(
|
|
1116
1162
|
columns=columns,
|
|
1117
1163
|
df_module=df_module,
|
|
@@ -1132,6 +1178,134 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1132
1178
|
return True
|
|
1133
1179
|
|
|
1134
1180
|
|
|
1181
|
+
class SnowflakeTarget(BaseStoreTarget):
|
|
1182
|
+
"""
|
|
1183
|
+
:param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
|
|
1184
|
+
if they exist.
|
|
1185
|
+
:param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
|
|
1186
|
+
:param user: Snowflake user for login
|
|
1187
|
+
:param db_schema: Database schema
|
|
1188
|
+
:param database: Database name
|
|
1189
|
+
:param warehouse: Snowflake warehouse name
|
|
1190
|
+
:param table_name: Snowflake table name
|
|
1191
|
+
"""
|
|
1192
|
+
|
|
1193
|
+
support_spark = True
|
|
1194
|
+
support_append = True
|
|
1195
|
+
is_offline = True
|
|
1196
|
+
kind = TargetTypes.snowflake
|
|
1197
|
+
|
|
1198
|
+
def __init__(
|
|
1199
|
+
self,
|
|
1200
|
+
name: str = "",
|
|
1201
|
+
path=None,
|
|
1202
|
+
attributes: dict[str, str] = None,
|
|
1203
|
+
after_step=None,
|
|
1204
|
+
columns=None,
|
|
1205
|
+
partitioned: bool = False,
|
|
1206
|
+
key_bucketing_number: Optional[int] = None,
|
|
1207
|
+
partition_cols: Optional[list[str]] = None,
|
|
1208
|
+
time_partitioning_granularity: Optional[str] = None,
|
|
1209
|
+
max_events: Optional[int] = None,
|
|
1210
|
+
flush_after_seconds: Optional[int] = None,
|
|
1211
|
+
storage_options: dict[str, str] = None,
|
|
1212
|
+
schema: dict[str, Any] = None,
|
|
1213
|
+
credentials_prefix=None,
|
|
1214
|
+
url: str = None,
|
|
1215
|
+
user: str = None,
|
|
1216
|
+
db_schema: str = None,
|
|
1217
|
+
database: str = None,
|
|
1218
|
+
warehouse: str = None,
|
|
1219
|
+
table_name: str = None,
|
|
1220
|
+
):
|
|
1221
|
+
attributes = attributes or {}
|
|
1222
|
+
if url:
|
|
1223
|
+
attributes["url"] = url
|
|
1224
|
+
if user:
|
|
1225
|
+
attributes["user"] = user
|
|
1226
|
+
if database:
|
|
1227
|
+
attributes["database"] = database
|
|
1228
|
+
if db_schema:
|
|
1229
|
+
attributes["db_schema"] = db_schema
|
|
1230
|
+
if warehouse:
|
|
1231
|
+
attributes["warehouse"] = warehouse
|
|
1232
|
+
if table_name:
|
|
1233
|
+
attributes["table"] = table_name
|
|
1234
|
+
|
|
1235
|
+
super().__init__(
|
|
1236
|
+
name,
|
|
1237
|
+
path,
|
|
1238
|
+
attributes,
|
|
1239
|
+
after_step,
|
|
1240
|
+
list(schema.keys()) if schema else columns,
|
|
1241
|
+
partitioned,
|
|
1242
|
+
key_bucketing_number,
|
|
1243
|
+
partition_cols,
|
|
1244
|
+
time_partitioning_granularity,
|
|
1245
|
+
max_events=max_events,
|
|
1246
|
+
flush_after_seconds=flush_after_seconds,
|
|
1247
|
+
storage_options=storage_options,
|
|
1248
|
+
schema=schema,
|
|
1249
|
+
credentials_prefix=credentials_prefix,
|
|
1250
|
+
)
|
|
1251
|
+
|
|
1252
|
+
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1253
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
1254
|
+
spark_options["dbtable"] = self.attributes.get("table")
|
|
1255
|
+
return spark_options
|
|
1256
|
+
|
|
1257
|
+
def purge(self):
|
|
1258
|
+
import snowflake.connector
|
|
1259
|
+
|
|
1260
|
+
missing = [
|
|
1261
|
+
key
|
|
1262
|
+
for key in ["database", "db_schema", "table", "url", "user", "warehouse"]
|
|
1263
|
+
if self.attributes.get(key) is None
|
|
1264
|
+
]
|
|
1265
|
+
if missing:
|
|
1266
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1267
|
+
f"Can't purge Snowflake target, "
|
|
1268
|
+
f"some attributes are missing: {', '.join(missing)}"
|
|
1269
|
+
)
|
|
1270
|
+
account = self.attributes["url"].replace(".snowflakecomputing.com", "")
|
|
1271
|
+
|
|
1272
|
+
with snowflake.connector.connect(
|
|
1273
|
+
account=account,
|
|
1274
|
+
user=self.attributes["user"],
|
|
1275
|
+
password=get_snowflake_password(),
|
|
1276
|
+
warehouse=self.attributes["warehouse"],
|
|
1277
|
+
) as snowflake_connector:
|
|
1278
|
+
drop_statement = (
|
|
1279
|
+
f"DROP TABLE IF EXISTS {self.attributes['database']}.{self.attributes['db_schema']}"
|
|
1280
|
+
f".{self.attributes['table']}"
|
|
1281
|
+
)
|
|
1282
|
+
snowflake_connector.execute_string(drop_statement)
|
|
1283
|
+
|
|
1284
|
+
def as_df(
|
|
1285
|
+
self,
|
|
1286
|
+
columns=None,
|
|
1287
|
+
df_module=None,
|
|
1288
|
+
entities=None,
|
|
1289
|
+
start_time=None,
|
|
1290
|
+
end_time=None,
|
|
1291
|
+
time_column=None,
|
|
1292
|
+
additional_filters=None,
|
|
1293
|
+
**kwargs,
|
|
1294
|
+
):
|
|
1295
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1296
|
+
f"{type(self).__name__} does not support pandas engine"
|
|
1297
|
+
)
|
|
1298
|
+
|
|
1299
|
+
@property
|
|
1300
|
+
def source_spark_attributes(self) -> dict:
|
|
1301
|
+
keys = ["url", "user", "database", "db_schema", "warehouse"]
|
|
1302
|
+
attributes = self.attributes or {}
|
|
1303
|
+
snowflake_dict = {key: attributes.get(key) for key in keys}
|
|
1304
|
+
table = attributes.get("table")
|
|
1305
|
+
snowflake_dict["query"] = f"SELECT * from {table}" if table else None
|
|
1306
|
+
return snowflake_dict
|
|
1307
|
+
|
|
1308
|
+
|
|
1135
1309
|
class NoSqlBaseTarget(BaseStoreTarget):
|
|
1136
1310
|
is_table = True
|
|
1137
1311
|
is_online = True
|
|
@@ -1156,6 +1330,19 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1156
1330
|
timestamp_key=None,
|
|
1157
1331
|
featureset_status=None,
|
|
1158
1332
|
):
|
|
1333
|
+
table, column_list = self._get_table_and_columns(features, key_columns)
|
|
1334
|
+
|
|
1335
|
+
graph.add_step(
|
|
1336
|
+
name=self.name or self.writer_step_name,
|
|
1337
|
+
after=after,
|
|
1338
|
+
graph_shape="cylinder",
|
|
1339
|
+
class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
|
|
1340
|
+
columns=column_list,
|
|
1341
|
+
table=table,
|
|
1342
|
+
**self.attributes,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
def _get_table_and_columns(self, features, key_columns):
|
|
1159
1346
|
key_columns = list(key_columns.keys())
|
|
1160
1347
|
table = self._resource.uri
|
|
1161
1348
|
column_list = self._get_column_list(
|
|
@@ -1174,15 +1361,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1174
1361
|
col for col in column_list if col[0] not in aggregate_features
|
|
1175
1362
|
]
|
|
1176
1363
|
|
|
1177
|
-
|
|
1178
|
-
name=self.name or self.writer_step_name,
|
|
1179
|
-
after=after,
|
|
1180
|
-
graph_shape="cylinder",
|
|
1181
|
-
class_name="storey.NoSqlTarget",
|
|
1182
|
-
columns=column_list,
|
|
1183
|
-
table=table,
|
|
1184
|
-
**self.attributes,
|
|
1185
|
-
)
|
|
1364
|
+
return table, column_list
|
|
1186
1365
|
|
|
1187
1366
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1188
1367
|
raise NotImplementedError()
|
|
@@ -1193,9 +1372,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1193
1372
|
def get_dask_options(self):
|
|
1194
1373
|
return {"format": "csv"}
|
|
1195
1374
|
|
|
1196
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1197
|
-
raise NotImplementedError()
|
|
1198
|
-
|
|
1199
1375
|
def write_dataframe(
|
|
1200
1376
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1201
1377
|
):
|
|
@@ -1203,7 +1379,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1203
1379
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
1204
1380
|
options.update(kwargs)
|
|
1205
1381
|
df = self.prepare_spark_df(df)
|
|
1206
|
-
|
|
1382
|
+
write_format = options.pop("format", None)
|
|
1383
|
+
write_spark_dataframe_with_options(
|
|
1384
|
+
options, df, "overwrite", write_format=write_format
|
|
1385
|
+
)
|
|
1207
1386
|
else:
|
|
1208
1387
|
# To prevent modification of the original dataframe and make sure
|
|
1209
1388
|
# that the last event of a key is the one being persisted
|
|
@@ -1213,7 +1392,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1213
1392
|
df = df.copy(deep=False)
|
|
1214
1393
|
access_key = self._get_credential("V3IO_ACCESS_KEY")
|
|
1215
1394
|
|
|
1216
|
-
|
|
1395
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1396
|
+
storage_options = store.get_storage_options()
|
|
1397
|
+
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1398
|
+
|
|
1399
|
+
_, path_with_container = parse_path(target_path)
|
|
1217
1400
|
container, path = split_path(path_with_container)
|
|
1218
1401
|
|
|
1219
1402
|
frames_client = get_frames_client(
|
|
@@ -1231,17 +1414,31 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1231
1414
|
def get_table_object(self):
|
|
1232
1415
|
from storey import Table, V3ioDriver
|
|
1233
1416
|
|
|
1234
|
-
|
|
1235
|
-
endpoint, uri = parse_path(
|
|
1417
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1418
|
+
endpoint, uri = parse_path(target_path)
|
|
1419
|
+
storage_options = store.get_storage_options()
|
|
1420
|
+
access_key = storage_options.get("v3io_access_key")
|
|
1421
|
+
|
|
1236
1422
|
return Table(
|
|
1237
1423
|
uri,
|
|
1238
|
-
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
|
|
1424
|
+
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
|
|
1239
1425
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
1240
1426
|
)
|
|
1241
1427
|
|
|
1242
1428
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1429
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1430
|
+
storage_options = store.get_storage_options()
|
|
1431
|
+
store_access_key = storage_options.get("v3io_access_key")
|
|
1432
|
+
env_access_key = self._secrets.get(
|
|
1433
|
+
"V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
|
|
1434
|
+
)
|
|
1435
|
+
if store_access_key and env_access_key and store_access_key != env_access_key:
|
|
1436
|
+
logger.warning(
|
|
1437
|
+
"The Spark v3io connector does not support access_key parameterization."
|
|
1438
|
+
"Spark will disregard the store-provided key."
|
|
1439
|
+
)
|
|
1243
1440
|
spark_options = {
|
|
1244
|
-
"path":
|
|
1441
|
+
"path": store.spark_url + path_in_store,
|
|
1245
1442
|
"format": "io.iguaz.v3io.spark.sql.kv",
|
|
1246
1443
|
}
|
|
1247
1444
|
if isinstance(key_column, list) and len(key_column) >= 1:
|
|
@@ -1287,11 +1484,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1287
1484
|
support_spark = True
|
|
1288
1485
|
writer_step_name = "RedisNoSqlTarget"
|
|
1289
1486
|
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
def _get_server_endpoint(self):
|
|
1294
|
-
endpoint, uri = parse_path(self.get_target_path())
|
|
1487
|
+
@staticmethod
|
|
1488
|
+
def get_server_endpoint(path, credentials_prefix=None):
|
|
1489
|
+
endpoint, uri = parse_path(path)
|
|
1295
1490
|
endpoint = endpoint or mlrun.mlconf.redis.url
|
|
1296
1491
|
if endpoint.startswith("ds://"):
|
|
1297
1492
|
datastore_profile = datastore_profile_read(endpoint)
|
|
@@ -1308,8 +1503,15 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1308
1503
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1309
1504
|
"Provide Redis username and password only via secrets"
|
|
1310
1505
|
)
|
|
1311
|
-
|
|
1312
|
-
|
|
1506
|
+
credentials_prefix = credentials_prefix or mlrun.get_secret_or_env(
|
|
1507
|
+
key="CREDENTIALS_PREFIX"
|
|
1508
|
+
)
|
|
1509
|
+
user = mlrun.get_secret_or_env(
|
|
1510
|
+
"REDIS_USER", default="", prefix=credentials_prefix
|
|
1511
|
+
)
|
|
1512
|
+
password = mlrun.get_secret_or_env(
|
|
1513
|
+
"REDIS_PASSWORD", default="", prefix=credentials_prefix
|
|
1514
|
+
)
|
|
1313
1515
|
host = parsed_endpoint.hostname
|
|
1314
1516
|
port = parsed_endpoint.port if parsed_endpoint.port else "6379"
|
|
1315
1517
|
scheme = parsed_endpoint.scheme
|
|
@@ -1323,7 +1525,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1323
1525
|
from storey import Table
|
|
1324
1526
|
from storey.redis_driver import RedisDriver
|
|
1325
1527
|
|
|
1326
|
-
endpoint, uri = self.
|
|
1528
|
+
endpoint, uri = self.get_server_endpoint(
|
|
1529
|
+
self.get_target_path(), self.credentials_prefix
|
|
1530
|
+
)
|
|
1327
1531
|
|
|
1328
1532
|
return Table(
|
|
1329
1533
|
uri,
|
|
@@ -1332,12 +1536,14 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1332
1536
|
)
|
|
1333
1537
|
|
|
1334
1538
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1335
|
-
endpoint, uri = self.
|
|
1539
|
+
endpoint, uri = self.get_server_endpoint(
|
|
1540
|
+
self.get_target_path(), self.credentials_prefix
|
|
1541
|
+
)
|
|
1336
1542
|
parsed_endpoint = urlparse(endpoint)
|
|
1337
|
-
|
|
1543
|
+
store, path_in_store, path = self._get_store_and_path()
|
|
1338
1544
|
return {
|
|
1339
1545
|
"key.column": "_spark_object_name",
|
|
1340
|
-
"table": "{" +
|
|
1546
|
+
"table": "{" + path_in_store,
|
|
1341
1547
|
"format": "org.apache.spark.sql.redis",
|
|
1342
1548
|
"host": parsed_endpoint.hostname,
|
|
1343
1549
|
"port": parsed_endpoint.port,
|
|
@@ -1364,6 +1570,29 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1364
1570
|
|
|
1365
1571
|
return df
|
|
1366
1572
|
|
|
1573
|
+
def add_writer_step(
|
|
1574
|
+
self,
|
|
1575
|
+
graph,
|
|
1576
|
+
after,
|
|
1577
|
+
features,
|
|
1578
|
+
key_columns=None,
|
|
1579
|
+
timestamp_key=None,
|
|
1580
|
+
featureset_status=None,
|
|
1581
|
+
):
|
|
1582
|
+
table, column_list = self._get_table_and_columns(features, key_columns)
|
|
1583
|
+
|
|
1584
|
+
graph.add_step(
|
|
1585
|
+
path=self.get_target_path(),
|
|
1586
|
+
name=self.name or self.writer_step_name,
|
|
1587
|
+
after=after,
|
|
1588
|
+
graph_shape="cylinder",
|
|
1589
|
+
class_name="mlrun.datastore.storeytargets.RedisNoSqlStoreyTarget",
|
|
1590
|
+
columns=column_list,
|
|
1591
|
+
table=table,
|
|
1592
|
+
credentials_prefix=self.credentials_prefix,
|
|
1593
|
+
**self.attributes,
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1367
1596
|
|
|
1368
1597
|
class StreamTarget(BaseStoreTarget):
|
|
1369
1598
|
kind = TargetTypes.stream
|
|
@@ -1382,33 +1611,46 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1382
1611
|
timestamp_key=None,
|
|
1383
1612
|
featureset_status=None,
|
|
1384
1613
|
):
|
|
1385
|
-
from storey import V3ioDriver
|
|
1386
|
-
|
|
1387
1614
|
key_columns = list(key_columns.keys())
|
|
1388
|
-
|
|
1389
|
-
if not path:
|
|
1390
|
-
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1391
|
-
endpoint, uri = parse_path(path)
|
|
1615
|
+
|
|
1392
1616
|
column_list = self._get_column_list(
|
|
1393
1617
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1394
1618
|
)
|
|
1619
|
+
stream_path = self.get_target_path()
|
|
1620
|
+
if not stream_path:
|
|
1621
|
+
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1395
1622
|
|
|
1396
1623
|
graph.add_step(
|
|
1397
1624
|
name=self.name or "StreamTarget",
|
|
1398
1625
|
after=after,
|
|
1399
1626
|
graph_shape="cylinder",
|
|
1400
|
-
class_name="
|
|
1627
|
+
class_name="mlrun.datastore.storeytargets.StreamStoreyTarget",
|
|
1401
1628
|
columns=column_list,
|
|
1402
|
-
|
|
1403
|
-
stream_path=uri,
|
|
1629
|
+
stream_path=stream_path,
|
|
1404
1630
|
**self.attributes,
|
|
1405
1631
|
)
|
|
1406
1632
|
|
|
1407
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1408
|
-
raise NotImplementedError()
|
|
1409
|
-
|
|
1410
1633
|
|
|
1411
1634
|
class KafkaTarget(BaseStoreTarget):
|
|
1635
|
+
"""
|
|
1636
|
+
Kafka target storage driver, used to write data into kafka topics.
|
|
1637
|
+
example::
|
|
1638
|
+
# define target
|
|
1639
|
+
kafka_target = KafkaTarget(
|
|
1640
|
+
name="kafka", path="my_topic", brokers="localhost:9092"
|
|
1641
|
+
)
|
|
1642
|
+
# ingest
|
|
1643
|
+
stocks_set.ingest(stocks, [kafka_target])
|
|
1644
|
+
:param name: target name
|
|
1645
|
+
:param path: topic name e.g. "my_topic"
|
|
1646
|
+
:param after_step: optional, after what step in the graph to add the target
|
|
1647
|
+
:param columns: optional, which columns from data to write
|
|
1648
|
+
:param bootstrap_servers: Deprecated. Use the brokers parameter instead
|
|
1649
|
+
:param producer_options: additional configurations for kafka producer
|
|
1650
|
+
:param brokers: kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
|
|
1651
|
+
"localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
|
|
1652
|
+
"""
|
|
1653
|
+
|
|
1412
1654
|
kind = TargetTypes.kafka
|
|
1413
1655
|
is_table = False
|
|
1414
1656
|
is_online = False
|
|
@@ -1421,11 +1663,27 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1421
1663
|
*args,
|
|
1422
1664
|
bootstrap_servers=None,
|
|
1423
1665
|
producer_options=None,
|
|
1666
|
+
brokers=None,
|
|
1424
1667
|
**kwargs,
|
|
1425
1668
|
):
|
|
1426
1669
|
attrs = {}
|
|
1427
|
-
|
|
1428
|
-
|
|
1670
|
+
|
|
1671
|
+
# TODO: Remove this in 1.9.0
|
|
1672
|
+
if bootstrap_servers:
|
|
1673
|
+
if brokers:
|
|
1674
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1675
|
+
"KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
|
|
1676
|
+
"'bootstrap_servers' parameter. Please use 'brokers' only."
|
|
1677
|
+
)
|
|
1678
|
+
warnings.warn(
|
|
1679
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
1680
|
+
"use 'brokers' instead.",
|
|
1681
|
+
FutureWarning,
|
|
1682
|
+
)
|
|
1683
|
+
brokers = bootstrap_servers
|
|
1684
|
+
|
|
1685
|
+
if brokers:
|
|
1686
|
+
attrs["brokers"] = brokers
|
|
1429
1687
|
if producer_options is not None:
|
|
1430
1688
|
attrs["producer_options"] = producer_options
|
|
1431
1689
|
|
|
@@ -1444,37 +1702,21 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1444
1702
|
column_list = self._get_column_list(
|
|
1445
1703
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1446
1704
|
)
|
|
1447
|
-
|
|
1448
|
-
datastore_profile = datastore_profile_read(self.path)
|
|
1449
|
-
attributes = datastore_profile.attributes()
|
|
1450
|
-
bootstrap_servers = attributes.pop("bootstrap_servers", None)
|
|
1451
|
-
topic = datastore_profile.topic
|
|
1452
|
-
else:
|
|
1453
|
-
attributes = copy(self.attributes)
|
|
1454
|
-
bootstrap_servers = attributes.pop("bootstrap_servers", None)
|
|
1455
|
-
topic, bootstrap_servers = parse_kafka_url(
|
|
1456
|
-
self.get_target_path(), bootstrap_servers
|
|
1457
|
-
)
|
|
1705
|
+
path = self.get_target_path()
|
|
1458
1706
|
|
|
1459
|
-
if not
|
|
1460
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1461
|
-
"KafkaTarget requires a path (topic)"
|
|
1462
|
-
)
|
|
1707
|
+
if not path:
|
|
1708
|
+
raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a path")
|
|
1463
1709
|
|
|
1464
1710
|
graph.add_step(
|
|
1465
1711
|
name=self.name or "KafkaTarget",
|
|
1466
1712
|
after=after,
|
|
1467
1713
|
graph_shape="cylinder",
|
|
1468
|
-
class_name="
|
|
1714
|
+
class_name="mlrun.datastore.storeytargets.KafkaStoreyTarget",
|
|
1469
1715
|
columns=column_list,
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
**attributes,
|
|
1716
|
+
path=path,
|
|
1717
|
+
attributes=self.attributes,
|
|
1473
1718
|
)
|
|
1474
1719
|
|
|
1475
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1476
|
-
raise NotImplementedError()
|
|
1477
|
-
|
|
1478
1720
|
def purge(self):
|
|
1479
1721
|
pass
|
|
1480
1722
|
|
|
@@ -1509,7 +1751,7 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1509
1751
|
|
|
1510
1752
|
graph.add_step(
|
|
1511
1753
|
name=self.name or "TSDBTarget",
|
|
1512
|
-
class_name="
|
|
1754
|
+
class_name="mlrun.datastore.storeytargets.TSDBStoreyTarget",
|
|
1513
1755
|
after=after,
|
|
1514
1756
|
graph_shape="cylinder",
|
|
1515
1757
|
path=uri,
|
|
@@ -1519,9 +1761,6 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1519
1761
|
**self.attributes,
|
|
1520
1762
|
)
|
|
1521
1763
|
|
|
1522
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1523
|
-
raise NotImplementedError()
|
|
1524
|
-
|
|
1525
1764
|
def write_dataframe(
|
|
1526
1765
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1527
1766
|
):
|
|
@@ -1535,7 +1774,11 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1535
1774
|
key_column = [key_column]
|
|
1536
1775
|
new_index.extend(key_column)
|
|
1537
1776
|
|
|
1538
|
-
|
|
1777
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1778
|
+
storage_options = store.get_storage_options()
|
|
1779
|
+
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1780
|
+
|
|
1781
|
+
_, path_with_container = parse_path(target_path)
|
|
1539
1782
|
container, path = split_path(path_with_container)
|
|
1540
1783
|
|
|
1541
1784
|
frames_client = get_frames_client(
|
|
@@ -1555,6 +1798,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1555
1798
|
is_online = False
|
|
1556
1799
|
support_spark = False
|
|
1557
1800
|
support_storey = True
|
|
1801
|
+
support_pandas = True
|
|
1558
1802
|
|
|
1559
1803
|
def __init__(
|
|
1560
1804
|
self,
|
|
@@ -1590,6 +1834,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1590
1834
|
class DFTarget(BaseStoreTarget):
|
|
1591
1835
|
kind = TargetTypes.dataframe
|
|
1592
1836
|
support_storey = True
|
|
1837
|
+
support_pandas = True
|
|
1593
1838
|
|
|
1594
1839
|
def __init__(self, *args, name="dataframe", **kwargs):
|
|
1595
1840
|
self._df = None
|
|
@@ -1626,11 +1871,16 @@ class DFTarget(BaseStoreTarget):
|
|
|
1626
1871
|
self,
|
|
1627
1872
|
columns=None,
|
|
1628
1873
|
df_module=None,
|
|
1874
|
+
entities=None,
|
|
1629
1875
|
start_time=None,
|
|
1630
1876
|
end_time=None,
|
|
1631
1877
|
time_column=None,
|
|
1878
|
+
additional_filters=None,
|
|
1632
1879
|
**kwargs,
|
|
1633
1880
|
):
|
|
1881
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1882
|
+
additional_filters, self.__class__
|
|
1883
|
+
)
|
|
1634
1884
|
return select_columns_from_df(
|
|
1635
1885
|
filter_df_start_end_time(
|
|
1636
1886
|
self._df,
|
|
@@ -1647,29 +1897,30 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1647
1897
|
is_online = True
|
|
1648
1898
|
support_spark = False
|
|
1649
1899
|
support_storey = True
|
|
1900
|
+
support_pandas = True
|
|
1650
1901
|
|
|
1651
1902
|
def __init__(
|
|
1652
1903
|
self,
|
|
1653
1904
|
name: str = "",
|
|
1654
1905
|
path=None,
|
|
1655
|
-
attributes:
|
|
1906
|
+
attributes: dict[str, str] = None,
|
|
1656
1907
|
after_step=None,
|
|
1657
1908
|
partitioned: bool = False,
|
|
1658
1909
|
key_bucketing_number: Optional[int] = None,
|
|
1659
|
-
partition_cols: Optional[
|
|
1910
|
+
partition_cols: Optional[list[str]] = None,
|
|
1660
1911
|
time_partitioning_granularity: Optional[str] = None,
|
|
1661
1912
|
max_events: Optional[int] = None,
|
|
1662
1913
|
flush_after_seconds: Optional[int] = None,
|
|
1663
|
-
storage_options:
|
|
1914
|
+
storage_options: dict[str, str] = None,
|
|
1664
1915
|
db_url: str = None,
|
|
1665
1916
|
table_name: str = None,
|
|
1666
|
-
schema:
|
|
1917
|
+
schema: dict[str, Any] = None,
|
|
1667
1918
|
primary_key_column: str = "",
|
|
1668
1919
|
if_exists: str = "append",
|
|
1669
1920
|
create_table: bool = False,
|
|
1670
1921
|
# create_according_to_data: bool = False,
|
|
1671
1922
|
varchar_len: int = 50,
|
|
1672
|
-
parse_dates:
|
|
1923
|
+
parse_dates: list[str] = None,
|
|
1673
1924
|
):
|
|
1674
1925
|
"""
|
|
1675
1926
|
Write to SqlDB as output target for a flow.
|
|
@@ -1789,7 +2040,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1789
2040
|
name=self.name or "SqlTarget",
|
|
1790
2041
|
after=after,
|
|
1791
2042
|
graph_shape="cylinder",
|
|
1792
|
-
class_name="
|
|
2043
|
+
class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
|
|
1793
2044
|
columns=column_list,
|
|
1794
2045
|
header=True,
|
|
1795
2046
|
table=table,
|
|
@@ -1805,6 +2056,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1805
2056
|
start_time=None,
|
|
1806
2057
|
end_time=None,
|
|
1807
2058
|
time_column=None,
|
|
2059
|
+
additional_filters=None,
|
|
1808
2060
|
**kwargs,
|
|
1809
2061
|
):
|
|
1810
2062
|
try:
|
|
@@ -1813,9 +2065,13 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1813
2065
|
except (ModuleNotFoundError, ImportError) as exc:
|
|
1814
2066
|
self._raise_sqlalchemy_import_error(exc)
|
|
1815
2067
|
|
|
2068
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
2069
|
+
additional_filters, self.__class__
|
|
2070
|
+
)
|
|
2071
|
+
|
|
1816
2072
|
db_path, table_name, _, _, _, _ = self._parse_url()
|
|
1817
2073
|
engine = sqlalchemy.create_engine(db_path)
|
|
1818
|
-
parse_dates: Optional[
|
|
2074
|
+
parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
|
|
1819
2075
|
with engine.connect() as conn:
|
|
1820
2076
|
query, parse_dates = _generate_sql_query_with_time_filter(
|
|
1821
2077
|
table_name=table_name,
|
|
@@ -1902,7 +2158,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1902
2158
|
raise ValueError(f"Table named {table_name} is not exist")
|
|
1903
2159
|
|
|
1904
2160
|
elif not table_exists and create_table:
|
|
1905
|
-
|
|
2161
|
+
type_to_sql_type = {
|
|
1906
2162
|
int: sqlalchemy.Integer,
|
|
1907
2163
|
str: sqlalchemy.String(self.attributes.get("varchar_len")),
|
|
1908
2164
|
datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
|
|
@@ -1915,12 +2171,16 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1915
2171
|
# creat new table with the given name
|
|
1916
2172
|
columns = []
|
|
1917
2173
|
for col, col_type in self.schema.items():
|
|
1918
|
-
|
|
1919
|
-
if
|
|
1920
|
-
raise TypeError(
|
|
2174
|
+
col_type_sql = type_to_sql_type.get(col_type)
|
|
2175
|
+
if col_type_sql is None:
|
|
2176
|
+
raise TypeError(
|
|
2177
|
+
f"'{col_type}' unsupported type for column '{col}'"
|
|
2178
|
+
)
|
|
1921
2179
|
columns.append(
|
|
1922
2180
|
sqlalchemy.Column(
|
|
1923
|
-
col,
|
|
2181
|
+
col,
|
|
2182
|
+
col_type_sql,
|
|
2183
|
+
primary_key=(col in primary_key_for_check),
|
|
1924
2184
|
)
|
|
1925
2185
|
)
|
|
1926
2186
|
|
|
@@ -1951,10 +2211,11 @@ kind_to_driver = {
|
|
|
1951
2211
|
TargetTypes.tsdb: TSDBTarget,
|
|
1952
2212
|
TargetTypes.custom: CustomTarget,
|
|
1953
2213
|
TargetTypes.sql: SQLTarget,
|
|
2214
|
+
TargetTypes.snowflake: SnowflakeTarget,
|
|
1954
2215
|
}
|
|
1955
2216
|
|
|
1956
2217
|
|
|
1957
|
-
def _get_target_path(driver, resource, run_id_mode=False):
|
|
2218
|
+
def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
|
|
1958
2219
|
"""return the default target path given the resource and target kind"""
|
|
1959
2220
|
kind = driver.kind
|
|
1960
2221
|
suffix = driver.suffix
|
|
@@ -1971,11 +2232,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
|
|
|
1971
2232
|
)
|
|
1972
2233
|
name = resource.metadata.name
|
|
1973
2234
|
project = resource.metadata.project or mlrun.mlconf.default_project
|
|
1974
|
-
|
|
2235
|
+
|
|
2236
|
+
default_kind_name = kind
|
|
2237
|
+
if scheme == "ds":
|
|
2238
|
+
# "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
|
|
2239
|
+
# as a placeholder that can be used in any specified target
|
|
2240
|
+
default_kind_name = "dsnosql"
|
|
2241
|
+
if scheme == "redis" or scheme == "rediss":
|
|
2242
|
+
default_kind_name = TargetTypes.redisnosql
|
|
2243
|
+
|
|
2244
|
+
netloc = netloc or ""
|
|
2245
|
+
data_prefix = get_default_prefix_for_target(default_kind_name).format(
|
|
2246
|
+
ds_profile_name=netloc, # In case of ds profile, set its the name
|
|
2247
|
+
authority=netloc, # In case of redis, replace {authority} with netloc
|
|
1975
2248
|
project=project,
|
|
1976
2249
|
kind=kind,
|
|
1977
2250
|
name=name,
|
|
1978
2251
|
)
|
|
2252
|
+
|
|
2253
|
+
if scheme == "rediss":
|
|
2254
|
+
data_prefix = data_prefix.replace("redis://", "rediss://", 1)
|
|
2255
|
+
|
|
1979
2256
|
# todo: handle ver tag changes, may need to copy files?
|
|
1980
2257
|
if not run_id_mode:
|
|
1981
2258
|
version = resource.metadata.tag
|