mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +31 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +13 -2
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +233 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +387 -119
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +245 -20
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +909 -231
- mlrun/db/nopdb.py +279 -14
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1176 -406
- mlrun/render.py +28 -22
- mlrun/run.py +208 -181
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +54 -24
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/METADATA +0 -269
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -17,6 +17,7 @@ import os
|
|
|
17
17
|
import random
|
|
18
18
|
import sys
|
|
19
19
|
import time
|
|
20
|
+
import warnings
|
|
20
21
|
from collections import Counter
|
|
21
22
|
from copy import copy
|
|
22
23
|
from typing import Any, Optional, Union
|
|
@@ -28,6 +29,11 @@ from mergedeep import merge
|
|
|
28
29
|
import mlrun
|
|
29
30
|
import mlrun.utils.helpers
|
|
30
31
|
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import (
|
|
33
|
+
get_snowflake_password,
|
|
34
|
+
get_snowflake_spark_options,
|
|
35
|
+
)
|
|
36
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
31
37
|
from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
|
|
32
38
|
from mlrun.utils import logger, now_date
|
|
33
39
|
from mlrun.utils.helpers import to_parquet
|
|
@@ -41,7 +47,6 @@ from .spark_utils import spark_session_update_hadoop_options
|
|
|
41
47
|
from .utils import (
|
|
42
48
|
_generate_sql_query_with_time_filter,
|
|
43
49
|
filter_df_start_end_time,
|
|
44
|
-
parse_kafka_url,
|
|
45
50
|
select_columns_from_df,
|
|
46
51
|
)
|
|
47
52
|
|
|
@@ -57,6 +62,7 @@ class TargetTypes:
|
|
|
57
62
|
dataframe = "dataframe"
|
|
58
63
|
custom = "custom"
|
|
59
64
|
sql = "sql"
|
|
65
|
+
snowflake = "snowflake"
|
|
60
66
|
|
|
61
67
|
@staticmethod
|
|
62
68
|
def all():
|
|
@@ -71,6 +77,7 @@ class TargetTypes:
|
|
|
71
77
|
TargetTypes.dataframe,
|
|
72
78
|
TargetTypes.custom,
|
|
73
79
|
TargetTypes.sql,
|
|
80
|
+
TargetTypes.snowflake,
|
|
74
81
|
]
|
|
75
82
|
|
|
76
83
|
|
|
@@ -78,11 +85,14 @@ def generate_target_run_id():
|
|
|
78
85
|
return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
|
|
79
86
|
|
|
80
87
|
|
|
81
|
-
def write_spark_dataframe_with_options(spark_options, df, mode):
|
|
88
|
+
def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
|
|
82
89
|
non_hadoop_spark_options = spark_session_update_hadoop_options(
|
|
83
90
|
df.sql_ctx.sparkSession, spark_options
|
|
84
91
|
)
|
|
85
|
-
|
|
92
|
+
if write_format:
|
|
93
|
+
df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
|
|
94
|
+
else:
|
|
95
|
+
df.write.mode(mode).save(**non_hadoop_spark_options)
|
|
86
96
|
|
|
87
97
|
|
|
88
98
|
def default_target_names():
|
|
@@ -379,6 +389,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
379
389
|
is_offline = False
|
|
380
390
|
support_spark = False
|
|
381
391
|
support_storey = False
|
|
392
|
+
support_pandas = False
|
|
382
393
|
support_append = False
|
|
383
394
|
|
|
384
395
|
def __init__(
|
|
@@ -428,6 +439,12 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
428
439
|
self.storage_options = storage_options
|
|
429
440
|
self.schema = schema or {}
|
|
430
441
|
self.credentials_prefix = credentials_prefix
|
|
442
|
+
if credentials_prefix:
|
|
443
|
+
warnings.warn(
|
|
444
|
+
"The 'credentials_prefix' parameter is deprecated and will be removed in "
|
|
445
|
+
"1.9.0. Please use datastore profiles instead.",
|
|
446
|
+
FutureWarning,
|
|
447
|
+
)
|
|
431
448
|
|
|
432
449
|
self._target = None
|
|
433
450
|
self._resource = None
|
|
@@ -451,7 +468,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
451
468
|
self.get_target_path(),
|
|
452
469
|
credentials_prefix_secrets,
|
|
453
470
|
)
|
|
454
|
-
return store, url
|
|
471
|
+
return store, resolved_store_path, url
|
|
455
472
|
|
|
456
473
|
def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
|
|
457
474
|
result = []
|
|
@@ -497,10 +514,13 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
497
514
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
498
515
|
options.update(kwargs)
|
|
499
516
|
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
500
|
-
|
|
517
|
+
write_format = options.pop("format", None)
|
|
518
|
+
write_spark_dataframe_with_options(
|
|
519
|
+
options, df, "overwrite", write_format=write_format
|
|
520
|
+
)
|
|
501
521
|
elif hasattr(df, "dask"):
|
|
502
522
|
dask_options = self.get_dask_options()
|
|
503
|
-
store, target_path = self._get_store_and_path()
|
|
523
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
504
524
|
storage_options = store.get_storage_options()
|
|
505
525
|
df = df.repartition(partition_size="100MB")
|
|
506
526
|
try:
|
|
@@ -521,18 +541,21 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
521
541
|
except Exception as exc:
|
|
522
542
|
raise RuntimeError("Failed to write Dask Dataframe") from exc
|
|
523
543
|
else:
|
|
524
|
-
store, target_path = self._get_store_and_path()
|
|
544
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
525
545
|
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
526
546
|
file_system = store.filesystem
|
|
527
|
-
if
|
|
547
|
+
if (
|
|
548
|
+
file_system.protocol == "file"
|
|
549
|
+
# fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
|
|
550
|
+
or isinstance(file_system.protocol, (tuple, list))
|
|
551
|
+
and "file" in file_system.protocol
|
|
552
|
+
):
|
|
528
553
|
dir = os.path.dirname(target_path)
|
|
529
554
|
if dir:
|
|
530
555
|
os.makedirs(dir, exist_ok=True)
|
|
531
556
|
target_df = df
|
|
532
557
|
partition_cols = None # single parquet file
|
|
533
|
-
if not
|
|
534
|
-
".pq"
|
|
535
|
-
): # directory
|
|
558
|
+
if not mlrun.utils.helpers.is_parquet_file(target_path): # directory
|
|
536
559
|
partition_cols = []
|
|
537
560
|
if timestamp_key and (
|
|
538
561
|
self.partitioned or self.time_partitioning_granularity
|
|
@@ -641,6 +664,29 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
641
664
|
def _target_path_object(self):
|
|
642
665
|
"""return the actual/computed target path"""
|
|
643
666
|
is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
|
|
667
|
+
|
|
668
|
+
if self._resource and self.path:
|
|
669
|
+
parsed_url = urlparse(self.path)
|
|
670
|
+
# When the URL consists only from scheme and endpoint and no path,
|
|
671
|
+
# make a default path for DS and redis targets.
|
|
672
|
+
# Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
|
|
673
|
+
if (
|
|
674
|
+
not isinstance(self, KafkaTarget)
|
|
675
|
+
and parsed_url.scheme in ["ds", "redis", "rediss"]
|
|
676
|
+
and (not parsed_url.path or parsed_url.path == "/")
|
|
677
|
+
):
|
|
678
|
+
return TargetPathObject(
|
|
679
|
+
_get_target_path(
|
|
680
|
+
self,
|
|
681
|
+
self._resource,
|
|
682
|
+
self.run_id is not None,
|
|
683
|
+
netloc=parsed_url.netloc,
|
|
684
|
+
scheme=parsed_url.scheme,
|
|
685
|
+
),
|
|
686
|
+
self.run_id,
|
|
687
|
+
is_single_file,
|
|
688
|
+
)
|
|
689
|
+
|
|
644
690
|
return self.get_path() or (
|
|
645
691
|
TargetPathObject(
|
|
646
692
|
_get_target_path(self, self._resource, self.run_id is not None),
|
|
@@ -657,6 +703,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
657
703
|
self.kind, self.name, self.get_target_templated_path()
|
|
658
704
|
)
|
|
659
705
|
target = self._target
|
|
706
|
+
target.attributes = self.attributes
|
|
660
707
|
target.run_id = self.run_id
|
|
661
708
|
target.status = status or target.status or "created"
|
|
662
709
|
target.updated = now_date().isoformat()
|
|
@@ -685,11 +732,25 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
685
732
|
timestamp_key=None,
|
|
686
733
|
featureset_status=None,
|
|
687
734
|
):
|
|
735
|
+
if not self.support_storey:
|
|
736
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
737
|
+
f"{type(self).__name__} does not support storey engine"
|
|
738
|
+
)
|
|
688
739
|
raise NotImplementedError()
|
|
689
740
|
|
|
690
741
|
def purge(self):
|
|
691
|
-
|
|
692
|
-
|
|
742
|
+
"""
|
|
743
|
+
Delete the files of the target.
|
|
744
|
+
|
|
745
|
+
Do not use this function directly from the sdk. Use FeatureSet.purge_targets.
|
|
746
|
+
"""
|
|
747
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
748
|
+
if path_in_store not in ["", "/"]:
|
|
749
|
+
store.rm(path_in_store, recursive=True)
|
|
750
|
+
else:
|
|
751
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
752
|
+
"Unable to delete target. Please Use purge_targets from FeatureSet object."
|
|
753
|
+
)
|
|
693
754
|
|
|
694
755
|
def as_df(
|
|
695
756
|
self,
|
|
@@ -699,9 +760,15 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
699
760
|
start_time=None,
|
|
700
761
|
end_time=None,
|
|
701
762
|
time_column=None,
|
|
763
|
+
additional_filters=None,
|
|
702
764
|
**kwargs,
|
|
703
765
|
):
|
|
704
766
|
"""return the target data as dataframe"""
|
|
767
|
+
if not self.support_pandas:
|
|
768
|
+
raise NotImplementedError()
|
|
769
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
770
|
+
additional_filters, self.__class__
|
|
771
|
+
)
|
|
705
772
|
return mlrun.get_dataitem(self.get_target_path()).as_df(
|
|
706
773
|
columns=columns,
|
|
707
774
|
df_module=df_module,
|
|
@@ -713,14 +780,22 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
713
780
|
|
|
714
781
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
715
782
|
# options used in spark.read.load(**options)
|
|
783
|
+
if not self.support_spark:
|
|
784
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
785
|
+
f"{type(self).__name__} does not support spark engine"
|
|
786
|
+
)
|
|
716
787
|
raise NotImplementedError()
|
|
717
788
|
|
|
718
|
-
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=
|
|
789
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
719
790
|
return df
|
|
720
791
|
|
|
721
792
|
def get_dask_options(self):
|
|
722
793
|
raise NotImplementedError()
|
|
723
794
|
|
|
795
|
+
@property
|
|
796
|
+
def source_spark_attributes(self) -> dict:
|
|
797
|
+
return {}
|
|
798
|
+
|
|
724
799
|
|
|
725
800
|
class ParquetTarget(BaseStoreTarget):
|
|
726
801
|
"""Parquet target storage driver, used to materialize feature set/vector data into parquet files.
|
|
@@ -752,6 +827,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
752
827
|
support_spark = True
|
|
753
828
|
support_storey = True
|
|
754
829
|
support_dask = True
|
|
830
|
+
support_pandas = True
|
|
755
831
|
support_append = True
|
|
756
832
|
|
|
757
833
|
def __init__(
|
|
@@ -857,10 +933,9 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
857
933
|
if time_unit == time_partitioning_granularity:
|
|
858
934
|
break
|
|
859
935
|
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
and not self.get_target_path().endswith(".pq")
|
|
936
|
+
target_path = self.get_target_path()
|
|
937
|
+
if not self.partitioned and not mlrun.utils.helpers.is_parquet_file(
|
|
938
|
+
target_path
|
|
864
939
|
):
|
|
865
940
|
partition_cols = []
|
|
866
941
|
|
|
@@ -868,25 +943,16 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
868
943
|
for key_column in key_columns:
|
|
869
944
|
tuple_key_columns.append((key_column.name, key_column.value_type))
|
|
870
945
|
|
|
871
|
-
store, target_path = self._get_store_and_path()
|
|
872
|
-
|
|
873
|
-
storage_options = store.get_storage_options()
|
|
874
|
-
if storage_options and self.storage_options:
|
|
875
|
-
storage_options = merge(storage_options, self.storage_options)
|
|
876
|
-
else:
|
|
877
|
-
storage_options = storage_options or self.storage_options
|
|
878
|
-
|
|
879
946
|
step = graph.add_step(
|
|
880
947
|
name=self.name or "ParquetTarget",
|
|
881
948
|
after=after,
|
|
882
949
|
graph_shape="cylinder",
|
|
883
|
-
class_name="
|
|
950
|
+
class_name="mlrun.datastore.storeytargets.ParquetStoreyTarget",
|
|
884
951
|
path=target_path,
|
|
885
952
|
columns=column_list,
|
|
886
953
|
index_cols=tuple_key_columns,
|
|
887
954
|
partition_cols=partition_cols,
|
|
888
955
|
time_field=timestamp_key,
|
|
889
|
-
storage_options=storage_options,
|
|
890
956
|
max_events=self.max_events,
|
|
891
957
|
flush_after_seconds=self.flush_after_seconds,
|
|
892
958
|
update_last_written=featureset_status.update_last_written_for_target,
|
|
@@ -921,9 +987,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
921
987
|
if unit == time_partitioning_granularity:
|
|
922
988
|
break
|
|
923
989
|
|
|
924
|
-
store, path, url =
|
|
925
|
-
self.get_target_path()
|
|
926
|
-
)
|
|
990
|
+
store, path, url = self._get_store_and_path()
|
|
927
991
|
spark_options = store.get_spark_options()
|
|
928
992
|
spark_options.update(
|
|
929
993
|
{
|
|
@@ -948,6 +1012,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
948
1012
|
start_time=None,
|
|
949
1013
|
end_time=None,
|
|
950
1014
|
time_column=None,
|
|
1015
|
+
additional_filters=None,
|
|
951
1016
|
**kwargs,
|
|
952
1017
|
):
|
|
953
1018
|
"""return the target data as dataframe"""
|
|
@@ -958,6 +1023,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
958
1023
|
start_time=start_time,
|
|
959
1024
|
end_time=end_time,
|
|
960
1025
|
time_column=time_column,
|
|
1026
|
+
additional_filters=transform_list_filters_to_tuple(additional_filters),
|
|
961
1027
|
**kwargs,
|
|
962
1028
|
)
|
|
963
1029
|
if not columns:
|
|
@@ -979,9 +1045,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
979
1045
|
return result
|
|
980
1046
|
|
|
981
1047
|
def is_single_file(self):
|
|
982
|
-
|
|
983
|
-
return self.path.endswith(".parquet") or self.path.endswith(".pq")
|
|
984
|
-
return False
|
|
1048
|
+
return mlrun.utils.helpers.is_parquet_file(self.path)
|
|
985
1049
|
|
|
986
1050
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
987
1051
|
# If partitioning by time, add the necessary columns
|
|
@@ -1021,6 +1085,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1021
1085
|
is_offline = True
|
|
1022
1086
|
support_spark = True
|
|
1023
1087
|
support_storey = True
|
|
1088
|
+
support_pandas = True
|
|
1024
1089
|
|
|
1025
1090
|
@staticmethod
|
|
1026
1091
|
def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
|
|
@@ -1042,24 +1107,21 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1042
1107
|
column_list = self._get_column_list(
|
|
1043
1108
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1044
1109
|
)
|
|
1045
|
-
|
|
1110
|
+
target_path = self.get_target_path()
|
|
1046
1111
|
graph.add_step(
|
|
1047
1112
|
name=self.name or "CSVTarget",
|
|
1048
1113
|
after=after,
|
|
1049
1114
|
graph_shape="cylinder",
|
|
1050
|
-
class_name="
|
|
1115
|
+
class_name="mlrun.datastore.storeytargets.CSVStoreyTarget",
|
|
1051
1116
|
path=target_path,
|
|
1052
1117
|
columns=column_list,
|
|
1053
1118
|
header=True,
|
|
1054
1119
|
index_cols=key_columns,
|
|
1055
|
-
storage_options=store.get_storage_options(),
|
|
1056
1120
|
**self.attributes,
|
|
1057
1121
|
)
|
|
1058
1122
|
|
|
1059
1123
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1060
|
-
store, path, url =
|
|
1061
|
-
self.get_target_path()
|
|
1062
|
-
)
|
|
1124
|
+
store, path, url = self._get_store_and_path()
|
|
1063
1125
|
spark_options = store.get_spark_options()
|
|
1064
1126
|
spark_options.update(
|
|
1065
1127
|
{
|
|
@@ -1074,7 +1136,8 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1074
1136
|
import pyspark.sql.functions as funcs
|
|
1075
1137
|
|
|
1076
1138
|
for col_name, col_type in df.dtypes:
|
|
1077
|
-
|
|
1139
|
+
# covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
|
|
1140
|
+
if col_type.startswith("timestamp"):
|
|
1078
1141
|
# df.write.csv saves timestamps with millisecond precision, but we want microsecond precision
|
|
1079
1142
|
# for compatibility with storey.
|
|
1080
1143
|
df = df.withColumn(
|
|
@@ -1090,8 +1153,12 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1090
1153
|
start_time=None,
|
|
1091
1154
|
end_time=None,
|
|
1092
1155
|
time_column=None,
|
|
1156
|
+
additional_filters=None,
|
|
1093
1157
|
**kwargs,
|
|
1094
1158
|
):
|
|
1159
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1160
|
+
additional_filters, self.__class__
|
|
1161
|
+
)
|
|
1095
1162
|
df = super().as_df(
|
|
1096
1163
|
columns=columns,
|
|
1097
1164
|
df_module=df_module,
|
|
@@ -1112,6 +1179,134 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1112
1179
|
return True
|
|
1113
1180
|
|
|
1114
1181
|
|
|
1182
|
+
class SnowflakeTarget(BaseStoreTarget):
|
|
1183
|
+
"""
|
|
1184
|
+
:param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
|
|
1185
|
+
if they exist.
|
|
1186
|
+
:param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
|
|
1187
|
+
:param user: Snowflake user for login
|
|
1188
|
+
:param db_schema: Database schema
|
|
1189
|
+
:param database: Database name
|
|
1190
|
+
:param warehouse: Snowflake warehouse name
|
|
1191
|
+
:param table_name: Snowflake table name
|
|
1192
|
+
"""
|
|
1193
|
+
|
|
1194
|
+
support_spark = True
|
|
1195
|
+
support_append = True
|
|
1196
|
+
is_offline = True
|
|
1197
|
+
kind = TargetTypes.snowflake
|
|
1198
|
+
|
|
1199
|
+
def __init__(
|
|
1200
|
+
self,
|
|
1201
|
+
name: str = "",
|
|
1202
|
+
path=None,
|
|
1203
|
+
attributes: dict[str, str] = None,
|
|
1204
|
+
after_step=None,
|
|
1205
|
+
columns=None,
|
|
1206
|
+
partitioned: bool = False,
|
|
1207
|
+
key_bucketing_number: Optional[int] = None,
|
|
1208
|
+
partition_cols: Optional[list[str]] = None,
|
|
1209
|
+
time_partitioning_granularity: Optional[str] = None,
|
|
1210
|
+
max_events: Optional[int] = None,
|
|
1211
|
+
flush_after_seconds: Optional[int] = None,
|
|
1212
|
+
storage_options: dict[str, str] = None,
|
|
1213
|
+
schema: dict[str, Any] = None,
|
|
1214
|
+
credentials_prefix=None,
|
|
1215
|
+
url: str = None,
|
|
1216
|
+
user: str = None,
|
|
1217
|
+
db_schema: str = None,
|
|
1218
|
+
database: str = None,
|
|
1219
|
+
warehouse: str = None,
|
|
1220
|
+
table_name: str = None,
|
|
1221
|
+
):
|
|
1222
|
+
attributes = attributes or {}
|
|
1223
|
+
if url:
|
|
1224
|
+
attributes["url"] = url
|
|
1225
|
+
if user:
|
|
1226
|
+
attributes["user"] = user
|
|
1227
|
+
if database:
|
|
1228
|
+
attributes["database"] = database
|
|
1229
|
+
if db_schema:
|
|
1230
|
+
attributes["db_schema"] = db_schema
|
|
1231
|
+
if warehouse:
|
|
1232
|
+
attributes["warehouse"] = warehouse
|
|
1233
|
+
if table_name:
|
|
1234
|
+
attributes["table"] = table_name
|
|
1235
|
+
|
|
1236
|
+
super().__init__(
|
|
1237
|
+
name,
|
|
1238
|
+
path,
|
|
1239
|
+
attributes,
|
|
1240
|
+
after_step,
|
|
1241
|
+
list(schema.keys()) if schema else columns,
|
|
1242
|
+
partitioned,
|
|
1243
|
+
key_bucketing_number,
|
|
1244
|
+
partition_cols,
|
|
1245
|
+
time_partitioning_granularity,
|
|
1246
|
+
max_events=max_events,
|
|
1247
|
+
flush_after_seconds=flush_after_seconds,
|
|
1248
|
+
storage_options=storage_options,
|
|
1249
|
+
schema=schema,
|
|
1250
|
+
credentials_prefix=credentials_prefix,
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1254
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
1255
|
+
spark_options["dbtable"] = self.attributes.get("table")
|
|
1256
|
+
return spark_options
|
|
1257
|
+
|
|
1258
|
+
def purge(self):
|
|
1259
|
+
import snowflake.connector
|
|
1260
|
+
|
|
1261
|
+
missing = [
|
|
1262
|
+
key
|
|
1263
|
+
for key in ["database", "db_schema", "table", "url", "user", "warehouse"]
|
|
1264
|
+
if self.attributes.get(key) is None
|
|
1265
|
+
]
|
|
1266
|
+
if missing:
|
|
1267
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1268
|
+
f"Can't purge Snowflake target, "
|
|
1269
|
+
f"some attributes are missing: {', '.join(missing)}"
|
|
1270
|
+
)
|
|
1271
|
+
account = self.attributes["url"].replace(".snowflakecomputing.com", "")
|
|
1272
|
+
|
|
1273
|
+
with snowflake.connector.connect(
|
|
1274
|
+
account=account,
|
|
1275
|
+
user=self.attributes["user"],
|
|
1276
|
+
password=get_snowflake_password(),
|
|
1277
|
+
warehouse=self.attributes["warehouse"],
|
|
1278
|
+
) as snowflake_connector:
|
|
1279
|
+
drop_statement = (
|
|
1280
|
+
f"DROP TABLE IF EXISTS {self.attributes['database']}.{self.attributes['db_schema']}"
|
|
1281
|
+
f".{self.attributes['table']}"
|
|
1282
|
+
)
|
|
1283
|
+
snowflake_connector.execute_string(drop_statement)
|
|
1284
|
+
|
|
1285
|
+
def as_df(
|
|
1286
|
+
self,
|
|
1287
|
+
columns=None,
|
|
1288
|
+
df_module=None,
|
|
1289
|
+
entities=None,
|
|
1290
|
+
start_time=None,
|
|
1291
|
+
end_time=None,
|
|
1292
|
+
time_column=None,
|
|
1293
|
+
additional_filters=None,
|
|
1294
|
+
**kwargs,
|
|
1295
|
+
):
|
|
1296
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1297
|
+
f"{type(self).__name__} does not support pandas engine"
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
@property
|
|
1301
|
+
def source_spark_attributes(self) -> dict:
|
|
1302
|
+
keys = ["url", "user", "database", "db_schema", "warehouse"]
|
|
1303
|
+
attributes = self.attributes or {}
|
|
1304
|
+
snowflake_dict = {key: attributes.get(key) for key in keys}
|
|
1305
|
+
table = attributes.get("table")
|
|
1306
|
+
snowflake_dict["query"] = f"SELECT * from {table}" if table else None
|
|
1307
|
+
return snowflake_dict
|
|
1308
|
+
|
|
1309
|
+
|
|
1115
1310
|
class NoSqlBaseTarget(BaseStoreTarget):
|
|
1116
1311
|
is_table = True
|
|
1117
1312
|
is_online = True
|
|
@@ -1136,6 +1331,19 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1136
1331
|
timestamp_key=None,
|
|
1137
1332
|
featureset_status=None,
|
|
1138
1333
|
):
|
|
1334
|
+
table, column_list = self._get_table_and_columns(features, key_columns)
|
|
1335
|
+
|
|
1336
|
+
graph.add_step(
|
|
1337
|
+
name=self.name or self.writer_step_name,
|
|
1338
|
+
after=after,
|
|
1339
|
+
graph_shape="cylinder",
|
|
1340
|
+
class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
|
|
1341
|
+
columns=column_list,
|
|
1342
|
+
table=table,
|
|
1343
|
+
**self.attributes,
|
|
1344
|
+
)
|
|
1345
|
+
|
|
1346
|
+
def _get_table_and_columns(self, features, key_columns):
|
|
1139
1347
|
key_columns = list(key_columns.keys())
|
|
1140
1348
|
table = self._resource.uri
|
|
1141
1349
|
column_list = self._get_column_list(
|
|
@@ -1154,15 +1362,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1154
1362
|
col for col in column_list if col[0] not in aggregate_features
|
|
1155
1363
|
]
|
|
1156
1364
|
|
|
1157
|
-
|
|
1158
|
-
name=self.name or self.writer_step_name,
|
|
1159
|
-
after=after,
|
|
1160
|
-
graph_shape="cylinder",
|
|
1161
|
-
class_name="storey.NoSqlTarget",
|
|
1162
|
-
columns=column_list,
|
|
1163
|
-
table=table,
|
|
1164
|
-
**self.attributes,
|
|
1165
|
-
)
|
|
1365
|
+
return table, column_list
|
|
1166
1366
|
|
|
1167
1367
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1168
1368
|
raise NotImplementedError()
|
|
@@ -1173,9 +1373,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1173
1373
|
def get_dask_options(self):
|
|
1174
1374
|
return {"format": "csv"}
|
|
1175
1375
|
|
|
1176
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1177
|
-
raise NotImplementedError()
|
|
1178
|
-
|
|
1179
1376
|
def write_dataframe(
|
|
1180
1377
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1181
1378
|
):
|
|
@@ -1183,7 +1380,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1183
1380
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
1184
1381
|
options.update(kwargs)
|
|
1185
1382
|
df = self.prepare_spark_df(df)
|
|
1186
|
-
|
|
1383
|
+
write_format = options.pop("format", None)
|
|
1384
|
+
write_spark_dataframe_with_options(
|
|
1385
|
+
options, df, "overwrite", write_format=write_format
|
|
1386
|
+
)
|
|
1187
1387
|
else:
|
|
1188
1388
|
# To prevent modification of the original dataframe and make sure
|
|
1189
1389
|
# that the last event of a key is the one being persisted
|
|
@@ -1193,7 +1393,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1193
1393
|
df = df.copy(deep=False)
|
|
1194
1394
|
access_key = self._get_credential("V3IO_ACCESS_KEY")
|
|
1195
1395
|
|
|
1196
|
-
store, target_path = self._get_store_and_path()
|
|
1396
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1197
1397
|
storage_options = store.get_storage_options()
|
|
1198
1398
|
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1199
1399
|
|
|
@@ -1215,7 +1415,7 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1215
1415
|
def get_table_object(self):
|
|
1216
1416
|
from storey import Table, V3ioDriver
|
|
1217
1417
|
|
|
1218
|
-
store, target_path = self._get_store_and_path()
|
|
1418
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1219
1419
|
endpoint, uri = parse_path(target_path)
|
|
1220
1420
|
storage_options = store.get_storage_options()
|
|
1221
1421
|
access_key = storage_options.get("v3io_access_key")
|
|
@@ -1227,7 +1427,7 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1227
1427
|
)
|
|
1228
1428
|
|
|
1229
1429
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1230
|
-
store, target_path = self._get_store_and_path()
|
|
1430
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1231
1431
|
storage_options = store.get_storage_options()
|
|
1232
1432
|
store_access_key = storage_options.get("v3io_access_key")
|
|
1233
1433
|
env_access_key = self._secrets.get(
|
|
@@ -1239,7 +1439,7 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1239
1439
|
"Spark will disregard the store-provided key."
|
|
1240
1440
|
)
|
|
1241
1441
|
spark_options = {
|
|
1242
|
-
"path": store.spark_url +
|
|
1442
|
+
"path": store.spark_url + path_in_store,
|
|
1243
1443
|
"format": "io.iguaz.v3io.spark.sql.kv",
|
|
1244
1444
|
}
|
|
1245
1445
|
if isinstance(key_column, list) and len(key_column) >= 1:
|
|
@@ -1285,11 +1485,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1285
1485
|
support_spark = True
|
|
1286
1486
|
writer_step_name = "RedisNoSqlTarget"
|
|
1287
1487
|
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
def _get_server_endpoint(self):
|
|
1292
|
-
endpoint, uri = parse_path(self.get_target_path())
|
|
1488
|
+
@staticmethod
|
|
1489
|
+
def get_server_endpoint(path, credentials_prefix=None):
|
|
1490
|
+
endpoint, uri = parse_path(path)
|
|
1293
1491
|
endpoint = endpoint or mlrun.mlconf.redis.url
|
|
1294
1492
|
if endpoint.startswith("ds://"):
|
|
1295
1493
|
datastore_profile = datastore_profile_read(endpoint)
|
|
@@ -1306,8 +1504,15 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1306
1504
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1307
1505
|
"Provide Redis username and password only via secrets"
|
|
1308
1506
|
)
|
|
1309
|
-
|
|
1310
|
-
|
|
1507
|
+
credentials_prefix = credentials_prefix or mlrun.get_secret_or_env(
|
|
1508
|
+
key="CREDENTIALS_PREFIX"
|
|
1509
|
+
)
|
|
1510
|
+
user = mlrun.get_secret_or_env(
|
|
1511
|
+
"REDIS_USER", default="", prefix=credentials_prefix
|
|
1512
|
+
)
|
|
1513
|
+
password = mlrun.get_secret_or_env(
|
|
1514
|
+
"REDIS_PASSWORD", default="", prefix=credentials_prefix
|
|
1515
|
+
)
|
|
1311
1516
|
host = parsed_endpoint.hostname
|
|
1312
1517
|
port = parsed_endpoint.port if parsed_endpoint.port else "6379"
|
|
1313
1518
|
scheme = parsed_endpoint.scheme
|
|
@@ -1321,7 +1526,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1321
1526
|
from storey import Table
|
|
1322
1527
|
from storey.redis_driver import RedisDriver
|
|
1323
1528
|
|
|
1324
|
-
endpoint, uri = self.
|
|
1529
|
+
endpoint, uri = self.get_server_endpoint(
|
|
1530
|
+
self.get_target_path(), self.credentials_prefix
|
|
1531
|
+
)
|
|
1325
1532
|
|
|
1326
1533
|
return Table(
|
|
1327
1534
|
uri,
|
|
@@ -1330,12 +1537,14 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1330
1537
|
)
|
|
1331
1538
|
|
|
1332
1539
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1333
|
-
endpoint, uri = self.
|
|
1540
|
+
endpoint, uri = self.get_server_endpoint(
|
|
1541
|
+
self.get_target_path(), self.credentials_prefix
|
|
1542
|
+
)
|
|
1334
1543
|
parsed_endpoint = urlparse(endpoint)
|
|
1335
|
-
store, path = self._get_store_and_path()
|
|
1544
|
+
store, path_in_store, path = self._get_store_and_path()
|
|
1336
1545
|
return {
|
|
1337
1546
|
"key.column": "_spark_object_name",
|
|
1338
|
-
"table": "{" +
|
|
1547
|
+
"table": "{" + path_in_store,
|
|
1339
1548
|
"format": "org.apache.spark.sql.redis",
|
|
1340
1549
|
"host": parsed_endpoint.hostname,
|
|
1341
1550
|
"port": parsed_endpoint.port,
|
|
@@ -1362,6 +1571,29 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1362
1571
|
|
|
1363
1572
|
return df
|
|
1364
1573
|
|
|
1574
|
+
def add_writer_step(
|
|
1575
|
+
self,
|
|
1576
|
+
graph,
|
|
1577
|
+
after,
|
|
1578
|
+
features,
|
|
1579
|
+
key_columns=None,
|
|
1580
|
+
timestamp_key=None,
|
|
1581
|
+
featureset_status=None,
|
|
1582
|
+
):
|
|
1583
|
+
table, column_list = self._get_table_and_columns(features, key_columns)
|
|
1584
|
+
|
|
1585
|
+
graph.add_step(
|
|
1586
|
+
path=self.get_target_path(),
|
|
1587
|
+
name=self.name or self.writer_step_name,
|
|
1588
|
+
after=after,
|
|
1589
|
+
graph_shape="cylinder",
|
|
1590
|
+
class_name="mlrun.datastore.storeytargets.RedisNoSqlStoreyTarget",
|
|
1591
|
+
columns=column_list,
|
|
1592
|
+
table=table,
|
|
1593
|
+
credentials_prefix=self.credentials_prefix,
|
|
1594
|
+
**self.attributes,
|
|
1595
|
+
)
|
|
1596
|
+
|
|
1365
1597
|
|
|
1366
1598
|
class StreamTarget(BaseStoreTarget):
|
|
1367
1599
|
kind = TargetTypes.stream
|
|
@@ -1380,37 +1612,46 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1380
1612
|
timestamp_key=None,
|
|
1381
1613
|
featureset_status=None,
|
|
1382
1614
|
):
|
|
1383
|
-
from storey import V3ioDriver
|
|
1384
|
-
|
|
1385
1615
|
key_columns = list(key_columns.keys())
|
|
1386
|
-
|
|
1387
|
-
if not path:
|
|
1388
|
-
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1389
|
-
endpoint, uri = parse_path(path)
|
|
1390
|
-
storage_options = store.get_storage_options()
|
|
1391
|
-
access_key = storage_options.get("v3io_access_key")
|
|
1616
|
+
|
|
1392
1617
|
column_list = self._get_column_list(
|
|
1393
1618
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1394
1619
|
)
|
|
1620
|
+
stream_path = self.get_target_path()
|
|
1621
|
+
if not stream_path:
|
|
1622
|
+
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1395
1623
|
|
|
1396
1624
|
graph.add_step(
|
|
1397
1625
|
name=self.name or "StreamTarget",
|
|
1398
1626
|
after=after,
|
|
1399
1627
|
graph_shape="cylinder",
|
|
1400
|
-
class_name="
|
|
1628
|
+
class_name="mlrun.datastore.storeytargets.StreamStoreyTarget",
|
|
1401
1629
|
columns=column_list,
|
|
1402
|
-
|
|
1403
|
-
webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
|
|
1404
|
-
),
|
|
1405
|
-
stream_path=uri,
|
|
1630
|
+
stream_path=stream_path,
|
|
1406
1631
|
**self.attributes,
|
|
1407
1632
|
)
|
|
1408
1633
|
|
|
1409
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1410
|
-
raise NotImplementedError()
|
|
1411
|
-
|
|
1412
1634
|
|
|
1413
1635
|
class KafkaTarget(BaseStoreTarget):
|
|
1636
|
+
"""
|
|
1637
|
+
Kafka target storage driver, used to write data into kafka topics.
|
|
1638
|
+
example::
|
|
1639
|
+
# define target
|
|
1640
|
+
kafka_target = KafkaTarget(
|
|
1641
|
+
name="kafka", path="my_topic", brokers="localhost:9092"
|
|
1642
|
+
)
|
|
1643
|
+
# ingest
|
|
1644
|
+
stocks_set.ingest(stocks, [kafka_target])
|
|
1645
|
+
:param name: target name
|
|
1646
|
+
:param path: topic name e.g. "my_topic"
|
|
1647
|
+
:param after_step: optional, after what step in the graph to add the target
|
|
1648
|
+
:param columns: optional, which columns from data to write
|
|
1649
|
+
:param bootstrap_servers: Deprecated. Use the brokers parameter instead
|
|
1650
|
+
:param producer_options: additional configurations for kafka producer
|
|
1651
|
+
:param brokers: kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
|
|
1652
|
+
"localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
|
|
1653
|
+
"""
|
|
1654
|
+
|
|
1414
1655
|
kind = TargetTypes.kafka
|
|
1415
1656
|
is_table = False
|
|
1416
1657
|
is_online = False
|
|
@@ -1423,11 +1664,27 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1423
1664
|
*args,
|
|
1424
1665
|
bootstrap_servers=None,
|
|
1425
1666
|
producer_options=None,
|
|
1667
|
+
brokers=None,
|
|
1426
1668
|
**kwargs,
|
|
1427
1669
|
):
|
|
1428
1670
|
attrs = {}
|
|
1429
|
-
|
|
1430
|
-
|
|
1671
|
+
|
|
1672
|
+
# TODO: Remove this in 1.9.0
|
|
1673
|
+
if bootstrap_servers:
|
|
1674
|
+
if brokers:
|
|
1675
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1676
|
+
"KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
|
|
1677
|
+
"'bootstrap_servers' parameter. Please use 'brokers' only."
|
|
1678
|
+
)
|
|
1679
|
+
warnings.warn(
|
|
1680
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
1681
|
+
"use 'brokers' instead.",
|
|
1682
|
+
FutureWarning,
|
|
1683
|
+
)
|
|
1684
|
+
brokers = bootstrap_servers
|
|
1685
|
+
|
|
1686
|
+
if brokers:
|
|
1687
|
+
attrs["brokers"] = brokers
|
|
1431
1688
|
if producer_options is not None:
|
|
1432
1689
|
attrs["producer_options"] = producer_options
|
|
1433
1690
|
|
|
@@ -1446,37 +1703,21 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1446
1703
|
column_list = self._get_column_list(
|
|
1447
1704
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1448
1705
|
)
|
|
1449
|
-
|
|
1450
|
-
datastore_profile = datastore_profile_read(self.path)
|
|
1451
|
-
attributes = datastore_profile.attributes()
|
|
1452
|
-
bootstrap_servers = attributes.pop("bootstrap_servers", None)
|
|
1453
|
-
topic = datastore_profile.topic
|
|
1454
|
-
else:
|
|
1455
|
-
attributes = copy(self.attributes)
|
|
1456
|
-
bootstrap_servers = attributes.pop("bootstrap_servers", None)
|
|
1457
|
-
topic, bootstrap_servers = parse_kafka_url(
|
|
1458
|
-
self.get_target_path(), bootstrap_servers
|
|
1459
|
-
)
|
|
1706
|
+
path = self.get_target_path()
|
|
1460
1707
|
|
|
1461
|
-
if not
|
|
1462
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1463
|
-
"KafkaTarget requires a path (topic)"
|
|
1464
|
-
)
|
|
1708
|
+
if not path:
|
|
1709
|
+
raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a path")
|
|
1465
1710
|
|
|
1466
1711
|
graph.add_step(
|
|
1467
1712
|
name=self.name or "KafkaTarget",
|
|
1468
1713
|
after=after,
|
|
1469
1714
|
graph_shape="cylinder",
|
|
1470
|
-
class_name="
|
|
1715
|
+
class_name="mlrun.datastore.storeytargets.KafkaStoreyTarget",
|
|
1471
1716
|
columns=column_list,
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
**attributes,
|
|
1717
|
+
path=path,
|
|
1718
|
+
attributes=self.attributes,
|
|
1475
1719
|
)
|
|
1476
1720
|
|
|
1477
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1478
|
-
raise NotImplementedError()
|
|
1479
|
-
|
|
1480
1721
|
def purge(self):
|
|
1481
1722
|
pass
|
|
1482
1723
|
|
|
@@ -1511,7 +1752,7 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1511
1752
|
|
|
1512
1753
|
graph.add_step(
|
|
1513
1754
|
name=self.name or "TSDBTarget",
|
|
1514
|
-
class_name="
|
|
1755
|
+
class_name="mlrun.datastore.storeytargets.TSDBStoreyTarget",
|
|
1515
1756
|
after=after,
|
|
1516
1757
|
graph_shape="cylinder",
|
|
1517
1758
|
path=uri,
|
|
@@ -1521,9 +1762,6 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1521
1762
|
**self.attributes,
|
|
1522
1763
|
)
|
|
1523
1764
|
|
|
1524
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1525
|
-
raise NotImplementedError()
|
|
1526
|
-
|
|
1527
1765
|
def write_dataframe(
|
|
1528
1766
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1529
1767
|
):
|
|
@@ -1537,7 +1775,7 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1537
1775
|
key_column = [key_column]
|
|
1538
1776
|
new_index.extend(key_column)
|
|
1539
1777
|
|
|
1540
|
-
store, target_path = self._get_store_and_path()
|
|
1778
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1541
1779
|
storage_options = store.get_storage_options()
|
|
1542
1780
|
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1543
1781
|
|
|
@@ -1561,6 +1799,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1561
1799
|
is_online = False
|
|
1562
1800
|
support_spark = False
|
|
1563
1801
|
support_storey = True
|
|
1802
|
+
support_pandas = True
|
|
1564
1803
|
|
|
1565
1804
|
def __init__(
|
|
1566
1805
|
self,
|
|
@@ -1596,6 +1835,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1596
1835
|
class DFTarget(BaseStoreTarget):
|
|
1597
1836
|
kind = TargetTypes.dataframe
|
|
1598
1837
|
support_storey = True
|
|
1838
|
+
support_pandas = True
|
|
1599
1839
|
|
|
1600
1840
|
def __init__(self, *args, name="dataframe", **kwargs):
|
|
1601
1841
|
self._df = None
|
|
@@ -1632,11 +1872,16 @@ class DFTarget(BaseStoreTarget):
|
|
|
1632
1872
|
self,
|
|
1633
1873
|
columns=None,
|
|
1634
1874
|
df_module=None,
|
|
1875
|
+
entities=None,
|
|
1635
1876
|
start_time=None,
|
|
1636
1877
|
end_time=None,
|
|
1637
1878
|
time_column=None,
|
|
1879
|
+
additional_filters=None,
|
|
1638
1880
|
**kwargs,
|
|
1639
1881
|
):
|
|
1882
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1883
|
+
additional_filters, self.__class__
|
|
1884
|
+
)
|
|
1640
1885
|
return select_columns_from_df(
|
|
1641
1886
|
filter_df_start_end_time(
|
|
1642
1887
|
self._df,
|
|
@@ -1653,6 +1898,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1653
1898
|
is_online = True
|
|
1654
1899
|
support_spark = False
|
|
1655
1900
|
support_storey = True
|
|
1901
|
+
support_pandas = True
|
|
1656
1902
|
|
|
1657
1903
|
def __init__(
|
|
1658
1904
|
self,
|
|
@@ -1795,7 +2041,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1795
2041
|
name=self.name or "SqlTarget",
|
|
1796
2042
|
after=after,
|
|
1797
2043
|
graph_shape="cylinder",
|
|
1798
|
-
class_name="
|
|
2044
|
+
class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
|
|
1799
2045
|
columns=column_list,
|
|
1800
2046
|
header=True,
|
|
1801
2047
|
table=table,
|
|
@@ -1811,6 +2057,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1811
2057
|
start_time=None,
|
|
1812
2058
|
end_time=None,
|
|
1813
2059
|
time_column=None,
|
|
2060
|
+
additional_filters=None,
|
|
1814
2061
|
**kwargs,
|
|
1815
2062
|
):
|
|
1816
2063
|
try:
|
|
@@ -1819,6 +2066,10 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1819
2066
|
except (ModuleNotFoundError, ImportError) as exc:
|
|
1820
2067
|
self._raise_sqlalchemy_import_error(exc)
|
|
1821
2068
|
|
|
2069
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
2070
|
+
additional_filters, self.__class__
|
|
2071
|
+
)
|
|
2072
|
+
|
|
1822
2073
|
db_path, table_name, _, _, _, _ = self._parse_url()
|
|
1823
2074
|
engine = sqlalchemy.create_engine(db_path)
|
|
1824
2075
|
parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
|
|
@@ -1908,7 +2159,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1908
2159
|
raise ValueError(f"Table named {table_name} is not exist")
|
|
1909
2160
|
|
|
1910
2161
|
elif not table_exists and create_table:
|
|
1911
|
-
|
|
2162
|
+
type_to_sql_type = {
|
|
1912
2163
|
int: sqlalchemy.Integer,
|
|
1913
2164
|
str: sqlalchemy.String(self.attributes.get("varchar_len")),
|
|
1914
2165
|
datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
|
|
@@ -1921,7 +2172,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1921
2172
|
# creat new table with the given name
|
|
1922
2173
|
columns = []
|
|
1923
2174
|
for col, col_type in self.schema.items():
|
|
1924
|
-
col_type_sql =
|
|
2175
|
+
col_type_sql = type_to_sql_type.get(col_type)
|
|
1925
2176
|
if col_type_sql is None:
|
|
1926
2177
|
raise TypeError(
|
|
1927
2178
|
f"'{col_type}' unsupported type for column '{col}'"
|
|
@@ -1961,10 +2212,11 @@ kind_to_driver = {
|
|
|
1961
2212
|
TargetTypes.tsdb: TSDBTarget,
|
|
1962
2213
|
TargetTypes.custom: CustomTarget,
|
|
1963
2214
|
TargetTypes.sql: SQLTarget,
|
|
2215
|
+
TargetTypes.snowflake: SnowflakeTarget,
|
|
1964
2216
|
}
|
|
1965
2217
|
|
|
1966
2218
|
|
|
1967
|
-
def _get_target_path(driver, resource, run_id_mode=False):
|
|
2219
|
+
def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
|
|
1968
2220
|
"""return the default target path given the resource and target kind"""
|
|
1969
2221
|
kind = driver.kind
|
|
1970
2222
|
suffix = driver.suffix
|
|
@@ -1981,11 +2233,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
|
|
|
1981
2233
|
)
|
|
1982
2234
|
name = resource.metadata.name
|
|
1983
2235
|
project = resource.metadata.project or mlrun.mlconf.default_project
|
|
1984
|
-
|
|
2236
|
+
|
|
2237
|
+
default_kind_name = kind
|
|
2238
|
+
if scheme == "ds":
|
|
2239
|
+
# "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
|
|
2240
|
+
# as a placeholder that can be used in any specified target
|
|
2241
|
+
default_kind_name = "dsnosql"
|
|
2242
|
+
if scheme == "redis" or scheme == "rediss":
|
|
2243
|
+
default_kind_name = TargetTypes.redisnosql
|
|
2244
|
+
|
|
2245
|
+
netloc = netloc or ""
|
|
2246
|
+
data_prefix = get_default_prefix_for_target(default_kind_name).format(
|
|
2247
|
+
ds_profile_name=netloc, # In case of ds profile, set its the name
|
|
2248
|
+
authority=netloc, # In case of redis, replace {authority} with netloc
|
|
1985
2249
|
project=project,
|
|
1986
2250
|
kind=kind,
|
|
1987
2251
|
name=name,
|
|
1988
2252
|
)
|
|
2253
|
+
|
|
2254
|
+
if scheme == "rediss":
|
|
2255
|
+
data_prefix = data_prefix.replace("redis://", "rediss://", 1)
|
|
2256
|
+
|
|
1989
2257
|
# todo: handle ver tag changes, may need to copy files?
|
|
1990
2258
|
if not run_id_mode:
|
|
1991
2259
|
version = resource.metadata.tag
|