mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -2
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +21 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +113 -2
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +11 -0
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +224 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +374 -102
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +231 -22
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +864 -228
- mlrun/db/nopdb.py +268 -16
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1125 -414
- mlrun/render.py +28 -22
- mlrun/run.py +207 -180
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +40 -14
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +646 -177
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc5.dist-info/METADATA +0 -269
- mlrun-1.7.0rc5.dist-info/RECORD +0 -323
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -17,6 +17,7 @@ import os
|
|
|
17
17
|
import random
|
|
18
18
|
import sys
|
|
19
19
|
import time
|
|
20
|
+
import warnings
|
|
20
21
|
from collections import Counter
|
|
21
22
|
from copy import copy
|
|
22
23
|
from typing import Any, Optional, Union
|
|
@@ -28,6 +29,11 @@ from mergedeep import merge
|
|
|
28
29
|
import mlrun
|
|
29
30
|
import mlrun.utils.helpers
|
|
30
31
|
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import (
|
|
33
|
+
get_snowflake_password,
|
|
34
|
+
get_snowflake_spark_options,
|
|
35
|
+
)
|
|
36
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
31
37
|
from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
|
|
32
38
|
from mlrun.utils import logger, now_date
|
|
33
39
|
from mlrun.utils.helpers import to_parquet
|
|
@@ -41,7 +47,6 @@ from .spark_utils import spark_session_update_hadoop_options
|
|
|
41
47
|
from .utils import (
|
|
42
48
|
_generate_sql_query_with_time_filter,
|
|
43
49
|
filter_df_start_end_time,
|
|
44
|
-
parse_kafka_url,
|
|
45
50
|
select_columns_from_df,
|
|
46
51
|
)
|
|
47
52
|
|
|
@@ -57,6 +62,7 @@ class TargetTypes:
|
|
|
57
62
|
dataframe = "dataframe"
|
|
58
63
|
custom = "custom"
|
|
59
64
|
sql = "sql"
|
|
65
|
+
snowflake = "snowflake"
|
|
60
66
|
|
|
61
67
|
@staticmethod
|
|
62
68
|
def all():
|
|
@@ -71,6 +77,7 @@ class TargetTypes:
|
|
|
71
77
|
TargetTypes.dataframe,
|
|
72
78
|
TargetTypes.custom,
|
|
73
79
|
TargetTypes.sql,
|
|
80
|
+
TargetTypes.snowflake,
|
|
74
81
|
]
|
|
75
82
|
|
|
76
83
|
|
|
@@ -78,11 +85,14 @@ def generate_target_run_id():
|
|
|
78
85
|
return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
|
|
79
86
|
|
|
80
87
|
|
|
81
|
-
def write_spark_dataframe_with_options(spark_options, df, mode):
|
|
88
|
+
def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
|
|
82
89
|
non_hadoop_spark_options = spark_session_update_hadoop_options(
|
|
83
90
|
df.sql_ctx.sparkSession, spark_options
|
|
84
91
|
)
|
|
85
|
-
|
|
92
|
+
if write_format:
|
|
93
|
+
df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
|
|
94
|
+
else:
|
|
95
|
+
df.write.mode(mode).save(**non_hadoop_spark_options)
|
|
86
96
|
|
|
87
97
|
|
|
88
98
|
def default_target_names():
|
|
@@ -379,6 +389,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
379
389
|
is_offline = False
|
|
380
390
|
support_spark = False
|
|
381
391
|
support_storey = False
|
|
392
|
+
support_pandas = False
|
|
382
393
|
support_append = False
|
|
383
394
|
|
|
384
395
|
def __init__(
|
|
@@ -428,6 +439,12 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
428
439
|
self.storage_options = storage_options
|
|
429
440
|
self.schema = schema or {}
|
|
430
441
|
self.credentials_prefix = credentials_prefix
|
|
442
|
+
if credentials_prefix:
|
|
443
|
+
warnings.warn(
|
|
444
|
+
"The 'credentials_prefix' parameter is deprecated and will be removed in "
|
|
445
|
+
"1.9.0. Please use datastore profiles instead.",
|
|
446
|
+
FutureWarning,
|
|
447
|
+
)
|
|
431
448
|
|
|
432
449
|
self._target = None
|
|
433
450
|
self._resource = None
|
|
@@ -497,7 +514,10 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
497
514
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
498
515
|
options.update(kwargs)
|
|
499
516
|
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
500
|
-
|
|
517
|
+
write_format = options.pop("format", None)
|
|
518
|
+
write_spark_dataframe_with_options(
|
|
519
|
+
options, df, "overwrite", write_format=write_format
|
|
520
|
+
)
|
|
501
521
|
elif hasattr(df, "dask"):
|
|
502
522
|
dask_options = self.get_dask_options()
|
|
503
523
|
store, path_in_store, target_path = self._get_store_and_path()
|
|
@@ -524,15 +544,18 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
524
544
|
store, path_in_store, target_path = self._get_store_and_path()
|
|
525
545
|
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
526
546
|
file_system = store.filesystem
|
|
527
|
-
if
|
|
547
|
+
if (
|
|
548
|
+
file_system.protocol == "file"
|
|
549
|
+
# fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
|
|
550
|
+
or isinstance(file_system.protocol, (tuple, list))
|
|
551
|
+
and "file" in file_system.protocol
|
|
552
|
+
):
|
|
528
553
|
dir = os.path.dirname(target_path)
|
|
529
554
|
if dir:
|
|
530
555
|
os.makedirs(dir, exist_ok=True)
|
|
531
556
|
target_df = df
|
|
532
557
|
partition_cols = None # single parquet file
|
|
533
|
-
if not
|
|
534
|
-
".pq"
|
|
535
|
-
): # directory
|
|
558
|
+
if not mlrun.utils.helpers.is_parquet_file(target_path): # directory
|
|
536
559
|
partition_cols = []
|
|
537
560
|
if timestamp_key and (
|
|
538
561
|
self.partitioned or self.time_partitioning_granularity
|
|
@@ -641,6 +664,29 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
641
664
|
def _target_path_object(self):
|
|
642
665
|
"""return the actual/computed target path"""
|
|
643
666
|
is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
|
|
667
|
+
|
|
668
|
+
if self._resource and self.path:
|
|
669
|
+
parsed_url = urlparse(self.path)
|
|
670
|
+
# When the URL consists only from scheme and endpoint and no path,
|
|
671
|
+
# make a default path for DS and redis targets.
|
|
672
|
+
# Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
|
|
673
|
+
if (
|
|
674
|
+
not isinstance(self, KafkaTarget)
|
|
675
|
+
and parsed_url.scheme in ["ds", "redis", "rediss"]
|
|
676
|
+
and (not parsed_url.path or parsed_url.path == "/")
|
|
677
|
+
):
|
|
678
|
+
return TargetPathObject(
|
|
679
|
+
_get_target_path(
|
|
680
|
+
self,
|
|
681
|
+
self._resource,
|
|
682
|
+
self.run_id is not None,
|
|
683
|
+
netloc=parsed_url.netloc,
|
|
684
|
+
scheme=parsed_url.scheme,
|
|
685
|
+
),
|
|
686
|
+
self.run_id,
|
|
687
|
+
is_single_file,
|
|
688
|
+
)
|
|
689
|
+
|
|
644
690
|
return self.get_path() or (
|
|
645
691
|
TargetPathObject(
|
|
646
692
|
_get_target_path(self, self._resource, self.run_id is not None),
|
|
@@ -657,6 +703,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
657
703
|
self.kind, self.name, self.get_target_templated_path()
|
|
658
704
|
)
|
|
659
705
|
target = self._target
|
|
706
|
+
target.attributes = self.attributes
|
|
660
707
|
target.run_id = self.run_id
|
|
661
708
|
target.status = status or target.status or "created"
|
|
662
709
|
target.updated = now_date().isoformat()
|
|
@@ -685,11 +732,25 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
685
732
|
timestamp_key=None,
|
|
686
733
|
featureset_status=None,
|
|
687
734
|
):
|
|
735
|
+
if not self.support_storey:
|
|
736
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
737
|
+
f"{type(self).__name__} does not support storey engine"
|
|
738
|
+
)
|
|
688
739
|
raise NotImplementedError()
|
|
689
740
|
|
|
690
741
|
def purge(self):
|
|
742
|
+
"""
|
|
743
|
+
Delete the files of the target.
|
|
744
|
+
|
|
745
|
+
Do not use this function directly from the sdk. Use FeatureSet.purge_targets.
|
|
746
|
+
"""
|
|
691
747
|
store, path_in_store, target_path = self._get_store_and_path()
|
|
692
|
-
|
|
748
|
+
if path_in_store not in ["", "/"]:
|
|
749
|
+
store.rm(path_in_store, recursive=True)
|
|
750
|
+
else:
|
|
751
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
752
|
+
"Unable to delete target. Please Use purge_targets from FeatureSet object."
|
|
753
|
+
)
|
|
693
754
|
|
|
694
755
|
def as_df(
|
|
695
756
|
self,
|
|
@@ -699,9 +760,15 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
699
760
|
start_time=None,
|
|
700
761
|
end_time=None,
|
|
701
762
|
time_column=None,
|
|
763
|
+
additional_filters=None,
|
|
702
764
|
**kwargs,
|
|
703
765
|
):
|
|
704
766
|
"""return the target data as dataframe"""
|
|
767
|
+
if not self.support_pandas:
|
|
768
|
+
raise NotImplementedError()
|
|
769
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
770
|
+
additional_filters, self.__class__
|
|
771
|
+
)
|
|
705
772
|
return mlrun.get_dataitem(self.get_target_path()).as_df(
|
|
706
773
|
columns=columns,
|
|
707
774
|
df_module=df_module,
|
|
@@ -713,14 +780,22 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
713
780
|
|
|
714
781
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
715
782
|
# options used in spark.read.load(**options)
|
|
783
|
+
if not self.support_spark:
|
|
784
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
785
|
+
f"{type(self).__name__} does not support spark engine"
|
|
786
|
+
)
|
|
716
787
|
raise NotImplementedError()
|
|
717
788
|
|
|
718
|
-
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=
|
|
789
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
719
790
|
return df
|
|
720
791
|
|
|
721
792
|
def get_dask_options(self):
|
|
722
793
|
raise NotImplementedError()
|
|
723
794
|
|
|
795
|
+
@property
|
|
796
|
+
def source_spark_attributes(self) -> dict:
|
|
797
|
+
return {}
|
|
798
|
+
|
|
724
799
|
|
|
725
800
|
class ParquetTarget(BaseStoreTarget):
|
|
726
801
|
"""Parquet target storage driver, used to materialize feature set/vector data into parquet files.
|
|
@@ -752,6 +827,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
752
827
|
support_spark = True
|
|
753
828
|
support_storey = True
|
|
754
829
|
support_dask = True
|
|
830
|
+
support_pandas = True
|
|
755
831
|
support_append = True
|
|
756
832
|
|
|
757
833
|
def __init__(
|
|
@@ -857,10 +933,9 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
857
933
|
if time_unit == time_partitioning_granularity:
|
|
858
934
|
break
|
|
859
935
|
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
and not self.get_target_path().endswith(".pq")
|
|
936
|
+
target_path = self.get_target_path()
|
|
937
|
+
if not self.partitioned and not mlrun.utils.helpers.is_parquet_file(
|
|
938
|
+
target_path
|
|
864
939
|
):
|
|
865
940
|
partition_cols = []
|
|
866
941
|
|
|
@@ -868,25 +943,16 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
868
943
|
for key_column in key_columns:
|
|
869
944
|
tuple_key_columns.append((key_column.name, key_column.value_type))
|
|
870
945
|
|
|
871
|
-
store, path_in_store, target_path = self._get_store_and_path()
|
|
872
|
-
|
|
873
|
-
storage_options = store.get_storage_options()
|
|
874
|
-
if storage_options and self.storage_options:
|
|
875
|
-
storage_options = merge(storage_options, self.storage_options)
|
|
876
|
-
else:
|
|
877
|
-
storage_options = storage_options or self.storage_options
|
|
878
|
-
|
|
879
946
|
step = graph.add_step(
|
|
880
947
|
name=self.name or "ParquetTarget",
|
|
881
948
|
after=after,
|
|
882
949
|
graph_shape="cylinder",
|
|
883
|
-
class_name="
|
|
950
|
+
class_name="mlrun.datastore.storeytargets.ParquetStoreyTarget",
|
|
884
951
|
path=target_path,
|
|
885
952
|
columns=column_list,
|
|
886
953
|
index_cols=tuple_key_columns,
|
|
887
954
|
partition_cols=partition_cols,
|
|
888
955
|
time_field=timestamp_key,
|
|
889
|
-
storage_options=storage_options,
|
|
890
956
|
max_events=self.max_events,
|
|
891
957
|
flush_after_seconds=self.flush_after_seconds,
|
|
892
958
|
update_last_written=featureset_status.update_last_written_for_target,
|
|
@@ -946,6 +1012,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
946
1012
|
start_time=None,
|
|
947
1013
|
end_time=None,
|
|
948
1014
|
time_column=None,
|
|
1015
|
+
additional_filters=None,
|
|
949
1016
|
**kwargs,
|
|
950
1017
|
):
|
|
951
1018
|
"""return the target data as dataframe"""
|
|
@@ -956,6 +1023,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
956
1023
|
start_time=start_time,
|
|
957
1024
|
end_time=end_time,
|
|
958
1025
|
time_column=time_column,
|
|
1026
|
+
additional_filters=transform_list_filters_to_tuple(additional_filters),
|
|
959
1027
|
**kwargs,
|
|
960
1028
|
)
|
|
961
1029
|
if not columns:
|
|
@@ -977,9 +1045,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
977
1045
|
return result
|
|
978
1046
|
|
|
979
1047
|
def is_single_file(self):
|
|
980
|
-
|
|
981
|
-
return self.path.endswith(".parquet") or self.path.endswith(".pq")
|
|
982
|
-
return False
|
|
1048
|
+
return mlrun.utils.helpers.is_parquet_file(self.path)
|
|
983
1049
|
|
|
984
1050
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
985
1051
|
# If partitioning by time, add the necessary columns
|
|
@@ -1019,6 +1085,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1019
1085
|
is_offline = True
|
|
1020
1086
|
support_spark = True
|
|
1021
1087
|
support_storey = True
|
|
1088
|
+
support_pandas = True
|
|
1022
1089
|
|
|
1023
1090
|
@staticmethod
|
|
1024
1091
|
def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
|
|
@@ -1040,17 +1107,16 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1040
1107
|
column_list = self._get_column_list(
|
|
1041
1108
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1042
1109
|
)
|
|
1043
|
-
|
|
1110
|
+
target_path = self.get_target_path()
|
|
1044
1111
|
graph.add_step(
|
|
1045
1112
|
name=self.name or "CSVTarget",
|
|
1046
1113
|
after=after,
|
|
1047
1114
|
graph_shape="cylinder",
|
|
1048
|
-
class_name="
|
|
1115
|
+
class_name="mlrun.datastore.storeytargets.CSVStoreyTarget",
|
|
1049
1116
|
path=target_path,
|
|
1050
1117
|
columns=column_list,
|
|
1051
1118
|
header=True,
|
|
1052
1119
|
index_cols=key_columns,
|
|
1053
|
-
storage_options=store.get_storage_options(),
|
|
1054
1120
|
**self.attributes,
|
|
1055
1121
|
)
|
|
1056
1122
|
|
|
@@ -1070,7 +1136,8 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1070
1136
|
import pyspark.sql.functions as funcs
|
|
1071
1137
|
|
|
1072
1138
|
for col_name, col_type in df.dtypes:
|
|
1073
|
-
|
|
1139
|
+
# covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
|
|
1140
|
+
if col_type.startswith("timestamp"):
|
|
1074
1141
|
# df.write.csv saves timestamps with millisecond precision, but we want microsecond precision
|
|
1075
1142
|
# for compatibility with storey.
|
|
1076
1143
|
df = df.withColumn(
|
|
@@ -1086,8 +1153,12 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1086
1153
|
start_time=None,
|
|
1087
1154
|
end_time=None,
|
|
1088
1155
|
time_column=None,
|
|
1156
|
+
additional_filters=None,
|
|
1089
1157
|
**kwargs,
|
|
1090
1158
|
):
|
|
1159
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1160
|
+
additional_filters, self.__class__
|
|
1161
|
+
)
|
|
1091
1162
|
df = super().as_df(
|
|
1092
1163
|
columns=columns,
|
|
1093
1164
|
df_module=df_module,
|
|
@@ -1108,6 +1179,134 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1108
1179
|
return True
|
|
1109
1180
|
|
|
1110
1181
|
|
|
1182
|
+
class SnowflakeTarget(BaseStoreTarget):
|
|
1183
|
+
"""
|
|
1184
|
+
:param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
|
|
1185
|
+
if they exist.
|
|
1186
|
+
:param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
|
|
1187
|
+
:param user: Snowflake user for login
|
|
1188
|
+
:param db_schema: Database schema
|
|
1189
|
+
:param database: Database name
|
|
1190
|
+
:param warehouse: Snowflake warehouse name
|
|
1191
|
+
:param table_name: Snowflake table name
|
|
1192
|
+
"""
|
|
1193
|
+
|
|
1194
|
+
support_spark = True
|
|
1195
|
+
support_append = True
|
|
1196
|
+
is_offline = True
|
|
1197
|
+
kind = TargetTypes.snowflake
|
|
1198
|
+
|
|
1199
|
+
def __init__(
|
|
1200
|
+
self,
|
|
1201
|
+
name: str = "",
|
|
1202
|
+
path=None,
|
|
1203
|
+
attributes: dict[str, str] = None,
|
|
1204
|
+
after_step=None,
|
|
1205
|
+
columns=None,
|
|
1206
|
+
partitioned: bool = False,
|
|
1207
|
+
key_bucketing_number: Optional[int] = None,
|
|
1208
|
+
partition_cols: Optional[list[str]] = None,
|
|
1209
|
+
time_partitioning_granularity: Optional[str] = None,
|
|
1210
|
+
max_events: Optional[int] = None,
|
|
1211
|
+
flush_after_seconds: Optional[int] = None,
|
|
1212
|
+
storage_options: dict[str, str] = None,
|
|
1213
|
+
schema: dict[str, Any] = None,
|
|
1214
|
+
credentials_prefix=None,
|
|
1215
|
+
url: str = None,
|
|
1216
|
+
user: str = None,
|
|
1217
|
+
db_schema: str = None,
|
|
1218
|
+
database: str = None,
|
|
1219
|
+
warehouse: str = None,
|
|
1220
|
+
table_name: str = None,
|
|
1221
|
+
):
|
|
1222
|
+
attributes = attributes or {}
|
|
1223
|
+
if url:
|
|
1224
|
+
attributes["url"] = url
|
|
1225
|
+
if user:
|
|
1226
|
+
attributes["user"] = user
|
|
1227
|
+
if database:
|
|
1228
|
+
attributes["database"] = database
|
|
1229
|
+
if db_schema:
|
|
1230
|
+
attributes["db_schema"] = db_schema
|
|
1231
|
+
if warehouse:
|
|
1232
|
+
attributes["warehouse"] = warehouse
|
|
1233
|
+
if table_name:
|
|
1234
|
+
attributes["table"] = table_name
|
|
1235
|
+
|
|
1236
|
+
super().__init__(
|
|
1237
|
+
name,
|
|
1238
|
+
path,
|
|
1239
|
+
attributes,
|
|
1240
|
+
after_step,
|
|
1241
|
+
list(schema.keys()) if schema else columns,
|
|
1242
|
+
partitioned,
|
|
1243
|
+
key_bucketing_number,
|
|
1244
|
+
partition_cols,
|
|
1245
|
+
time_partitioning_granularity,
|
|
1246
|
+
max_events=max_events,
|
|
1247
|
+
flush_after_seconds=flush_after_seconds,
|
|
1248
|
+
storage_options=storage_options,
|
|
1249
|
+
schema=schema,
|
|
1250
|
+
credentials_prefix=credentials_prefix,
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1254
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
1255
|
+
spark_options["dbtable"] = self.attributes.get("table")
|
|
1256
|
+
return spark_options
|
|
1257
|
+
|
|
1258
|
+
def purge(self):
|
|
1259
|
+
import snowflake.connector
|
|
1260
|
+
|
|
1261
|
+
missing = [
|
|
1262
|
+
key
|
|
1263
|
+
for key in ["database", "db_schema", "table", "url", "user", "warehouse"]
|
|
1264
|
+
if self.attributes.get(key) is None
|
|
1265
|
+
]
|
|
1266
|
+
if missing:
|
|
1267
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1268
|
+
f"Can't purge Snowflake target, "
|
|
1269
|
+
f"some attributes are missing: {', '.join(missing)}"
|
|
1270
|
+
)
|
|
1271
|
+
account = self.attributes["url"].replace(".snowflakecomputing.com", "")
|
|
1272
|
+
|
|
1273
|
+
with snowflake.connector.connect(
|
|
1274
|
+
account=account,
|
|
1275
|
+
user=self.attributes["user"],
|
|
1276
|
+
password=get_snowflake_password(),
|
|
1277
|
+
warehouse=self.attributes["warehouse"],
|
|
1278
|
+
) as snowflake_connector:
|
|
1279
|
+
drop_statement = (
|
|
1280
|
+
f"DROP TABLE IF EXISTS {self.attributes['database']}.{self.attributes['db_schema']}"
|
|
1281
|
+
f".{self.attributes['table']}"
|
|
1282
|
+
)
|
|
1283
|
+
snowflake_connector.execute_string(drop_statement)
|
|
1284
|
+
|
|
1285
|
+
def as_df(
|
|
1286
|
+
self,
|
|
1287
|
+
columns=None,
|
|
1288
|
+
df_module=None,
|
|
1289
|
+
entities=None,
|
|
1290
|
+
start_time=None,
|
|
1291
|
+
end_time=None,
|
|
1292
|
+
time_column=None,
|
|
1293
|
+
additional_filters=None,
|
|
1294
|
+
**kwargs,
|
|
1295
|
+
):
|
|
1296
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1297
|
+
f"{type(self).__name__} does not support pandas engine"
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
@property
|
|
1301
|
+
def source_spark_attributes(self) -> dict:
|
|
1302
|
+
keys = ["url", "user", "database", "db_schema", "warehouse"]
|
|
1303
|
+
attributes = self.attributes or {}
|
|
1304
|
+
snowflake_dict = {key: attributes.get(key) for key in keys}
|
|
1305
|
+
table = attributes.get("table")
|
|
1306
|
+
snowflake_dict["query"] = f"SELECT * from {table}" if table else None
|
|
1307
|
+
return snowflake_dict
|
|
1308
|
+
|
|
1309
|
+
|
|
1111
1310
|
class NoSqlBaseTarget(BaseStoreTarget):
|
|
1112
1311
|
is_table = True
|
|
1113
1312
|
is_online = True
|
|
@@ -1132,6 +1331,19 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1132
1331
|
timestamp_key=None,
|
|
1133
1332
|
featureset_status=None,
|
|
1134
1333
|
):
|
|
1334
|
+
table, column_list = self._get_table_and_columns(features, key_columns)
|
|
1335
|
+
|
|
1336
|
+
graph.add_step(
|
|
1337
|
+
name=self.name or self.writer_step_name,
|
|
1338
|
+
after=after,
|
|
1339
|
+
graph_shape="cylinder",
|
|
1340
|
+
class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
|
|
1341
|
+
columns=column_list,
|
|
1342
|
+
table=table,
|
|
1343
|
+
**self.attributes,
|
|
1344
|
+
)
|
|
1345
|
+
|
|
1346
|
+
def _get_table_and_columns(self, features, key_columns):
|
|
1135
1347
|
key_columns = list(key_columns.keys())
|
|
1136
1348
|
table = self._resource.uri
|
|
1137
1349
|
column_list = self._get_column_list(
|
|
@@ -1150,15 +1362,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1150
1362
|
col for col in column_list if col[0] not in aggregate_features
|
|
1151
1363
|
]
|
|
1152
1364
|
|
|
1153
|
-
|
|
1154
|
-
name=self.name or self.writer_step_name,
|
|
1155
|
-
after=after,
|
|
1156
|
-
graph_shape="cylinder",
|
|
1157
|
-
class_name="storey.NoSqlTarget",
|
|
1158
|
-
columns=column_list,
|
|
1159
|
-
table=table,
|
|
1160
|
-
**self.attributes,
|
|
1161
|
-
)
|
|
1365
|
+
return table, column_list
|
|
1162
1366
|
|
|
1163
1367
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1164
1368
|
raise NotImplementedError()
|
|
@@ -1169,9 +1373,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1169
1373
|
def get_dask_options(self):
|
|
1170
1374
|
return {"format": "csv"}
|
|
1171
1375
|
|
|
1172
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1173
|
-
raise NotImplementedError()
|
|
1174
|
-
|
|
1175
1376
|
def write_dataframe(
|
|
1176
1377
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1177
1378
|
):
|
|
@@ -1179,7 +1380,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1179
1380
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
1180
1381
|
options.update(kwargs)
|
|
1181
1382
|
df = self.prepare_spark_df(df)
|
|
1182
|
-
|
|
1383
|
+
write_format = options.pop("format", None)
|
|
1384
|
+
write_spark_dataframe_with_options(
|
|
1385
|
+
options, df, "overwrite", write_format=write_format
|
|
1386
|
+
)
|
|
1183
1387
|
else:
|
|
1184
1388
|
# To prevent modification of the original dataframe and make sure
|
|
1185
1389
|
# that the last event of a key is the one being persisted
|
|
@@ -1281,11 +1485,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1281
1485
|
support_spark = True
|
|
1282
1486
|
writer_step_name = "RedisNoSqlTarget"
|
|
1283
1487
|
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
def _get_server_endpoint(self):
|
|
1288
|
-
endpoint, uri = parse_path(self.get_target_path())
|
|
1488
|
+
@staticmethod
|
|
1489
|
+
def get_server_endpoint(path, credentials_prefix=None):
|
|
1490
|
+
endpoint, uri = parse_path(path)
|
|
1289
1491
|
endpoint = endpoint or mlrun.mlconf.redis.url
|
|
1290
1492
|
if endpoint.startswith("ds://"):
|
|
1291
1493
|
datastore_profile = datastore_profile_read(endpoint)
|
|
@@ -1302,8 +1504,15 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1302
1504
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1303
1505
|
"Provide Redis username and password only via secrets"
|
|
1304
1506
|
)
|
|
1305
|
-
|
|
1306
|
-
|
|
1507
|
+
credentials_prefix = credentials_prefix or mlrun.get_secret_or_env(
|
|
1508
|
+
key="CREDENTIALS_PREFIX"
|
|
1509
|
+
)
|
|
1510
|
+
user = mlrun.get_secret_or_env(
|
|
1511
|
+
"REDIS_USER", default="", prefix=credentials_prefix
|
|
1512
|
+
)
|
|
1513
|
+
password = mlrun.get_secret_or_env(
|
|
1514
|
+
"REDIS_PASSWORD", default="", prefix=credentials_prefix
|
|
1515
|
+
)
|
|
1307
1516
|
host = parsed_endpoint.hostname
|
|
1308
1517
|
port = parsed_endpoint.port if parsed_endpoint.port else "6379"
|
|
1309
1518
|
scheme = parsed_endpoint.scheme
|
|
@@ -1317,7 +1526,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1317
1526
|
from storey import Table
|
|
1318
1527
|
from storey.redis_driver import RedisDriver
|
|
1319
1528
|
|
|
1320
|
-
endpoint, uri = self.
|
|
1529
|
+
endpoint, uri = self.get_server_endpoint(
|
|
1530
|
+
self.get_target_path(), self.credentials_prefix
|
|
1531
|
+
)
|
|
1321
1532
|
|
|
1322
1533
|
return Table(
|
|
1323
1534
|
uri,
|
|
@@ -1326,7 +1537,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1326
1537
|
)
|
|
1327
1538
|
|
|
1328
1539
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1329
|
-
endpoint, uri = self.
|
|
1540
|
+
endpoint, uri = self.get_server_endpoint(
|
|
1541
|
+
self.get_target_path(), self.credentials_prefix
|
|
1542
|
+
)
|
|
1330
1543
|
parsed_endpoint = urlparse(endpoint)
|
|
1331
1544
|
store, path_in_store, path = self._get_store_and_path()
|
|
1332
1545
|
return {
|
|
@@ -1358,6 +1571,29 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1358
1571
|
|
|
1359
1572
|
return df
|
|
1360
1573
|
|
|
1574
|
+
def add_writer_step(
|
|
1575
|
+
self,
|
|
1576
|
+
graph,
|
|
1577
|
+
after,
|
|
1578
|
+
features,
|
|
1579
|
+
key_columns=None,
|
|
1580
|
+
timestamp_key=None,
|
|
1581
|
+
featureset_status=None,
|
|
1582
|
+
):
|
|
1583
|
+
table, column_list = self._get_table_and_columns(features, key_columns)
|
|
1584
|
+
|
|
1585
|
+
graph.add_step(
|
|
1586
|
+
path=self.get_target_path(),
|
|
1587
|
+
name=self.name or self.writer_step_name,
|
|
1588
|
+
after=after,
|
|
1589
|
+
graph_shape="cylinder",
|
|
1590
|
+
class_name="mlrun.datastore.storeytargets.RedisNoSqlStoreyTarget",
|
|
1591
|
+
columns=column_list,
|
|
1592
|
+
table=table,
|
|
1593
|
+
credentials_prefix=self.credentials_prefix,
|
|
1594
|
+
**self.attributes,
|
|
1595
|
+
)
|
|
1596
|
+
|
|
1361
1597
|
|
|
1362
1598
|
class StreamTarget(BaseStoreTarget):
|
|
1363
1599
|
kind = TargetTypes.stream
|
|
@@ -1376,37 +1612,46 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1376
1612
|
timestamp_key=None,
|
|
1377
1613
|
featureset_status=None,
|
|
1378
1614
|
):
|
|
1379
|
-
from storey import V3ioDriver
|
|
1380
|
-
|
|
1381
1615
|
key_columns = list(key_columns.keys())
|
|
1382
|
-
|
|
1383
|
-
if not path:
|
|
1384
|
-
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1385
|
-
endpoint, uri = parse_path(path)
|
|
1386
|
-
storage_options = store.get_storage_options()
|
|
1387
|
-
access_key = storage_options.get("v3io_access_key")
|
|
1616
|
+
|
|
1388
1617
|
column_list = self._get_column_list(
|
|
1389
1618
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1390
1619
|
)
|
|
1620
|
+
stream_path = self.get_target_path()
|
|
1621
|
+
if not stream_path:
|
|
1622
|
+
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1391
1623
|
|
|
1392
1624
|
graph.add_step(
|
|
1393
1625
|
name=self.name or "StreamTarget",
|
|
1394
1626
|
after=after,
|
|
1395
1627
|
graph_shape="cylinder",
|
|
1396
|
-
class_name="
|
|
1628
|
+
class_name="mlrun.datastore.storeytargets.StreamStoreyTarget",
|
|
1397
1629
|
columns=column_list,
|
|
1398
|
-
|
|
1399
|
-
webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
|
|
1400
|
-
),
|
|
1401
|
-
stream_path=uri,
|
|
1630
|
+
stream_path=stream_path,
|
|
1402
1631
|
**self.attributes,
|
|
1403
1632
|
)
|
|
1404
1633
|
|
|
1405
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1406
|
-
raise NotImplementedError()
|
|
1407
|
-
|
|
1408
1634
|
|
|
1409
1635
|
class KafkaTarget(BaseStoreTarget):
|
|
1636
|
+
"""
|
|
1637
|
+
Kafka target storage driver, used to write data into kafka topics.
|
|
1638
|
+
example::
|
|
1639
|
+
# define target
|
|
1640
|
+
kafka_target = KafkaTarget(
|
|
1641
|
+
name="kafka", path="my_topic", brokers="localhost:9092"
|
|
1642
|
+
)
|
|
1643
|
+
# ingest
|
|
1644
|
+
stocks_set.ingest(stocks, [kafka_target])
|
|
1645
|
+
:param name: target name
|
|
1646
|
+
:param path: topic name e.g. "my_topic"
|
|
1647
|
+
:param after_step: optional, after what step in the graph to add the target
|
|
1648
|
+
:param columns: optional, which columns from data to write
|
|
1649
|
+
:param bootstrap_servers: Deprecated. Use the brokers parameter instead
|
|
1650
|
+
:param producer_options: additional configurations for kafka producer
|
|
1651
|
+
:param brokers: kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
|
|
1652
|
+
"localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
|
|
1653
|
+
"""
|
|
1654
|
+
|
|
1410
1655
|
kind = TargetTypes.kafka
|
|
1411
1656
|
is_table = False
|
|
1412
1657
|
is_online = False
|
|
@@ -1419,11 +1664,27 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1419
1664
|
*args,
|
|
1420
1665
|
bootstrap_servers=None,
|
|
1421
1666
|
producer_options=None,
|
|
1667
|
+
brokers=None,
|
|
1422
1668
|
**kwargs,
|
|
1423
1669
|
):
|
|
1424
1670
|
attrs = {}
|
|
1425
|
-
|
|
1426
|
-
|
|
1671
|
+
|
|
1672
|
+
# TODO: Remove this in 1.9.0
|
|
1673
|
+
if bootstrap_servers:
|
|
1674
|
+
if brokers:
|
|
1675
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1676
|
+
"KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
|
|
1677
|
+
"'bootstrap_servers' parameter. Please use 'brokers' only."
|
|
1678
|
+
)
|
|
1679
|
+
warnings.warn(
|
|
1680
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
1681
|
+
"use 'brokers' instead.",
|
|
1682
|
+
FutureWarning,
|
|
1683
|
+
)
|
|
1684
|
+
brokers = bootstrap_servers
|
|
1685
|
+
|
|
1686
|
+
if brokers:
|
|
1687
|
+
attrs["brokers"] = brokers
|
|
1427
1688
|
if producer_options is not None:
|
|
1428
1689
|
attrs["producer_options"] = producer_options
|
|
1429
1690
|
|
|
@@ -1442,37 +1703,21 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1442
1703
|
column_list = self._get_column_list(
|
|
1443
1704
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1444
1705
|
)
|
|
1445
|
-
|
|
1446
|
-
datastore_profile = datastore_profile_read(self.path)
|
|
1447
|
-
attributes = datastore_profile.attributes()
|
|
1448
|
-
bootstrap_servers = attributes.pop("bootstrap_servers", None)
|
|
1449
|
-
topic = datastore_profile.topic
|
|
1450
|
-
else:
|
|
1451
|
-
attributes = copy(self.attributes)
|
|
1452
|
-
bootstrap_servers = attributes.pop("bootstrap_servers", None)
|
|
1453
|
-
topic, bootstrap_servers = parse_kafka_url(
|
|
1454
|
-
self.get_target_path(), bootstrap_servers
|
|
1455
|
-
)
|
|
1706
|
+
path = self.get_target_path()
|
|
1456
1707
|
|
|
1457
|
-
if not
|
|
1458
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1459
|
-
"KafkaTarget requires a path (topic)"
|
|
1460
|
-
)
|
|
1708
|
+
if not path:
|
|
1709
|
+
raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a path")
|
|
1461
1710
|
|
|
1462
1711
|
graph.add_step(
|
|
1463
1712
|
name=self.name or "KafkaTarget",
|
|
1464
1713
|
after=after,
|
|
1465
1714
|
graph_shape="cylinder",
|
|
1466
|
-
class_name="
|
|
1715
|
+
class_name="mlrun.datastore.storeytargets.KafkaStoreyTarget",
|
|
1467
1716
|
columns=column_list,
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
**attributes,
|
|
1717
|
+
path=path,
|
|
1718
|
+
attributes=self.attributes,
|
|
1471
1719
|
)
|
|
1472
1720
|
|
|
1473
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1474
|
-
raise NotImplementedError()
|
|
1475
|
-
|
|
1476
1721
|
def purge(self):
|
|
1477
1722
|
pass
|
|
1478
1723
|
|
|
@@ -1507,7 +1752,7 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1507
1752
|
|
|
1508
1753
|
graph.add_step(
|
|
1509
1754
|
name=self.name or "TSDBTarget",
|
|
1510
|
-
class_name="
|
|
1755
|
+
class_name="mlrun.datastore.storeytargets.TSDBStoreyTarget",
|
|
1511
1756
|
after=after,
|
|
1512
1757
|
graph_shape="cylinder",
|
|
1513
1758
|
path=uri,
|
|
@@ -1517,9 +1762,6 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1517
1762
|
**self.attributes,
|
|
1518
1763
|
)
|
|
1519
1764
|
|
|
1520
|
-
def as_df(self, columns=None, df_module=None, **kwargs):
|
|
1521
|
-
raise NotImplementedError()
|
|
1522
|
-
|
|
1523
1765
|
def write_dataframe(
|
|
1524
1766
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1525
1767
|
):
|
|
@@ -1557,6 +1799,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1557
1799
|
is_online = False
|
|
1558
1800
|
support_spark = False
|
|
1559
1801
|
support_storey = True
|
|
1802
|
+
support_pandas = True
|
|
1560
1803
|
|
|
1561
1804
|
def __init__(
|
|
1562
1805
|
self,
|
|
@@ -1592,6 +1835,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1592
1835
|
class DFTarget(BaseStoreTarget):
|
|
1593
1836
|
kind = TargetTypes.dataframe
|
|
1594
1837
|
support_storey = True
|
|
1838
|
+
support_pandas = True
|
|
1595
1839
|
|
|
1596
1840
|
def __init__(self, *args, name="dataframe", **kwargs):
|
|
1597
1841
|
self._df = None
|
|
@@ -1628,11 +1872,16 @@ class DFTarget(BaseStoreTarget):
|
|
|
1628
1872
|
self,
|
|
1629
1873
|
columns=None,
|
|
1630
1874
|
df_module=None,
|
|
1875
|
+
entities=None,
|
|
1631
1876
|
start_time=None,
|
|
1632
1877
|
end_time=None,
|
|
1633
1878
|
time_column=None,
|
|
1879
|
+
additional_filters=None,
|
|
1634
1880
|
**kwargs,
|
|
1635
1881
|
):
|
|
1882
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1883
|
+
additional_filters, self.__class__
|
|
1884
|
+
)
|
|
1636
1885
|
return select_columns_from_df(
|
|
1637
1886
|
filter_df_start_end_time(
|
|
1638
1887
|
self._df,
|
|
@@ -1649,6 +1898,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1649
1898
|
is_online = True
|
|
1650
1899
|
support_spark = False
|
|
1651
1900
|
support_storey = True
|
|
1901
|
+
support_pandas = True
|
|
1652
1902
|
|
|
1653
1903
|
def __init__(
|
|
1654
1904
|
self,
|
|
@@ -1791,7 +2041,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1791
2041
|
name=self.name or "SqlTarget",
|
|
1792
2042
|
after=after,
|
|
1793
2043
|
graph_shape="cylinder",
|
|
1794
|
-
class_name="
|
|
2044
|
+
class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
|
|
1795
2045
|
columns=column_list,
|
|
1796
2046
|
header=True,
|
|
1797
2047
|
table=table,
|
|
@@ -1807,6 +2057,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1807
2057
|
start_time=None,
|
|
1808
2058
|
end_time=None,
|
|
1809
2059
|
time_column=None,
|
|
2060
|
+
additional_filters=None,
|
|
1810
2061
|
**kwargs,
|
|
1811
2062
|
):
|
|
1812
2063
|
try:
|
|
@@ -1815,6 +2066,10 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1815
2066
|
except (ModuleNotFoundError, ImportError) as exc:
|
|
1816
2067
|
self._raise_sqlalchemy_import_error(exc)
|
|
1817
2068
|
|
|
2069
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
2070
|
+
additional_filters, self.__class__
|
|
2071
|
+
)
|
|
2072
|
+
|
|
1818
2073
|
db_path, table_name, _, _, _, _ = self._parse_url()
|
|
1819
2074
|
engine = sqlalchemy.create_engine(db_path)
|
|
1820
2075
|
parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
|
|
@@ -1904,7 +2159,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1904
2159
|
raise ValueError(f"Table named {table_name} is not exist")
|
|
1905
2160
|
|
|
1906
2161
|
elif not table_exists and create_table:
|
|
1907
|
-
|
|
2162
|
+
type_to_sql_type = {
|
|
1908
2163
|
int: sqlalchemy.Integer,
|
|
1909
2164
|
str: sqlalchemy.String(self.attributes.get("varchar_len")),
|
|
1910
2165
|
datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
|
|
@@ -1917,7 +2172,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1917
2172
|
# creat new table with the given name
|
|
1918
2173
|
columns = []
|
|
1919
2174
|
for col, col_type in self.schema.items():
|
|
1920
|
-
col_type_sql =
|
|
2175
|
+
col_type_sql = type_to_sql_type.get(col_type)
|
|
1921
2176
|
if col_type_sql is None:
|
|
1922
2177
|
raise TypeError(
|
|
1923
2178
|
f"'{col_type}' unsupported type for column '{col}'"
|
|
@@ -1957,10 +2212,11 @@ kind_to_driver = {
|
|
|
1957
2212
|
TargetTypes.tsdb: TSDBTarget,
|
|
1958
2213
|
TargetTypes.custom: CustomTarget,
|
|
1959
2214
|
TargetTypes.sql: SQLTarget,
|
|
2215
|
+
TargetTypes.snowflake: SnowflakeTarget,
|
|
1960
2216
|
}
|
|
1961
2217
|
|
|
1962
2218
|
|
|
1963
|
-
def _get_target_path(driver, resource, run_id_mode=False):
|
|
2219
|
+
def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
|
|
1964
2220
|
"""return the default target path given the resource and target kind"""
|
|
1965
2221
|
kind = driver.kind
|
|
1966
2222
|
suffix = driver.suffix
|
|
@@ -1977,11 +2233,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
|
|
|
1977
2233
|
)
|
|
1978
2234
|
name = resource.metadata.name
|
|
1979
2235
|
project = resource.metadata.project or mlrun.mlconf.default_project
|
|
1980
|
-
|
|
2236
|
+
|
|
2237
|
+
default_kind_name = kind
|
|
2238
|
+
if scheme == "ds":
|
|
2239
|
+
# "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
|
|
2240
|
+
# as a placeholder that can be used in any specified target
|
|
2241
|
+
default_kind_name = "dsnosql"
|
|
2242
|
+
if scheme == "redis" or scheme == "rediss":
|
|
2243
|
+
default_kind_name = TargetTypes.redisnosql
|
|
2244
|
+
|
|
2245
|
+
netloc = netloc or ""
|
|
2246
|
+
data_prefix = get_default_prefix_for_target(default_kind_name).format(
|
|
2247
|
+
ds_profile_name=netloc, # In case of ds profile, set its the name
|
|
2248
|
+
authority=netloc, # In case of redis, replace {authority} with netloc
|
|
1981
2249
|
project=project,
|
|
1982
2250
|
kind=kind,
|
|
1983
2251
|
name=name,
|
|
1984
2252
|
)
|
|
2253
|
+
|
|
2254
|
+
if scheme == "rediss":
|
|
2255
|
+
data_prefix = data_prefix.replace("redis://", "rediss://", 1)
|
|
2256
|
+
|
|
1985
2257
|
# todo: handle ver tag changes, may need to copy files?
|
|
1986
2258
|
if not run_id_mode:
|
|
1987
2259
|
version = resource.metadata.tag
|