mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
mlrun/datastore/s3.py
CHANGED
|
@@ -15,11 +15,12 @@
|
|
|
15
15
|
import time
|
|
16
16
|
|
|
17
17
|
import boto3
|
|
18
|
+
from boto3.s3.transfer import TransferConfig
|
|
18
19
|
from fsspec.registry import get_filesystem_class
|
|
19
20
|
|
|
20
21
|
import mlrun.errors
|
|
21
22
|
|
|
22
|
-
from .base import DataStore, FileStats, get_range,
|
|
23
|
+
from .base import DataStore, FileStats, get_range, make_datastore_schema_sanitizer
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class S3Store(DataStore):
|
|
@@ -35,11 +36,18 @@ class S3Store(DataStore):
|
|
|
35
36
|
|
|
36
37
|
access_key_id = self._get_secret_or_env("AWS_ACCESS_KEY_ID")
|
|
37
38
|
secret_key = self._get_secret_or_env("AWS_SECRET_ACCESS_KEY")
|
|
39
|
+
token_file = self._get_secret_or_env("AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE")
|
|
38
40
|
endpoint_url = self._get_secret_or_env("S3_ENDPOINT_URL")
|
|
39
41
|
force_non_anonymous = self._get_secret_or_env("S3_NON_ANONYMOUS")
|
|
40
42
|
profile_name = self._get_secret_or_env("AWS_PROFILE")
|
|
41
43
|
assume_role_arn = self._get_secret_or_env("MLRUN_AWS_ROLE_ARN")
|
|
42
44
|
|
|
45
|
+
self.config = TransferConfig(
|
|
46
|
+
multipart_threshold=1024 * 1024 * 25,
|
|
47
|
+
max_concurrency=10,
|
|
48
|
+
multipart_chunksize=1024 * 1024 * 25,
|
|
49
|
+
)
|
|
50
|
+
|
|
43
51
|
# If user asks to assume a role, this needs to go through the STS client and retrieve temporary creds
|
|
44
52
|
if assume_role_arn:
|
|
45
53
|
client = boto3.client(
|
|
@@ -87,14 +95,15 @@ class S3Store(DataStore):
|
|
|
87
95
|
self.s3 = boto3.resource(
|
|
88
96
|
"s3", region_name=region, endpoint_url=endpoint_url
|
|
89
97
|
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
+
if not token_file:
|
|
99
|
+
# If not using credentials, boto will still attempt to sign the requests, and will fail any operations
|
|
100
|
+
# due to no credentials found. These commands disable signing and allow anonymous mode (same as
|
|
101
|
+
# anon in the storage_options when working with fsspec).
|
|
102
|
+
from botocore.handlers import disable_signing
|
|
103
|
+
|
|
104
|
+
self.s3.meta.client.meta.events.register(
|
|
105
|
+
"choose-signer.s3.*", disable_signing
|
|
106
|
+
)
|
|
98
107
|
|
|
99
108
|
def get_spark_options(self):
|
|
100
109
|
res = {}
|
|
@@ -119,7 +128,7 @@ class S3Store(DataStore):
|
|
|
119
128
|
except ImportError as exc:
|
|
120
129
|
raise ImportError("AWS s3fs not installed") from exc
|
|
121
130
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
122
|
-
self._filesystem =
|
|
131
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
123
132
|
filesystem_class,
|
|
124
133
|
using_bucket=self.using_bucket,
|
|
125
134
|
**self.get_storage_options(),
|
|
@@ -132,6 +141,7 @@ class S3Store(DataStore):
|
|
|
132
141
|
endpoint_url = self._get_secret_or_env("S3_ENDPOINT_URL")
|
|
133
142
|
access_key_id = self._get_secret_or_env("AWS_ACCESS_KEY_ID")
|
|
134
143
|
secret = self._get_secret_or_env("AWS_SECRET_ACCESS_KEY")
|
|
144
|
+
token_file = self._get_secret_or_env("AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE")
|
|
135
145
|
|
|
136
146
|
if self._temp_credentials:
|
|
137
147
|
access_key_id = self._temp_credentials["AccessKeyId"]
|
|
@@ -141,7 +151,7 @@ class S3Store(DataStore):
|
|
|
141
151
|
token = None
|
|
142
152
|
|
|
143
153
|
storage_options = dict(
|
|
144
|
-
anon=not (force_non_anonymous or (access_key_id and secret)),
|
|
154
|
+
anon=not (force_non_anonymous or (access_key_id and secret) or token_file),
|
|
145
155
|
key=access_key_id,
|
|
146
156
|
secret=secret,
|
|
147
157
|
token=token,
|
|
@@ -166,7 +176,7 @@ class S3Store(DataStore):
|
|
|
166
176
|
|
|
167
177
|
def upload(self, key, src_path):
|
|
168
178
|
bucket, key = self.get_bucket_and_key(key)
|
|
169
|
-
self.s3.
|
|
179
|
+
self.s3.Bucket(bucket).upload_file(src_path, key, Config=self.config)
|
|
170
180
|
|
|
171
181
|
def get(self, key, size=None, offset=0):
|
|
172
182
|
bucket, key = self.get_bucket_and_key(key)
|
|
@@ -176,6 +186,7 @@ class S3Store(DataStore):
|
|
|
176
186
|
return obj.get()["Body"].read()
|
|
177
187
|
|
|
178
188
|
def put(self, key, data, append=False):
|
|
189
|
+
data, _ = self._prepare_put_data(data, append)
|
|
179
190
|
bucket, key = self.get_bucket_and_key(key)
|
|
180
191
|
self.s3.Object(bucket, key).put(Body=data)
|
|
181
192
|
|
|
@@ -201,6 +212,8 @@ class S3Store(DataStore):
|
|
|
201
212
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
202
213
|
bucket, key = self.get_bucket_and_key(path)
|
|
203
214
|
path = f"{bucket}/{key}"
|
|
215
|
+
# In order to raise an error if there is connection error, ML-7056.
|
|
216
|
+
self.filesystem.exists(path=path)
|
|
204
217
|
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
205
218
|
|
|
206
219
|
|
mlrun/datastore/sources.py
CHANGED
|
@@ -32,6 +32,7 @@ from mlrun.config import config
|
|
|
32
32
|
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
33
|
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
34
34
|
from mlrun.secrets import SecretsStore
|
|
35
|
+
from mlrun.utils import logger
|
|
35
36
|
|
|
36
37
|
from ..model import DataSource
|
|
37
38
|
from ..platforms.iguazio import parse_path
|
|
@@ -85,7 +86,8 @@ class BaseSourceDriver(DataSource):
|
|
|
85
86
|
)
|
|
86
87
|
|
|
87
88
|
explicit_ack = (
|
|
88
|
-
is_explicit_ack_supported(context)
|
|
89
|
+
is_explicit_ack_supported(context)
|
|
90
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
89
91
|
)
|
|
90
92
|
return storey.SyncEmitSource(
|
|
91
93
|
context=context,
|
|
@@ -826,6 +828,20 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
826
828
|
spark_options["query"] = self.attributes.get("query")
|
|
827
829
|
return spark_options
|
|
828
830
|
|
|
831
|
+
def to_dataframe(
|
|
832
|
+
self,
|
|
833
|
+
columns=None,
|
|
834
|
+
df_module=None,
|
|
835
|
+
entities=None,
|
|
836
|
+
start_time=None,
|
|
837
|
+
end_time=None,
|
|
838
|
+
time_field=None,
|
|
839
|
+
additional_filters=None,
|
|
840
|
+
):
|
|
841
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
842
|
+
f"{type(self).__name__} supports only spark engine"
|
|
843
|
+
)
|
|
844
|
+
|
|
829
845
|
|
|
830
846
|
class CustomSource(BaseSourceDriver):
|
|
831
847
|
kind = "custom"
|
|
@@ -930,7 +946,8 @@ class OnlineSource(BaseSourceDriver):
|
|
|
930
946
|
|
|
931
947
|
source_args = self.attributes.get("source_args", {})
|
|
932
948
|
explicit_ack = (
|
|
933
|
-
is_explicit_ack_supported(context)
|
|
949
|
+
is_explicit_ack_supported(context)
|
|
950
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
934
951
|
)
|
|
935
952
|
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
936
953
|
src_class = storey.SyncEmitSource(
|
|
@@ -1015,7 +1032,8 @@ class StreamSource(OnlineSource):
|
|
|
1015
1032
|
engine = "async"
|
|
1016
1033
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
1017
1034
|
engine = function.spec.graph.engine
|
|
1018
|
-
|
|
1035
|
+
|
|
1036
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
1019
1037
|
kwargs["explicit_ack_mode"] = "explicitOnly"
|
|
1020
1038
|
kwargs["worker_allocation_mode"] = "static"
|
|
1021
1039
|
|
|
@@ -1102,7 +1120,8 @@ class KafkaSource(OnlineSource):
|
|
|
1102
1120
|
engine = "async"
|
|
1103
1121
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
1104
1122
|
engine = function.spec.graph.engine
|
|
1105
|
-
|
|
1123
|
+
|
|
1124
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
1106
1125
|
explicit_ack_mode = "explicitOnly"
|
|
1107
1126
|
extra_attributes["workerAllocationMode"] = extra_attributes.get(
|
|
1108
1127
|
"worker_allocation_mode", "static"
|
|
@@ -1145,6 +1164,59 @@ class KafkaSource(OnlineSource):
|
|
|
1145
1164
|
"to a Spark dataframe is not possible, as this operation is not supported by Spark"
|
|
1146
1165
|
)
|
|
1147
1166
|
|
|
1167
|
+
def create_topics(
|
|
1168
|
+
self,
|
|
1169
|
+
num_partitions: int = 4,
|
|
1170
|
+
replication_factor: int = 1,
|
|
1171
|
+
topics: list[str] = None,
|
|
1172
|
+
):
|
|
1173
|
+
"""
|
|
1174
|
+
Create Kafka topics with the specified number of partitions and replication factor.
|
|
1175
|
+
|
|
1176
|
+
:param num_partitions: number of partitions for the topics
|
|
1177
|
+
:param replication_factor: replication factor for the topics
|
|
1178
|
+
:param topics: list of topic names to create, if None,
|
|
1179
|
+
the topics will be taken from the source attributes
|
|
1180
|
+
"""
|
|
1181
|
+
from kafka.admin import KafkaAdminClient, NewTopic
|
|
1182
|
+
|
|
1183
|
+
brokers = self.attributes.get("brokers")
|
|
1184
|
+
if not brokers:
|
|
1185
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1186
|
+
"brokers must be specified in the KafkaSource attributes"
|
|
1187
|
+
)
|
|
1188
|
+
topics = topics or self.attributes.get("topics")
|
|
1189
|
+
if not topics:
|
|
1190
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1191
|
+
"topics must be specified in the KafkaSource attributes"
|
|
1192
|
+
)
|
|
1193
|
+
new_topics = [
|
|
1194
|
+
NewTopic(topic, num_partitions, replication_factor) for topic in topics
|
|
1195
|
+
]
|
|
1196
|
+
kafka_admin = KafkaAdminClient(
|
|
1197
|
+
bootstrap_servers=brokers,
|
|
1198
|
+
sasl_mechanism=self.attributes.get("sasl", {}).get("sasl_mechanism"),
|
|
1199
|
+
sasl_plain_username=self.attributes.get("sasl", {}).get("username"),
|
|
1200
|
+
sasl_plain_password=self.attributes.get("sasl", {}).get("password"),
|
|
1201
|
+
sasl_kerberos_service_name=self.attributes.get("sasl", {}).get(
|
|
1202
|
+
"sasl_kerberos_service_name", "kafka"
|
|
1203
|
+
),
|
|
1204
|
+
sasl_kerberos_domain_name=self.attributes.get("sasl", {}).get(
|
|
1205
|
+
"sasl_kerberos_domain_name"
|
|
1206
|
+
),
|
|
1207
|
+
sasl_oauth_token_provider=self.attributes.get("sasl", {}).get("mechanism"),
|
|
1208
|
+
)
|
|
1209
|
+
try:
|
|
1210
|
+
kafka_admin.create_topics(new_topics)
|
|
1211
|
+
finally:
|
|
1212
|
+
kafka_admin.close()
|
|
1213
|
+
logger.info(
|
|
1214
|
+
"Kafka topics created successfully",
|
|
1215
|
+
topics=topics,
|
|
1216
|
+
num_partitions=num_partitions,
|
|
1217
|
+
replication_factor=replication_factor,
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1148
1220
|
|
|
1149
1221
|
class SQLSource(BaseSourceDriver):
|
|
1150
1222
|
kind = "sqldb"
|
mlrun/datastore/spark_utils.py
CHANGED
|
@@ -13,7 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
16
18
|
import mlrun
|
|
19
|
+
from mlrun.features import Entity
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str]:
|
|
@@ -35,3 +38,30 @@ def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str
|
|
|
35
38
|
else:
|
|
36
39
|
non_hadoop_spark_options[key] = value
|
|
37
40
|
return non_hadoop_spark_options
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def check_special_columns_exists(
|
|
44
|
+
spark_df, entities: list[Union[Entity, str]], timestamp_key: str, label_column: str
|
|
45
|
+
):
|
|
46
|
+
columns = spark_df.columns
|
|
47
|
+
entities = entities or []
|
|
48
|
+
entities = [
|
|
49
|
+
entity.name if isinstance(entity, Entity) else entity for entity in entities
|
|
50
|
+
]
|
|
51
|
+
missing_entities = [entity for entity in entities if entity not in columns]
|
|
52
|
+
cases_message = "Please check the letter cases (uppercase or lowercase)"
|
|
53
|
+
if missing_entities:
|
|
54
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
55
|
+
f"There are missing entities from dataframe during ingestion. missing_entities: {missing_entities}."
|
|
56
|
+
f" {cases_message}"
|
|
57
|
+
)
|
|
58
|
+
if timestamp_key and timestamp_key not in columns:
|
|
59
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
60
|
+
f"timestamp_key is missing from dataframe during ingestion. timestamp_key: {timestamp_key}."
|
|
61
|
+
f" {cases_message}"
|
|
62
|
+
)
|
|
63
|
+
if label_column and label_column not in columns:
|
|
64
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
65
|
+
f"label_column is missing from dataframe during ingestion. label_column: {label_column}. "
|
|
66
|
+
f"{cases_message}"
|
|
67
|
+
)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import storey
|
|
15
|
+
from mergedeep import merge
|
|
16
|
+
from storey import V3ioDriver
|
|
17
|
+
|
|
18
|
+
import mlrun
|
|
19
|
+
import mlrun.model_monitoring.helpers
|
|
20
|
+
from mlrun.datastore.base import DataStore
|
|
21
|
+
|
|
22
|
+
from ..platforms.iguazio import parse_path
|
|
23
|
+
from .utils import (
|
|
24
|
+
parse_kafka_url,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
Storey targets expect storage_options, which may contain credentials.
|
|
29
|
+
To avoid passing it openly within the graph, we use wrapper classes.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_url_and_storage_options(path, external_storage_options=None):
|
|
34
|
+
store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(path)
|
|
35
|
+
storage_options = store.get_storage_options()
|
|
36
|
+
if storage_options and external_storage_options:
|
|
37
|
+
# merge external storage options with the store's storage options. storage_options takes precedence
|
|
38
|
+
storage_options = merge(external_storage_options, storage_options)
|
|
39
|
+
else:
|
|
40
|
+
storage_options = storage_options or external_storage_options
|
|
41
|
+
return url, DataStore._sanitize_storage_options(storage_options)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TDEngineStoreyTarget(storey.TDEngineTarget):
|
|
45
|
+
def __init__(self, *args, **kwargs):
|
|
46
|
+
kwargs["url"] = mlrun.model_monitoring.helpers.get_tsdb_connection_string()
|
|
47
|
+
super().__init__(*args, **kwargs)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class StoreyTargetUtils:
|
|
51
|
+
@staticmethod
|
|
52
|
+
def process_args_and_kwargs(args, kwargs):
|
|
53
|
+
args = list(args)
|
|
54
|
+
path = args[0] if args else kwargs.get("path")
|
|
55
|
+
external_storage_options = kwargs.get("storage_options")
|
|
56
|
+
|
|
57
|
+
url, storage_options = get_url_and_storage_options(
|
|
58
|
+
path, external_storage_options
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if storage_options:
|
|
62
|
+
kwargs["storage_options"] = storage_options
|
|
63
|
+
if args:
|
|
64
|
+
args[0] = url
|
|
65
|
+
if "path" in kwargs:
|
|
66
|
+
kwargs["path"] = url
|
|
67
|
+
return args, kwargs
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ParquetStoreyTarget(storey.ParquetTarget):
|
|
71
|
+
def __init__(self, *args, **kwargs):
|
|
72
|
+
args, kwargs = StoreyTargetUtils.process_args_and_kwargs(args, kwargs)
|
|
73
|
+
super().__init__(*args, **kwargs)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class CSVStoreyTarget(storey.CSVTarget):
|
|
77
|
+
def __init__(self, *args, **kwargs):
|
|
78
|
+
args, kwargs = StoreyTargetUtils.process_args_and_kwargs(args, kwargs)
|
|
79
|
+
super().__init__(*args, **kwargs)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class StreamStoreyTarget(storey.StreamTarget):
|
|
83
|
+
def __init__(self, *args, **kwargs):
|
|
84
|
+
args = list(args)
|
|
85
|
+
|
|
86
|
+
uri = args[0] if args else kwargs.get("stream_path")
|
|
87
|
+
|
|
88
|
+
if not uri:
|
|
89
|
+
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
90
|
+
|
|
91
|
+
_, storage_options = get_url_and_storage_options(uri)
|
|
92
|
+
endpoint, path = parse_path(uri)
|
|
93
|
+
|
|
94
|
+
access_key = storage_options.get("v3io_access_key")
|
|
95
|
+
storage = V3ioDriver(
|
|
96
|
+
webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if storage_options:
|
|
100
|
+
kwargs["storage"] = storage
|
|
101
|
+
if args:
|
|
102
|
+
args[0] = endpoint
|
|
103
|
+
if "stream_path" in kwargs:
|
|
104
|
+
kwargs["stream_path"] = path
|
|
105
|
+
|
|
106
|
+
super().__init__(*args, **kwargs)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class KafkaStoreyTarget(storey.KafkaTarget):
|
|
110
|
+
def __init__(self, *args, **kwargs):
|
|
111
|
+
path = kwargs.pop("path")
|
|
112
|
+
attributes = kwargs.pop("attributes", None)
|
|
113
|
+
if path and path.startswith("ds://"):
|
|
114
|
+
datastore_profile = (
|
|
115
|
+
mlrun.datastore.datastore_profile.datastore_profile_read(path)
|
|
116
|
+
)
|
|
117
|
+
attributes = merge(attributes, datastore_profile.attributes())
|
|
118
|
+
brokers = attributes.pop(
|
|
119
|
+
"brokers", attributes.pop("bootstrap_servers", None)
|
|
120
|
+
)
|
|
121
|
+
topic = datastore_profile.topic
|
|
122
|
+
else:
|
|
123
|
+
brokers = attributes.pop(
|
|
124
|
+
"brokers", attributes.pop("bootstrap_servers", None)
|
|
125
|
+
)
|
|
126
|
+
topic, brokers = parse_kafka_url(path, brokers)
|
|
127
|
+
|
|
128
|
+
if not topic:
|
|
129
|
+
raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a topic")
|
|
130
|
+
kwargs["brokers"] = brokers
|
|
131
|
+
kwargs["topic"] = topic
|
|
132
|
+
super().__init__(*args, **kwargs, **attributes)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class NoSqlStoreyTarget(storey.NoSqlTarget):
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class RedisNoSqlStoreyTarget(storey.NoSqlTarget):
|
|
140
|
+
def __init__(self, *args, **kwargs):
|
|
141
|
+
path = kwargs.pop("path")
|
|
142
|
+
endpoint, uri = mlrun.datastore.targets.RedisNoSqlTarget.get_server_endpoint(
|
|
143
|
+
path,
|
|
144
|
+
kwargs.pop("credentials_prefix", None),
|
|
145
|
+
)
|
|
146
|
+
kwargs["path"] = endpoint + "/" + uri
|
|
147
|
+
super().__init__(*args, **kwargs)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class TSDBStoreyTarget(storey.TSDBTarget):
|
|
151
|
+
pass
|