mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +26 -112
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +46 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +47 -48
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +69 -0
- mlrun/common/db/sql_session.py +2 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/common/formatters/artifact.py +21 -0
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/helpers.py +1 -2
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +24 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +18 -8
- mlrun/common/schemas/auth.py +11 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -1
- mlrun/common/schemas/feature_store.py +16 -16
- mlrun/common/schemas/frontend_spec.py +8 -7
- mlrun/common/schemas/function.py +5 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +18 -3
- mlrun/common/schemas/model_monitoring/constants.py +83 -26
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
- mlrun/common/schemas/notification.py +4 -4
- mlrun/common/schemas/object.py +2 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +1 -10
- mlrun/common/schemas/project.py +24 -23
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +3 -3
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +2 -2
- mlrun/common/types.py +7 -1
- mlrun/config.py +54 -17
- mlrun/data_types/to_pandas.py +10 -12
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +17 -5
- mlrun/datastore/base.py +62 -39
- mlrun/datastore/datastore.py +28 -9
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/filestore.py +0 -1
- mlrun/datastore/google_cloud_storage.py +6 -2
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +6 -2
- mlrun/datastore/s3.py +9 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +201 -96
- mlrun/datastore/spark_utils.py +1 -2
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +358 -104
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +5 -1
- mlrun/db/base.py +185 -35
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +614 -179
- mlrun/db/nopdb.py +210 -26
- mlrun/errors.py +12 -1
- mlrun/execution.py +41 -24
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -72
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +28 -30
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/conversion.py +11 -13
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +37 -34
- mlrun/features.py +9 -20
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +2 -3
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +4 -3
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +14 -16
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +17 -11
- mlrun/launcher/remote.py +16 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +238 -73
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +138 -315
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +24 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +104 -84
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +127 -28
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/prometheus.py +1 -4
- mlrun/model_monitoring/stream_processing.py +62 -231
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +6 -6
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +35 -21
- mlrun/projects/pipelines.py +68 -99
- mlrun/projects/project.py +830 -266
- mlrun/render.py +3 -11
- mlrun/run.py +162 -166
- mlrun/runtimes/__init__.py +62 -7
- mlrun/runtimes/base.py +39 -32
- mlrun/runtimes/daskjob.py +8 -8
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +6 -3
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
- mlrun/runtimes/pod.py +286 -88
- mlrun/runtimes/remotesparkjob.py +2 -2
- mlrun/runtimes/sparkjob/spark3job.py +51 -34
- mlrun/runtimes/utils.py +7 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +13 -10
- mlrun/serving/server.py +22 -26
- mlrun/serving/states.py +99 -25
- mlrun/serving/utils.py +3 -3
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +59 -20
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +1 -2
- mlrun/utils/async_http.py +5 -7
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +3 -3
- mlrun/utils/helpers.py +183 -197
- mlrun/utils/http.py +2 -5
- mlrun/utils/logger.py +76 -14
- mlrun/utils/notifications/notification/__init__.py +17 -12
- mlrun/utils/notifications/notification/base.py +14 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +155 -30
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +2 -4
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc2.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/datastore/redis.py
CHANGED
|
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
|
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
33
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
|
-
|
|
34
|
+
redis_default_port = "6379"
|
|
35
35
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
36
36
|
self.headers = None
|
|
37
37
|
|
|
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
|
|
|
49
49
|
user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
|
|
50
50
|
password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
|
|
51
51
|
host = parsed_endpoint.hostname
|
|
52
|
-
port = parsed_endpoint.port if parsed_endpoint.port else
|
|
52
|
+
port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
|
|
53
53
|
schema = parsed_endpoint.scheme
|
|
54
54
|
if user or password:
|
|
55
55
|
endpoint = f"{schema}://{user}:{password}@{host}:{port}"
|
|
@@ -163,3 +163,7 @@ class RedisStore(DataStore):
|
|
|
163
163
|
self.redis.delete(k)
|
|
164
164
|
else:
|
|
165
165
|
self.redis.delete(key)
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def spark_url(self):
|
|
169
|
+
return ""
|
mlrun/datastore/s3.py
CHANGED
|
@@ -156,6 +156,10 @@ class S3Store(DataStore):
|
|
|
156
156
|
|
|
157
157
|
return self._sanitize_storage_options(storage_options)
|
|
158
158
|
|
|
159
|
+
@property
|
|
160
|
+
def spark_url(self):
|
|
161
|
+
return f"s3a://{self.endpoint}"
|
|
162
|
+
|
|
159
163
|
def get_bucket_and_key(self, key):
|
|
160
164
|
path = self._join(key)[1:]
|
|
161
165
|
return self.endpoint, path
|
|
@@ -194,6 +198,11 @@ class S3Store(DataStore):
|
|
|
194
198
|
bucket = self.s3.Bucket(bucket)
|
|
195
199
|
return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
|
|
196
200
|
|
|
201
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
202
|
+
bucket, key = self.get_bucket_and_key(path)
|
|
203
|
+
path = f"{bucket}/{key}"
|
|
204
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
205
|
+
|
|
197
206
|
|
|
198
207
|
def parse_s3_bucket_and_key(s3_path):
|
|
199
208
|
try:
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
import mlrun
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_snowflake_password():
|
|
20
|
+
key = "SNOWFLAKE_PASSWORD"
|
|
21
|
+
snowflake_password = mlrun.get_secret_or_env(key)
|
|
22
|
+
|
|
23
|
+
if not snowflake_password:
|
|
24
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
25
|
+
f"No password provided. Set password using the {key} "
|
|
26
|
+
"project secret or environment variable."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return snowflake_password
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_snowflake_spark_options(attributes):
|
|
33
|
+
return {
|
|
34
|
+
"format": "net.snowflake.spark.snowflake",
|
|
35
|
+
"sfURL": attributes.get("url"),
|
|
36
|
+
"sfUser": attributes.get("user"),
|
|
37
|
+
"sfPassword": get_snowflake_password(),
|
|
38
|
+
"sfDatabase": attributes.get("database"),
|
|
39
|
+
"sfSchema": attributes.get("schema"),
|
|
40
|
+
"sfWarehouse": attributes.get("warehouse"),
|
|
41
|
+
"application": "iguazio_platform",
|
|
42
|
+
"TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
|
|
43
|
+
}
|
mlrun/datastore/sources.py
CHANGED
|
@@ -12,12 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import operator
|
|
15
16
|
import os
|
|
16
17
|
import warnings
|
|
17
18
|
from base64 import b64encode
|
|
18
19
|
from copy import copy
|
|
19
20
|
from datetime import datetime
|
|
20
|
-
from typing import
|
|
21
|
+
from typing import Optional, Union
|
|
21
22
|
|
|
22
23
|
import pandas as pd
|
|
23
24
|
import semver
|
|
@@ -27,9 +28,11 @@ from nuclio import KafkaTrigger
|
|
|
27
28
|
from nuclio.config import split_path
|
|
28
29
|
|
|
29
30
|
import mlrun
|
|
31
|
+
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
30
34
|
from mlrun.secrets import SecretsStore
|
|
31
35
|
|
|
32
|
-
from ..config import config
|
|
33
36
|
from ..model import DataSource
|
|
34
37
|
from ..platforms.iguazio import parse_path
|
|
35
38
|
from ..utils import get_class, is_explicit_ack_supported
|
|
@@ -39,7 +42,6 @@ from .utils import (
|
|
|
39
42
|
_generate_sql_query_with_time_filter,
|
|
40
43
|
filter_df_start_end_time,
|
|
41
44
|
select_columns_from_df,
|
|
42
|
-
store_path_to_spark,
|
|
43
45
|
)
|
|
44
46
|
|
|
45
47
|
|
|
@@ -102,8 +104,12 @@ class BaseSourceDriver(DataSource):
|
|
|
102
104
|
start_time=None,
|
|
103
105
|
end_time=None,
|
|
104
106
|
time_field=None,
|
|
107
|
+
additional_filters=None,
|
|
105
108
|
):
|
|
106
109
|
"""return the source data as dataframe"""
|
|
110
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
111
|
+
additional_filters, self.__class__
|
|
112
|
+
)
|
|
107
113
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
108
114
|
columns=columns,
|
|
109
115
|
df_module=df_module,
|
|
@@ -114,7 +120,11 @@ class BaseSourceDriver(DataSource):
|
|
|
114
120
|
|
|
115
121
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
116
122
|
if self.support_spark:
|
|
117
|
-
|
|
123
|
+
spark_options = self.get_spark_options()
|
|
124
|
+
spark_format = spark_options.pop("format", None)
|
|
125
|
+
df = load_spark_dataframe_with_options(
|
|
126
|
+
session, spark_options, format=spark_format
|
|
127
|
+
)
|
|
118
128
|
if named_view:
|
|
119
129
|
df.createOrReplaceTempView(self.name)
|
|
120
130
|
return self._filter_spark_df(df, time_field, columns)
|
|
@@ -170,10 +180,10 @@ class CSVSource(BaseSourceDriver):
|
|
|
170
180
|
self,
|
|
171
181
|
name: str = "",
|
|
172
182
|
path: str = None,
|
|
173
|
-
attributes:
|
|
183
|
+
attributes: dict[str, object] = None,
|
|
174
184
|
key_field: str = None,
|
|
175
185
|
schedule: str = None,
|
|
176
|
-
parse_dates: Union[None, int, str,
|
|
186
|
+
parse_dates: Union[None, int, str, list[int], list[str]] = None,
|
|
177
187
|
**kwargs,
|
|
178
188
|
):
|
|
179
189
|
super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
|
|
@@ -193,14 +203,10 @@ class CSVSource(BaseSourceDriver):
|
|
|
193
203
|
parse_dates.append(time_field)
|
|
194
204
|
|
|
195
205
|
data_item = mlrun.store_manager.object(self.path)
|
|
196
|
-
|
|
197
|
-
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
198
|
-
path = store.url + path
|
|
199
|
-
else:
|
|
200
|
-
path = data_item.url
|
|
206
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
201
207
|
|
|
202
208
|
return storey.CSVSource(
|
|
203
|
-
paths=
|
|
209
|
+
paths=url, # unlike self.path, it already has store:// replaced
|
|
204
210
|
build_dict=True,
|
|
205
211
|
key_field=self.key_field or key_field,
|
|
206
212
|
storage_options=data_item.store.get_storage_options(),
|
|
@@ -209,25 +215,17 @@ class CSVSource(BaseSourceDriver):
|
|
|
209
215
|
)
|
|
210
216
|
|
|
211
217
|
def get_spark_options(self):
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
218
|
-
"format": "csv",
|
|
219
|
-
"header": "true",
|
|
220
|
-
"inferSchema": "true",
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
return {**result, **storage_spark_options}
|
|
224
|
-
else:
|
|
225
|
-
return {
|
|
226
|
-
"path": store_path_to_spark(self.path),
|
|
218
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
219
|
+
spark_options = store.get_spark_options()
|
|
220
|
+
spark_options.update(
|
|
221
|
+
{
|
|
222
|
+
"path": store.spark_url + path,
|
|
227
223
|
"format": "csv",
|
|
228
224
|
"header": "true",
|
|
229
225
|
"inferSchema": "true",
|
|
230
226
|
}
|
|
227
|
+
)
|
|
228
|
+
return spark_options
|
|
231
229
|
|
|
232
230
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
233
231
|
import pyspark.sql.functions as funcs
|
|
@@ -253,7 +251,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
253
251
|
start_time=None,
|
|
254
252
|
end_time=None,
|
|
255
253
|
time_field=None,
|
|
254
|
+
additional_filters=None,
|
|
256
255
|
):
|
|
256
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
257
|
+
additional_filters, self.__class__
|
|
258
|
+
)
|
|
257
259
|
reader_args = self.attributes.get("reader_args", {})
|
|
258
260
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
259
261
|
columns=columns,
|
|
@@ -289,6 +291,12 @@ class ParquetSource(BaseSourceDriver):
|
|
|
289
291
|
:parameter start_time: filters out data before this time
|
|
290
292
|
:parameter end_time: filters out data after this time
|
|
291
293
|
:parameter attributes: additional parameters to pass to storey.
|
|
294
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
295
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
296
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
297
|
+
Example: [("Product", "=", "Computer")]
|
|
298
|
+
For all supported filters, please see:
|
|
299
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
292
300
|
"""
|
|
293
301
|
|
|
294
302
|
kind = "parquet"
|
|
@@ -299,13 +307,19 @@ class ParquetSource(BaseSourceDriver):
|
|
|
299
307
|
self,
|
|
300
308
|
name: str = "",
|
|
301
309
|
path: str = None,
|
|
302
|
-
attributes:
|
|
310
|
+
attributes: dict[str, object] = None,
|
|
303
311
|
key_field: str = None,
|
|
304
312
|
time_field: str = None,
|
|
305
313
|
schedule: str = None,
|
|
306
314
|
start_time: Optional[Union[datetime, str]] = None,
|
|
307
315
|
end_time: Optional[Union[datetime, str]] = None,
|
|
316
|
+
additional_filters: Optional[list[Union[tuple, list]]] = None,
|
|
308
317
|
):
|
|
318
|
+
if additional_filters:
|
|
319
|
+
attributes = copy(attributes) or {}
|
|
320
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
321
|
+
attributes["additional_filters"] = additional_filters
|
|
322
|
+
|
|
309
323
|
super().__init__(
|
|
310
324
|
name,
|
|
311
325
|
path,
|
|
@@ -333,6 +347,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
333
347
|
def end_time(self, end_time):
|
|
334
348
|
self._end_time = self._convert_to_datetime(end_time)
|
|
335
349
|
|
|
350
|
+
@property
|
|
351
|
+
def additional_filters(self):
|
|
352
|
+
return self.attributes.get("additional_filters")
|
|
353
|
+
|
|
336
354
|
@staticmethod
|
|
337
355
|
def _convert_to_datetime(time):
|
|
338
356
|
if time and isinstance(time, str):
|
|
@@ -349,45 +367,48 @@ class ParquetSource(BaseSourceDriver):
|
|
|
349
367
|
start_time=None,
|
|
350
368
|
end_time=None,
|
|
351
369
|
context=None,
|
|
370
|
+
additional_filters=None,
|
|
352
371
|
):
|
|
353
372
|
import storey
|
|
354
373
|
|
|
355
|
-
attributes = self.attributes
|
|
374
|
+
attributes = copy(self.attributes)
|
|
375
|
+
attributes.pop("additional_filters", None)
|
|
356
376
|
if context:
|
|
357
377
|
attributes["context"] = context
|
|
358
|
-
|
|
378
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
359
379
|
data_item = mlrun.store_manager.object(self.path)
|
|
360
|
-
|
|
361
|
-
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
362
|
-
path = store.url + path
|
|
363
|
-
else:
|
|
364
|
-
path = data_item.url
|
|
365
|
-
|
|
380
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
366
381
|
return storey.ParquetSource(
|
|
367
|
-
paths=
|
|
382
|
+
paths=url, # unlike self.path, it already has store:// replaced
|
|
368
383
|
key_field=self.key_field or key_field,
|
|
369
384
|
storage_options=data_item.store.get_storage_options(),
|
|
370
385
|
end_filter=self.end_time,
|
|
371
386
|
start_filter=self.start_time,
|
|
372
387
|
filter_column=self.time_field or time_field,
|
|
388
|
+
additional_filters=self.additional_filters or additional_filters,
|
|
373
389
|
**attributes,
|
|
374
390
|
)
|
|
375
391
|
|
|
392
|
+
@classmethod
|
|
393
|
+
def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
|
|
394
|
+
new_obj = super().from_dict(
|
|
395
|
+
struct=struct, fields=fields, deprecated_fields=deprecated_fields
|
|
396
|
+
)
|
|
397
|
+
new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
|
|
398
|
+
new_obj.additional_filters
|
|
399
|
+
)
|
|
400
|
+
return new_obj
|
|
401
|
+
|
|
376
402
|
def get_spark_options(self):
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
383
|
-
"format": "parquet",
|
|
384
|
-
}
|
|
385
|
-
return {**result, **storage_spark_options}
|
|
386
|
-
else:
|
|
387
|
-
return {
|
|
388
|
-
"path": store_path_to_spark(self.path),
|
|
403
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
404
|
+
spark_options = store.get_spark_options()
|
|
405
|
+
spark_options.update(
|
|
406
|
+
{
|
|
407
|
+
"path": store.spark_url + path,
|
|
389
408
|
"format": "parquet",
|
|
390
409
|
}
|
|
410
|
+
)
|
|
411
|
+
return spark_options
|
|
391
412
|
|
|
392
413
|
def to_dataframe(
|
|
393
414
|
self,
|
|
@@ -397,8 +418,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
397
418
|
start_time=None,
|
|
398
419
|
end_time=None,
|
|
399
420
|
time_field=None,
|
|
421
|
+
additional_filters=None,
|
|
400
422
|
):
|
|
401
423
|
reader_args = self.attributes.get("reader_args", {})
|
|
424
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
402
425
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
403
426
|
columns=columns,
|
|
404
427
|
df_module=df_module,
|
|
@@ -406,9 +429,88 @@ class ParquetSource(BaseSourceDriver):
|
|
|
406
429
|
end_time=end_time or self.end_time,
|
|
407
430
|
time_column=time_field or self.time_field,
|
|
408
431
|
format="parquet",
|
|
432
|
+
additional_filters=additional_filters or self.additional_filters,
|
|
409
433
|
**reader_args,
|
|
410
434
|
)
|
|
411
435
|
|
|
436
|
+
def _build_spark_additional_filters(self, column_types: dict):
|
|
437
|
+
if not self.additional_filters:
|
|
438
|
+
return None
|
|
439
|
+
from pyspark.sql.functions import col, isnan, lit
|
|
440
|
+
|
|
441
|
+
operators = {
|
|
442
|
+
"==": operator.eq,
|
|
443
|
+
"=": operator.eq,
|
|
444
|
+
">": operator.gt,
|
|
445
|
+
"<": operator.lt,
|
|
446
|
+
">=": operator.ge,
|
|
447
|
+
"<=": operator.le,
|
|
448
|
+
"!=": operator.ne,
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
spark_filter = None
|
|
452
|
+
new_filter = lit(True)
|
|
453
|
+
for filter_tuple in self.additional_filters:
|
|
454
|
+
if not filter_tuple:
|
|
455
|
+
continue
|
|
456
|
+
col_name, op, value = filter_tuple
|
|
457
|
+
if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
|
|
458
|
+
none_exists = False
|
|
459
|
+
value = list(value)
|
|
460
|
+
for sub_value in value:
|
|
461
|
+
if sub_value is None:
|
|
462
|
+
value.remove(sub_value)
|
|
463
|
+
none_exists = True
|
|
464
|
+
if none_exists:
|
|
465
|
+
filter_nan = column_types[col_name] not in ("timestamp", "date")
|
|
466
|
+
if value:
|
|
467
|
+
if op.lower() == "in":
|
|
468
|
+
new_filter = (
|
|
469
|
+
col(col_name).isin(value) | col(col_name).isNull()
|
|
470
|
+
)
|
|
471
|
+
if filter_nan:
|
|
472
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
473
|
+
|
|
474
|
+
else:
|
|
475
|
+
new_filter = (
|
|
476
|
+
~col(col_name).isin(value) & ~col(col_name).isNull()
|
|
477
|
+
)
|
|
478
|
+
if filter_nan:
|
|
479
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
480
|
+
else:
|
|
481
|
+
if op.lower() == "in":
|
|
482
|
+
new_filter = col(col_name).isNull()
|
|
483
|
+
if filter_nan:
|
|
484
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
485
|
+
else:
|
|
486
|
+
new_filter = ~col(col_name).isNull()
|
|
487
|
+
if filter_nan:
|
|
488
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
489
|
+
else:
|
|
490
|
+
if op.lower() == "in":
|
|
491
|
+
new_filter = col(col_name).isin(value)
|
|
492
|
+
elif op.lower() == "not in":
|
|
493
|
+
new_filter = ~col(col_name).isin(value)
|
|
494
|
+
elif op in operators:
|
|
495
|
+
new_filter = operators[op](col(col_name), value)
|
|
496
|
+
else:
|
|
497
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
498
|
+
f"unsupported filter operator: {op}"
|
|
499
|
+
)
|
|
500
|
+
if spark_filter is not None:
|
|
501
|
+
spark_filter = spark_filter & new_filter
|
|
502
|
+
else:
|
|
503
|
+
spark_filter = new_filter
|
|
504
|
+
return spark_filter
|
|
505
|
+
|
|
506
|
+
def _filter_spark_df(self, df, time_field=None, columns=None):
|
|
507
|
+
spark_additional_filters = self._build_spark_additional_filters(
|
|
508
|
+
column_types=dict(df.dtypes)
|
|
509
|
+
)
|
|
510
|
+
if spark_additional_filters is not None:
|
|
511
|
+
df = df.filter(spark_additional_filters)
|
|
512
|
+
return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
|
|
513
|
+
|
|
412
514
|
|
|
413
515
|
class BigQuerySource(BaseSourceDriver):
|
|
414
516
|
"""
|
|
@@ -423,12 +525,17 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
423
525
|
|
|
424
526
|
# use sql query
|
|
425
527
|
query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
|
|
426
|
-
source = BigQuerySource(
|
|
427
|
-
|
|
428
|
-
|
|
528
|
+
source = BigQuerySource(
|
|
529
|
+
"bq1",
|
|
530
|
+
query=query_string,
|
|
531
|
+
gcp_project="my_project",
|
|
532
|
+
materialization_dataset="dataviews",
|
|
533
|
+
)
|
|
429
534
|
|
|
430
535
|
# read a table
|
|
431
|
-
source = BigQuerySource(
|
|
536
|
+
source = BigQuerySource(
|
|
537
|
+
"bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
|
|
538
|
+
)
|
|
432
539
|
|
|
433
540
|
|
|
434
541
|
:parameter name: source name
|
|
@@ -531,10 +638,15 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
531
638
|
start_time=None,
|
|
532
639
|
end_time=None,
|
|
533
640
|
time_field=None,
|
|
641
|
+
additional_filters=None,
|
|
534
642
|
):
|
|
535
643
|
from google.cloud import bigquery
|
|
536
644
|
from google.cloud.bigquery_storage_v1 import BigQueryReadClient
|
|
537
645
|
|
|
646
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
647
|
+
additional_filters, self.__class__
|
|
648
|
+
)
|
|
649
|
+
|
|
538
650
|
def schema_to_dtypes(schema):
|
|
539
651
|
from mlrun.data_types.data_types import gbq_to_pandas_dtype
|
|
540
652
|
|
|
@@ -574,7 +686,6 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
574
686
|
else:
|
|
575
687
|
df = rows_iterator.to_dataframe(dtypes=dtypes)
|
|
576
688
|
|
|
577
|
-
# TODO : filter as part of the query
|
|
578
689
|
return select_columns_from_df(
|
|
579
690
|
filter_df_start_end_time(
|
|
580
691
|
df,
|
|
@@ -695,32 +806,10 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
695
806
|
**kwargs,
|
|
696
807
|
)
|
|
697
808
|
|
|
698
|
-
def _get_password(self):
|
|
699
|
-
key = "SNOWFLAKE_PASSWORD"
|
|
700
|
-
snowflake_password = os.getenv(key) or os.getenv(
|
|
701
|
-
SecretsStore.k8s_env_variable_name_for_secret(key)
|
|
702
|
-
)
|
|
703
|
-
|
|
704
|
-
if not snowflake_password:
|
|
705
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
706
|
-
"No password provided. Set password using the SNOWFLAKE_PASSWORD "
|
|
707
|
-
"project secret or environment variable."
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
return snowflake_password
|
|
711
|
-
|
|
712
809
|
def get_spark_options(self):
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
"sfURL": self.attributes.get("url"),
|
|
717
|
-
"sfUser": self.attributes.get("user"),
|
|
718
|
-
"sfPassword": self._get_password(),
|
|
719
|
-
"sfDatabase": self.attributes.get("database"),
|
|
720
|
-
"sfSchema": self.attributes.get("schema"),
|
|
721
|
-
"sfWarehouse": self.attributes.get("warehouse"),
|
|
722
|
-
"application": "iguazio_platform",
|
|
723
|
-
}
|
|
810
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
811
|
+
spark_options["query"] = self.attributes.get("query")
|
|
812
|
+
return spark_options
|
|
724
813
|
|
|
725
814
|
|
|
726
815
|
class CustomSource(BaseSourceDriver):
|
|
@@ -774,7 +863,19 @@ class DataFrameSource:
|
|
|
774
863
|
context=self.context or context,
|
|
775
864
|
)
|
|
776
865
|
|
|
777
|
-
def to_dataframe(
|
|
866
|
+
def to_dataframe(
|
|
867
|
+
self,
|
|
868
|
+
columns=None,
|
|
869
|
+
df_module=None,
|
|
870
|
+
entities=None,
|
|
871
|
+
start_time=None,
|
|
872
|
+
end_time=None,
|
|
873
|
+
time_field=None,
|
|
874
|
+
additional_filters=None,
|
|
875
|
+
):
|
|
876
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
877
|
+
additional_filters, self.__class__
|
|
878
|
+
)
|
|
778
879
|
return self._df
|
|
779
880
|
|
|
780
881
|
def is_iterator(self):
|
|
@@ -800,7 +901,7 @@ class OnlineSource(BaseSourceDriver):
|
|
|
800
901
|
self,
|
|
801
902
|
name: str = None,
|
|
802
903
|
path: str = None,
|
|
803
|
-
attributes:
|
|
904
|
+
attributes: dict[str, object] = None,
|
|
804
905
|
key_field: str = None,
|
|
805
906
|
time_field: str = None,
|
|
806
907
|
workers: int = None,
|
|
@@ -812,16 +913,12 @@ class OnlineSource(BaseSourceDriver):
|
|
|
812
913
|
def to_step(self, key_field=None, time_field=None, context=None):
|
|
813
914
|
import storey
|
|
814
915
|
|
|
815
|
-
source_class = (
|
|
816
|
-
storey.AsyncEmitSource
|
|
817
|
-
if config.datastore.async_source_mode == "enabled"
|
|
818
|
-
else storey.SyncEmitSource
|
|
819
|
-
)
|
|
820
916
|
source_args = self.attributes.get("source_args", {})
|
|
821
917
|
explicit_ack = (
|
|
822
918
|
is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
|
|
823
919
|
)
|
|
824
|
-
|
|
920
|
+
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
921
|
+
src_class = storey.SyncEmitSource(
|
|
825
922
|
context=context,
|
|
826
923
|
key_field=self.key_field or key_field,
|
|
827
924
|
full_event=True,
|
|
@@ -848,8 +945,6 @@ class HttpSource(OnlineSource):
|
|
|
848
945
|
|
|
849
946
|
|
|
850
947
|
class StreamSource(OnlineSource):
|
|
851
|
-
"""Sets stream source for the flow. If stream doesn't exist it will create it"""
|
|
852
|
-
|
|
853
948
|
kind = "v3ioStream"
|
|
854
949
|
|
|
855
950
|
def __init__(
|
|
@@ -863,7 +958,7 @@ class StreamSource(OnlineSource):
|
|
|
863
958
|
**kwargs,
|
|
864
959
|
):
|
|
865
960
|
"""
|
|
866
|
-
Sets stream source for the flow. If stream doesn't exist it will create it
|
|
961
|
+
Sets the stream source for the flow. If the stream doesn't exist it will create it.
|
|
867
962
|
|
|
868
963
|
:param name: stream name. Default "stream"
|
|
869
964
|
:param group: consumer group. Default "serving"
|
|
@@ -882,8 +977,15 @@ class StreamSource(OnlineSource):
|
|
|
882
977
|
super().__init__(name, attributes=attrs, **kwargs)
|
|
883
978
|
|
|
884
979
|
def add_nuclio_trigger(self, function):
|
|
885
|
-
|
|
886
|
-
|
|
980
|
+
store, _, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
981
|
+
if store.kind != "v3io":
|
|
982
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
983
|
+
"Only profiles that reference the v3io datastore can be used with StreamSource"
|
|
984
|
+
)
|
|
985
|
+
storage_options = store.get_storage_options()
|
|
986
|
+
access_key = storage_options.get("v3io_access_key")
|
|
987
|
+
endpoint, stream_path = parse_path(url)
|
|
988
|
+
v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
|
|
887
989
|
container, stream_path = split_path(stream_path)
|
|
888
990
|
res = v3io_client.stream.create(
|
|
889
991
|
container=container,
|
|
@@ -903,7 +1005,7 @@ class StreamSource(OnlineSource):
|
|
|
903
1005
|
kwargs["worker_allocation_mode"] = "static"
|
|
904
1006
|
|
|
905
1007
|
function.add_v3io_stream_trigger(
|
|
906
|
-
|
|
1008
|
+
url,
|
|
907
1009
|
self.name,
|
|
908
1010
|
self.attributes["group"],
|
|
909
1011
|
self.attributes["seek_to"],
|
|
@@ -915,8 +1017,6 @@ class StreamSource(OnlineSource):
|
|
|
915
1017
|
|
|
916
1018
|
|
|
917
1019
|
class KafkaSource(OnlineSource):
|
|
918
|
-
"""Sets kafka source for the flow"""
|
|
919
|
-
|
|
920
1020
|
kind = "kafka"
|
|
921
1021
|
|
|
922
1022
|
def __init__(
|
|
@@ -970,6 +1070,7 @@ class KafkaSource(OnlineSource):
|
|
|
970
1070
|
start_time=None,
|
|
971
1071
|
end_time=None,
|
|
972
1072
|
time_field=None,
|
|
1073
|
+
additional_filters=None,
|
|
973
1074
|
):
|
|
974
1075
|
raise mlrun.MLRunInvalidArgumentError(
|
|
975
1076
|
"KafkaSource does not support batch processing"
|
|
@@ -1047,7 +1148,7 @@ class SQLSource(BaseSourceDriver):
|
|
|
1047
1148
|
db_url: str = None,
|
|
1048
1149
|
table_name: str = None,
|
|
1049
1150
|
spark_options: dict = None,
|
|
1050
|
-
parse_dates:
|
|
1151
|
+
parse_dates: list[str] = None,
|
|
1051
1152
|
**kwargs,
|
|
1052
1153
|
):
|
|
1053
1154
|
"""
|
|
@@ -1110,9 +1211,13 @@ class SQLSource(BaseSourceDriver):
|
|
|
1110
1211
|
start_time=None,
|
|
1111
1212
|
end_time=None,
|
|
1112
1213
|
time_field=None,
|
|
1214
|
+
additional_filters=None,
|
|
1113
1215
|
):
|
|
1114
1216
|
import sqlalchemy as sqlalchemy
|
|
1115
1217
|
|
|
1218
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1219
|
+
additional_filters, self.__class__
|
|
1220
|
+
)
|
|
1116
1221
|
db_path = self.attributes.get("db_path")
|
|
1117
1222
|
table_name = self.attributes.get("table_name")
|
|
1118
1223
|
parse_dates = self.attributes.get("parse_dates")
|
mlrun/datastore/spark_utils.py
CHANGED
|
@@ -12,12 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Dict
|
|
16
15
|
|
|
17
16
|
import mlrun
|
|
18
17
|
|
|
19
18
|
|
|
20
|
-
def spark_session_update_hadoop_options(session, spark_options) ->
|
|
19
|
+
def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str]:
|
|
21
20
|
hadoop_conf = session.sparkContext._jsc.hadoopConfiguration()
|
|
22
21
|
non_hadoop_spark_options = {}
|
|
23
22
|
|