mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +131 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +129 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc7.dist-info/METADATA +0 -272
- mlrun-1.6.4rc7.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
mlrun/datastore/sources.py
CHANGED
|
@@ -12,12 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import operator
|
|
15
16
|
import os
|
|
16
17
|
import warnings
|
|
17
18
|
from base64 import b64encode
|
|
18
19
|
from copy import copy
|
|
19
20
|
from datetime import datetime
|
|
20
|
-
from typing import
|
|
21
|
+
from typing import Optional, Union
|
|
21
22
|
|
|
22
23
|
import pandas as pd
|
|
23
24
|
import semver
|
|
@@ -27,9 +28,12 @@ from nuclio import KafkaTrigger
|
|
|
27
28
|
from nuclio.config import split_path
|
|
28
29
|
|
|
29
30
|
import mlrun
|
|
31
|
+
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
30
34
|
from mlrun.secrets import SecretsStore
|
|
35
|
+
from mlrun.utils import logger
|
|
31
36
|
|
|
32
|
-
from ..config import config
|
|
33
37
|
from ..model import DataSource
|
|
34
38
|
from ..platforms.iguazio import parse_path
|
|
35
39
|
from ..utils import get_class, is_explicit_ack_supported
|
|
@@ -39,7 +43,6 @@ from .utils import (
|
|
|
39
43
|
_generate_sql_query_with_time_filter,
|
|
40
44
|
filter_df_start_end_time,
|
|
41
45
|
select_columns_from_df,
|
|
42
|
-
store_path_to_spark,
|
|
43
46
|
)
|
|
44
47
|
|
|
45
48
|
|
|
@@ -83,7 +86,8 @@ class BaseSourceDriver(DataSource):
|
|
|
83
86
|
)
|
|
84
87
|
|
|
85
88
|
explicit_ack = (
|
|
86
|
-
is_explicit_ack_supported(context)
|
|
89
|
+
is_explicit_ack_supported(context)
|
|
90
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
87
91
|
)
|
|
88
92
|
return storey.SyncEmitSource(
|
|
89
93
|
context=context,
|
|
@@ -102,8 +106,12 @@ class BaseSourceDriver(DataSource):
|
|
|
102
106
|
start_time=None,
|
|
103
107
|
end_time=None,
|
|
104
108
|
time_field=None,
|
|
109
|
+
additional_filters=None,
|
|
105
110
|
):
|
|
106
111
|
"""return the source data as dataframe"""
|
|
112
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
113
|
+
additional_filters, self.__class__
|
|
114
|
+
)
|
|
107
115
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
108
116
|
columns=columns,
|
|
109
117
|
df_module=df_module,
|
|
@@ -114,7 +122,11 @@ class BaseSourceDriver(DataSource):
|
|
|
114
122
|
|
|
115
123
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
116
124
|
if self.support_spark:
|
|
117
|
-
|
|
125
|
+
spark_options = self.get_spark_options()
|
|
126
|
+
spark_format = spark_options.pop("format", None)
|
|
127
|
+
df = load_spark_dataframe_with_options(
|
|
128
|
+
session, spark_options, format=spark_format
|
|
129
|
+
)
|
|
118
130
|
if named_view:
|
|
119
131
|
df.createOrReplaceTempView(self.name)
|
|
120
132
|
return self._filter_spark_df(df, time_field, columns)
|
|
@@ -170,10 +182,10 @@ class CSVSource(BaseSourceDriver):
|
|
|
170
182
|
self,
|
|
171
183
|
name: str = "",
|
|
172
184
|
path: str = None,
|
|
173
|
-
attributes:
|
|
185
|
+
attributes: dict[str, object] = None,
|
|
174
186
|
key_field: str = None,
|
|
175
187
|
schedule: str = None,
|
|
176
|
-
parse_dates: Union[None, int, str,
|
|
188
|
+
parse_dates: Union[None, int, str, list[int], list[str]] = None,
|
|
177
189
|
**kwargs,
|
|
178
190
|
):
|
|
179
191
|
super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
|
|
@@ -193,14 +205,10 @@ class CSVSource(BaseSourceDriver):
|
|
|
193
205
|
parse_dates.append(time_field)
|
|
194
206
|
|
|
195
207
|
data_item = mlrun.store_manager.object(self.path)
|
|
196
|
-
|
|
197
|
-
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
198
|
-
path = store.url + path
|
|
199
|
-
else:
|
|
200
|
-
path = data_item.url
|
|
208
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
201
209
|
|
|
202
210
|
return storey.CSVSource(
|
|
203
|
-
paths=
|
|
211
|
+
paths=url, # unlike self.path, it already has store:// replaced
|
|
204
212
|
build_dict=True,
|
|
205
213
|
key_field=self.key_field or key_field,
|
|
206
214
|
storage_options=data_item.store.get_storage_options(),
|
|
@@ -209,25 +217,17 @@ class CSVSource(BaseSourceDriver):
|
|
|
209
217
|
)
|
|
210
218
|
|
|
211
219
|
def get_spark_options(self):
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
218
|
-
"format": "csv",
|
|
219
|
-
"header": "true",
|
|
220
|
-
"inferSchema": "true",
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
return {**result, **storage_spark_options}
|
|
224
|
-
else:
|
|
225
|
-
return {
|
|
226
|
-
"path": store_path_to_spark(self.path),
|
|
220
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
221
|
+
spark_options = store.get_spark_options()
|
|
222
|
+
spark_options.update(
|
|
223
|
+
{
|
|
224
|
+
"path": store.spark_url + path,
|
|
227
225
|
"format": "csv",
|
|
228
226
|
"header": "true",
|
|
229
227
|
"inferSchema": "true",
|
|
230
228
|
}
|
|
229
|
+
)
|
|
230
|
+
return spark_options
|
|
231
231
|
|
|
232
232
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
233
233
|
import pyspark.sql.functions as funcs
|
|
@@ -253,7 +253,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
253
253
|
start_time=None,
|
|
254
254
|
end_time=None,
|
|
255
255
|
time_field=None,
|
|
256
|
+
additional_filters=None,
|
|
256
257
|
):
|
|
258
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
259
|
+
additional_filters, self.__class__
|
|
260
|
+
)
|
|
257
261
|
reader_args = self.attributes.get("reader_args", {})
|
|
258
262
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
259
263
|
columns=columns,
|
|
@@ -289,6 +293,12 @@ class ParquetSource(BaseSourceDriver):
|
|
|
289
293
|
:parameter start_time: filters out data before this time
|
|
290
294
|
:parameter end_time: filters out data after this time
|
|
291
295
|
:parameter attributes: additional parameters to pass to storey.
|
|
296
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
297
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
298
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
299
|
+
Example: [("Product", "=", "Computer")]
|
|
300
|
+
For all supported filters, please see:
|
|
301
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
292
302
|
"""
|
|
293
303
|
|
|
294
304
|
kind = "parquet"
|
|
@@ -299,13 +309,19 @@ class ParquetSource(BaseSourceDriver):
|
|
|
299
309
|
self,
|
|
300
310
|
name: str = "",
|
|
301
311
|
path: str = None,
|
|
302
|
-
attributes:
|
|
312
|
+
attributes: dict[str, object] = None,
|
|
303
313
|
key_field: str = None,
|
|
304
314
|
time_field: str = None,
|
|
305
315
|
schedule: str = None,
|
|
306
316
|
start_time: Optional[Union[datetime, str]] = None,
|
|
307
317
|
end_time: Optional[Union[datetime, str]] = None,
|
|
318
|
+
additional_filters: Optional[list[Union[tuple, list]]] = None,
|
|
308
319
|
):
|
|
320
|
+
if additional_filters:
|
|
321
|
+
attributes = copy(attributes) or {}
|
|
322
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
323
|
+
attributes["additional_filters"] = additional_filters
|
|
324
|
+
|
|
309
325
|
super().__init__(
|
|
310
326
|
name,
|
|
311
327
|
path,
|
|
@@ -333,6 +349,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
333
349
|
def end_time(self, end_time):
|
|
334
350
|
self._end_time = self._convert_to_datetime(end_time)
|
|
335
351
|
|
|
352
|
+
@property
|
|
353
|
+
def additional_filters(self):
|
|
354
|
+
return self.attributes.get("additional_filters")
|
|
355
|
+
|
|
336
356
|
@staticmethod
|
|
337
357
|
def _convert_to_datetime(time):
|
|
338
358
|
if time and isinstance(time, str):
|
|
@@ -349,45 +369,48 @@ class ParquetSource(BaseSourceDriver):
|
|
|
349
369
|
start_time=None,
|
|
350
370
|
end_time=None,
|
|
351
371
|
context=None,
|
|
372
|
+
additional_filters=None,
|
|
352
373
|
):
|
|
353
374
|
import storey
|
|
354
375
|
|
|
355
|
-
attributes = self.attributes
|
|
376
|
+
attributes = copy(self.attributes)
|
|
377
|
+
attributes.pop("additional_filters", None)
|
|
356
378
|
if context:
|
|
357
379
|
attributes["context"] = context
|
|
358
|
-
|
|
380
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
359
381
|
data_item = mlrun.store_manager.object(self.path)
|
|
360
|
-
|
|
361
|
-
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
362
|
-
path = store.url + path
|
|
363
|
-
else:
|
|
364
|
-
path = data_item.url
|
|
365
|
-
|
|
382
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
366
383
|
return storey.ParquetSource(
|
|
367
|
-
paths=
|
|
384
|
+
paths=url, # unlike self.path, it already has store:// replaced
|
|
368
385
|
key_field=self.key_field or key_field,
|
|
369
386
|
storage_options=data_item.store.get_storage_options(),
|
|
370
387
|
end_filter=self.end_time,
|
|
371
388
|
start_filter=self.start_time,
|
|
372
389
|
filter_column=self.time_field or time_field,
|
|
390
|
+
additional_filters=self.additional_filters or additional_filters,
|
|
373
391
|
**attributes,
|
|
374
392
|
)
|
|
375
393
|
|
|
394
|
+
@classmethod
|
|
395
|
+
def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
|
|
396
|
+
new_obj = super().from_dict(
|
|
397
|
+
struct=struct, fields=fields, deprecated_fields=deprecated_fields
|
|
398
|
+
)
|
|
399
|
+
new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
|
|
400
|
+
new_obj.additional_filters
|
|
401
|
+
)
|
|
402
|
+
return new_obj
|
|
403
|
+
|
|
376
404
|
def get_spark_options(self):
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
383
|
-
"format": "parquet",
|
|
384
|
-
}
|
|
385
|
-
return {**result, **storage_spark_options}
|
|
386
|
-
else:
|
|
387
|
-
return {
|
|
388
|
-
"path": store_path_to_spark(self.path),
|
|
405
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
406
|
+
spark_options = store.get_spark_options()
|
|
407
|
+
spark_options.update(
|
|
408
|
+
{
|
|
409
|
+
"path": store.spark_url + path,
|
|
389
410
|
"format": "parquet",
|
|
390
411
|
}
|
|
412
|
+
)
|
|
413
|
+
return spark_options
|
|
391
414
|
|
|
392
415
|
def to_dataframe(
|
|
393
416
|
self,
|
|
@@ -397,8 +420,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
397
420
|
start_time=None,
|
|
398
421
|
end_time=None,
|
|
399
422
|
time_field=None,
|
|
423
|
+
additional_filters=None,
|
|
400
424
|
):
|
|
401
425
|
reader_args = self.attributes.get("reader_args", {})
|
|
426
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
402
427
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
403
428
|
columns=columns,
|
|
404
429
|
df_module=df_module,
|
|
@@ -406,9 +431,88 @@ class ParquetSource(BaseSourceDriver):
|
|
|
406
431
|
end_time=end_time or self.end_time,
|
|
407
432
|
time_column=time_field or self.time_field,
|
|
408
433
|
format="parquet",
|
|
434
|
+
additional_filters=additional_filters or self.additional_filters,
|
|
409
435
|
**reader_args,
|
|
410
436
|
)
|
|
411
437
|
|
|
438
|
+
def _build_spark_additional_filters(self, column_types: dict):
|
|
439
|
+
if not self.additional_filters:
|
|
440
|
+
return None
|
|
441
|
+
from pyspark.sql.functions import col, isnan, lit
|
|
442
|
+
|
|
443
|
+
operators = {
|
|
444
|
+
"==": operator.eq,
|
|
445
|
+
"=": operator.eq,
|
|
446
|
+
">": operator.gt,
|
|
447
|
+
"<": operator.lt,
|
|
448
|
+
">=": operator.ge,
|
|
449
|
+
"<=": operator.le,
|
|
450
|
+
"!=": operator.ne,
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
spark_filter = None
|
|
454
|
+
new_filter = lit(True)
|
|
455
|
+
for filter_tuple in self.additional_filters:
|
|
456
|
+
if not filter_tuple:
|
|
457
|
+
continue
|
|
458
|
+
col_name, op, value = filter_tuple
|
|
459
|
+
if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
|
|
460
|
+
none_exists = False
|
|
461
|
+
value = list(value)
|
|
462
|
+
for sub_value in value:
|
|
463
|
+
if sub_value is None:
|
|
464
|
+
value.remove(sub_value)
|
|
465
|
+
none_exists = True
|
|
466
|
+
if none_exists:
|
|
467
|
+
filter_nan = column_types[col_name] not in ("timestamp", "date")
|
|
468
|
+
if value:
|
|
469
|
+
if op.lower() == "in":
|
|
470
|
+
new_filter = (
|
|
471
|
+
col(col_name).isin(value) | col(col_name).isNull()
|
|
472
|
+
)
|
|
473
|
+
if filter_nan:
|
|
474
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
475
|
+
|
|
476
|
+
else:
|
|
477
|
+
new_filter = (
|
|
478
|
+
~col(col_name).isin(value) & ~col(col_name).isNull()
|
|
479
|
+
)
|
|
480
|
+
if filter_nan:
|
|
481
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
482
|
+
else:
|
|
483
|
+
if op.lower() == "in":
|
|
484
|
+
new_filter = col(col_name).isNull()
|
|
485
|
+
if filter_nan:
|
|
486
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
487
|
+
else:
|
|
488
|
+
new_filter = ~col(col_name).isNull()
|
|
489
|
+
if filter_nan:
|
|
490
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
491
|
+
else:
|
|
492
|
+
if op.lower() == "in":
|
|
493
|
+
new_filter = col(col_name).isin(value)
|
|
494
|
+
elif op.lower() == "not in":
|
|
495
|
+
new_filter = ~col(col_name).isin(value)
|
|
496
|
+
elif op in operators:
|
|
497
|
+
new_filter = operators[op](col(col_name), value)
|
|
498
|
+
else:
|
|
499
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
500
|
+
f"unsupported filter operator: {op}"
|
|
501
|
+
)
|
|
502
|
+
if spark_filter is not None:
|
|
503
|
+
spark_filter = spark_filter & new_filter
|
|
504
|
+
else:
|
|
505
|
+
spark_filter = new_filter
|
|
506
|
+
return spark_filter
|
|
507
|
+
|
|
508
|
+
def _filter_spark_df(self, df, time_field=None, columns=None):
|
|
509
|
+
spark_additional_filters = self._build_spark_additional_filters(
|
|
510
|
+
column_types=dict(df.dtypes)
|
|
511
|
+
)
|
|
512
|
+
if spark_additional_filters is not None:
|
|
513
|
+
df = df.filter(spark_additional_filters)
|
|
514
|
+
return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
|
|
515
|
+
|
|
412
516
|
|
|
413
517
|
class BigQuerySource(BaseSourceDriver):
|
|
414
518
|
"""
|
|
@@ -423,12 +527,17 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
423
527
|
|
|
424
528
|
# use sql query
|
|
425
529
|
query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
|
|
426
|
-
source = BigQuerySource(
|
|
427
|
-
|
|
428
|
-
|
|
530
|
+
source = BigQuerySource(
|
|
531
|
+
"bq1",
|
|
532
|
+
query=query_string,
|
|
533
|
+
gcp_project="my_project",
|
|
534
|
+
materialization_dataset="dataviews",
|
|
535
|
+
)
|
|
429
536
|
|
|
430
537
|
# read a table
|
|
431
|
-
source = BigQuerySource(
|
|
538
|
+
source = BigQuerySource(
|
|
539
|
+
"bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
|
|
540
|
+
)
|
|
432
541
|
|
|
433
542
|
|
|
434
543
|
:parameter name: source name
|
|
@@ -531,10 +640,15 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
531
640
|
start_time=None,
|
|
532
641
|
end_time=None,
|
|
533
642
|
time_field=None,
|
|
643
|
+
additional_filters=None,
|
|
534
644
|
):
|
|
535
645
|
from google.cloud import bigquery
|
|
536
646
|
from google.cloud.bigquery_storage_v1 import BigQueryReadClient
|
|
537
647
|
|
|
648
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
649
|
+
additional_filters, self.__class__
|
|
650
|
+
)
|
|
651
|
+
|
|
538
652
|
def schema_to_dtypes(schema):
|
|
539
653
|
from mlrun.data_types.data_types import gbq_to_pandas_dtype
|
|
540
654
|
|
|
@@ -574,7 +688,6 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
574
688
|
else:
|
|
575
689
|
df = rows_iterator.to_dataframe(dtypes=dtypes)
|
|
576
690
|
|
|
577
|
-
# TODO : filter as part of the query
|
|
578
691
|
return select_columns_from_df(
|
|
579
692
|
filter_df_start_end_time(
|
|
580
693
|
df,
|
|
@@ -636,7 +749,7 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
636
749
|
url="...",
|
|
637
750
|
user="...",
|
|
638
751
|
database="...",
|
|
639
|
-
|
|
752
|
+
db_schema="...",
|
|
640
753
|
warehouse="...",
|
|
641
754
|
)
|
|
642
755
|
|
|
@@ -651,7 +764,8 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
651
764
|
:parameter url: URL of the snowflake cluster
|
|
652
765
|
:parameter user: snowflake user
|
|
653
766
|
:parameter database: snowflake database
|
|
654
|
-
:parameter schema: snowflake schema
|
|
767
|
+
:parameter schema: snowflake schema - deprecated, use db_schema
|
|
768
|
+
:parameter db_schema: snowflake schema
|
|
655
769
|
:parameter warehouse: snowflake warehouse
|
|
656
770
|
"""
|
|
657
771
|
|
|
@@ -663,6 +777,7 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
663
777
|
self,
|
|
664
778
|
name: str = "",
|
|
665
779
|
key_field: str = None,
|
|
780
|
+
attributes: dict[str, object] = None,
|
|
666
781
|
time_field: str = None,
|
|
667
782
|
schedule: str = None,
|
|
668
783
|
start_time=None,
|
|
@@ -672,21 +787,34 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
672
787
|
user: str = None,
|
|
673
788
|
database: str = None,
|
|
674
789
|
schema: str = None,
|
|
790
|
+
db_schema: str = None,
|
|
675
791
|
warehouse: str = None,
|
|
676
792
|
**kwargs,
|
|
677
793
|
):
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
}
|
|
794
|
+
# TODO: Remove in 1.9.0
|
|
795
|
+
if schema:
|
|
796
|
+
warnings.warn(
|
|
797
|
+
"schema is deprecated in 1.7.0, and will be removed in 1.9.0, please use db_schema"
|
|
798
|
+
)
|
|
799
|
+
db_schema = db_schema or schema # TODO: Remove in 1.9.0
|
|
800
|
+
|
|
801
|
+
attributes = attributes or {}
|
|
802
|
+
if url:
|
|
803
|
+
attributes["url"] = url
|
|
804
|
+
if user:
|
|
805
|
+
attributes["user"] = user
|
|
806
|
+
if database:
|
|
807
|
+
attributes["database"] = database
|
|
808
|
+
if db_schema:
|
|
809
|
+
attributes["db_schema"] = db_schema
|
|
810
|
+
if warehouse:
|
|
811
|
+
attributes["warehouse"] = warehouse
|
|
812
|
+
if query:
|
|
813
|
+
attributes["query"] = query
|
|
686
814
|
|
|
687
815
|
super().__init__(
|
|
688
816
|
name,
|
|
689
|
-
attributes=
|
|
817
|
+
attributes=attributes,
|
|
690
818
|
key_field=key_field,
|
|
691
819
|
time_field=time_field,
|
|
692
820
|
schedule=schedule,
|
|
@@ -695,32 +823,24 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
695
823
|
**kwargs,
|
|
696
824
|
)
|
|
697
825
|
|
|
698
|
-
def _get_password(self):
|
|
699
|
-
key = "SNOWFLAKE_PASSWORD"
|
|
700
|
-
snowflake_password = os.getenv(key) or os.getenv(
|
|
701
|
-
SecretsStore.k8s_env_variable_name_for_secret(key)
|
|
702
|
-
)
|
|
703
|
-
|
|
704
|
-
if not snowflake_password:
|
|
705
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
706
|
-
"No password provided. Set password using the SNOWFLAKE_PASSWORD "
|
|
707
|
-
"project secret or environment variable."
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
return snowflake_password
|
|
711
|
-
|
|
712
826
|
def get_spark_options(self):
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
827
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
828
|
+
spark_options["query"] = self.attributes.get("query")
|
|
829
|
+
return spark_options
|
|
830
|
+
|
|
831
|
+
def to_dataframe(
|
|
832
|
+
self,
|
|
833
|
+
columns=None,
|
|
834
|
+
df_module=None,
|
|
835
|
+
entities=None,
|
|
836
|
+
start_time=None,
|
|
837
|
+
end_time=None,
|
|
838
|
+
time_field=None,
|
|
839
|
+
additional_filters=None,
|
|
840
|
+
):
|
|
841
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
842
|
+
f"{type(self).__name__} supports only spark engine"
|
|
843
|
+
)
|
|
724
844
|
|
|
725
845
|
|
|
726
846
|
class CustomSource(BaseSourceDriver):
|
|
@@ -774,7 +894,19 @@ class DataFrameSource:
|
|
|
774
894
|
context=self.context or context,
|
|
775
895
|
)
|
|
776
896
|
|
|
777
|
-
def to_dataframe(
|
|
897
|
+
def to_dataframe(
|
|
898
|
+
self,
|
|
899
|
+
columns=None,
|
|
900
|
+
df_module=None,
|
|
901
|
+
entities=None,
|
|
902
|
+
start_time=None,
|
|
903
|
+
end_time=None,
|
|
904
|
+
time_field=None,
|
|
905
|
+
additional_filters=None,
|
|
906
|
+
):
|
|
907
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
908
|
+
additional_filters, self.__class__
|
|
909
|
+
)
|
|
778
910
|
return self._df
|
|
779
911
|
|
|
780
912
|
def is_iterator(self):
|
|
@@ -800,7 +932,7 @@ class OnlineSource(BaseSourceDriver):
|
|
|
800
932
|
self,
|
|
801
933
|
name: str = None,
|
|
802
934
|
path: str = None,
|
|
803
|
-
attributes:
|
|
935
|
+
attributes: dict[str, object] = None,
|
|
804
936
|
key_field: str = None,
|
|
805
937
|
time_field: str = None,
|
|
806
938
|
workers: int = None,
|
|
@@ -812,16 +944,13 @@ class OnlineSource(BaseSourceDriver):
|
|
|
812
944
|
def to_step(self, key_field=None, time_field=None, context=None):
|
|
813
945
|
import storey
|
|
814
946
|
|
|
815
|
-
source_class = (
|
|
816
|
-
storey.AsyncEmitSource
|
|
817
|
-
if config.datastore.async_source_mode == "enabled"
|
|
818
|
-
else storey.SyncEmitSource
|
|
819
|
-
)
|
|
820
947
|
source_args = self.attributes.get("source_args", {})
|
|
821
948
|
explicit_ack = (
|
|
822
|
-
is_explicit_ack_supported(context)
|
|
949
|
+
is_explicit_ack_supported(context)
|
|
950
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
823
951
|
)
|
|
824
|
-
|
|
952
|
+
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
953
|
+
src_class = storey.SyncEmitSource(
|
|
825
954
|
context=context,
|
|
826
955
|
key_field=self.key_field or key_field,
|
|
827
956
|
full_event=True,
|
|
@@ -848,8 +977,6 @@ class HttpSource(OnlineSource):
|
|
|
848
977
|
|
|
849
978
|
|
|
850
979
|
class StreamSource(OnlineSource):
|
|
851
|
-
"""Sets stream source for the flow. If stream doesn't exist it will create it"""
|
|
852
|
-
|
|
853
980
|
kind = "v3ioStream"
|
|
854
981
|
|
|
855
982
|
def __init__(
|
|
@@ -863,7 +990,7 @@ class StreamSource(OnlineSource):
|
|
|
863
990
|
**kwargs,
|
|
864
991
|
):
|
|
865
992
|
"""
|
|
866
|
-
Sets stream source for the flow. If stream doesn't exist it will create it
|
|
993
|
+
Sets the stream source for the flow. If the stream doesn't exist it will create it.
|
|
867
994
|
|
|
868
995
|
:param name: stream name. Default "stream"
|
|
869
996
|
:param group: consumer group. Default "serving"
|
|
@@ -882,8 +1009,15 @@ class StreamSource(OnlineSource):
|
|
|
882
1009
|
super().__init__(name, attributes=attrs, **kwargs)
|
|
883
1010
|
|
|
884
1011
|
def add_nuclio_trigger(self, function):
|
|
885
|
-
|
|
886
|
-
|
|
1012
|
+
store, _, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
1013
|
+
if store.kind != "v3io":
|
|
1014
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1015
|
+
"Only profiles that reference the v3io datastore can be used with StreamSource"
|
|
1016
|
+
)
|
|
1017
|
+
storage_options = store.get_storage_options()
|
|
1018
|
+
access_key = storage_options.get("v3io_access_key")
|
|
1019
|
+
endpoint, stream_path = parse_path(url)
|
|
1020
|
+
v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
|
|
887
1021
|
container, stream_path = split_path(stream_path)
|
|
888
1022
|
res = v3io_client.stream.create(
|
|
889
1023
|
container=container,
|
|
@@ -898,12 +1032,13 @@ class StreamSource(OnlineSource):
|
|
|
898
1032
|
engine = "async"
|
|
899
1033
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
900
1034
|
engine = function.spec.graph.engine
|
|
901
|
-
|
|
1035
|
+
|
|
1036
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
902
1037
|
kwargs["explicit_ack_mode"] = "explicitOnly"
|
|
903
1038
|
kwargs["worker_allocation_mode"] = "static"
|
|
904
1039
|
|
|
905
1040
|
function.add_v3io_stream_trigger(
|
|
906
|
-
|
|
1041
|
+
url,
|
|
907
1042
|
self.name,
|
|
908
1043
|
self.attributes["group"],
|
|
909
1044
|
self.attributes["seek_to"],
|
|
@@ -915,8 +1050,6 @@ class StreamSource(OnlineSource):
|
|
|
915
1050
|
|
|
916
1051
|
|
|
917
1052
|
class KafkaSource(OnlineSource):
|
|
918
|
-
"""Sets kafka source for the flow"""
|
|
919
|
-
|
|
920
1053
|
kind = "kafka"
|
|
921
1054
|
|
|
922
1055
|
def __init__(
|
|
@@ -970,6 +1103,7 @@ class KafkaSource(OnlineSource):
|
|
|
970
1103
|
start_time=None,
|
|
971
1104
|
end_time=None,
|
|
972
1105
|
time_field=None,
|
|
1106
|
+
additional_filters=None,
|
|
973
1107
|
):
|
|
974
1108
|
raise mlrun.MLRunInvalidArgumentError(
|
|
975
1109
|
"KafkaSource does not support batch processing"
|
|
@@ -986,7 +1120,8 @@ class KafkaSource(OnlineSource):
|
|
|
986
1120
|
engine = "async"
|
|
987
1121
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
988
1122
|
engine = function.spec.graph.engine
|
|
989
|
-
|
|
1123
|
+
|
|
1124
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
990
1125
|
explicit_ack_mode = "explicitOnly"
|
|
991
1126
|
extra_attributes["workerAllocationMode"] = extra_attributes.get(
|
|
992
1127
|
"worker_allocation_mode", "static"
|
|
@@ -1029,6 +1164,59 @@ class KafkaSource(OnlineSource):
|
|
|
1029
1164
|
"to a Spark dataframe is not possible, as this operation is not supported by Spark"
|
|
1030
1165
|
)
|
|
1031
1166
|
|
|
1167
|
+
def create_topics(
|
|
1168
|
+
self,
|
|
1169
|
+
num_partitions: int = 4,
|
|
1170
|
+
replication_factor: int = 1,
|
|
1171
|
+
topics: list[str] = None,
|
|
1172
|
+
):
|
|
1173
|
+
"""
|
|
1174
|
+
Create Kafka topics with the specified number of partitions and replication factor.
|
|
1175
|
+
|
|
1176
|
+
:param num_partitions: number of partitions for the topics
|
|
1177
|
+
:param replication_factor: replication factor for the topics
|
|
1178
|
+
:param topics: list of topic names to create, if None,
|
|
1179
|
+
the topics will be taken from the source attributes
|
|
1180
|
+
"""
|
|
1181
|
+
from kafka.admin import KafkaAdminClient, NewTopic
|
|
1182
|
+
|
|
1183
|
+
brokers = self.attributes.get("brokers")
|
|
1184
|
+
if not brokers:
|
|
1185
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1186
|
+
"brokers must be specified in the KafkaSource attributes"
|
|
1187
|
+
)
|
|
1188
|
+
topics = topics or self.attributes.get("topics")
|
|
1189
|
+
if not topics:
|
|
1190
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1191
|
+
"topics must be specified in the KafkaSource attributes"
|
|
1192
|
+
)
|
|
1193
|
+
new_topics = [
|
|
1194
|
+
NewTopic(topic, num_partitions, replication_factor) for topic in topics
|
|
1195
|
+
]
|
|
1196
|
+
kafka_admin = KafkaAdminClient(
|
|
1197
|
+
bootstrap_servers=brokers,
|
|
1198
|
+
sasl_mechanism=self.attributes.get("sasl", {}).get("sasl_mechanism"),
|
|
1199
|
+
sasl_plain_username=self.attributes.get("sasl", {}).get("username"),
|
|
1200
|
+
sasl_plain_password=self.attributes.get("sasl", {}).get("password"),
|
|
1201
|
+
sasl_kerberos_service_name=self.attributes.get("sasl", {}).get(
|
|
1202
|
+
"sasl_kerberos_service_name", "kafka"
|
|
1203
|
+
),
|
|
1204
|
+
sasl_kerberos_domain_name=self.attributes.get("sasl", {}).get(
|
|
1205
|
+
"sasl_kerberos_domain_name"
|
|
1206
|
+
),
|
|
1207
|
+
sasl_oauth_token_provider=self.attributes.get("sasl", {}).get("mechanism"),
|
|
1208
|
+
)
|
|
1209
|
+
try:
|
|
1210
|
+
kafka_admin.create_topics(new_topics)
|
|
1211
|
+
finally:
|
|
1212
|
+
kafka_admin.close()
|
|
1213
|
+
logger.info(
|
|
1214
|
+
"Kafka topics created successfully",
|
|
1215
|
+
topics=topics,
|
|
1216
|
+
num_partitions=num_partitions,
|
|
1217
|
+
replication_factor=replication_factor,
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1032
1220
|
|
|
1033
1221
|
class SQLSource(BaseSourceDriver):
|
|
1034
1222
|
kind = "sqldb"
|
|
@@ -1047,7 +1235,7 @@ class SQLSource(BaseSourceDriver):
|
|
|
1047
1235
|
db_url: str = None,
|
|
1048
1236
|
table_name: str = None,
|
|
1049
1237
|
spark_options: dict = None,
|
|
1050
|
-
parse_dates:
|
|
1238
|
+
parse_dates: list[str] = None,
|
|
1051
1239
|
**kwargs,
|
|
1052
1240
|
):
|
|
1053
1241
|
"""
|
|
@@ -1110,9 +1298,13 @@ class SQLSource(BaseSourceDriver):
|
|
|
1110
1298
|
start_time=None,
|
|
1111
1299
|
end_time=None,
|
|
1112
1300
|
time_field=None,
|
|
1301
|
+
additional_filters=None,
|
|
1113
1302
|
):
|
|
1114
1303
|
import sqlalchemy as sqlalchemy
|
|
1115
1304
|
|
|
1305
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1306
|
+
additional_filters, self.__class__
|
|
1307
|
+
)
|
|
1116
1308
|
db_path = self.attributes.get("db_path")
|
|
1117
1309
|
table_name = self.attributes.get("table_name")
|
|
1118
1310
|
parse_dates = self.attributes.get("parse_dates")
|