mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +31 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +13 -2
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +233 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +387 -119
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +245 -20
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +909 -231
- mlrun/db/nopdb.py +279 -14
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1176 -406
- mlrun/render.py +28 -22
- mlrun/run.py +208 -181
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +54 -24
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/METADATA +0 -269
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/datastore/sources.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import operator
|
|
15
16
|
import os
|
|
16
17
|
import warnings
|
|
17
18
|
from base64 import b64encode
|
|
@@ -28,7 +29,10 @@ from nuclio.config import split_path
|
|
|
28
29
|
|
|
29
30
|
import mlrun
|
|
30
31
|
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
31
34
|
from mlrun.secrets import SecretsStore
|
|
35
|
+
from mlrun.utils import logger
|
|
32
36
|
|
|
33
37
|
from ..model import DataSource
|
|
34
38
|
from ..platforms.iguazio import parse_path
|
|
@@ -82,7 +86,8 @@ class BaseSourceDriver(DataSource):
|
|
|
82
86
|
)
|
|
83
87
|
|
|
84
88
|
explicit_ack = (
|
|
85
|
-
is_explicit_ack_supported(context)
|
|
89
|
+
is_explicit_ack_supported(context)
|
|
90
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
86
91
|
)
|
|
87
92
|
return storey.SyncEmitSource(
|
|
88
93
|
context=context,
|
|
@@ -101,8 +106,12 @@ class BaseSourceDriver(DataSource):
|
|
|
101
106
|
start_time=None,
|
|
102
107
|
end_time=None,
|
|
103
108
|
time_field=None,
|
|
109
|
+
additional_filters=None,
|
|
104
110
|
):
|
|
105
111
|
"""return the source data as dataframe"""
|
|
112
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
113
|
+
additional_filters, self.__class__
|
|
114
|
+
)
|
|
106
115
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
107
116
|
columns=columns,
|
|
108
117
|
df_module=df_module,
|
|
@@ -113,7 +122,11 @@ class BaseSourceDriver(DataSource):
|
|
|
113
122
|
|
|
114
123
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
115
124
|
if self.support_spark:
|
|
116
|
-
|
|
125
|
+
spark_options = self.get_spark_options()
|
|
126
|
+
spark_format = spark_options.pop("format", None)
|
|
127
|
+
df = load_spark_dataframe_with_options(
|
|
128
|
+
session, spark_options, format=spark_format
|
|
129
|
+
)
|
|
117
130
|
if named_view:
|
|
118
131
|
df.createOrReplaceTempView(self.name)
|
|
119
132
|
return self._filter_spark_df(df, time_field, columns)
|
|
@@ -169,7 +182,7 @@ class CSVSource(BaseSourceDriver):
|
|
|
169
182
|
self,
|
|
170
183
|
name: str = "",
|
|
171
184
|
path: str = None,
|
|
172
|
-
attributes: dict[str,
|
|
185
|
+
attributes: dict[str, object] = None,
|
|
173
186
|
key_field: str = None,
|
|
174
187
|
schedule: str = None,
|
|
175
188
|
parse_dates: Union[None, int, str, list[int], list[str]] = None,
|
|
@@ -204,11 +217,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
204
217
|
)
|
|
205
218
|
|
|
206
219
|
def get_spark_options(self):
|
|
207
|
-
store, path,
|
|
220
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
208
221
|
spark_options = store.get_spark_options()
|
|
209
222
|
spark_options.update(
|
|
210
223
|
{
|
|
211
|
-
"path":
|
|
224
|
+
"path": store.spark_url + path,
|
|
212
225
|
"format": "csv",
|
|
213
226
|
"header": "true",
|
|
214
227
|
"inferSchema": "true",
|
|
@@ -240,7 +253,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
240
253
|
start_time=None,
|
|
241
254
|
end_time=None,
|
|
242
255
|
time_field=None,
|
|
256
|
+
additional_filters=None,
|
|
243
257
|
):
|
|
258
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
259
|
+
additional_filters, self.__class__
|
|
260
|
+
)
|
|
244
261
|
reader_args = self.attributes.get("reader_args", {})
|
|
245
262
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
246
263
|
columns=columns,
|
|
@@ -276,6 +293,12 @@ class ParquetSource(BaseSourceDriver):
|
|
|
276
293
|
:parameter start_time: filters out data before this time
|
|
277
294
|
:parameter end_time: filters out data after this time
|
|
278
295
|
:parameter attributes: additional parameters to pass to storey.
|
|
296
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
297
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
298
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
299
|
+
Example: [("Product", "=", "Computer")]
|
|
300
|
+
For all supported filters, please see:
|
|
301
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
279
302
|
"""
|
|
280
303
|
|
|
281
304
|
kind = "parquet"
|
|
@@ -286,13 +309,19 @@ class ParquetSource(BaseSourceDriver):
|
|
|
286
309
|
self,
|
|
287
310
|
name: str = "",
|
|
288
311
|
path: str = None,
|
|
289
|
-
attributes: dict[str,
|
|
312
|
+
attributes: dict[str, object] = None,
|
|
290
313
|
key_field: str = None,
|
|
291
314
|
time_field: str = None,
|
|
292
315
|
schedule: str = None,
|
|
293
316
|
start_time: Optional[Union[datetime, str]] = None,
|
|
294
317
|
end_time: Optional[Union[datetime, str]] = None,
|
|
318
|
+
additional_filters: Optional[list[Union[tuple, list]]] = None,
|
|
295
319
|
):
|
|
320
|
+
if additional_filters:
|
|
321
|
+
attributes = copy(attributes) or {}
|
|
322
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
323
|
+
attributes["additional_filters"] = additional_filters
|
|
324
|
+
|
|
296
325
|
super().__init__(
|
|
297
326
|
name,
|
|
298
327
|
path,
|
|
@@ -320,6 +349,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
320
349
|
def end_time(self, end_time):
|
|
321
350
|
self._end_time = self._convert_to_datetime(end_time)
|
|
322
351
|
|
|
352
|
+
@property
|
|
353
|
+
def additional_filters(self):
|
|
354
|
+
return self.attributes.get("additional_filters")
|
|
355
|
+
|
|
323
356
|
@staticmethod
|
|
324
357
|
def _convert_to_datetime(time):
|
|
325
358
|
if time and isinstance(time, str):
|
|
@@ -336,16 +369,17 @@ class ParquetSource(BaseSourceDriver):
|
|
|
336
369
|
start_time=None,
|
|
337
370
|
end_time=None,
|
|
338
371
|
context=None,
|
|
372
|
+
additional_filters=None,
|
|
339
373
|
):
|
|
340
374
|
import storey
|
|
341
375
|
|
|
342
|
-
attributes = self.attributes
|
|
376
|
+
attributes = copy(self.attributes)
|
|
377
|
+
attributes.pop("additional_filters", None)
|
|
343
378
|
if context:
|
|
344
379
|
attributes["context"] = context
|
|
345
|
-
|
|
380
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
346
381
|
data_item = mlrun.store_manager.object(self.path)
|
|
347
382
|
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
348
|
-
|
|
349
383
|
return storey.ParquetSource(
|
|
350
384
|
paths=url, # unlike self.path, it already has store:// replaced
|
|
351
385
|
key_field=self.key_field or key_field,
|
|
@@ -353,11 +387,22 @@ class ParquetSource(BaseSourceDriver):
|
|
|
353
387
|
end_filter=self.end_time,
|
|
354
388
|
start_filter=self.start_time,
|
|
355
389
|
filter_column=self.time_field or time_field,
|
|
390
|
+
additional_filters=self.additional_filters or additional_filters,
|
|
356
391
|
**attributes,
|
|
357
392
|
)
|
|
358
393
|
|
|
394
|
+
@classmethod
|
|
395
|
+
def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
|
|
396
|
+
new_obj = super().from_dict(
|
|
397
|
+
struct=struct, fields=fields, deprecated_fields=deprecated_fields
|
|
398
|
+
)
|
|
399
|
+
new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
|
|
400
|
+
new_obj.additional_filters
|
|
401
|
+
)
|
|
402
|
+
return new_obj
|
|
403
|
+
|
|
359
404
|
def get_spark_options(self):
|
|
360
|
-
store, path,
|
|
405
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
361
406
|
spark_options = store.get_spark_options()
|
|
362
407
|
spark_options.update(
|
|
363
408
|
{
|
|
@@ -375,8 +420,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
375
420
|
start_time=None,
|
|
376
421
|
end_time=None,
|
|
377
422
|
time_field=None,
|
|
423
|
+
additional_filters=None,
|
|
378
424
|
):
|
|
379
425
|
reader_args = self.attributes.get("reader_args", {})
|
|
426
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
380
427
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
381
428
|
columns=columns,
|
|
382
429
|
df_module=df_module,
|
|
@@ -384,9 +431,88 @@ class ParquetSource(BaseSourceDriver):
|
|
|
384
431
|
end_time=end_time or self.end_time,
|
|
385
432
|
time_column=time_field or self.time_field,
|
|
386
433
|
format="parquet",
|
|
434
|
+
additional_filters=additional_filters or self.additional_filters,
|
|
387
435
|
**reader_args,
|
|
388
436
|
)
|
|
389
437
|
|
|
438
|
+
def _build_spark_additional_filters(self, column_types: dict):
|
|
439
|
+
if not self.additional_filters:
|
|
440
|
+
return None
|
|
441
|
+
from pyspark.sql.functions import col, isnan, lit
|
|
442
|
+
|
|
443
|
+
operators = {
|
|
444
|
+
"==": operator.eq,
|
|
445
|
+
"=": operator.eq,
|
|
446
|
+
">": operator.gt,
|
|
447
|
+
"<": operator.lt,
|
|
448
|
+
">=": operator.ge,
|
|
449
|
+
"<=": operator.le,
|
|
450
|
+
"!=": operator.ne,
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
spark_filter = None
|
|
454
|
+
new_filter = lit(True)
|
|
455
|
+
for filter_tuple in self.additional_filters:
|
|
456
|
+
if not filter_tuple:
|
|
457
|
+
continue
|
|
458
|
+
col_name, op, value = filter_tuple
|
|
459
|
+
if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
|
|
460
|
+
none_exists = False
|
|
461
|
+
value = list(value)
|
|
462
|
+
for sub_value in value:
|
|
463
|
+
if sub_value is None:
|
|
464
|
+
value.remove(sub_value)
|
|
465
|
+
none_exists = True
|
|
466
|
+
if none_exists:
|
|
467
|
+
filter_nan = column_types[col_name] not in ("timestamp", "date")
|
|
468
|
+
if value:
|
|
469
|
+
if op.lower() == "in":
|
|
470
|
+
new_filter = (
|
|
471
|
+
col(col_name).isin(value) | col(col_name).isNull()
|
|
472
|
+
)
|
|
473
|
+
if filter_nan:
|
|
474
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
475
|
+
|
|
476
|
+
else:
|
|
477
|
+
new_filter = (
|
|
478
|
+
~col(col_name).isin(value) & ~col(col_name).isNull()
|
|
479
|
+
)
|
|
480
|
+
if filter_nan:
|
|
481
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
482
|
+
else:
|
|
483
|
+
if op.lower() == "in":
|
|
484
|
+
new_filter = col(col_name).isNull()
|
|
485
|
+
if filter_nan:
|
|
486
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
487
|
+
else:
|
|
488
|
+
new_filter = ~col(col_name).isNull()
|
|
489
|
+
if filter_nan:
|
|
490
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
491
|
+
else:
|
|
492
|
+
if op.lower() == "in":
|
|
493
|
+
new_filter = col(col_name).isin(value)
|
|
494
|
+
elif op.lower() == "not in":
|
|
495
|
+
new_filter = ~col(col_name).isin(value)
|
|
496
|
+
elif op in operators:
|
|
497
|
+
new_filter = operators[op](col(col_name), value)
|
|
498
|
+
else:
|
|
499
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
500
|
+
f"unsupported filter operator: {op}"
|
|
501
|
+
)
|
|
502
|
+
if spark_filter is not None:
|
|
503
|
+
spark_filter = spark_filter & new_filter
|
|
504
|
+
else:
|
|
505
|
+
spark_filter = new_filter
|
|
506
|
+
return spark_filter
|
|
507
|
+
|
|
508
|
+
def _filter_spark_df(self, df, time_field=None, columns=None):
|
|
509
|
+
spark_additional_filters = self._build_spark_additional_filters(
|
|
510
|
+
column_types=dict(df.dtypes)
|
|
511
|
+
)
|
|
512
|
+
if spark_additional_filters is not None:
|
|
513
|
+
df = df.filter(spark_additional_filters)
|
|
514
|
+
return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
|
|
515
|
+
|
|
390
516
|
|
|
391
517
|
class BigQuerySource(BaseSourceDriver):
|
|
392
518
|
"""
|
|
@@ -401,12 +527,17 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
401
527
|
|
|
402
528
|
# use sql query
|
|
403
529
|
query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
|
|
404
|
-
source = BigQuerySource(
|
|
405
|
-
|
|
406
|
-
|
|
530
|
+
source = BigQuerySource(
|
|
531
|
+
"bq1",
|
|
532
|
+
query=query_string,
|
|
533
|
+
gcp_project="my_project",
|
|
534
|
+
materialization_dataset="dataviews",
|
|
535
|
+
)
|
|
407
536
|
|
|
408
537
|
# read a table
|
|
409
|
-
source = BigQuerySource(
|
|
538
|
+
source = BigQuerySource(
|
|
539
|
+
"bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
|
|
540
|
+
)
|
|
410
541
|
|
|
411
542
|
|
|
412
543
|
:parameter name: source name
|
|
@@ -509,10 +640,15 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
509
640
|
start_time=None,
|
|
510
641
|
end_time=None,
|
|
511
642
|
time_field=None,
|
|
643
|
+
additional_filters=None,
|
|
512
644
|
):
|
|
513
645
|
from google.cloud import bigquery
|
|
514
646
|
from google.cloud.bigquery_storage_v1 import BigQueryReadClient
|
|
515
647
|
|
|
648
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
649
|
+
additional_filters, self.__class__
|
|
650
|
+
)
|
|
651
|
+
|
|
516
652
|
def schema_to_dtypes(schema):
|
|
517
653
|
from mlrun.data_types.data_types import gbq_to_pandas_dtype
|
|
518
654
|
|
|
@@ -552,7 +688,6 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
552
688
|
else:
|
|
553
689
|
df = rows_iterator.to_dataframe(dtypes=dtypes)
|
|
554
690
|
|
|
555
|
-
# TODO : filter as part of the query
|
|
556
691
|
return select_columns_from_df(
|
|
557
692
|
filter_df_start_end_time(
|
|
558
693
|
df,
|
|
@@ -614,7 +749,7 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
614
749
|
url="...",
|
|
615
750
|
user="...",
|
|
616
751
|
database="...",
|
|
617
|
-
|
|
752
|
+
db_schema="...",
|
|
618
753
|
warehouse="...",
|
|
619
754
|
)
|
|
620
755
|
|
|
@@ -629,7 +764,8 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
629
764
|
:parameter url: URL of the snowflake cluster
|
|
630
765
|
:parameter user: snowflake user
|
|
631
766
|
:parameter database: snowflake database
|
|
632
|
-
:parameter schema: snowflake schema
|
|
767
|
+
:parameter schema: snowflake schema - deprecated, use db_schema
|
|
768
|
+
:parameter db_schema: snowflake schema
|
|
633
769
|
:parameter warehouse: snowflake warehouse
|
|
634
770
|
"""
|
|
635
771
|
|
|
@@ -641,6 +777,7 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
641
777
|
self,
|
|
642
778
|
name: str = "",
|
|
643
779
|
key_field: str = None,
|
|
780
|
+
attributes: dict[str, object] = None,
|
|
644
781
|
time_field: str = None,
|
|
645
782
|
schedule: str = None,
|
|
646
783
|
start_time=None,
|
|
@@ -650,21 +787,34 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
650
787
|
user: str = None,
|
|
651
788
|
database: str = None,
|
|
652
789
|
schema: str = None,
|
|
790
|
+
db_schema: str = None,
|
|
653
791
|
warehouse: str = None,
|
|
654
792
|
**kwargs,
|
|
655
793
|
):
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
}
|
|
794
|
+
# TODO: Remove in 1.9.0
|
|
795
|
+
if schema:
|
|
796
|
+
warnings.warn(
|
|
797
|
+
"schema is deprecated in 1.7.0, and will be removed in 1.9.0, please use db_schema"
|
|
798
|
+
)
|
|
799
|
+
db_schema = db_schema or schema # TODO: Remove in 1.9.0
|
|
800
|
+
|
|
801
|
+
attributes = attributes or {}
|
|
802
|
+
if url:
|
|
803
|
+
attributes["url"] = url
|
|
804
|
+
if user:
|
|
805
|
+
attributes["user"] = user
|
|
806
|
+
if database:
|
|
807
|
+
attributes["database"] = database
|
|
808
|
+
if db_schema:
|
|
809
|
+
attributes["db_schema"] = db_schema
|
|
810
|
+
if warehouse:
|
|
811
|
+
attributes["warehouse"] = warehouse
|
|
812
|
+
if query:
|
|
813
|
+
attributes["query"] = query
|
|
664
814
|
|
|
665
815
|
super().__init__(
|
|
666
816
|
name,
|
|
667
|
-
attributes=
|
|
817
|
+
attributes=attributes,
|
|
668
818
|
key_field=key_field,
|
|
669
819
|
time_field=time_field,
|
|
670
820
|
schedule=schedule,
|
|
@@ -673,32 +823,24 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
673
823
|
**kwargs,
|
|
674
824
|
)
|
|
675
825
|
|
|
676
|
-
def _get_password(self):
|
|
677
|
-
key = "SNOWFLAKE_PASSWORD"
|
|
678
|
-
snowflake_password = os.getenv(key) or os.getenv(
|
|
679
|
-
SecretsStore.k8s_env_variable_name_for_secret(key)
|
|
680
|
-
)
|
|
681
|
-
|
|
682
|
-
if not snowflake_password:
|
|
683
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
684
|
-
"No password provided. Set password using the SNOWFLAKE_PASSWORD "
|
|
685
|
-
"project secret or environment variable."
|
|
686
|
-
)
|
|
687
|
-
|
|
688
|
-
return snowflake_password
|
|
689
|
-
|
|
690
826
|
def get_spark_options(self):
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
827
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
828
|
+
spark_options["query"] = self.attributes.get("query")
|
|
829
|
+
return spark_options
|
|
830
|
+
|
|
831
|
+
def to_dataframe(
|
|
832
|
+
self,
|
|
833
|
+
columns=None,
|
|
834
|
+
df_module=None,
|
|
835
|
+
entities=None,
|
|
836
|
+
start_time=None,
|
|
837
|
+
end_time=None,
|
|
838
|
+
time_field=None,
|
|
839
|
+
additional_filters=None,
|
|
840
|
+
):
|
|
841
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
842
|
+
f"{type(self).__name__} supports only spark engine"
|
|
843
|
+
)
|
|
702
844
|
|
|
703
845
|
|
|
704
846
|
class CustomSource(BaseSourceDriver):
|
|
@@ -752,7 +894,19 @@ class DataFrameSource:
|
|
|
752
894
|
context=self.context or context,
|
|
753
895
|
)
|
|
754
896
|
|
|
755
|
-
def to_dataframe(
|
|
897
|
+
def to_dataframe(
|
|
898
|
+
self,
|
|
899
|
+
columns=None,
|
|
900
|
+
df_module=None,
|
|
901
|
+
entities=None,
|
|
902
|
+
start_time=None,
|
|
903
|
+
end_time=None,
|
|
904
|
+
time_field=None,
|
|
905
|
+
additional_filters=None,
|
|
906
|
+
):
|
|
907
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
908
|
+
additional_filters, self.__class__
|
|
909
|
+
)
|
|
756
910
|
return self._df
|
|
757
911
|
|
|
758
912
|
def is_iterator(self):
|
|
@@ -792,9 +946,11 @@ class OnlineSource(BaseSourceDriver):
|
|
|
792
946
|
|
|
793
947
|
source_args = self.attributes.get("source_args", {})
|
|
794
948
|
explicit_ack = (
|
|
795
|
-
is_explicit_ack_supported(context)
|
|
949
|
+
is_explicit_ack_supported(context)
|
|
950
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
796
951
|
)
|
|
797
|
-
|
|
952
|
+
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
953
|
+
src_class = storey.SyncEmitSource(
|
|
798
954
|
context=context,
|
|
799
955
|
key_field=self.key_field or key_field,
|
|
800
956
|
full_event=True,
|
|
@@ -853,12 +1009,11 @@ class StreamSource(OnlineSource):
|
|
|
853
1009
|
super().__init__(name, attributes=attrs, **kwargs)
|
|
854
1010
|
|
|
855
1011
|
def add_nuclio_trigger(self, function):
|
|
856
|
-
store,
|
|
1012
|
+
store, _, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
857
1013
|
if store.kind != "v3io":
|
|
858
1014
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
859
1015
|
"Only profiles that reference the v3io datastore can be used with StreamSource"
|
|
860
1016
|
)
|
|
861
|
-
path = "v3io:/" + path
|
|
862
1017
|
storage_options = store.get_storage_options()
|
|
863
1018
|
access_key = storage_options.get("v3io_access_key")
|
|
864
1019
|
endpoint, stream_path = parse_path(url)
|
|
@@ -877,12 +1032,13 @@ class StreamSource(OnlineSource):
|
|
|
877
1032
|
engine = "async"
|
|
878
1033
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
879
1034
|
engine = function.spec.graph.engine
|
|
880
|
-
|
|
1035
|
+
|
|
1036
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
881
1037
|
kwargs["explicit_ack_mode"] = "explicitOnly"
|
|
882
1038
|
kwargs["worker_allocation_mode"] = "static"
|
|
883
1039
|
|
|
884
1040
|
function.add_v3io_stream_trigger(
|
|
885
|
-
|
|
1041
|
+
url,
|
|
886
1042
|
self.name,
|
|
887
1043
|
self.attributes["group"],
|
|
888
1044
|
self.attributes["seek_to"],
|
|
@@ -947,6 +1103,7 @@ class KafkaSource(OnlineSource):
|
|
|
947
1103
|
start_time=None,
|
|
948
1104
|
end_time=None,
|
|
949
1105
|
time_field=None,
|
|
1106
|
+
additional_filters=None,
|
|
950
1107
|
):
|
|
951
1108
|
raise mlrun.MLRunInvalidArgumentError(
|
|
952
1109
|
"KafkaSource does not support batch processing"
|
|
@@ -963,7 +1120,8 @@ class KafkaSource(OnlineSource):
|
|
|
963
1120
|
engine = "async"
|
|
964
1121
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
965
1122
|
engine = function.spec.graph.engine
|
|
966
|
-
|
|
1123
|
+
|
|
1124
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
967
1125
|
explicit_ack_mode = "explicitOnly"
|
|
968
1126
|
extra_attributes["workerAllocationMode"] = extra_attributes.get(
|
|
969
1127
|
"worker_allocation_mode", "static"
|
|
@@ -1006,6 +1164,59 @@ class KafkaSource(OnlineSource):
|
|
|
1006
1164
|
"to a Spark dataframe is not possible, as this operation is not supported by Spark"
|
|
1007
1165
|
)
|
|
1008
1166
|
|
|
1167
|
+
def create_topics(
|
|
1168
|
+
self,
|
|
1169
|
+
num_partitions: int = 4,
|
|
1170
|
+
replication_factor: int = 1,
|
|
1171
|
+
topics: list[str] = None,
|
|
1172
|
+
):
|
|
1173
|
+
"""
|
|
1174
|
+
Create Kafka topics with the specified number of partitions and replication factor.
|
|
1175
|
+
|
|
1176
|
+
:param num_partitions: number of partitions for the topics
|
|
1177
|
+
:param replication_factor: replication factor for the topics
|
|
1178
|
+
:param topics: list of topic names to create, if None,
|
|
1179
|
+
the topics will be taken from the source attributes
|
|
1180
|
+
"""
|
|
1181
|
+
from kafka.admin import KafkaAdminClient, NewTopic
|
|
1182
|
+
|
|
1183
|
+
brokers = self.attributes.get("brokers")
|
|
1184
|
+
if not brokers:
|
|
1185
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1186
|
+
"brokers must be specified in the KafkaSource attributes"
|
|
1187
|
+
)
|
|
1188
|
+
topics = topics or self.attributes.get("topics")
|
|
1189
|
+
if not topics:
|
|
1190
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1191
|
+
"topics must be specified in the KafkaSource attributes"
|
|
1192
|
+
)
|
|
1193
|
+
new_topics = [
|
|
1194
|
+
NewTopic(topic, num_partitions, replication_factor) for topic in topics
|
|
1195
|
+
]
|
|
1196
|
+
kafka_admin = KafkaAdminClient(
|
|
1197
|
+
bootstrap_servers=brokers,
|
|
1198
|
+
sasl_mechanism=self.attributes.get("sasl", {}).get("sasl_mechanism"),
|
|
1199
|
+
sasl_plain_username=self.attributes.get("sasl", {}).get("username"),
|
|
1200
|
+
sasl_plain_password=self.attributes.get("sasl", {}).get("password"),
|
|
1201
|
+
sasl_kerberos_service_name=self.attributes.get("sasl", {}).get(
|
|
1202
|
+
"sasl_kerberos_service_name", "kafka"
|
|
1203
|
+
),
|
|
1204
|
+
sasl_kerberos_domain_name=self.attributes.get("sasl", {}).get(
|
|
1205
|
+
"sasl_kerberos_domain_name"
|
|
1206
|
+
),
|
|
1207
|
+
sasl_oauth_token_provider=self.attributes.get("sasl", {}).get("mechanism"),
|
|
1208
|
+
)
|
|
1209
|
+
try:
|
|
1210
|
+
kafka_admin.create_topics(new_topics)
|
|
1211
|
+
finally:
|
|
1212
|
+
kafka_admin.close()
|
|
1213
|
+
logger.info(
|
|
1214
|
+
"Kafka topics created successfully",
|
|
1215
|
+
topics=topics,
|
|
1216
|
+
num_partitions=num_partitions,
|
|
1217
|
+
replication_factor=replication_factor,
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1009
1220
|
|
|
1010
1221
|
class SQLSource(BaseSourceDriver):
|
|
1011
1222
|
kind = "sqldb"
|
|
@@ -1087,9 +1298,13 @@ class SQLSource(BaseSourceDriver):
|
|
|
1087
1298
|
start_time=None,
|
|
1088
1299
|
end_time=None,
|
|
1089
1300
|
time_field=None,
|
|
1301
|
+
additional_filters=None,
|
|
1090
1302
|
):
|
|
1091
1303
|
import sqlalchemy as sqlalchemy
|
|
1092
1304
|
|
|
1305
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1306
|
+
additional_filters, self.__class__
|
|
1307
|
+
)
|
|
1093
1308
|
db_path = self.attributes.get("db_path")
|
|
1094
1309
|
table_name = self.attributes.get("table_name")
|
|
1095
1310
|
parse_dates = self.attributes.get("parse_dates")
|
mlrun/datastore/spark_utils.py
CHANGED
|
@@ -13,7 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
16
18
|
import mlrun
|
|
19
|
+
from mlrun.features import Entity
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str]:
|
|
@@ -35,3 +38,30 @@ def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str
|
|
|
35
38
|
else:
|
|
36
39
|
non_hadoop_spark_options[key] = value
|
|
37
40
|
return non_hadoop_spark_options
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def check_special_columns_exists(
|
|
44
|
+
spark_df, entities: list[Union[Entity, str]], timestamp_key: str, label_column: str
|
|
45
|
+
):
|
|
46
|
+
columns = spark_df.columns
|
|
47
|
+
entities = entities or []
|
|
48
|
+
entities = [
|
|
49
|
+
entity.name if isinstance(entity, Entity) else entity for entity in entities
|
|
50
|
+
]
|
|
51
|
+
missing_entities = [entity for entity in entities if entity not in columns]
|
|
52
|
+
cases_message = "Please check the letter cases (uppercase or lowercase)"
|
|
53
|
+
if missing_entities:
|
|
54
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
55
|
+
f"There are missing entities from dataframe during ingestion. missing_entities: {missing_entities}."
|
|
56
|
+
f" {cases_message}"
|
|
57
|
+
)
|
|
58
|
+
if timestamp_key and timestamp_key not in columns:
|
|
59
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
60
|
+
f"timestamp_key is missing from dataframe during ingestion. timestamp_key: {timestamp_key}."
|
|
61
|
+
f" {cases_message}"
|
|
62
|
+
)
|
|
63
|
+
if label_column and label_column not in columns:
|
|
64
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
65
|
+
f"label_column is missing from dataframe during ingestion. label_column: {label_column}. "
|
|
66
|
+
f"{cases_message}"
|
|
67
|
+
)
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
import mlrun
|
|
18
18
|
import mlrun.artifacts
|
|
19
19
|
from mlrun.config import config
|
|
20
|
-
from mlrun.utils.helpers import
|
|
20
|
+
from mlrun.utils.helpers import parse_artifact_uri
|
|
21
21
|
|
|
22
22
|
from ..common.helpers import parse_versioned_object_uri
|
|
23
23
|
from ..platforms.iguazio import parse_path
|
|
@@ -27,6 +27,8 @@ from .targets import get_online_target
|
|
|
27
27
|
|
|
28
28
|
def is_store_uri(url):
|
|
29
29
|
"""detect if the uri starts with the store schema prefix"""
|
|
30
|
+
if not url:
|
|
31
|
+
return False
|
|
30
32
|
return url.startswith(DB_SCHEMA + "://")
|
|
31
33
|
|
|
32
34
|
|
|
@@ -146,7 +148,11 @@ def get_store_resource(
|
|
|
146
148
|
|
|
147
149
|
db = db or mlrun.get_run_db(secrets=secrets)
|
|
148
150
|
kind, uri = parse_store_uri(uri)
|
|
149
|
-
if kind
|
|
151
|
+
if not kind:
|
|
152
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
153
|
+
f"Cannot get store resource from invalid URI: {uri}"
|
|
154
|
+
)
|
|
155
|
+
elif kind == StorePrefix.FeatureSet:
|
|
150
156
|
project, name, tag, uid = parse_versioned_object_uri(
|
|
151
157
|
uri, project or config.default_project
|
|
152
158
|
)
|
|
@@ -167,11 +173,7 @@ def get_store_resource(
|
|
|
167
173
|
)
|
|
168
174
|
if resource.get("kind", "") == "link":
|
|
169
175
|
# todo: support other link types (not just iter, move this to the db/api layer
|
|
170
|
-
link_iteration = (
|
|
171
|
-
resource.get("link_iteration", 0)
|
|
172
|
-
if is_legacy_artifact(resource)
|
|
173
|
-
else resource["spec"].get("link_iteration", 0)
|
|
174
|
-
)
|
|
176
|
+
link_iteration = resource["spec"].get("link_iteration", 0)
|
|
175
177
|
|
|
176
178
|
resource = db.read_artifact(
|
|
177
179
|
key,
|