mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +10 -1
- mlrun/__main__.py +23 -111
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +169 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +36 -253
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +46 -42
- mlrun/artifacts/model.py +9 -141
- mlrun/artifacts/plots.py +14 -375
- mlrun/common/constants.py +65 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +10 -5
- mlrun/common/schemas/alert.py +92 -11
- mlrun/common/schemas/api_gateway.py +56 -0
- mlrun/common/schemas/artifact.py +15 -5
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/model_monitoring/__init__.py +15 -3
- mlrun/common/schemas/model_monitoring/constants.py +58 -7
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
- mlrun/common/schemas/pipeline.py +0 -9
- mlrun/common/schemas/project.py +5 -11
- mlrun/common/types.py +1 -0
- mlrun/config.py +30 -9
- mlrun/data_types/to_pandas.py +9 -9
- mlrun/datastore/base.py +41 -9
- mlrun/datastore/datastore.py +6 -2
- mlrun/datastore/datastore_profile.py +56 -4
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/sources.py +147 -7
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +110 -42
- mlrun/datastore/utils.py +42 -0
- mlrun/db/base.py +54 -10
- mlrun/db/httpdb.py +282 -79
- mlrun/db/nopdb.py +52 -10
- mlrun/errors.py +11 -0
- mlrun/execution.py +26 -9
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +12 -47
- mlrun/feature_store/feature_set.py +9 -0
- mlrun/feature_store/feature_vector.py +8 -0
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +9 -9
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +16 -0
- mlrun/frameworks/__init__.py +6 -0
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/tf_keras/__init__.py +4 -1
- mlrun/k8s_utils.py +10 -11
- mlrun/launcher/base.py +4 -3
- mlrun/launcher/client.py +5 -3
- mlrun/launcher/local.py +12 -2
- mlrun/launcher/remote.py +9 -2
- mlrun/lists.py +6 -2
- mlrun/model.py +47 -21
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +42 -18
- mlrun/model_monitoring/application.py +5 -305
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +280 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +3 -1
- mlrun/model_monitoring/db/__init__.py +2 -0
- mlrun/model_monitoring/db/stores/__init__.py +0 -2
- mlrun/model_monitoring/db/stores/base/store.py +22 -37
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +316 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +401 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +658 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/helpers.py +63 -1
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +57 -216
- mlrun/model_monitoring/writer.py +134 -124
- mlrun/package/__init__.py +13 -1
- mlrun/package/packagers/__init__.py +6 -1
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +10 -9
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +24 -12
- mlrun/projects/pipelines.py +79 -102
- mlrun/projects/project.py +271 -103
- mlrun/render.py +15 -14
- mlrun/run.py +16 -46
- mlrun/runtimes/__init__.py +6 -3
- mlrun/runtimes/base.py +14 -7
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +2 -1
- mlrun/runtimes/local.py +12 -3
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +194 -84
- mlrun/runtimes/nuclio/application/application.py +170 -8
- mlrun/runtimes/nuclio/function.py +39 -49
- mlrun/runtimes/pod.py +16 -36
- mlrun/runtimes/remotesparkjob.py +9 -3
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +6 -45
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/server.py +2 -1
- mlrun/serving/states.py +51 -8
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +5 -1
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +157 -83
- mlrun/utils/logger.py +39 -7
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification/base.py +1 -1
- mlrun/utils/notifications/notification/slack.py +34 -7
- mlrun/utils/notifications/notification/webhook.py +1 -1
- mlrun/utils/notifications/notification_pusher.py +147 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/METADATA +14 -6
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/RECORD +158 -138
- mlrun/kfpops.py +0 -865
- mlrun/platforms/other.py +0 -305
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/top_level.txt +0 -0
mlrun/datastore/datastore.py
CHANGED
|
@@ -223,6 +223,11 @@ class StoreManager:
|
|
|
223
223
|
subpath = url[len("memory://") :]
|
|
224
224
|
return in_memory_store, subpath, url
|
|
225
225
|
|
|
226
|
+
elif schema in get_local_file_schema():
|
|
227
|
+
# parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
|
|
228
|
+
# As a workaround, we set subpath to the url.
|
|
229
|
+
subpath = url.replace("file://", "", 1)
|
|
230
|
+
|
|
226
231
|
if not schema and endpoint:
|
|
227
232
|
if endpoint in self._stores.keys():
|
|
228
233
|
return self._stores[endpoint], subpath, url
|
|
@@ -241,8 +246,7 @@ class StoreManager:
|
|
|
241
246
|
)
|
|
242
247
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
243
248
|
self._stores[store_key] = store
|
|
244
|
-
|
|
245
|
-
return store, url if store.kind == "file" else subpath, url
|
|
249
|
+
return store, subpath, url
|
|
246
250
|
|
|
247
251
|
def reset_secrets(self):
|
|
248
252
|
self._secrets = {}
|
|
@@ -37,6 +37,7 @@ class DatastoreProfile(pydantic.BaseModel):
|
|
|
37
37
|
extra = pydantic.Extra.forbid
|
|
38
38
|
|
|
39
39
|
@pydantic.validator("name")
|
|
40
|
+
@classmethod
|
|
40
41
|
def lower_case(cls, v):
|
|
41
42
|
return v.lower()
|
|
42
43
|
|
|
@@ -185,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
185
186
|
assume_role_arn: typing.Optional[str] = None
|
|
186
187
|
access_key_id: typing.Optional[str] = None
|
|
187
188
|
secret_key: typing.Optional[str] = None
|
|
189
|
+
bucket: typing.Optional[str] = None
|
|
190
|
+
|
|
191
|
+
@pydantic.validator("bucket")
|
|
192
|
+
@classmethod
|
|
193
|
+
def check_bucket(cls, v):
|
|
194
|
+
if not v:
|
|
195
|
+
warnings.warn(
|
|
196
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
197
|
+
FutureWarning,
|
|
198
|
+
stacklevel=2,
|
|
199
|
+
)
|
|
200
|
+
return v
|
|
188
201
|
|
|
189
202
|
def secrets(self) -> dict:
|
|
190
203
|
res = {}
|
|
@@ -203,7 +216,13 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
203
216
|
return res
|
|
204
217
|
|
|
205
218
|
def url(self, subpath):
|
|
206
|
-
|
|
219
|
+
# TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
|
|
220
|
+
# we assume that the subpath can begin without a '/' character,
|
|
221
|
+
# while here we assume it always starts with one.
|
|
222
|
+
if self.bucket:
|
|
223
|
+
return f"s3://{self.bucket}{subpath}"
|
|
224
|
+
else:
|
|
225
|
+
return f"s3:/{subpath}"
|
|
207
226
|
|
|
208
227
|
|
|
209
228
|
class DatastoreProfileRedis(DatastoreProfile):
|
|
@@ -272,18 +291,36 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
272
291
|
_private_attributes = ("gcp_credentials",)
|
|
273
292
|
credentials_path: typing.Optional[str] = None # path to file.
|
|
274
293
|
gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
|
|
294
|
+
bucket: typing.Optional[str] = None
|
|
295
|
+
|
|
296
|
+
@pydantic.validator("bucket")
|
|
297
|
+
@classmethod
|
|
298
|
+
def check_bucket(cls, v):
|
|
299
|
+
if not v:
|
|
300
|
+
warnings.warn(
|
|
301
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
302
|
+
FutureWarning,
|
|
303
|
+
stacklevel=2,
|
|
304
|
+
)
|
|
305
|
+
return v
|
|
275
306
|
|
|
276
307
|
@pydantic.validator("gcp_credentials", pre=True, always=True)
|
|
308
|
+
@classmethod
|
|
277
309
|
def convert_dict_to_json(cls, v):
|
|
278
310
|
if isinstance(v, dict):
|
|
279
311
|
return json.dumps(v)
|
|
280
312
|
return v
|
|
281
313
|
|
|
282
314
|
def url(self, subpath) -> str:
|
|
315
|
+
# TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
|
|
316
|
+
# but the opposite assumption is made in S3.
|
|
283
317
|
if subpath.startswith("/"):
|
|
284
318
|
# in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
285
319
|
subpath = subpath[1:]
|
|
286
|
-
|
|
320
|
+
if self.bucket:
|
|
321
|
+
return f"gcs://{self.bucket}/{subpath}"
|
|
322
|
+
else:
|
|
323
|
+
return f"gcs://{subpath}"
|
|
287
324
|
|
|
288
325
|
def secrets(self) -> dict:
|
|
289
326
|
res = {}
|
|
@@ -311,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
311
348
|
client_secret: typing.Optional[str] = None
|
|
312
349
|
sas_token: typing.Optional[str] = None
|
|
313
350
|
credential: typing.Optional[str] = None
|
|
351
|
+
container: typing.Optional[str] = None
|
|
352
|
+
|
|
353
|
+
@pydantic.validator("container")
|
|
354
|
+
@classmethod
|
|
355
|
+
def check_container(cls, v):
|
|
356
|
+
if not v:
|
|
357
|
+
warnings.warn(
|
|
358
|
+
"The 'container' attribute will be mandatory starting from version 1.9",
|
|
359
|
+
FutureWarning,
|
|
360
|
+
stacklevel=2,
|
|
361
|
+
)
|
|
362
|
+
return v
|
|
314
363
|
|
|
315
364
|
def url(self, subpath) -> str:
|
|
316
365
|
if subpath.startswith("/"):
|
|
317
|
-
# in azure the path after schema is starts with
|
|
366
|
+
# in azure the path after schema is starts with container, wherefore it should not start with "/".
|
|
318
367
|
subpath = subpath[1:]
|
|
319
|
-
|
|
368
|
+
if self.container:
|
|
369
|
+
return f"az://{self.container}/{subpath}"
|
|
370
|
+
else:
|
|
371
|
+
return f"az://{subpath}"
|
|
320
372
|
|
|
321
373
|
def secrets(self) -> dict:
|
|
322
374
|
res = {}
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -80,8 +80,8 @@ class InMemoryStore(DataStore):
|
|
|
80
80
|
reader = df_module.read_json
|
|
81
81
|
else:
|
|
82
82
|
raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
|
|
83
|
-
# InMemoryStore store
|
|
84
|
-
for field in ["time_column", "start_time", "end_time"]:
|
|
83
|
+
# InMemoryStore store – don't pass filters
|
|
84
|
+
for field in ["time_column", "start_time", "end_time", "additional_filters"]:
|
|
85
85
|
kwargs.pop(field, None)
|
|
86
86
|
|
|
87
87
|
return reader(item, **kwargs)
|
mlrun/datastore/redis.py
CHANGED
|
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
|
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
33
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
|
-
|
|
34
|
+
redis_default_port = "6379"
|
|
35
35
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
36
36
|
self.headers = None
|
|
37
37
|
|
|
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
|
|
|
49
49
|
user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
|
|
50
50
|
password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
|
|
51
51
|
host = parsed_endpoint.hostname
|
|
52
|
-
port = parsed_endpoint.port if parsed_endpoint.port else
|
|
52
|
+
port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
|
|
53
53
|
schema = parsed_endpoint.scheme
|
|
54
54
|
if user or password:
|
|
55
55
|
endpoint = f"{schema}://{user}:{password}@{host}:{port}"
|
mlrun/datastore/s3.py
CHANGED
|
@@ -198,6 +198,11 @@ class S3Store(DataStore):
|
|
|
198
198
|
bucket = self.s3.Bucket(bucket)
|
|
199
199
|
return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
|
|
200
200
|
|
|
201
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
202
|
+
bucket, key = self.get_bucket_and_key(path)
|
|
203
|
+
path = f"{bucket}/{key}"
|
|
204
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
205
|
+
|
|
201
206
|
|
|
202
207
|
def parse_s3_bucket_and_key(s3_path):
|
|
203
208
|
try:
|
mlrun/datastore/sources.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import operator
|
|
15
16
|
import os
|
|
16
17
|
import warnings
|
|
17
18
|
from base64 import b64encode
|
|
@@ -29,6 +30,7 @@ from nuclio.config import split_path
|
|
|
29
30
|
import mlrun
|
|
30
31
|
from mlrun.config import config
|
|
31
32
|
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
32
34
|
from mlrun.secrets import SecretsStore
|
|
33
35
|
|
|
34
36
|
from ..model import DataSource
|
|
@@ -102,8 +104,12 @@ class BaseSourceDriver(DataSource):
|
|
|
102
104
|
start_time=None,
|
|
103
105
|
end_time=None,
|
|
104
106
|
time_field=None,
|
|
107
|
+
additional_filters=None,
|
|
105
108
|
):
|
|
106
109
|
"""return the source data as dataframe"""
|
|
110
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
111
|
+
additional_filters, self.__class__
|
|
112
|
+
)
|
|
107
113
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
108
114
|
columns=columns,
|
|
109
115
|
df_module=df_module,
|
|
@@ -174,7 +180,7 @@ class CSVSource(BaseSourceDriver):
|
|
|
174
180
|
self,
|
|
175
181
|
name: str = "",
|
|
176
182
|
path: str = None,
|
|
177
|
-
attributes: dict[str,
|
|
183
|
+
attributes: dict[str, object] = None,
|
|
178
184
|
key_field: str = None,
|
|
179
185
|
schedule: str = None,
|
|
180
186
|
parse_dates: Union[None, int, str, list[int], list[str]] = None,
|
|
@@ -245,7 +251,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
245
251
|
start_time=None,
|
|
246
252
|
end_time=None,
|
|
247
253
|
time_field=None,
|
|
254
|
+
additional_filters=None,
|
|
248
255
|
):
|
|
256
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
257
|
+
additional_filters, self.__class__
|
|
258
|
+
)
|
|
249
259
|
reader_args = self.attributes.get("reader_args", {})
|
|
250
260
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
251
261
|
columns=columns,
|
|
@@ -281,6 +291,12 @@ class ParquetSource(BaseSourceDriver):
|
|
|
281
291
|
:parameter start_time: filters out data before this time
|
|
282
292
|
:parameter end_time: filters out data after this time
|
|
283
293
|
:parameter attributes: additional parameters to pass to storey.
|
|
294
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
295
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
296
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
297
|
+
Example: [("Product", "=", "Computer")]
|
|
298
|
+
For all supported filters, please see:
|
|
299
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
284
300
|
"""
|
|
285
301
|
|
|
286
302
|
kind = "parquet"
|
|
@@ -291,13 +307,19 @@ class ParquetSource(BaseSourceDriver):
|
|
|
291
307
|
self,
|
|
292
308
|
name: str = "",
|
|
293
309
|
path: str = None,
|
|
294
|
-
attributes: dict[str,
|
|
310
|
+
attributes: dict[str, object] = None,
|
|
295
311
|
key_field: str = None,
|
|
296
312
|
time_field: str = None,
|
|
297
313
|
schedule: str = None,
|
|
298
314
|
start_time: Optional[Union[datetime, str]] = None,
|
|
299
315
|
end_time: Optional[Union[datetime, str]] = None,
|
|
316
|
+
additional_filters: Optional[list[Union[tuple, list]]] = None,
|
|
300
317
|
):
|
|
318
|
+
if additional_filters:
|
|
319
|
+
attributes = copy(attributes) or {}
|
|
320
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
321
|
+
attributes["additional_filters"] = additional_filters
|
|
322
|
+
|
|
301
323
|
super().__init__(
|
|
302
324
|
name,
|
|
303
325
|
path,
|
|
@@ -325,6 +347,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
325
347
|
def end_time(self, end_time):
|
|
326
348
|
self._end_time = self._convert_to_datetime(end_time)
|
|
327
349
|
|
|
350
|
+
@property
|
|
351
|
+
def additional_filters(self):
|
|
352
|
+
return self.attributes.get("additional_filters")
|
|
353
|
+
|
|
328
354
|
@staticmethod
|
|
329
355
|
def _convert_to_datetime(time):
|
|
330
356
|
if time and isinstance(time, str):
|
|
@@ -341,16 +367,17 @@ class ParquetSource(BaseSourceDriver):
|
|
|
341
367
|
start_time=None,
|
|
342
368
|
end_time=None,
|
|
343
369
|
context=None,
|
|
370
|
+
additional_filters=None,
|
|
344
371
|
):
|
|
345
372
|
import storey
|
|
346
373
|
|
|
347
|
-
attributes = self.attributes
|
|
374
|
+
attributes = copy(self.attributes)
|
|
375
|
+
attributes.pop("additional_filters", None)
|
|
348
376
|
if context:
|
|
349
377
|
attributes["context"] = context
|
|
350
|
-
|
|
378
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
351
379
|
data_item = mlrun.store_manager.object(self.path)
|
|
352
380
|
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
353
|
-
|
|
354
381
|
return storey.ParquetSource(
|
|
355
382
|
paths=url, # unlike self.path, it already has store:// replaced
|
|
356
383
|
key_field=self.key_field or key_field,
|
|
@@ -358,9 +385,20 @@ class ParquetSource(BaseSourceDriver):
|
|
|
358
385
|
end_filter=self.end_time,
|
|
359
386
|
start_filter=self.start_time,
|
|
360
387
|
filter_column=self.time_field or time_field,
|
|
388
|
+
additional_filters=self.additional_filters or additional_filters,
|
|
361
389
|
**attributes,
|
|
362
390
|
)
|
|
363
391
|
|
|
392
|
+
@classmethod
|
|
393
|
+
def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
|
|
394
|
+
new_obj = super().from_dict(
|
|
395
|
+
struct=struct, fields=fields, deprecated_fields=deprecated_fields
|
|
396
|
+
)
|
|
397
|
+
new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
|
|
398
|
+
new_obj.additional_filters
|
|
399
|
+
)
|
|
400
|
+
return new_obj
|
|
401
|
+
|
|
364
402
|
def get_spark_options(self):
|
|
365
403
|
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
366
404
|
spark_options = store.get_spark_options()
|
|
@@ -380,8 +418,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
380
418
|
start_time=None,
|
|
381
419
|
end_time=None,
|
|
382
420
|
time_field=None,
|
|
421
|
+
additional_filters=None,
|
|
383
422
|
):
|
|
384
423
|
reader_args = self.attributes.get("reader_args", {})
|
|
424
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
385
425
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
386
426
|
columns=columns,
|
|
387
427
|
df_module=df_module,
|
|
@@ -389,9 +429,88 @@ class ParquetSource(BaseSourceDriver):
|
|
|
389
429
|
end_time=end_time or self.end_time,
|
|
390
430
|
time_column=time_field or self.time_field,
|
|
391
431
|
format="parquet",
|
|
432
|
+
additional_filters=additional_filters or self.additional_filters,
|
|
392
433
|
**reader_args,
|
|
393
434
|
)
|
|
394
435
|
|
|
436
|
+
def _build_spark_additional_filters(self, column_types: dict):
|
|
437
|
+
if not self.additional_filters:
|
|
438
|
+
return None
|
|
439
|
+
from pyspark.sql.functions import col, isnan, lit
|
|
440
|
+
|
|
441
|
+
operators = {
|
|
442
|
+
"==": operator.eq,
|
|
443
|
+
"=": operator.eq,
|
|
444
|
+
">": operator.gt,
|
|
445
|
+
"<": operator.lt,
|
|
446
|
+
">=": operator.ge,
|
|
447
|
+
"<=": operator.le,
|
|
448
|
+
"!=": operator.ne,
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
spark_filter = None
|
|
452
|
+
new_filter = lit(True)
|
|
453
|
+
for filter_tuple in self.additional_filters:
|
|
454
|
+
if not filter_tuple:
|
|
455
|
+
continue
|
|
456
|
+
col_name, op, value = filter_tuple
|
|
457
|
+
if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
|
|
458
|
+
none_exists = False
|
|
459
|
+
value = list(value)
|
|
460
|
+
for sub_value in value:
|
|
461
|
+
if sub_value is None:
|
|
462
|
+
value.remove(sub_value)
|
|
463
|
+
none_exists = True
|
|
464
|
+
if none_exists:
|
|
465
|
+
filter_nan = column_types[col_name] not in ("timestamp", "date")
|
|
466
|
+
if value:
|
|
467
|
+
if op.lower() == "in":
|
|
468
|
+
new_filter = (
|
|
469
|
+
col(col_name).isin(value) | col(col_name).isNull()
|
|
470
|
+
)
|
|
471
|
+
if filter_nan:
|
|
472
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
473
|
+
|
|
474
|
+
else:
|
|
475
|
+
new_filter = (
|
|
476
|
+
~col(col_name).isin(value) & ~col(col_name).isNull()
|
|
477
|
+
)
|
|
478
|
+
if filter_nan:
|
|
479
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
480
|
+
else:
|
|
481
|
+
if op.lower() == "in":
|
|
482
|
+
new_filter = col(col_name).isNull()
|
|
483
|
+
if filter_nan:
|
|
484
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
485
|
+
else:
|
|
486
|
+
new_filter = ~col(col_name).isNull()
|
|
487
|
+
if filter_nan:
|
|
488
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
489
|
+
else:
|
|
490
|
+
if op.lower() == "in":
|
|
491
|
+
new_filter = col(col_name).isin(value)
|
|
492
|
+
elif op.lower() == "not in":
|
|
493
|
+
new_filter = ~col(col_name).isin(value)
|
|
494
|
+
elif op in operators:
|
|
495
|
+
new_filter = operators[op](col(col_name), value)
|
|
496
|
+
else:
|
|
497
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
498
|
+
f"unsupported filter operator: {op}"
|
|
499
|
+
)
|
|
500
|
+
if spark_filter is not None:
|
|
501
|
+
spark_filter = spark_filter & new_filter
|
|
502
|
+
else:
|
|
503
|
+
spark_filter = new_filter
|
|
504
|
+
return spark_filter
|
|
505
|
+
|
|
506
|
+
def _filter_spark_df(self, df, time_field=None, columns=None):
|
|
507
|
+
spark_additional_filters = self._build_spark_additional_filters(
|
|
508
|
+
column_types=dict(df.dtypes)
|
|
509
|
+
)
|
|
510
|
+
if spark_additional_filters is not None:
|
|
511
|
+
df = df.filter(spark_additional_filters)
|
|
512
|
+
return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
|
|
513
|
+
|
|
395
514
|
|
|
396
515
|
class BigQuerySource(BaseSourceDriver):
|
|
397
516
|
"""
|
|
@@ -519,10 +638,15 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
519
638
|
start_time=None,
|
|
520
639
|
end_time=None,
|
|
521
640
|
time_field=None,
|
|
641
|
+
additional_filters=None,
|
|
522
642
|
):
|
|
523
643
|
from google.cloud import bigquery
|
|
524
644
|
from google.cloud.bigquery_storage_v1 import BigQueryReadClient
|
|
525
645
|
|
|
646
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
647
|
+
additional_filters, self.__class__
|
|
648
|
+
)
|
|
649
|
+
|
|
526
650
|
def schema_to_dtypes(schema):
|
|
527
651
|
from mlrun.data_types.data_types import gbq_to_pandas_dtype
|
|
528
652
|
|
|
@@ -562,7 +686,6 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
562
686
|
else:
|
|
563
687
|
df = rows_iterator.to_dataframe(dtypes=dtypes)
|
|
564
688
|
|
|
565
|
-
# TODO : filter as part of the query
|
|
566
689
|
return select_columns_from_df(
|
|
567
690
|
filter_df_start_end_time(
|
|
568
691
|
df,
|
|
@@ -740,7 +863,19 @@ class DataFrameSource:
|
|
|
740
863
|
context=self.context or context,
|
|
741
864
|
)
|
|
742
865
|
|
|
743
|
-
def to_dataframe(
|
|
866
|
+
def to_dataframe(
|
|
867
|
+
self,
|
|
868
|
+
columns=None,
|
|
869
|
+
df_module=None,
|
|
870
|
+
entities=None,
|
|
871
|
+
start_time=None,
|
|
872
|
+
end_time=None,
|
|
873
|
+
time_field=None,
|
|
874
|
+
additional_filters=None,
|
|
875
|
+
):
|
|
876
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
877
|
+
additional_filters, self.__class__
|
|
878
|
+
)
|
|
744
879
|
return self._df
|
|
745
880
|
|
|
746
881
|
def is_iterator(self):
|
|
@@ -935,6 +1070,7 @@ class KafkaSource(OnlineSource):
|
|
|
935
1070
|
start_time=None,
|
|
936
1071
|
end_time=None,
|
|
937
1072
|
time_field=None,
|
|
1073
|
+
additional_filters=None,
|
|
938
1074
|
):
|
|
939
1075
|
raise mlrun.MLRunInvalidArgumentError(
|
|
940
1076
|
"KafkaSource does not support batch processing"
|
|
@@ -1075,9 +1211,13 @@ class SQLSource(BaseSourceDriver):
|
|
|
1075
1211
|
start_time=None,
|
|
1076
1212
|
end_time=None,
|
|
1077
1213
|
time_field=None,
|
|
1214
|
+
additional_filters=None,
|
|
1078
1215
|
):
|
|
1079
1216
|
import sqlalchemy as sqlalchemy
|
|
1080
1217
|
|
|
1218
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1219
|
+
additional_filters, self.__class__
|
|
1220
|
+
)
|
|
1081
1221
|
db_path = self.attributes.get("db_path")
|
|
1082
1222
|
table_name = self.attributes.get("table_name")
|
|
1083
1223
|
parse_dates = self.attributes.get("parse_dates")
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
import mlrun
|
|
18
18
|
import mlrun.artifacts
|
|
19
19
|
from mlrun.config import config
|
|
20
|
-
from mlrun.utils.helpers import
|
|
20
|
+
from mlrun.utils.helpers import parse_artifact_uri
|
|
21
21
|
|
|
22
22
|
from ..common.helpers import parse_versioned_object_uri
|
|
23
23
|
from ..platforms.iguazio import parse_path
|
|
@@ -146,7 +146,11 @@ def get_store_resource(
|
|
|
146
146
|
|
|
147
147
|
db = db or mlrun.get_run_db(secrets=secrets)
|
|
148
148
|
kind, uri = parse_store_uri(uri)
|
|
149
|
-
if kind
|
|
149
|
+
if not kind:
|
|
150
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
151
|
+
f"Cannot get store resource from invalid URI: {uri}"
|
|
152
|
+
)
|
|
153
|
+
elif kind == StorePrefix.FeatureSet:
|
|
150
154
|
project, name, tag, uid = parse_versioned_object_uri(
|
|
151
155
|
uri, project or config.default_project
|
|
152
156
|
)
|
|
@@ -167,11 +171,7 @@ def get_store_resource(
|
|
|
167
171
|
)
|
|
168
172
|
if resource.get("kind", "") == "link":
|
|
169
173
|
# todo: support other link types (not just iter, move this to the db/api layer
|
|
170
|
-
link_iteration = (
|
|
171
|
-
resource.get("link_iteration", 0)
|
|
172
|
-
if is_legacy_artifact(resource)
|
|
173
|
-
else resource["spec"].get("link_iteration", 0)
|
|
174
|
-
)
|
|
174
|
+
link_iteration = resource["spec"].get("link_iteration", 0)
|
|
175
175
|
|
|
176
176
|
resource = db.read_artifact(
|
|
177
177
|
key,
|