mlrun 1.7.0rc13__py3-none-any.whl → 1.7.0rc21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +10 -1
- mlrun/__main__.py +23 -111
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +36 -253
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +46 -42
- mlrun/artifacts/model.py +9 -141
- mlrun/artifacts/plots.py +14 -375
- mlrun/common/constants.py +65 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +10 -5
- mlrun/common/schemas/alert.py +92 -11
- mlrun/common/schemas/api_gateway.py +56 -0
- mlrun/common/schemas/artifact.py +15 -5
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/model_monitoring/__init__.py +15 -3
- mlrun/common/schemas/model_monitoring/constants.py +58 -7
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
- mlrun/common/schemas/pipeline.py +0 -9
- mlrun/common/schemas/project.py +6 -11
- mlrun/common/types.py +1 -0
- mlrun/config.py +36 -8
- mlrun/data_types/to_pandas.py +9 -9
- mlrun/datastore/base.py +41 -9
- mlrun/datastore/datastore.py +6 -2
- mlrun/datastore/datastore_profile.py +56 -4
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/sources.py +147 -7
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +129 -9
- mlrun/datastore/utils.py +42 -0
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +55 -11
- mlrun/db/httpdb.py +346 -107
- mlrun/db/nopdb.py +52 -10
- mlrun/errors.py +11 -0
- mlrun/execution.py +24 -9
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +12 -47
- mlrun/feature_store/feature_set.py +9 -0
- mlrun/feature_store/feature_vector.py +8 -0
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +9 -9
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +16 -0
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/tf_keras/__init__.py +4 -1
- mlrun/k8s_utils.py +10 -11
- mlrun/launcher/base.py +4 -3
- mlrun/launcher/client.py +5 -3
- mlrun/launcher/local.py +8 -2
- mlrun/launcher/remote.py +8 -2
- mlrun/lists.py +6 -2
- mlrun/model.py +62 -20
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +41 -18
- mlrun/model_monitoring/application.py +5 -305
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +280 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +3 -1
- mlrun/model_monitoring/db/__init__.py +2 -0
- mlrun/model_monitoring/db/stores/__init__.py +0 -2
- mlrun/model_monitoring/db/stores/base/store.py +22 -37
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +636 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/helpers.py +46 -1
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +57 -216
- mlrun/model_monitoring/writer.py +134 -124
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +10 -9
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +19 -12
- mlrun/projects/pipelines.py +103 -109
- mlrun/projects/project.py +377 -137
- mlrun/render.py +15 -14
- mlrun/run.py +16 -47
- mlrun/runtimes/__init__.py +6 -3
- mlrun/runtimes/base.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +440 -208
- mlrun/runtimes/nuclio/application/application.py +170 -8
- mlrun/runtimes/nuclio/function.py +39 -49
- mlrun/runtimes/pod.py +21 -41
- mlrun/runtimes/remotesparkjob.py +9 -3
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +6 -45
- mlrun/serving/server.py +2 -1
- mlrun/serving/states.py +53 -2
- mlrun/serving/v2_serving.py +5 -1
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +107 -75
- mlrun/utils/logger.py +39 -7
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification/base.py +1 -1
- mlrun/utils/notifications/notification/slack.py +61 -13
- mlrun/utils/notifications/notification/webhook.py +1 -1
- mlrun/utils/notifications/notification_pusher.py +147 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/METADATA +14 -6
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/RECORD +154 -133
- mlrun/kfpops.py +0 -865
- mlrun/platforms/other.py +0 -305
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/top_level.txt +0 -0
mlrun/db/nopdb.py
CHANGED
|
@@ -16,6 +16,9 @@
|
|
|
16
16
|
import datetime
|
|
17
17
|
from typing import Optional, Union
|
|
18
18
|
|
|
19
|
+
import mlrun.alerts
|
|
20
|
+
import mlrun.common.formatters
|
|
21
|
+
import mlrun.common.runtimes.constants
|
|
19
22
|
import mlrun.common.schemas
|
|
20
23
|
import mlrun.errors
|
|
21
24
|
|
|
@@ -79,7 +82,10 @@ class NopDB(RunDBInterface):
|
|
|
79
82
|
uid: Optional[Union[str, list[str]]] = None,
|
|
80
83
|
project: Optional[str] = None,
|
|
81
84
|
labels: Optional[Union[str, list[str]]] = None,
|
|
82
|
-
state: Optional[
|
|
85
|
+
state: Optional[
|
|
86
|
+
mlrun.common.runtimes.constants.RunStates
|
|
87
|
+
] = None, # Backward compatibility
|
|
88
|
+
states: Optional[list[mlrun.common.runtimes.constants.RunStates]] = None,
|
|
83
89
|
sort: bool = True,
|
|
84
90
|
last: int = 0,
|
|
85
91
|
iter: bool = False,
|
|
@@ -128,7 +134,18 @@ class NopDB(RunDBInterface):
|
|
|
128
134
|
):
|
|
129
135
|
pass
|
|
130
136
|
|
|
131
|
-
def del_artifact(
|
|
137
|
+
def del_artifact(
|
|
138
|
+
self,
|
|
139
|
+
key,
|
|
140
|
+
tag="",
|
|
141
|
+
project="",
|
|
142
|
+
tree=None,
|
|
143
|
+
uid=None,
|
|
144
|
+
deletion_strategy: mlrun.common.schemas.artifact.ArtifactsDeletionStrategies = (
|
|
145
|
+
mlrun.common.schemas.artifact.ArtifactsDeletionStrategies.metadata_only
|
|
146
|
+
),
|
|
147
|
+
secrets: dict = None,
|
|
148
|
+
):
|
|
132
149
|
pass
|
|
133
150
|
|
|
134
151
|
def del_artifacts(self, name="", project="", tag="", labels=None):
|
|
@@ -196,7 +213,7 @@ class NopDB(RunDBInterface):
|
|
|
196
213
|
def list_projects(
|
|
197
214
|
self,
|
|
198
215
|
owner: str = None,
|
|
199
|
-
format_: mlrun.common.
|
|
216
|
+
format_: mlrun.common.formatters.ProjectFormat = mlrun.common.formatters.ProjectFormat.name_only,
|
|
200
217
|
labels: list[str] = None,
|
|
201
218
|
state: mlrun.common.schemas.ProjectState = None,
|
|
202
219
|
) -> mlrun.common.schemas.ProjectsOutput:
|
|
@@ -351,8 +368,8 @@ class NopDB(RunDBInterface):
|
|
|
351
368
|
namespace: str = None,
|
|
352
369
|
timeout: int = 30,
|
|
353
370
|
format_: Union[
|
|
354
|
-
str, mlrun.common.
|
|
355
|
-
] = mlrun.common.
|
|
371
|
+
str, mlrun.common.formatters.PipelineFormat
|
|
372
|
+
] = mlrun.common.formatters.PipelineFormat.summary,
|
|
356
373
|
project: str = None,
|
|
357
374
|
):
|
|
358
375
|
pass
|
|
@@ -365,8 +382,8 @@ class NopDB(RunDBInterface):
|
|
|
365
382
|
page_token: str = "",
|
|
366
383
|
filter_: str = "",
|
|
367
384
|
format_: Union[
|
|
368
|
-
str, mlrun.common.
|
|
369
|
-
] = mlrun.common.
|
|
385
|
+
str, mlrun.common.formatters.PipelineFormat
|
|
386
|
+
] = mlrun.common.formatters.PipelineFormat.metadata_only,
|
|
370
387
|
page_size: int = None,
|
|
371
388
|
) -> mlrun.common.schemas.PipelinesOutput:
|
|
372
389
|
pass
|
|
@@ -508,8 +525,11 @@ class NopDB(RunDBInterface):
|
|
|
508
525
|
|
|
509
526
|
def store_api_gateway(
|
|
510
527
|
self,
|
|
511
|
-
|
|
512
|
-
|
|
528
|
+
api_gateway: Union[
|
|
529
|
+
mlrun.common.schemas.APIGateway,
|
|
530
|
+
mlrun.runtimes.nuclio.api_gateway.APIGateway,
|
|
531
|
+
],
|
|
532
|
+
project: str = None,
|
|
513
533
|
) -> mlrun.common.schemas.APIGateway:
|
|
514
534
|
pass
|
|
515
535
|
|
|
@@ -658,6 +678,22 @@ class NopDB(RunDBInterface):
|
|
|
658
678
|
) -> None:
|
|
659
679
|
pass
|
|
660
680
|
|
|
681
|
+
def disable_model_monitoring(
|
|
682
|
+
self,
|
|
683
|
+
project: str,
|
|
684
|
+
delete_resources: bool = True,
|
|
685
|
+
delete_stream_function: bool = False,
|
|
686
|
+
delete_histogram_data_drift_app: bool = True,
|
|
687
|
+
delete_user_applications: bool = False,
|
|
688
|
+
user_application_list: list[str] = None,
|
|
689
|
+
) -> bool:
|
|
690
|
+
pass
|
|
691
|
+
|
|
692
|
+
def delete_model_monitoring_function(
|
|
693
|
+
self, project: str, functions: list[str]
|
|
694
|
+
) -> bool:
|
|
695
|
+
pass
|
|
696
|
+
|
|
661
697
|
def deploy_histogram_data_drift_app(
|
|
662
698
|
self, project: str, image: str = "mlrun/mlrun"
|
|
663
699
|
) -> None:
|
|
@@ -671,7 +707,7 @@ class NopDB(RunDBInterface):
|
|
|
671
707
|
def store_alert_config(
|
|
672
708
|
self,
|
|
673
709
|
alert_name: str,
|
|
674
|
-
alert_data: Union[dict, mlrun.
|
|
710
|
+
alert_data: Union[dict, mlrun.alerts.alert.AlertConfig],
|
|
675
711
|
project="",
|
|
676
712
|
):
|
|
677
713
|
pass
|
|
@@ -687,3 +723,9 @@ class NopDB(RunDBInterface):
|
|
|
687
723
|
|
|
688
724
|
def reset_alert_config(self, alert_name: str, project=""):
|
|
689
725
|
pass
|
|
726
|
+
|
|
727
|
+
def get_alert_template(self, template_name: str):
|
|
728
|
+
pass
|
|
729
|
+
|
|
730
|
+
def list_alert_templates(self):
|
|
731
|
+
pass
|
mlrun/errors.py
CHANGED
|
@@ -155,6 +155,10 @@ class MLRunNotFoundError(MLRunHTTPStatusError):
|
|
|
155
155
|
error_status_code = HTTPStatus.NOT_FOUND.value
|
|
156
156
|
|
|
157
157
|
|
|
158
|
+
class MLRunPaginationEndOfResultsError(MLRunNotFoundError):
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
|
|
158
162
|
class MLRunBadRequestError(MLRunHTTPStatusError):
|
|
159
163
|
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
160
164
|
|
|
@@ -183,6 +187,10 @@ class MLRunInternalServerError(MLRunHTTPStatusError):
|
|
|
183
187
|
error_status_code = HTTPStatus.INTERNAL_SERVER_ERROR.value
|
|
184
188
|
|
|
185
189
|
|
|
190
|
+
class MLRunNotImplementedServerError(MLRunHTTPStatusError):
|
|
191
|
+
error_status_code = HTTPStatus.NOT_IMPLEMENTED.value
|
|
192
|
+
|
|
193
|
+
|
|
186
194
|
class MLRunServiceUnavailableError(MLRunHTTPStatusError):
|
|
187
195
|
error_status_code = HTTPStatus.SERVICE_UNAVAILABLE.value
|
|
188
196
|
|
|
@@ -234,4 +242,7 @@ STATUS_ERRORS = {
|
|
|
234
242
|
HTTPStatus.PRECONDITION_FAILED.value: MLRunPreconditionFailedError,
|
|
235
243
|
HTTPStatus.INTERNAL_SERVER_ERROR.value: MLRunInternalServerError,
|
|
236
244
|
HTTPStatus.SERVICE_UNAVAILABLE.value: MLRunServiceUnavailableError,
|
|
245
|
+
HTTPStatus.NOT_IMPLEMENTED.value: MLRunNotImplementedServerError,
|
|
237
246
|
}
|
|
247
|
+
|
|
248
|
+
EXPECTED_ERRORS = (MLRunPaginationEndOfResultsError,)
|
mlrun/execution.py
CHANGED
|
@@ -22,6 +22,7 @@ import yaml
|
|
|
22
22
|
from dateutil import parser
|
|
23
23
|
|
|
24
24
|
import mlrun
|
|
25
|
+
import mlrun.common.constants as mlrun_constants
|
|
25
26
|
from mlrun.artifacts import ModelArtifact
|
|
26
27
|
from mlrun.datastore.store_resources import get_store_resource
|
|
27
28
|
from mlrun.errors import MLRunInvalidArgumentError
|
|
@@ -129,7 +130,9 @@ class MLClientCtx:
|
|
|
129
130
|
@property
|
|
130
131
|
def tag(self):
|
|
131
132
|
"""Run tag (uid or workflow id if exists)"""
|
|
132
|
-
return
|
|
133
|
+
return (
|
|
134
|
+
self._labels.get(mlrun_constants.MLRunInternalLabels.workflow) or self._uid
|
|
135
|
+
)
|
|
133
136
|
|
|
134
137
|
@property
|
|
135
138
|
def state(self):
|
|
@@ -329,8 +332,10 @@ class MLClientCtx:
|
|
|
329
332
|
"uri": uri,
|
|
330
333
|
"owner": get_in(self._labels, "owner"),
|
|
331
334
|
}
|
|
332
|
-
if
|
|
333
|
-
resp[
|
|
335
|
+
if mlrun_constants.MLRunInternalLabels.workflow in self._labels:
|
|
336
|
+
resp[mlrun_constants.MLRunInternalLabels.workflow] = self._labels[
|
|
337
|
+
mlrun_constants.MLRunInternalLabels.workflow
|
|
338
|
+
]
|
|
334
339
|
return resp
|
|
335
340
|
|
|
336
341
|
@classmethod
|
|
@@ -396,7 +401,7 @@ class MLClientCtx:
|
|
|
396
401
|
self._set_input(k, v)
|
|
397
402
|
|
|
398
403
|
if host and not is_api:
|
|
399
|
-
self.set_label(
|
|
404
|
+
self.set_label(mlrun_constants.MLRunInternalLabels.host, host)
|
|
400
405
|
|
|
401
406
|
start = get_in(attrs, "status.start_time")
|
|
402
407
|
if start:
|
|
@@ -990,10 +995,15 @@ class MLClientCtx:
|
|
|
990
995
|
# If it's a OpenMPI job, get the global rank and compare to the logging rank (worker) set in MLRun's
|
|
991
996
|
# configuration:
|
|
992
997
|
labels = self.labels
|
|
993
|
-
if
|
|
998
|
+
if (
|
|
999
|
+
mlrun_constants.MLRunInternalLabels.host in labels
|
|
1000
|
+
and labels.get(mlrun_constants.MLRunInternalLabels.kind, "job") == "mpijob"
|
|
1001
|
+
):
|
|
994
1002
|
# The host (pod name) of each worker is created by k8s, and by default it uses the rank number as the id in
|
|
995
1003
|
# the following template: ...-worker-<rank>
|
|
996
|
-
rank = int(
|
|
1004
|
+
rank = int(
|
|
1005
|
+
labels[mlrun_constants.MLRunInternalLabels.host].rsplit("-", 1)[1]
|
|
1006
|
+
)
|
|
997
1007
|
return rank == mlrun.mlconf.packagers.logging_worker
|
|
998
1008
|
|
|
999
1009
|
# Single worker is always the logging worker:
|
|
@@ -1029,9 +1039,14 @@ class MLClientCtx:
|
|
|
1029
1039
|
"status.last_update": to_date_str(self._last_update),
|
|
1030
1040
|
}
|
|
1031
1041
|
|
|
1032
|
-
#
|
|
1033
|
-
# multiple executions for a single run (e.g. mpi)
|
|
1034
|
-
|
|
1042
|
+
# Completion of runs is decided by the API runs monitoring as there may be
|
|
1043
|
+
# multiple executions for a single run (e.g. mpi).
|
|
1044
|
+
# For kinds that are not monitored by the API (local) we allow changing the state.
|
|
1045
|
+
run_kind = self.labels.get(mlrun_constants.MLRunInternalLabels.kind, "")
|
|
1046
|
+
if (
|
|
1047
|
+
mlrun.runtimes.RuntimeKinds.is_local_runtime(run_kind)
|
|
1048
|
+
or self._state != "completed"
|
|
1049
|
+
):
|
|
1035
1050
|
struct["status.state"] = self._state
|
|
1036
1051
|
|
|
1037
1052
|
if self.is_logging_worker():
|
mlrun/feature_store/__init__.py
CHANGED
|
@@ -19,7 +19,6 @@ __all__ = [
|
|
|
19
19
|
"get_online_feature_service",
|
|
20
20
|
"ingest",
|
|
21
21
|
"preview",
|
|
22
|
-
"deploy_ingestion_service",
|
|
23
22
|
"deploy_ingestion_service_v2",
|
|
24
23
|
"delete_feature_set",
|
|
25
24
|
"delete_feature_vector",
|
|
@@ -41,7 +40,6 @@ from ..features import Entity, Feature
|
|
|
41
40
|
from .api import (
|
|
42
41
|
delete_feature_set,
|
|
43
42
|
delete_feature_vector,
|
|
44
|
-
deploy_ingestion_service,
|
|
45
43
|
deploy_ingestion_service_v2,
|
|
46
44
|
get_feature_set,
|
|
47
45
|
get_feature_vector,
|
mlrun/feature_store/api.py
CHANGED
|
@@ -113,6 +113,7 @@ def get_offline_features(
|
|
|
113
113
|
order_by: Union[str, list[str]] = None,
|
|
114
114
|
spark_service: str = None,
|
|
115
115
|
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
116
|
+
additional_filters: list = None,
|
|
116
117
|
):
|
|
117
118
|
"""retrieve offline feature vector results
|
|
118
119
|
|
|
@@ -175,6 +176,13 @@ def get_offline_features(
|
|
|
175
176
|
By default, the filter executes on the timestamp_key of each feature set.
|
|
176
177
|
Note: the time filtering is performed on each feature set before the
|
|
177
178
|
merge process using start_time and end_time params.
|
|
179
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
180
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
181
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
182
|
+
Example: [("Product", "=", "Computer")]
|
|
183
|
+
For all supported filters, please see:
|
|
184
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
185
|
+
|
|
178
186
|
|
|
179
187
|
"""
|
|
180
188
|
return _get_offline_features(
|
|
@@ -194,6 +202,7 @@ def get_offline_features(
|
|
|
194
202
|
order_by,
|
|
195
203
|
spark_service,
|
|
196
204
|
timestamp_for_filtering,
|
|
205
|
+
additional_filters,
|
|
197
206
|
)
|
|
198
207
|
|
|
199
208
|
|
|
@@ -214,6 +223,7 @@ def _get_offline_features(
|
|
|
214
223
|
order_by: Union[str, list[str]] = None,
|
|
215
224
|
spark_service: str = None,
|
|
216
225
|
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
226
|
+
additional_filters=None,
|
|
217
227
|
) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
|
|
218
228
|
if entity_rows is None and entity_timestamp_column is not None:
|
|
219
229
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -252,6 +262,7 @@ def _get_offline_features(
|
|
|
252
262
|
start_time=start_time,
|
|
253
263
|
end_time=end_time,
|
|
254
264
|
timestamp_for_filtering=timestamp_for_filtering,
|
|
265
|
+
additional_filters=additional_filters,
|
|
255
266
|
)
|
|
256
267
|
|
|
257
268
|
merger = merger_engine(feature_vector, **(engine_args or {}))
|
|
@@ -267,6 +278,7 @@ def _get_offline_features(
|
|
|
267
278
|
update_stats=update_stats,
|
|
268
279
|
query=query,
|
|
269
280
|
order_by=order_by,
|
|
281
|
+
additional_filters=additional_filters,
|
|
270
282
|
)
|
|
271
283
|
|
|
272
284
|
|
|
@@ -1005,53 +1017,6 @@ def _deploy_ingestion_service_v2(
|
|
|
1005
1017
|
return function.deploy(), function
|
|
1006
1018
|
|
|
1007
1019
|
|
|
1008
|
-
@deprecated(
|
|
1009
|
-
version="1.5.0",
|
|
1010
|
-
reason="'deploy_ingestion_service' will be removed in 1.7.0, use 'deploy_ingestion_service_v2' instead",
|
|
1011
|
-
category=FutureWarning,
|
|
1012
|
-
)
|
|
1013
|
-
def deploy_ingestion_service(
|
|
1014
|
-
featureset: Union[FeatureSet, str],
|
|
1015
|
-
source: DataSource = None,
|
|
1016
|
-
targets: list[DataTargetBase] = None,
|
|
1017
|
-
name: str = None,
|
|
1018
|
-
run_config: RunConfig = None,
|
|
1019
|
-
verbose=False,
|
|
1020
|
-
) -> str:
|
|
1021
|
-
"""Start real-time ingestion service using nuclio function
|
|
1022
|
-
|
|
1023
|
-
Deploy a real-time function implementing feature ingestion pipeline
|
|
1024
|
-
the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
|
|
1025
|
-
|
|
1026
|
-
the `run_config` parameter allow specifying the function and job configuration,
|
|
1027
|
-
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
1028
|
-
|
|
1029
|
-
example::
|
|
1030
|
-
|
|
1031
|
-
source = HTTPSource()
|
|
1032
|
-
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
1033
|
-
config = RunConfig(function=func)
|
|
1034
|
-
my_set.deploy_ingestion_service(source, run_config=config)
|
|
1035
|
-
|
|
1036
|
-
:param featureset: feature set object or uri
|
|
1037
|
-
:param source: data source object describing the online or offline source
|
|
1038
|
-
:param targets: list of data target objects
|
|
1039
|
-
:param name: name for the job/function
|
|
1040
|
-
:param run_config: service runtime configuration (function object/uri, resources, etc..)
|
|
1041
|
-
:param verbose: verbose log
|
|
1042
|
-
|
|
1043
|
-
:return: URL to access the deployed ingestion service
|
|
1044
|
-
"""
|
|
1045
|
-
endpoint, _ = featureset.deploy_ingestion_service(
|
|
1046
|
-
source=source,
|
|
1047
|
-
targets=targets,
|
|
1048
|
-
name=name,
|
|
1049
|
-
run_config=run_config,
|
|
1050
|
-
verbose=verbose,
|
|
1051
|
-
)
|
|
1052
|
-
return endpoint
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
1020
|
def _ingest_with_spark(
|
|
1056
1021
|
spark=None,
|
|
1057
1022
|
featureset: Union[FeatureSet, str] = None,
|
|
@@ -917,6 +917,7 @@ class FeatureSet(ModelObj):
|
|
|
917
917
|
start_time=None,
|
|
918
918
|
end_time=None,
|
|
919
919
|
time_column=None,
|
|
920
|
+
additional_filters=None,
|
|
920
921
|
**kwargs,
|
|
921
922
|
):
|
|
922
923
|
"""return featureset (offline) data as dataframe
|
|
@@ -928,6 +929,12 @@ class FeatureSet(ModelObj):
|
|
|
928
929
|
:param end_time: filter by end time
|
|
929
930
|
:param time_column: specify the time column name in the file
|
|
930
931
|
:param kwargs: additional reader (csv, parquet, ..) args
|
|
932
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
933
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
934
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
935
|
+
Example: [("Product", "=", "Computer")]
|
|
936
|
+
For all supported filters, please see:
|
|
937
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
931
938
|
:return: DataFrame
|
|
932
939
|
"""
|
|
933
940
|
entities = list(self.spec.entities.keys())
|
|
@@ -946,6 +953,7 @@ class FeatureSet(ModelObj):
|
|
|
946
953
|
start_time=start_time,
|
|
947
954
|
end_time=end_time,
|
|
948
955
|
time_field=time_column,
|
|
956
|
+
additional_filters=additional_filters,
|
|
949
957
|
**kwargs,
|
|
950
958
|
)
|
|
951
959
|
# to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
|
|
@@ -965,6 +973,7 @@ class FeatureSet(ModelObj):
|
|
|
965
973
|
start_time=start_time,
|
|
966
974
|
end_time=end_time,
|
|
967
975
|
time_column=time_column,
|
|
976
|
+
additional_filters=additional_filters,
|
|
968
977
|
**kwargs,
|
|
969
978
|
)
|
|
970
979
|
return result
|
|
@@ -741,6 +741,7 @@ class FeatureVector(ModelObj):
|
|
|
741
741
|
order_by: Union[str, list[str]] = None,
|
|
742
742
|
spark_service: str = None,
|
|
743
743
|
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
744
|
+
additional_filters: list = None,
|
|
744
745
|
):
|
|
745
746
|
"""retrieve offline feature vector results
|
|
746
747
|
|
|
@@ -797,6 +798,12 @@ class FeatureVector(ModelObj):
|
|
|
797
798
|
By default, the filter executes on the timestamp_key of each feature set.
|
|
798
799
|
Note: the time filtering is performed on each feature set before the
|
|
799
800
|
merge process using start_time and end_time params.
|
|
801
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
802
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
803
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
804
|
+
Example: [("Product", "=", "Computer")]
|
|
805
|
+
For all supported filters, please see:
|
|
806
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
800
807
|
|
|
801
808
|
"""
|
|
802
809
|
|
|
@@ -817,6 +824,7 @@ class FeatureVector(ModelObj):
|
|
|
817
824
|
order_by,
|
|
818
825
|
spark_service,
|
|
819
826
|
timestamp_for_filtering,
|
|
827
|
+
additional_filters,
|
|
820
828
|
)
|
|
821
829
|
|
|
822
830
|
def get_online_feature_service(
|
mlrun/feature_store/ingestion.py
CHANGED
|
@@ -17,6 +17,7 @@ import uuid
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
|
|
19
19
|
import mlrun
|
|
20
|
+
import mlrun.common.constants as mlrun_constants
|
|
20
21
|
from mlrun.datastore.sources import get_source_from_dict, get_source_step
|
|
21
22
|
from mlrun.datastore.targets import (
|
|
22
23
|
add_target_steps,
|
|
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
|
|
|
263
264
|
out_path=featureset.spec.output_path,
|
|
264
265
|
)
|
|
265
266
|
task.spec.secret_sources = run_config.secret_sources
|
|
266
|
-
task.set_label(
|
|
267
|
-
"feature-
|
|
268
|
-
)
|
|
267
|
+
task.set_label(
|
|
268
|
+
mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
|
|
269
|
+
).set_label("feature-set", featureset.uri)
|
|
269
270
|
if run_config.owner:
|
|
270
|
-
task.set_label(
|
|
271
|
-
|
|
272
|
-
)
|
|
271
|
+
task.set_label(
|
|
272
|
+
mlrun_constants.MLRunInternalLabels.owner, run_config.owner
|
|
273
|
+
).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
|
|
273
274
|
|
|
274
275
|
# set run UID and save in the feature set status (linking the features et to the job)
|
|
275
276
|
task.metadata.uid = uuid.uuid4().hex
|
|
@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
|
|
|
88
88
|
update_stats=None,
|
|
89
89
|
query=None,
|
|
90
90
|
order_by=None,
|
|
91
|
+
additional_filters=None,
|
|
91
92
|
):
|
|
92
93
|
self._target = target
|
|
93
94
|
|
|
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
|
|
|
134
135
|
timestamp_for_filtering=timestamp_for_filtering,
|
|
135
136
|
query=query,
|
|
136
137
|
order_by=order_by,
|
|
138
|
+
additional_filters=additional_filters,
|
|
137
139
|
)
|
|
138
140
|
|
|
139
141
|
def _write_to_offline_target(self, timestamp_key=None):
|
|
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
|
|
|
186
188
|
timestamp_for_filtering=None,
|
|
187
189
|
query=None,
|
|
188
190
|
order_by=None,
|
|
191
|
+
additional_filters=None,
|
|
189
192
|
):
|
|
190
193
|
self._create_engine_env()
|
|
191
194
|
|
|
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
|
|
|
212
215
|
feature_sets.append(None)
|
|
213
216
|
join_types.append(None)
|
|
214
217
|
|
|
215
|
-
|
|
218
|
+
timestamp_filtered = False
|
|
216
219
|
for step in join_graph.steps:
|
|
217
220
|
name = step.right_feature_set_name
|
|
218
221
|
feature_set = feature_set_objects[name]
|
|
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
|
|
|
250
253
|
if self._drop_indexes:
|
|
251
254
|
self._append_drop_column(time_column)
|
|
252
255
|
if (start_time or end_time) and time_column:
|
|
253
|
-
|
|
256
|
+
timestamp_filtered = True
|
|
254
257
|
|
|
255
258
|
df = self._get_engine_df(
|
|
256
259
|
feature_set,
|
|
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
|
|
|
259
262
|
start_time if time_column else None,
|
|
260
263
|
end_time if time_column else None,
|
|
261
264
|
time_column,
|
|
265
|
+
additional_filters,
|
|
262
266
|
)
|
|
263
267
|
|
|
264
268
|
fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
|
|
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
|
|
|
302
306
|
new_columns.append((column, alias))
|
|
303
307
|
self._update_alias(dictionary={name: alias for name, alias in new_columns})
|
|
304
308
|
|
|
305
|
-
# None of the feature sets was filtered as required
|
|
306
|
-
if not
|
|
309
|
+
# None of the feature sets was timestamp filtered as required
|
|
310
|
+
if not timestamp_filtered and (start_time or end_time):
|
|
307
311
|
raise mlrun.errors.MLRunRuntimeError(
|
|
308
312
|
"start_time and end_time can only be provided in conjunction with "
|
|
309
313
|
"a timestamp column, or when the at least one feature_set has a timestamp key"
|
|
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
|
|
|
755
759
|
start_time: typing.Union[str, datetime] = None,
|
|
756
760
|
end_time: typing.Union[str, datetime] = None,
|
|
757
761
|
time_column: typing.Optional[str] = None,
|
|
762
|
+
additional_filters=None,
|
|
758
763
|
):
|
|
759
764
|
"""
|
|
760
765
|
Return the feature_set data frame according to the args
|
|
@@ -79,10 +79,10 @@ class PandasConversionMixin:
|
|
|
79
79
|
msg = (
|
|
80
80
|
"toPandas attempted Arrow optimization because "
|
|
81
81
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
|
|
82
|
-
"failed by the reason below:\n
|
|
82
|
+
f"failed by the reason below:\n {e}\n"
|
|
83
83
|
"Attempting non-optimization as "
|
|
84
84
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
|
|
85
|
-
"true."
|
|
85
|
+
"true."
|
|
86
86
|
)
|
|
87
87
|
warnings.warn(msg)
|
|
88
88
|
use_arrow = False
|
|
@@ -92,7 +92,7 @@ class PandasConversionMixin:
|
|
|
92
92
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
|
|
93
93
|
"reached the error below and will not continue because automatic fallback "
|
|
94
94
|
"with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
|
|
95
|
-
"false.\n
|
|
95
|
+
f"false.\n {e}"
|
|
96
96
|
)
|
|
97
97
|
warnings.warn(msg)
|
|
98
98
|
raise
|
|
@@ -158,7 +158,7 @@ class PandasConversionMixin:
|
|
|
158
158
|
"reached the error below and can not continue. Note that "
|
|
159
159
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
|
|
160
160
|
"effect on failures in the middle of "
|
|
161
|
-
"computation.\n
|
|
161
|
+
f"computation.\n {e}"
|
|
162
162
|
)
|
|
163
163
|
warnings.warn(msg)
|
|
164
164
|
raise
|
|
@@ -168,10 +168,10 @@ class PandasConversionMixin:
|
|
|
168
168
|
column_counter = Counter(self.columns)
|
|
169
169
|
|
|
170
170
|
dtype = [None] * len(self.schema)
|
|
171
|
-
for
|
|
171
|
+
for field_idx, field in enumerate(self.schema):
|
|
172
172
|
# For duplicate column name, we use `iloc` to access it.
|
|
173
173
|
if column_counter[field.name] > 1:
|
|
174
|
-
pandas_col = pdf.iloc[:,
|
|
174
|
+
pandas_col = pdf.iloc[:, field_idx]
|
|
175
175
|
else:
|
|
176
176
|
pandas_col = pdf[field.name]
|
|
177
177
|
|
|
@@ -187,12 +187,12 @@ class PandasConversionMixin:
|
|
|
187
187
|
and field.nullable
|
|
188
188
|
and pandas_col.isnull().any()
|
|
189
189
|
):
|
|
190
|
-
dtype[
|
|
190
|
+
dtype[field_idx] = pandas_type
|
|
191
191
|
# Ensure we fall back to nullable numpy types, even when whole column is null:
|
|
192
192
|
if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
|
|
193
|
-
dtype[
|
|
193
|
+
dtype[field_idx] = np.float64
|
|
194
194
|
if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
|
|
195
|
-
dtype[
|
|
195
|
+
dtype[field_idx] = object
|
|
196
196
|
|
|
197
197
|
df = pd.DataFrame()
|
|
198
198
|
for index, t in enumerate(dtype):
|
|
@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
145
145
|
start_time=None,
|
|
146
146
|
end_time=None,
|
|
147
147
|
time_column=None,
|
|
148
|
+
additional_filters=None,
|
|
148
149
|
):
|
|
149
150
|
import dask.dataframe as dd
|
|
150
151
|
|
|
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
155
156
|
end_time=end_time,
|
|
156
157
|
time_column=time_column,
|
|
157
158
|
index=False,
|
|
159
|
+
additional_filters=additional_filters,
|
|
158
160
|
)
|
|
159
161
|
|
|
160
162
|
return self._reset_index(df).persist()
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import uuid
|
|
16
16
|
|
|
17
17
|
import mlrun
|
|
18
|
+
import mlrun.common.constants as mlrun_constants
|
|
18
19
|
from mlrun.config import config as mlconf
|
|
19
20
|
from mlrun.model import DataTargetBase, new_task
|
|
20
21
|
from mlrun.runtimes.function_reference import FunctionReference
|
|
@@ -42,6 +43,7 @@ def run_merge_job(
|
|
|
42
43
|
start_time=None,
|
|
43
44
|
end_time=None,
|
|
44
45
|
timestamp_for_filtering=None,
|
|
46
|
+
additional_filters=None,
|
|
45
47
|
):
|
|
46
48
|
name = vector.metadata.name
|
|
47
49
|
if not target or not hasattr(target, "to_dict"):
|
|
@@ -116,11 +118,14 @@ def run_merge_job(
|
|
|
116
118
|
"end_time": end_time,
|
|
117
119
|
"timestamp_for_filtering": timestamp_for_filtering,
|
|
118
120
|
"engine_args": engine_args,
|
|
121
|
+
"additional_filters": additional_filters,
|
|
119
122
|
},
|
|
120
123
|
inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
|
|
121
124
|
)
|
|
122
125
|
task.spec.secret_sources = run_config.secret_sources
|
|
123
|
-
task.set_label(
|
|
126
|
+
task.set_label(
|
|
127
|
+
mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
|
|
128
|
+
).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
|
|
124
129
|
task.metadata.uid = uuid.uuid4().hex
|
|
125
130
|
vector.status.run_uri = task.metadata.uid
|
|
126
131
|
vector.save()
|
|
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
|
|
|
196
201
|
from mlrun.datastore.targets import get_target_driver
|
|
197
202
|
def merge_handler(context, vector_uri, target, entity_rows=None,
|
|
198
203
|
entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
|
|
199
|
-
engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None
|
|
204
|
+
engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
|
|
205
|
+
additional_filters=None):
|
|
200
206
|
vector = context.get_store_resource(vector_uri)
|
|
201
207
|
store_target = get_target_driver(target, vector)
|
|
202
208
|
if entity_rows:
|
|
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
|
|
|
206
212
|
merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
|
|
207
213
|
merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
|
|
208
214
|
query=query, order_by=order_by, start_time=start_time, end_time=end_time,
|
|
209
|
-
timestamp_for_filtering=timestamp_for_filtering)
|
|
215
|
+
timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
|
|
210
216
|
|
|
211
217
|
target = vector.status.targets[store_target.name].to_dict()
|
|
212
218
|
context.log_result('feature_vector', vector.uri)
|
|
@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
114
114
|
start_time=None,
|
|
115
115
|
end_time=None,
|
|
116
116
|
time_column=None,
|
|
117
|
+
additional_filters=None,
|
|
117
118
|
):
|
|
118
119
|
df = feature_set.to_dataframe(
|
|
119
120
|
columns=column_names,
|
|
120
121
|
start_time=start_time,
|
|
121
122
|
end_time=end_time,
|
|
122
123
|
time_column=time_column,
|
|
124
|
+
additional_filters=additional_filters,
|
|
123
125
|
)
|
|
124
126
|
if df.index.names[0]:
|
|
125
127
|
df.reset_index(inplace=True)
|