mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +26 -112
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +46 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +47 -48
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +69 -0
- mlrun/common/db/sql_session.py +2 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/common/formatters/artifact.py +21 -0
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/helpers.py +1 -2
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +24 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +18 -8
- mlrun/common/schemas/auth.py +11 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -1
- mlrun/common/schemas/feature_store.py +16 -16
- mlrun/common/schemas/frontend_spec.py +8 -7
- mlrun/common/schemas/function.py +5 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +18 -3
- mlrun/common/schemas/model_monitoring/constants.py +83 -26
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
- mlrun/common/schemas/notification.py +4 -4
- mlrun/common/schemas/object.py +2 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +1 -10
- mlrun/common/schemas/project.py +24 -23
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +3 -3
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +2 -2
- mlrun/common/types.py +7 -1
- mlrun/config.py +54 -17
- mlrun/data_types/to_pandas.py +10 -12
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +17 -5
- mlrun/datastore/base.py +62 -39
- mlrun/datastore/datastore.py +28 -9
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/filestore.py +0 -1
- mlrun/datastore/google_cloud_storage.py +6 -2
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +6 -2
- mlrun/datastore/s3.py +9 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +201 -96
- mlrun/datastore/spark_utils.py +1 -2
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +358 -104
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +5 -1
- mlrun/db/base.py +185 -35
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +614 -179
- mlrun/db/nopdb.py +210 -26
- mlrun/errors.py +12 -1
- mlrun/execution.py +41 -24
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -72
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +28 -30
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/conversion.py +11 -13
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +37 -34
- mlrun/features.py +9 -20
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +2 -3
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +4 -3
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +14 -16
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +17 -11
- mlrun/launcher/remote.py +16 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +238 -73
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +138 -315
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +24 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +104 -84
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +127 -28
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/prometheus.py +1 -4
- mlrun/model_monitoring/stream_processing.py +62 -231
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +6 -6
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +35 -21
- mlrun/projects/pipelines.py +68 -99
- mlrun/projects/project.py +830 -266
- mlrun/render.py +3 -11
- mlrun/run.py +162 -166
- mlrun/runtimes/__init__.py +62 -7
- mlrun/runtimes/base.py +39 -32
- mlrun/runtimes/daskjob.py +8 -8
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +6 -3
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
- mlrun/runtimes/pod.py +286 -88
- mlrun/runtimes/remotesparkjob.py +2 -2
- mlrun/runtimes/sparkjob/spark3job.py +51 -34
- mlrun/runtimes/utils.py +7 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +13 -10
- mlrun/serving/server.py +22 -26
- mlrun/serving/states.py +99 -25
- mlrun/serving/utils.py +3 -3
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +59 -20
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +1 -2
- mlrun/utils/async_http.py +5 -7
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +3 -3
- mlrun/utils/helpers.py +183 -197
- mlrun/utils/http.py +2 -5
- mlrun/utils/logger.py +76 -14
- mlrun/utils/notifications/notification/__init__.py +17 -12
- mlrun/utils/notifications/notification/base.py +14 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +155 -30
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +2 -4
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc2.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ import typing
|
|
|
17
17
|
from copy import copy
|
|
18
18
|
from datetime import datetime
|
|
19
19
|
from enum import Enum
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Union
|
|
21
21
|
|
|
22
22
|
import numpy as np
|
|
23
23
|
import pandas as pd
|
|
@@ -69,18 +69,16 @@ class FeatureVectorSpec(ModelObj):
|
|
|
69
69
|
self._entity_fields: ObjectList = None
|
|
70
70
|
self._entity_source: DataSource = None
|
|
71
71
|
self._function: FunctionReference = None
|
|
72
|
-
self._relations:
|
|
72
|
+
self._relations: dict[str, ObjectDict] = None
|
|
73
73
|
self._join_graph: JoinGraph = None
|
|
74
74
|
|
|
75
75
|
self.description = description
|
|
76
|
-
self.features:
|
|
76
|
+
self.features: list[str] = features or []
|
|
77
77
|
self.entity_source = entity_source
|
|
78
78
|
self.entity_fields = entity_fields or []
|
|
79
79
|
self.graph = graph
|
|
80
80
|
self.join_graph = join_graph
|
|
81
|
-
self.relations:
|
|
82
|
-
relations or {}
|
|
83
|
-
)
|
|
81
|
+
self.relations: dict[str, dict[str, Union[Entity, str]]] = relations or {}
|
|
84
82
|
self.timestamp_field = timestamp_field
|
|
85
83
|
self.label_feature = label_feature
|
|
86
84
|
self.with_indexes = with_indexes
|
|
@@ -97,12 +95,12 @@ class FeatureVectorSpec(ModelObj):
|
|
|
97
95
|
self._entity_source = self._verify_dict(source, "entity_source", DataSource)
|
|
98
96
|
|
|
99
97
|
@property
|
|
100
|
-
def entity_fields(self) ->
|
|
98
|
+
def entity_fields(self) -> list[Feature]:
|
|
101
99
|
"""the schema/metadata for the entity source fields"""
|
|
102
100
|
return self._entity_fields
|
|
103
101
|
|
|
104
102
|
@entity_fields.setter
|
|
105
|
-
def entity_fields(self, entity_fields:
|
|
103
|
+
def entity_fields(self, entity_fields: list[Feature]):
|
|
106
104
|
self._entity_fields = ObjectList.from_list(Feature, entity_fields)
|
|
107
105
|
|
|
108
106
|
@property
|
|
@@ -125,14 +123,12 @@ class FeatureVectorSpec(ModelObj):
|
|
|
125
123
|
self._function = self._verify_dict(function, "function", FunctionReference)
|
|
126
124
|
|
|
127
125
|
@property
|
|
128
|
-
def relations(self) ->
|
|
126
|
+
def relations(self) -> dict[str, ObjectDict]:
|
|
129
127
|
"""feature set relations dict"""
|
|
130
128
|
return self._relations
|
|
131
129
|
|
|
132
130
|
@relations.setter
|
|
133
|
-
def relations(
|
|
134
|
-
self, relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]]
|
|
135
|
-
):
|
|
131
|
+
def relations(self, relations: dict[str, dict[str, Union[Entity, str]]]):
|
|
136
132
|
temp_relations = {}
|
|
137
133
|
for fs_name, relation in relations.items():
|
|
138
134
|
for col, ent in relation.items():
|
|
@@ -179,29 +175,29 @@ class FeatureVectorStatus(ModelObj):
|
|
|
179
175
|
self.stats = stats or {}
|
|
180
176
|
self.index_keys = index_keys
|
|
181
177
|
self.preview = preview or []
|
|
182
|
-
self.features:
|
|
178
|
+
self.features: list[Feature] = features or []
|
|
183
179
|
self.run_uri = run_uri
|
|
184
180
|
self.timestamp_key = timestamp_key
|
|
185
181
|
|
|
186
182
|
@property
|
|
187
|
-
def targets(self) ->
|
|
183
|
+
def targets(self) -> list[DataTarget]:
|
|
188
184
|
"""list of material storage targets + their status/path"""
|
|
189
185
|
return self._targets
|
|
190
186
|
|
|
191
187
|
@targets.setter
|
|
192
|
-
def targets(self, targets:
|
|
188
|
+
def targets(self, targets: list[DataTarget]):
|
|
193
189
|
self._targets = ObjectList.from_list(DataTarget, targets)
|
|
194
190
|
|
|
195
191
|
def update_target(self, target: DataTarget):
|
|
196
192
|
self._targets.update(target)
|
|
197
193
|
|
|
198
194
|
@property
|
|
199
|
-
def features(self) ->
|
|
195
|
+
def features(self) -> list[Feature]:
|
|
200
196
|
"""list of features (result of joining features from the source feature sets)"""
|
|
201
197
|
return self._features
|
|
202
198
|
|
|
203
199
|
@features.setter
|
|
204
|
-
def features(self, features:
|
|
200
|
+
def features(self, features: list[Feature]):
|
|
205
201
|
self._features = ObjectList.from_list(Feature, features)
|
|
206
202
|
|
|
207
203
|
|
|
@@ -378,7 +374,7 @@ class _JoinStep(ModelObj):
|
|
|
378
374
|
name: str = None,
|
|
379
375
|
left_step_name: str = None,
|
|
380
376
|
right_step_name: str = None,
|
|
381
|
-
left_feature_set_names: Union[str,
|
|
377
|
+
left_feature_set_names: Union[str, list[str]] = None,
|
|
382
378
|
right_feature_set_name: str = None,
|
|
383
379
|
join_type: str = "inner",
|
|
384
380
|
asof_join: bool = False,
|
|
@@ -388,7 +384,8 @@ class _JoinStep(ModelObj):
|
|
|
388
384
|
self.right_step_name = right_step_name
|
|
389
385
|
self.left_feature_set_names = (
|
|
390
386
|
left_feature_set_names
|
|
391
|
-
if
|
|
387
|
+
if left_feature_set_names is None
|
|
388
|
+
or isinstance(left_feature_set_names, list)
|
|
392
389
|
else [left_feature_set_names]
|
|
393
390
|
)
|
|
394
391
|
self.right_feature_set_name = right_feature_set_name
|
|
@@ -402,7 +399,7 @@ class _JoinStep(ModelObj):
|
|
|
402
399
|
self,
|
|
403
400
|
feature_set_objects: ObjectList,
|
|
404
401
|
vector,
|
|
405
|
-
entity_rows_keys:
|
|
402
|
+
entity_rows_keys: list[str] = None,
|
|
406
403
|
):
|
|
407
404
|
if feature_set_objects[self.right_feature_set_name].is_connectable_to_df(
|
|
408
405
|
entity_rows_keys
|
|
@@ -482,21 +479,22 @@ class FeatureVector(ModelObj):
|
|
|
482
479
|
description=None,
|
|
483
480
|
with_indexes=None,
|
|
484
481
|
join_graph: JoinGraph = None,
|
|
485
|
-
relations:
|
|
482
|
+
relations: dict[str, dict[str, Union[Entity, str]]] = None,
|
|
486
483
|
):
|
|
487
484
|
"""Feature vector, specify selected features, their metadata and material views
|
|
488
485
|
|
|
489
486
|
example::
|
|
490
487
|
|
|
491
488
|
import mlrun.feature_store as fstore
|
|
489
|
+
|
|
492
490
|
features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
|
|
493
491
|
vector = fstore.FeatureVector("my-vec", features)
|
|
494
492
|
|
|
495
493
|
# get the vector as a dataframe
|
|
496
|
-
df =
|
|
494
|
+
df = vector.get_offline_features().to_dataframe()
|
|
497
495
|
|
|
498
496
|
# return an online/real-time feature service
|
|
499
|
-
svc =
|
|
497
|
+
svc = vector.get_online_feature_service(impute_policy={"*": "$mean"})
|
|
500
498
|
resp = svc.get([{"stock": "GOOG"}])
|
|
501
499
|
|
|
502
500
|
:param name: List of names of targets to delete (default: delete all ingested targets)
|
|
@@ -732,7 +730,7 @@ class FeatureVector(ModelObj):
|
|
|
732
730
|
entity_timestamp_column: str = None,
|
|
733
731
|
target: DataTargetBase = None,
|
|
734
732
|
run_config: RunConfig = None,
|
|
735
|
-
drop_columns:
|
|
733
|
+
drop_columns: list[str] = None,
|
|
736
734
|
start_time: Union[str, datetime] = None,
|
|
737
735
|
end_time: Union[str, datetime] = None,
|
|
738
736
|
with_indexes: bool = False,
|
|
@@ -740,9 +738,9 @@ class FeatureVector(ModelObj):
|
|
|
740
738
|
engine: str = None,
|
|
741
739
|
engine_args: dict = None,
|
|
742
740
|
query: str = None,
|
|
743
|
-
order_by: Union[str,
|
|
741
|
+
order_by: Union[str, list[str]] = None,
|
|
744
742
|
spark_service: str = None,
|
|
745
|
-
timestamp_for_filtering: Union[str,
|
|
743
|
+
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
746
744
|
):
|
|
747
745
|
"""retrieve offline feature vector results
|
|
748
746
|
|
|
@@ -827,7 +825,7 @@ class FeatureVector(ModelObj):
|
|
|
827
825
|
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
828
826
|
impute_policy: dict = None,
|
|
829
827
|
update_stats: bool = False,
|
|
830
|
-
entity_keys:
|
|
828
|
+
entity_keys: list[str] = None,
|
|
831
829
|
):
|
|
832
830
|
"""initialize and return online feature vector service api,
|
|
833
831
|
returns :py:class:`~mlrun.feature_store.OnlineVectorService`
|
|
@@ -855,7 +853,7 @@ class FeatureVector(ModelObj):
|
|
|
855
853
|
|
|
856
854
|
Example::
|
|
857
855
|
|
|
858
|
-
svc = vector_uri.get_online_feature_service(entity_keys=[
|
|
856
|
+
svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
|
|
859
857
|
try:
|
|
860
858
|
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
861
859
|
print(resp)
|
|
@@ -910,7 +908,7 @@ class OnlineVectorService:
|
|
|
910
908
|
graph,
|
|
911
909
|
index_columns,
|
|
912
910
|
impute_policy: dict = None,
|
|
913
|
-
requested_columns:
|
|
911
|
+
requested_columns: list[str] = None,
|
|
914
912
|
):
|
|
915
913
|
self.vector = vector
|
|
916
914
|
self.impute_policy = impute_policy or {}
|
|
@@ -966,7 +964,7 @@ class OnlineVectorService:
|
|
|
966
964
|
"""vector merger function status (ready, running, error)"""
|
|
967
965
|
return "ready"
|
|
968
966
|
|
|
969
|
-
def get(self, entity_rows:
|
|
967
|
+
def get(self, entity_rows: list[Union[dict, list]], as_list=False):
|
|
970
968
|
"""get feature vector given the provided entity inputs
|
|
971
969
|
|
|
972
970
|
take a list of input vectors/rows and return a list of enriched feature vectors
|
mlrun/feature_store/ingestion.py
CHANGED
|
@@ -17,6 +17,7 @@ import uuid
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
|
|
19
19
|
import mlrun
|
|
20
|
+
import mlrun.common.constants as mlrun_constants
|
|
20
21
|
from mlrun.datastore.sources import get_source_from_dict, get_source_step
|
|
21
22
|
from mlrun.datastore.targets import (
|
|
22
23
|
add_target_steps,
|
|
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
|
|
|
263
264
|
out_path=featureset.spec.output_path,
|
|
264
265
|
)
|
|
265
266
|
task.spec.secret_sources = run_config.secret_sources
|
|
266
|
-
task.set_label(
|
|
267
|
-
"feature-
|
|
268
|
-
)
|
|
267
|
+
task.set_label(
|
|
268
|
+
mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
|
|
269
|
+
).set_label("feature-set", featureset.uri)
|
|
269
270
|
if run_config.owner:
|
|
270
|
-
task.set_label(
|
|
271
|
-
|
|
272
|
-
)
|
|
271
|
+
task.set_label(
|
|
272
|
+
mlrun_constants.MLRunInternalLabels.owner, run_config.owner
|
|
273
|
+
).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
|
|
273
274
|
|
|
274
275
|
# set run UID and save in the feature set status (linking the features et to the job)
|
|
275
276
|
task.metadata.uid = uuid.uuid4().hex
|
|
@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
|
|
|
88
88
|
update_stats=None,
|
|
89
89
|
query=None,
|
|
90
90
|
order_by=None,
|
|
91
|
+
additional_filters=None,
|
|
91
92
|
):
|
|
92
93
|
self._target = target
|
|
93
94
|
|
|
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
|
|
|
134
135
|
timestamp_for_filtering=timestamp_for_filtering,
|
|
135
136
|
query=query,
|
|
136
137
|
order_by=order_by,
|
|
138
|
+
additional_filters=additional_filters,
|
|
137
139
|
)
|
|
138
140
|
|
|
139
141
|
def _write_to_offline_target(self, timestamp_key=None):
|
|
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
|
|
|
186
188
|
timestamp_for_filtering=None,
|
|
187
189
|
query=None,
|
|
188
190
|
order_by=None,
|
|
191
|
+
additional_filters=None,
|
|
189
192
|
):
|
|
190
193
|
self._create_engine_env()
|
|
191
194
|
|
|
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
|
|
|
212
215
|
feature_sets.append(None)
|
|
213
216
|
join_types.append(None)
|
|
214
217
|
|
|
215
|
-
|
|
218
|
+
timestamp_filtered = False
|
|
216
219
|
for step in join_graph.steps:
|
|
217
220
|
name = step.right_feature_set_name
|
|
218
221
|
feature_set = feature_set_objects[name]
|
|
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
|
|
|
250
253
|
if self._drop_indexes:
|
|
251
254
|
self._append_drop_column(time_column)
|
|
252
255
|
if (start_time or end_time) and time_column:
|
|
253
|
-
|
|
256
|
+
timestamp_filtered = True
|
|
254
257
|
|
|
255
258
|
df = self._get_engine_df(
|
|
256
259
|
feature_set,
|
|
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
|
|
|
259
262
|
start_time if time_column else None,
|
|
260
263
|
end_time if time_column else None,
|
|
261
264
|
time_column,
|
|
265
|
+
additional_filters,
|
|
262
266
|
)
|
|
263
267
|
|
|
264
268
|
fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
|
|
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
|
|
|
302
306
|
new_columns.append((column, alias))
|
|
303
307
|
self._update_alias(dictionary={name: alias for name, alias in new_columns})
|
|
304
308
|
|
|
305
|
-
# None of the feature sets was filtered as required
|
|
306
|
-
if not
|
|
309
|
+
# None of the feature sets was timestamp filtered as required
|
|
310
|
+
if not timestamp_filtered and (start_time or end_time):
|
|
307
311
|
raise mlrun.errors.MLRunRuntimeError(
|
|
308
312
|
"start_time and end_time can only be provided in conjunction with "
|
|
309
313
|
"a timestamp column, or when the at least one feature_set has a timestamp key"
|
|
@@ -540,8 +544,8 @@ class BaseMerger(abc.ABC):
|
|
|
540
544
|
self,
|
|
541
545
|
name: str,
|
|
542
546
|
order: int,
|
|
543
|
-
left_keys:
|
|
544
|
-
right_keys:
|
|
547
|
+
left_keys: list[str] = None,
|
|
548
|
+
right_keys: list[str] = None,
|
|
545
549
|
):
|
|
546
550
|
self.name = name
|
|
547
551
|
self.left_keys = left_keys if left_keys is not None else []
|
|
@@ -750,11 +754,12 @@ class BaseMerger(abc.ABC):
|
|
|
750
754
|
def _get_engine_df(
|
|
751
755
|
self,
|
|
752
756
|
feature_set: FeatureSet,
|
|
753
|
-
feature_set_name:
|
|
754
|
-
column_names:
|
|
757
|
+
feature_set_name: list[str],
|
|
758
|
+
column_names: list[str] = None,
|
|
755
759
|
start_time: typing.Union[str, datetime] = None,
|
|
756
760
|
end_time: typing.Union[str, datetime] = None,
|
|
757
761
|
time_column: typing.Optional[str] = None,
|
|
762
|
+
additional_filters=None,
|
|
758
763
|
):
|
|
759
764
|
"""
|
|
760
765
|
Return the feature_set data frame according to the args
|
|
@@ -773,8 +778,8 @@ class BaseMerger(abc.ABC):
|
|
|
773
778
|
def _rename_columns_and_select(
|
|
774
779
|
self,
|
|
775
780
|
df,
|
|
776
|
-
rename_col_dict:
|
|
777
|
-
columns:
|
|
781
|
+
rename_col_dict: dict[str, str],
|
|
782
|
+
columns: list[str] = None,
|
|
778
783
|
):
|
|
779
784
|
"""
|
|
780
785
|
rename the columns of the df according to rename_col_dict, and select only `columns` if it is not none
|
|
@@ -801,7 +806,7 @@ class BaseMerger(abc.ABC):
|
|
|
801
806
|
"""
|
|
802
807
|
raise NotImplementedError
|
|
803
808
|
|
|
804
|
-
def _order_by(self, order_by_active:
|
|
809
|
+
def _order_by(self, order_by_active: list[str]):
|
|
805
810
|
"""
|
|
806
811
|
Order by `order_by_active` along all axis.
|
|
807
812
|
|
|
@@ -19,7 +19,7 @@ from collections import Counter
|
|
|
19
19
|
# np.bool -> bool and np.object -> object fix backported from pyspark v3.3.3.
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
class PandasConversionMixin
|
|
22
|
+
class PandasConversionMixin:
|
|
23
23
|
"""
|
|
24
24
|
Min-in for the conversion from Spark to pandas. Currently, only :class:`DataFrame`
|
|
25
25
|
can use this class.
|
|
@@ -79,10 +79,10 @@ class PandasConversionMixin(object):
|
|
|
79
79
|
msg = (
|
|
80
80
|
"toPandas attempted Arrow optimization because "
|
|
81
81
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
|
|
82
|
-
"failed by the reason below:\n
|
|
82
|
+
f"failed by the reason below:\n {e}\n"
|
|
83
83
|
"Attempting non-optimization as "
|
|
84
84
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
|
|
85
|
-
"true."
|
|
85
|
+
"true."
|
|
86
86
|
)
|
|
87
87
|
warnings.warn(msg)
|
|
88
88
|
use_arrow = False
|
|
@@ -92,7 +92,7 @@ class PandasConversionMixin(object):
|
|
|
92
92
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
|
|
93
93
|
"reached the error below and will not continue because automatic fallback "
|
|
94
94
|
"with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
|
|
95
|
-
"false.\n
|
|
95
|
+
f"false.\n {e}"
|
|
96
96
|
)
|
|
97
97
|
warnings.warn(msg)
|
|
98
98
|
raise
|
|
@@ -108,9 +108,7 @@ class PandasConversionMixin(object):
|
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
# Rename columns to avoid duplicated column names.
|
|
111
|
-
tmp_column_names = [
|
|
112
|
-
"col_{}".format(i) for i in range(len(self.columns))
|
|
113
|
-
]
|
|
111
|
+
tmp_column_names = [f"col_{i}" for i in range(len(self.columns))]
|
|
114
112
|
self_destruct = self.sql_ctx._conf.arrowPySparkSelfDestructEnabled()
|
|
115
113
|
batches = self.toDF(*tmp_column_names)._collect_as_arrow(
|
|
116
114
|
split_batches=self_destruct
|
|
@@ -160,7 +158,7 @@ class PandasConversionMixin(object):
|
|
|
160
158
|
"reached the error below and can not continue. Note that "
|
|
161
159
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
|
|
162
160
|
"effect on failures in the middle of "
|
|
163
|
-
"computation.\n
|
|
161
|
+
f"computation.\n {e}"
|
|
164
162
|
)
|
|
165
163
|
warnings.warn(msg)
|
|
166
164
|
raise
|
|
@@ -170,10 +168,10 @@ class PandasConversionMixin(object):
|
|
|
170
168
|
column_counter = Counter(self.columns)
|
|
171
169
|
|
|
172
170
|
dtype = [None] * len(self.schema)
|
|
173
|
-
for
|
|
171
|
+
for field_idx, field in enumerate(self.schema):
|
|
174
172
|
# For duplicate column name, we use `iloc` to access it.
|
|
175
173
|
if column_counter[field.name] > 1:
|
|
176
|
-
pandas_col = pdf.iloc[:,
|
|
174
|
+
pandas_col = pdf.iloc[:, field_idx]
|
|
177
175
|
else:
|
|
178
176
|
pandas_col = pdf[field.name]
|
|
179
177
|
|
|
@@ -189,12 +187,12 @@ class PandasConversionMixin(object):
|
|
|
189
187
|
and field.nullable
|
|
190
188
|
and pandas_col.isnull().any()
|
|
191
189
|
):
|
|
192
|
-
dtype[
|
|
190
|
+
dtype[field_idx] = pandas_type
|
|
193
191
|
# Ensure we fall back to nullable numpy types, even when whole column is null:
|
|
194
192
|
if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
|
|
195
|
-
dtype[
|
|
193
|
+
dtype[field_idx] = np.float64
|
|
196
194
|
if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
|
|
197
|
-
dtype[
|
|
195
|
+
dtype[field_idx] = object
|
|
198
196
|
|
|
199
197
|
df = pd.DataFrame()
|
|
200
198
|
for index, t in enumerate(dtype):
|
|
@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
145
145
|
start_time=None,
|
|
146
146
|
end_time=None,
|
|
147
147
|
time_column=None,
|
|
148
|
+
additional_filters=None,
|
|
148
149
|
):
|
|
149
150
|
import dask.dataframe as dd
|
|
150
151
|
|
|
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
155
156
|
end_time=end_time,
|
|
156
157
|
time_column=time_column,
|
|
157
158
|
index=False,
|
|
159
|
+
additional_filters=additional_filters,
|
|
158
160
|
)
|
|
159
161
|
|
|
160
162
|
return self._reset_index(df).persist()
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import uuid
|
|
16
16
|
|
|
17
17
|
import mlrun
|
|
18
|
+
import mlrun.common.constants as mlrun_constants
|
|
18
19
|
from mlrun.config import config as mlconf
|
|
19
20
|
from mlrun.model import DataTargetBase, new_task
|
|
20
21
|
from mlrun.runtimes.function_reference import FunctionReference
|
|
@@ -42,6 +43,7 @@ def run_merge_job(
|
|
|
42
43
|
start_time=None,
|
|
43
44
|
end_time=None,
|
|
44
45
|
timestamp_for_filtering=None,
|
|
46
|
+
additional_filters=None,
|
|
45
47
|
):
|
|
46
48
|
name = vector.metadata.name
|
|
47
49
|
if not target or not hasattr(target, "to_dict"):
|
|
@@ -116,11 +118,14 @@ def run_merge_job(
|
|
|
116
118
|
"end_time": end_time,
|
|
117
119
|
"timestamp_for_filtering": timestamp_for_filtering,
|
|
118
120
|
"engine_args": engine_args,
|
|
121
|
+
"additional_filters": additional_filters,
|
|
119
122
|
},
|
|
120
123
|
inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
|
|
121
124
|
)
|
|
122
125
|
task.spec.secret_sources = run_config.secret_sources
|
|
123
|
-
task.set_label(
|
|
126
|
+
task.set_label(
|
|
127
|
+
mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
|
|
128
|
+
).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
|
|
124
129
|
task.metadata.uid = uuid.uuid4().hex
|
|
125
130
|
vector.status.run_uri = task.metadata.uid
|
|
126
131
|
vector.save()
|
|
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
|
|
|
196
201
|
from mlrun.datastore.targets import get_target_driver
|
|
197
202
|
def merge_handler(context, vector_uri, target, entity_rows=None,
|
|
198
203
|
entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
|
|
199
|
-
engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None
|
|
204
|
+
engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
|
|
205
|
+
additional_filters=None):
|
|
200
206
|
vector = context.get_store_resource(vector_uri)
|
|
201
207
|
store_target = get_target_driver(target, vector)
|
|
202
208
|
if entity_rows:
|
|
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
|
|
|
206
212
|
merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
|
|
207
213
|
merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
|
|
208
214
|
query=query, order_by=order_by, start_time=start_time, end_time=end_time,
|
|
209
|
-
timestamp_for_filtering=timestamp_for_filtering)
|
|
215
|
+
timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
|
|
210
216
|
|
|
211
217
|
target = vector.status.targets[store_target.name].to_dict()
|
|
212
218
|
context.log_result('feature_vector', vector.uri)
|
|
@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
114
114
|
start_time=None,
|
|
115
115
|
end_time=None,
|
|
116
116
|
time_column=None,
|
|
117
|
+
additional_filters=None,
|
|
117
118
|
):
|
|
118
119
|
df = feature_set.to_dataframe(
|
|
119
120
|
columns=column_names,
|
|
120
121
|
start_time=start_time,
|
|
121
122
|
end_time=end_time,
|
|
122
123
|
time_column=time_column,
|
|
124
|
+
additional_filters=additional_filters,
|
|
123
125
|
)
|
|
124
126
|
if df.index.names[0]:
|
|
125
127
|
df.reset_index(inplace=True)
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
|
+
|
|
15
16
|
import pandas as pd
|
|
16
17
|
import semver
|
|
17
18
|
|
|
@@ -24,6 +25,32 @@ from .base import BaseMerger
|
|
|
24
25
|
from .conversion import PandasConversionMixin
|
|
25
26
|
|
|
26
27
|
|
|
28
|
+
def spark_df_to_pandas(spark_df):
|
|
29
|
+
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
30
|
+
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
31
|
+
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
32
|
+
if semver.parse(pd.__version__)["major"] >= 2:
|
|
33
|
+
import pyspark.sql.functions as pyspark_functions
|
|
34
|
+
|
|
35
|
+
type_conversion_dict = {}
|
|
36
|
+
for field in spark_df.schema.fields:
|
|
37
|
+
if str(field.dataType) == "TimestampType":
|
|
38
|
+
spark_df = spark_df.withColumn(
|
|
39
|
+
field.name,
|
|
40
|
+
pyspark_functions.date_format(
|
|
41
|
+
pyspark_functions.to_timestamp(field.name),
|
|
42
|
+
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
|
|
43
|
+
),
|
|
44
|
+
)
|
|
45
|
+
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
46
|
+
df = PandasConversionMixin.toPandas(spark_df)
|
|
47
|
+
if type_conversion_dict:
|
|
48
|
+
df = df.astype(type_conversion_dict)
|
|
49
|
+
return df
|
|
50
|
+
else:
|
|
51
|
+
return PandasConversionMixin.toPandas(spark_df)
|
|
52
|
+
|
|
53
|
+
|
|
27
54
|
class SparkFeatureMerger(BaseMerger):
|
|
28
55
|
engine = "spark"
|
|
29
56
|
support_offline = True
|
|
@@ -166,29 +193,7 @@ class SparkFeatureMerger(BaseMerger):
|
|
|
166
193
|
def get_df(self, to_pandas=True):
|
|
167
194
|
if to_pandas:
|
|
168
195
|
if self._pandas_df is None:
|
|
169
|
-
df = self._result_df
|
|
170
|
-
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
171
|
-
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
172
|
-
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
173
|
-
if semver.parse(pd.__version__)["major"] >= 2:
|
|
174
|
-
import pyspark.sql.functions as pyspark_functions
|
|
175
|
-
|
|
176
|
-
type_conversion_dict = {}
|
|
177
|
-
for field in df.schema.fields:
|
|
178
|
-
if str(field.dataType) == "TimestampType":
|
|
179
|
-
df = df.withColumn(
|
|
180
|
-
field.name,
|
|
181
|
-
pyspark_functions.date_format(
|
|
182
|
-
pyspark_functions.to_timestamp(field.name),
|
|
183
|
-
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
|
|
184
|
-
),
|
|
185
|
-
)
|
|
186
|
-
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
187
|
-
df = PandasConversionMixin.toPandas(df)
|
|
188
|
-
if type_conversion_dict:
|
|
189
|
-
df = df.astype(type_conversion_dict)
|
|
190
|
-
else:
|
|
191
|
-
df = PandasConversionMixin.toPandas(df)
|
|
196
|
+
df = spark_df_to_pandas(self._result_df)
|
|
192
197
|
self._pandas_df = df
|
|
193
198
|
self._set_indexes(self._pandas_df)
|
|
194
199
|
return self._pandas_df
|
|
@@ -221,7 +226,12 @@ class SparkFeatureMerger(BaseMerger):
|
|
|
221
226
|
start_time=None,
|
|
222
227
|
end_time=None,
|
|
223
228
|
time_column=None,
|
|
229
|
+
additional_filters=None,
|
|
224
230
|
):
|
|
231
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
232
|
+
additional_filters, self.__class__
|
|
233
|
+
)
|
|
234
|
+
|
|
225
235
|
source_kwargs = {}
|
|
226
236
|
if feature_set.spec.passthrough:
|
|
227
237
|
if not feature_set.spec.source:
|
|
@@ -243,13 +253,13 @@ class SparkFeatureMerger(BaseMerger):
|
|
|
243
253
|
# handling case where there are multiple feature sets and user creates vector where
|
|
244
254
|
# entity_timestamp_column is from a specific feature set (can't be entity timestamp)
|
|
245
255
|
source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
|
|
246
|
-
|
|
247
256
|
source = source_driver(
|
|
248
257
|
name=self.vector.metadata.name,
|
|
249
258
|
path=source_path,
|
|
250
259
|
time_field=time_column,
|
|
251
260
|
start_time=start_time,
|
|
252
261
|
end_time=end_time,
|
|
262
|
+
additional_filters=additional_filters,
|
|
253
263
|
**source_kwargs,
|
|
254
264
|
)
|
|
255
265
|
|