mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +26 -112
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +46 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +47 -48
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +69 -0
- mlrun/common/db/sql_session.py +2 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/common/formatters/artifact.py +21 -0
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/helpers.py +1 -2
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +24 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +18 -8
- mlrun/common/schemas/auth.py +11 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -1
- mlrun/common/schemas/feature_store.py +16 -16
- mlrun/common/schemas/frontend_spec.py +8 -7
- mlrun/common/schemas/function.py +5 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +18 -3
- mlrun/common/schemas/model_monitoring/constants.py +83 -26
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
- mlrun/common/schemas/notification.py +4 -4
- mlrun/common/schemas/object.py +2 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +1 -10
- mlrun/common/schemas/project.py +24 -23
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +3 -3
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +2 -2
- mlrun/common/types.py +7 -1
- mlrun/config.py +54 -17
- mlrun/data_types/to_pandas.py +10 -12
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +17 -5
- mlrun/datastore/base.py +62 -39
- mlrun/datastore/datastore.py +28 -9
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/filestore.py +0 -1
- mlrun/datastore/google_cloud_storage.py +6 -2
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +6 -2
- mlrun/datastore/s3.py +9 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +201 -96
- mlrun/datastore/spark_utils.py +1 -2
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +358 -104
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +5 -1
- mlrun/db/base.py +185 -35
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +614 -179
- mlrun/db/nopdb.py +210 -26
- mlrun/errors.py +12 -1
- mlrun/execution.py +41 -24
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -72
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +28 -30
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/conversion.py +11 -13
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +37 -34
- mlrun/features.py +9 -20
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +2 -3
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +4 -3
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +14 -16
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +17 -11
- mlrun/launcher/remote.py +16 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +238 -73
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +138 -315
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +24 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +104 -84
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +127 -28
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/prometheus.py +1 -4
- mlrun/model_monitoring/stream_processing.py +62 -231
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +6 -6
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +35 -21
- mlrun/projects/pipelines.py +68 -99
- mlrun/projects/project.py +830 -266
- mlrun/render.py +3 -11
- mlrun/run.py +162 -166
- mlrun/runtimes/__init__.py +62 -7
- mlrun/runtimes/base.py +39 -32
- mlrun/runtimes/daskjob.py +8 -8
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +6 -3
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
- mlrun/runtimes/pod.py +286 -88
- mlrun/runtimes/remotesparkjob.py +2 -2
- mlrun/runtimes/sparkjob/spark3job.py +51 -34
- mlrun/runtimes/utils.py +7 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +13 -10
- mlrun/serving/server.py +22 -26
- mlrun/serving/states.py +99 -25
- mlrun/serving/utils.py +3 -3
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +59 -20
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +1 -2
- mlrun/utils/async_http.py +5 -7
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +3 -3
- mlrun/utils/helpers.py +183 -197
- mlrun/utils/http.py +2 -5
- mlrun/utils/logger.py +76 -14
- mlrun/utils/notifications/notification/__init__.py +17 -12
- mlrun/utils/notifications/notification/base.py +14 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +155 -30
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +2 -4
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc2.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/feature_store/api.py
CHANGED
|
@@ -15,9 +15,8 @@ import copy
|
|
|
15
15
|
import importlib.util
|
|
16
16
|
import pathlib
|
|
17
17
|
import sys
|
|
18
|
-
import typing
|
|
19
18
|
from datetime import datetime
|
|
20
|
-
from typing import Any,
|
|
19
|
+
from typing import Any, Optional, Union
|
|
21
20
|
|
|
22
21
|
import pandas as pd
|
|
23
22
|
from deprecated import deprecated
|
|
@@ -103,7 +102,7 @@ def get_offline_features(
|
|
|
103
102
|
entity_timestamp_column: str = None,
|
|
104
103
|
target: DataTargetBase = None,
|
|
105
104
|
run_config: RunConfig = None,
|
|
106
|
-
drop_columns:
|
|
105
|
+
drop_columns: list[str] = None,
|
|
107
106
|
start_time: Union[str, datetime] = None,
|
|
108
107
|
end_time: Union[str, datetime] = None,
|
|
109
108
|
with_indexes: bool = False,
|
|
@@ -111,9 +110,10 @@ def get_offline_features(
|
|
|
111
110
|
engine: str = None,
|
|
112
111
|
engine_args: dict = None,
|
|
113
112
|
query: str = None,
|
|
114
|
-
order_by: Union[str,
|
|
113
|
+
order_by: Union[str, list[str]] = None,
|
|
115
114
|
spark_service: str = None,
|
|
116
|
-
timestamp_for_filtering: Union[str,
|
|
115
|
+
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
116
|
+
additional_filters: list = None,
|
|
117
117
|
):
|
|
118
118
|
"""retrieve offline feature vector results
|
|
119
119
|
|
|
@@ -137,7 +137,10 @@ def get_offline_features(
|
|
|
137
137
|
]
|
|
138
138
|
vector = FeatureVector(features=features)
|
|
139
139
|
resp = get_offline_features(
|
|
140
|
-
vector,
|
|
140
|
+
vector,
|
|
141
|
+
entity_rows=trades,
|
|
142
|
+
entity_timestamp_column="time",
|
|
143
|
+
query="ticker in ['GOOG'] and bid>100",
|
|
141
144
|
)
|
|
142
145
|
print(resp.to_dataframe())
|
|
143
146
|
print(vector.get_stats_table())
|
|
@@ -173,6 +176,13 @@ def get_offline_features(
|
|
|
173
176
|
By default, the filter executes on the timestamp_key of each feature set.
|
|
174
177
|
Note: the time filtering is performed on each feature set before the
|
|
175
178
|
merge process using start_time and end_time params.
|
|
179
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
180
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
181
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
182
|
+
Example: [("Product", "=", "Computer")]
|
|
183
|
+
For all supported filters, please see:
|
|
184
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
185
|
+
|
|
176
186
|
|
|
177
187
|
"""
|
|
178
188
|
return _get_offline_features(
|
|
@@ -192,6 +202,7 @@ def get_offline_features(
|
|
|
192
202
|
order_by,
|
|
193
203
|
spark_service,
|
|
194
204
|
timestamp_for_filtering,
|
|
205
|
+
additional_filters,
|
|
195
206
|
)
|
|
196
207
|
|
|
197
208
|
|
|
@@ -201,7 +212,7 @@ def _get_offline_features(
|
|
|
201
212
|
entity_timestamp_column: str = None,
|
|
202
213
|
target: DataTargetBase = None,
|
|
203
214
|
run_config: RunConfig = None,
|
|
204
|
-
drop_columns:
|
|
215
|
+
drop_columns: list[str] = None,
|
|
205
216
|
start_time: Union[str, datetime] = None,
|
|
206
217
|
end_time: Union[str, datetime] = None,
|
|
207
218
|
with_indexes: bool = False,
|
|
@@ -209,9 +220,10 @@ def _get_offline_features(
|
|
|
209
220
|
engine: str = None,
|
|
210
221
|
engine_args: dict = None,
|
|
211
222
|
query: str = None,
|
|
212
|
-
order_by: Union[str,
|
|
223
|
+
order_by: Union[str, list[str]] = None,
|
|
213
224
|
spark_service: str = None,
|
|
214
|
-
timestamp_for_filtering: Union[str,
|
|
225
|
+
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
226
|
+
additional_filters=None,
|
|
215
227
|
) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
|
|
216
228
|
if entity_rows is None and entity_timestamp_column is not None:
|
|
217
229
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -250,6 +262,7 @@ def _get_offline_features(
|
|
|
250
262
|
start_time=start_time,
|
|
251
263
|
end_time=end_time,
|
|
252
264
|
timestamp_for_filtering=timestamp_for_filtering,
|
|
265
|
+
additional_filters=additional_filters,
|
|
253
266
|
)
|
|
254
267
|
|
|
255
268
|
merger = merger_engine(feature_vector, **(engine_args or {}))
|
|
@@ -265,6 +278,7 @@ def _get_offline_features(
|
|
|
265
278
|
update_stats=update_stats,
|
|
266
279
|
query=query,
|
|
267
280
|
order_by=order_by,
|
|
281
|
+
additional_filters=additional_filters,
|
|
268
282
|
)
|
|
269
283
|
|
|
270
284
|
|
|
@@ -280,7 +294,7 @@ def get_online_feature_service(
|
|
|
280
294
|
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
281
295
|
impute_policy: dict = None,
|
|
282
296
|
update_stats: bool = False,
|
|
283
|
-
entity_keys:
|
|
297
|
+
entity_keys: list[str] = None,
|
|
284
298
|
):
|
|
285
299
|
"""initialize and return online feature vector service api,
|
|
286
300
|
returns :py:class:`~mlrun.feature_store.OnlineVectorService`
|
|
@@ -308,7 +322,7 @@ def get_online_feature_service(
|
|
|
308
322
|
|
|
309
323
|
Example::
|
|
310
324
|
|
|
311
|
-
svc = get_online_feature_service(vector_uri, entity_keys=[
|
|
325
|
+
svc = get_online_feature_service(vector_uri, entity_keys=["ticker"])
|
|
312
326
|
try:
|
|
313
327
|
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
314
328
|
print(resp)
|
|
@@ -361,7 +375,7 @@ def _get_online_feature_service(
|
|
|
361
375
|
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
362
376
|
impute_policy: dict = None,
|
|
363
377
|
update_stats: bool = False,
|
|
364
|
-
entity_keys:
|
|
378
|
+
entity_keys: list[str] = None,
|
|
365
379
|
) -> OnlineVectorService:
|
|
366
380
|
if isinstance(feature_vector, FeatureVector):
|
|
367
381
|
update_stats = True
|
|
@@ -413,7 +427,7 @@ def _rename_source_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
413
427
|
return df
|
|
414
428
|
|
|
415
429
|
|
|
416
|
-
def _get_namespace(run_config: RunConfig) ->
|
|
430
|
+
def _get_namespace(run_config: RunConfig) -> dict[str, Any]:
|
|
417
431
|
# if running locally, we need to import the file dynamically to get its namespace
|
|
418
432
|
if run_config and run_config.local and run_config.function:
|
|
419
433
|
filename = run_config.function.spec.filename
|
|
@@ -431,7 +445,7 @@ def _get_namespace(run_config: RunConfig) -> Dict[str, Any]:
|
|
|
431
445
|
def ingest(
|
|
432
446
|
featureset: Union[FeatureSet, str] = None,
|
|
433
447
|
source=None,
|
|
434
|
-
targets:
|
|
448
|
+
targets: list[DataTargetBase] = None,
|
|
435
449
|
namespace=None,
|
|
436
450
|
return_df: bool = True,
|
|
437
451
|
infer_options: InferOptions = InferOptions.default(),
|
|
@@ -457,7 +471,7 @@ def ingest(
|
|
|
457
471
|
df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
|
|
458
472
|
|
|
459
473
|
# for running as remote job
|
|
460
|
-
config = RunConfig(image=
|
|
474
|
+
config = RunConfig(image="mlrun/mlrun")
|
|
461
475
|
df = ingest(stocks_set, stocks, run_config=config)
|
|
462
476
|
|
|
463
477
|
# specify source and targets
|
|
@@ -511,7 +525,7 @@ def ingest(
|
|
|
511
525
|
def _ingest(
|
|
512
526
|
featureset: Union[FeatureSet, str] = None,
|
|
513
527
|
source=None,
|
|
514
|
-
targets:
|
|
528
|
+
targets: list[DataTargetBase] = None,
|
|
515
529
|
namespace=None,
|
|
516
530
|
return_df: bool = True,
|
|
517
531
|
infer_options: InferOptions = InferOptions.default(),
|
|
@@ -876,7 +890,7 @@ def _preview(
|
|
|
876
890
|
def _run_ingestion_job(
|
|
877
891
|
featureset: Union[FeatureSet, str],
|
|
878
892
|
source: DataSource = None,
|
|
879
|
-
targets:
|
|
893
|
+
targets: list[DataTargetBase] = None,
|
|
880
894
|
name: str = None,
|
|
881
895
|
infer_options: InferOptions = InferOptions.default(),
|
|
882
896
|
run_config: RunConfig = None,
|
|
@@ -901,11 +915,11 @@ def _run_ingestion_job(
|
|
|
901
915
|
def deploy_ingestion_service_v2(
|
|
902
916
|
featureset: Union[FeatureSet, str],
|
|
903
917
|
source: DataSource = None,
|
|
904
|
-
targets:
|
|
918
|
+
targets: list[DataTargetBase] = None,
|
|
905
919
|
name: str = None,
|
|
906
920
|
run_config: RunConfig = None,
|
|
907
921
|
verbose=False,
|
|
908
|
-
) ->
|
|
922
|
+
) -> tuple[str, BaseRuntime]:
|
|
909
923
|
"""Start real-time ingestion service using nuclio function
|
|
910
924
|
|
|
911
925
|
Deploy a real-time function implementing feature ingestion pipeline
|
|
@@ -944,11 +958,11 @@ def deploy_ingestion_service_v2(
|
|
|
944
958
|
def _deploy_ingestion_service_v2(
|
|
945
959
|
featureset: Union[FeatureSet, str],
|
|
946
960
|
source: DataSource = None,
|
|
947
|
-
targets:
|
|
961
|
+
targets: list[DataTargetBase] = None,
|
|
948
962
|
name: str = None,
|
|
949
963
|
run_config: RunConfig = None,
|
|
950
964
|
verbose=False,
|
|
951
|
-
) ->
|
|
965
|
+
) -> tuple[str, BaseRuntime]:
|
|
952
966
|
if isinstance(featureset, str):
|
|
953
967
|
featureset = get_feature_set_by_uri(featureset)
|
|
954
968
|
|
|
@@ -1003,58 +1017,11 @@ def _deploy_ingestion_service_v2(
|
|
|
1003
1017
|
return function.deploy(), function
|
|
1004
1018
|
|
|
1005
1019
|
|
|
1006
|
-
@deprecated(
|
|
1007
|
-
version="1.5.0",
|
|
1008
|
-
reason="'deploy_ingestion_service' will be removed in 1.7.0, use 'deploy_ingestion_service_v2' instead",
|
|
1009
|
-
category=FutureWarning,
|
|
1010
|
-
)
|
|
1011
|
-
def deploy_ingestion_service(
|
|
1012
|
-
featureset: Union[FeatureSet, str],
|
|
1013
|
-
source: DataSource = None,
|
|
1014
|
-
targets: List[DataTargetBase] = None,
|
|
1015
|
-
name: str = None,
|
|
1016
|
-
run_config: RunConfig = None,
|
|
1017
|
-
verbose=False,
|
|
1018
|
-
) -> str:
|
|
1019
|
-
"""Start real-time ingestion service using nuclio function
|
|
1020
|
-
|
|
1021
|
-
Deploy a real-time function implementing feature ingestion pipeline
|
|
1022
|
-
the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
|
|
1023
|
-
|
|
1024
|
-
the `run_config` parameter allow specifying the function and job configuration,
|
|
1025
|
-
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
1026
|
-
|
|
1027
|
-
example::
|
|
1028
|
-
|
|
1029
|
-
source = HTTPSource()
|
|
1030
|
-
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
1031
|
-
config = RunConfig(function=func)
|
|
1032
|
-
my_set.deploy_ingestion_service(source, run_config=config)
|
|
1033
|
-
|
|
1034
|
-
:param featureset: feature set object or uri
|
|
1035
|
-
:param source: data source object describing the online or offline source
|
|
1036
|
-
:param targets: list of data target objects
|
|
1037
|
-
:param name: name for the job/function
|
|
1038
|
-
:param run_config: service runtime configuration (function object/uri, resources, etc..)
|
|
1039
|
-
:param verbose: verbose log
|
|
1040
|
-
|
|
1041
|
-
:return: URL to access the deployed ingestion service
|
|
1042
|
-
"""
|
|
1043
|
-
endpoint, _ = featureset.deploy_ingestion_service(
|
|
1044
|
-
source=source,
|
|
1045
|
-
targets=targets,
|
|
1046
|
-
name=name,
|
|
1047
|
-
run_config=run_config,
|
|
1048
|
-
verbose=verbose,
|
|
1049
|
-
)
|
|
1050
|
-
return endpoint
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
1020
|
def _ingest_with_spark(
|
|
1054
1021
|
spark=None,
|
|
1055
1022
|
featureset: Union[FeatureSet, str] = None,
|
|
1056
1023
|
source: BaseSourceDriver = None,
|
|
1057
|
-
targets:
|
|
1024
|
+
targets: list[BaseStoreTarget] = None,
|
|
1058
1025
|
infer_options: InferOptions = InferOptions.default(),
|
|
1059
1026
|
mlrun_context=None,
|
|
1060
1027
|
namespace=None,
|
|
@@ -1122,9 +1089,10 @@ def _ingest_with_spark(
|
|
|
1122
1089
|
df_to_write = target.prepare_spark_df(
|
|
1123
1090
|
df_to_write, key_columns, timestamp_key, spark_options
|
|
1124
1091
|
)
|
|
1092
|
+
write_format = spark_options.pop("format", None)
|
|
1125
1093
|
if overwrite:
|
|
1126
1094
|
write_spark_dataframe_with_options(
|
|
1127
|
-
spark_options, df_to_write, "overwrite"
|
|
1095
|
+
spark_options, df_to_write, "overwrite", write_format=write_format
|
|
1128
1096
|
)
|
|
1129
1097
|
else:
|
|
1130
1098
|
# appending an empty dataframe may cause an empty file to be created (e.g. when writing to parquet)
|
|
@@ -1132,7 +1100,7 @@ def _ingest_with_spark(
|
|
|
1132
1100
|
df_to_write.persist()
|
|
1133
1101
|
if df_to_write.count() > 0:
|
|
1134
1102
|
write_spark_dataframe_with_options(
|
|
1135
|
-
spark_options, df_to_write, "append"
|
|
1103
|
+
spark_options, df_to_write, "append", write_format=write_format
|
|
1136
1104
|
)
|
|
1137
1105
|
target.update_resource_status("ready")
|
|
1138
1106
|
|
|
@@ -1207,7 +1175,7 @@ def _infer_from_static_df(
|
|
|
1207
1175
|
def set_task_params(
|
|
1208
1176
|
featureset: FeatureSet,
|
|
1209
1177
|
source: DataSource = None,
|
|
1210
|
-
targets:
|
|
1178
|
+
targets: list[DataTargetBase] = None,
|
|
1211
1179
|
parameters: dict = None,
|
|
1212
1180
|
infer_options: InferOptions = InferOptions.Null,
|
|
1213
1181
|
overwrite=None,
|
mlrun/feature_store/common.py
CHANGED
|
@@ -192,7 +192,7 @@ class RunConfig:
|
|
|
192
192
|
owner=None,
|
|
193
193
|
credentials: typing.Optional[mlrun.model.Credentials] = None,
|
|
194
194
|
code: str = None,
|
|
195
|
-
requirements: typing.Union[str,
|
|
195
|
+
requirements: typing.Union[str, list[str]] = None,
|
|
196
196
|
extra_spec: dict = None,
|
|
197
197
|
auth_info=None,
|
|
198
198
|
):
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import warnings
|
|
15
15
|
from datetime import datetime
|
|
16
|
-
from typing import
|
|
16
|
+
from typing import Optional, Union
|
|
17
17
|
|
|
18
18
|
import pandas as pd
|
|
19
19
|
from storey import EmitEveryEvent, EmitPolicy
|
|
@@ -119,9 +119,9 @@ class FeatureSetSpec(ModelObj):
|
|
|
119
119
|
|
|
120
120
|
self.owner = owner
|
|
121
121
|
self.description = description
|
|
122
|
-
self.entities:
|
|
123
|
-
self.relations:
|
|
124
|
-
self.features:
|
|
122
|
+
self.entities: list[Union[Entity, str]] = entities or []
|
|
123
|
+
self.relations: dict[str, Union[Entity, str]] = relations or {}
|
|
124
|
+
self.features: list[Feature] = features or []
|
|
125
125
|
self.partition_keys = partition_keys or []
|
|
126
126
|
self.timestamp_key = timestamp_key
|
|
127
127
|
self.source = source
|
|
@@ -136,12 +136,12 @@ class FeatureSetSpec(ModelObj):
|
|
|
136
136
|
self.with_default_targets = True
|
|
137
137
|
|
|
138
138
|
@property
|
|
139
|
-
def entities(self) ->
|
|
139
|
+
def entities(self) -> list[Entity]:
|
|
140
140
|
"""feature set entities (indexes)"""
|
|
141
141
|
return self._entities
|
|
142
142
|
|
|
143
143
|
@entities.setter
|
|
144
|
-
def entities(self, entities:
|
|
144
|
+
def entities(self, entities: list[Union[Entity, str]]):
|
|
145
145
|
if entities:
|
|
146
146
|
# if the entity is a string, convert it to Entity class
|
|
147
147
|
for i, entity in enumerate(entities):
|
|
@@ -163,21 +163,21 @@ class FeatureSetSpec(ModelObj):
|
|
|
163
163
|
self._entities = ObjectList.from_list(Entity, entities)
|
|
164
164
|
|
|
165
165
|
@property
|
|
166
|
-
def features(self) ->
|
|
166
|
+
def features(self) -> list[Feature]:
|
|
167
167
|
"""feature set features list"""
|
|
168
168
|
return self._features
|
|
169
169
|
|
|
170
170
|
@features.setter
|
|
171
|
-
def features(self, features:
|
|
171
|
+
def features(self, features: list[Feature]):
|
|
172
172
|
self._features = ObjectList.from_list(Feature, features)
|
|
173
173
|
|
|
174
174
|
@property
|
|
175
|
-
def targets(self) ->
|
|
175
|
+
def targets(self) -> list[DataTargetBase]:
|
|
176
176
|
"""list of desired targets (material storage)"""
|
|
177
177
|
return self._targets
|
|
178
178
|
|
|
179
179
|
@targets.setter
|
|
180
|
-
def targets(self, targets:
|
|
180
|
+
def targets(self, targets: list[DataTargetBase]):
|
|
181
181
|
self._targets = ObjectList.from_list(DataTargetBase, targets)
|
|
182
182
|
|
|
183
183
|
@property
|
|
@@ -230,12 +230,12 @@ class FeatureSetSpec(ModelObj):
|
|
|
230
230
|
self._source = source
|
|
231
231
|
|
|
232
232
|
@property
|
|
233
|
-
def relations(self) ->
|
|
233
|
+
def relations(self) -> dict[str, Entity]:
|
|
234
234
|
"""feature set relations dict"""
|
|
235
235
|
return self._relations
|
|
236
236
|
|
|
237
237
|
@relations.setter
|
|
238
|
-
def relations(self, relations:
|
|
238
|
+
def relations(self, relations: dict[str, Entity]):
|
|
239
239
|
for col, ent in relations.items():
|
|
240
240
|
if isinstance(ent, str):
|
|
241
241
|
relations[col] = Entity(ent)
|
|
@@ -284,12 +284,12 @@ class FeatureSetStatus(ModelObj):
|
|
|
284
284
|
self.run_uri = run_uri
|
|
285
285
|
|
|
286
286
|
@property
|
|
287
|
-
def targets(self) ->
|
|
287
|
+
def targets(self) -> list[DataTarget]:
|
|
288
288
|
"""list of material storage targets + their status/path"""
|
|
289
289
|
return self._targets
|
|
290
290
|
|
|
291
291
|
@targets.setter
|
|
292
|
-
def targets(self, targets:
|
|
292
|
+
def targets(self, targets: list[DataTarget]):
|
|
293
293
|
self._targets = ObjectList.from_list(DataTarget, targets)
|
|
294
294
|
|
|
295
295
|
def update_target(self, target: DataTarget):
|
|
@@ -318,8 +318,6 @@ def emit_policy_to_dict(policy: EmitPolicy):
|
|
|
318
318
|
|
|
319
319
|
|
|
320
320
|
class FeatureSet(ModelObj):
|
|
321
|
-
"""Feature set object, defines a set of features and their data pipeline"""
|
|
322
|
-
|
|
323
321
|
kind = mlrun.common.schemas.ObjectKind.feature_set.value
|
|
324
322
|
_dict_fields = ["kind", "metadata", "spec", "status"]
|
|
325
323
|
|
|
@@ -327,11 +325,11 @@ class FeatureSet(ModelObj):
|
|
|
327
325
|
self,
|
|
328
326
|
name: str = None,
|
|
329
327
|
description: str = None,
|
|
330
|
-
entities:
|
|
328
|
+
entities: list[Union[Entity, str]] = None,
|
|
331
329
|
timestamp_key: str = None,
|
|
332
330
|
engine: str = None,
|
|
333
331
|
label_column: str = None,
|
|
334
|
-
relations:
|
|
332
|
+
relations: dict[str, Union[Entity, str]] = None,
|
|
335
333
|
passthrough: bool = None,
|
|
336
334
|
):
|
|
337
335
|
"""Feature set object, defines a set of features and their data pipeline
|
|
@@ -339,7 +337,10 @@ class FeatureSet(ModelObj):
|
|
|
339
337
|
example::
|
|
340
338
|
|
|
341
339
|
import mlrun.feature_store as fstore
|
|
342
|
-
|
|
340
|
+
|
|
341
|
+
ticks = fstore.FeatureSet(
|
|
342
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
343
|
+
)
|
|
343
344
|
ticks.ingest(df)
|
|
344
345
|
|
|
345
346
|
:param name: name of the feature set
|
|
@@ -532,7 +533,7 @@ class FeatureSet(ModelObj):
|
|
|
532
533
|
self, **(class_args if class_args is not None else {})
|
|
533
534
|
)
|
|
534
535
|
|
|
535
|
-
def purge_targets(self, target_names:
|
|
536
|
+
def purge_targets(self, target_names: list[str] = None, silent: bool = False):
|
|
536
537
|
"""Delete data of specific targets
|
|
537
538
|
:param target_names: List of names of targets to delete (default: delete all ingested targets)
|
|
538
539
|
:param silent: Fail silently if target doesn't exist in featureset status"""
|
|
@@ -560,7 +561,7 @@ class FeatureSet(ModelObj):
|
|
|
560
561
|
|
|
561
562
|
def update_targets_for_ingest(
|
|
562
563
|
self,
|
|
563
|
-
targets:
|
|
564
|
+
targets: list[DataTargetBase],
|
|
564
565
|
overwrite: bool = None,
|
|
565
566
|
):
|
|
566
567
|
if not targets:
|
|
@@ -581,7 +582,7 @@ class FeatureSet(ModelObj):
|
|
|
581
582
|
update_targets_run_id_for_ingest(overwrite, targets, status_targets)
|
|
582
583
|
|
|
583
584
|
def _reload_and_get_status_targets(
|
|
584
|
-
self, target_names:
|
|
585
|
+
self, target_names: list[str] = None, silent: bool = False
|
|
585
586
|
):
|
|
586
587
|
try:
|
|
587
588
|
self.reload(update_spec=False)
|
|
@@ -602,9 +603,7 @@ class FeatureSet(ModelObj):
|
|
|
602
603
|
pass
|
|
603
604
|
else:
|
|
604
605
|
raise mlrun.errors.MLRunNotFoundError(
|
|
605
|
-
"Target not found in status (fset={
|
|
606
|
-
self.metadata.name, target_name
|
|
607
|
-
)
|
|
606
|
+
f"Target not found in status (fset={self.metadata.name}, target={target_name})"
|
|
608
607
|
)
|
|
609
608
|
else:
|
|
610
609
|
targets = self.status.targets
|
|
@@ -621,7 +620,7 @@ class FeatureSet(ModelObj):
|
|
|
621
620
|
name: str,
|
|
622
621
|
value_type: mlrun.data_types.ValueType = None,
|
|
623
622
|
description: str = None,
|
|
624
|
-
labels: Optional[
|
|
623
|
+
labels: Optional[dict[str, str]] = None,
|
|
625
624
|
):
|
|
626
625
|
"""add/set an entity (dataset index)
|
|
627
626
|
|
|
@@ -629,12 +628,12 @@ class FeatureSet(ModelObj):
|
|
|
629
628
|
|
|
630
629
|
import mlrun.feature_store as fstore
|
|
631
630
|
|
|
632
|
-
ticks = fstore.FeatureSet(
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
ticks.add_entity(
|
|
636
|
-
|
|
637
|
-
|
|
631
|
+
ticks = fstore.FeatureSet(
|
|
632
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
633
|
+
)
|
|
634
|
+
ticks.add_entity(
|
|
635
|
+
"country", mlrun.data_types.ValueType.STRING, description="stock country"
|
|
636
|
+
)
|
|
638
637
|
ticks.add_entity("year", mlrun.data_types.ValueType.INT16)
|
|
639
638
|
ticks.save()
|
|
640
639
|
|
|
@@ -654,13 +653,23 @@ class FeatureSet(ModelObj):
|
|
|
654
653
|
import mlrun.feature_store as fstore
|
|
655
654
|
from mlrun.features import Feature
|
|
656
655
|
|
|
657
|
-
ticks = fstore.FeatureSet(
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
ticks.add_feature(
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
656
|
+
ticks = fstore.FeatureSet(
|
|
657
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
658
|
+
)
|
|
659
|
+
ticks.add_feature(
|
|
660
|
+
Feature(
|
|
661
|
+
value_type=mlrun.data_types.ValueType.STRING,
|
|
662
|
+
description="client consistency",
|
|
663
|
+
),
|
|
664
|
+
"ABC01",
|
|
665
|
+
)
|
|
666
|
+
ticks.add_feature(
|
|
667
|
+
Feature(
|
|
668
|
+
value_type=mlrun.data_types.ValueType.FLOAT,
|
|
669
|
+
description="client volatility",
|
|
670
|
+
),
|
|
671
|
+
"SAB",
|
|
672
|
+
)
|
|
664
673
|
ticks.save()
|
|
665
674
|
|
|
666
675
|
:param feature: setting of Feature
|
|
@@ -864,15 +873,18 @@ class FeatureSet(ModelObj):
|
|
|
864
873
|
example::
|
|
865
874
|
|
|
866
875
|
import mlrun.feature_store as fstore
|
|
876
|
+
|
|
867
877
|
...
|
|
868
|
-
ticks = fstore.FeatureSet(
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
ticks.add_aggregation(
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
878
|
+
ticks = fstore.FeatureSet(
|
|
879
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
880
|
+
)
|
|
881
|
+
ticks.add_aggregation(
|
|
882
|
+
name="priceN",
|
|
883
|
+
column="price",
|
|
884
|
+
operations=["avg"],
|
|
885
|
+
windows=["1d"],
|
|
886
|
+
period="1h",
|
|
887
|
+
)
|
|
876
888
|
ticks.plot(rankdir="LR", with_targets=True)
|
|
877
889
|
|
|
878
890
|
:param filename: target filepath for the graph image (None for the notebook)
|
|
@@ -905,6 +917,7 @@ class FeatureSet(ModelObj):
|
|
|
905
917
|
start_time=None,
|
|
906
918
|
end_time=None,
|
|
907
919
|
time_column=None,
|
|
920
|
+
additional_filters=None,
|
|
908
921
|
**kwargs,
|
|
909
922
|
):
|
|
910
923
|
"""return featureset (offline) data as dataframe
|
|
@@ -916,6 +929,12 @@ class FeatureSet(ModelObj):
|
|
|
916
929
|
:param end_time: filter by end time
|
|
917
930
|
:param time_column: specify the time column name in the file
|
|
918
931
|
:param kwargs: additional reader (csv, parquet, ..) args
|
|
932
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
933
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
934
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
935
|
+
Example: [("Product", "=", "Computer")]
|
|
936
|
+
For all supported filters, please see:
|
|
937
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
919
938
|
:return: DataFrame
|
|
920
939
|
"""
|
|
921
940
|
entities = list(self.spec.entities.keys())
|
|
@@ -934,6 +953,7 @@ class FeatureSet(ModelObj):
|
|
|
934
953
|
start_time=start_time,
|
|
935
954
|
end_time=end_time,
|
|
936
955
|
time_field=time_column,
|
|
956
|
+
additional_filters=additional_filters,
|
|
937
957
|
**kwargs,
|
|
938
958
|
)
|
|
939
959
|
# to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
|
|
@@ -953,6 +973,7 @@ class FeatureSet(ModelObj):
|
|
|
953
973
|
start_time=start_time,
|
|
954
974
|
end_time=end_time,
|
|
955
975
|
time_column=time_column,
|
|
976
|
+
additional_filters=additional_filters,
|
|
956
977
|
**kwargs,
|
|
957
978
|
)
|
|
958
979
|
return result
|
|
@@ -983,7 +1004,7 @@ class FeatureSet(ModelObj):
|
|
|
983
1004
|
def ingest(
|
|
984
1005
|
self,
|
|
985
1006
|
source=None,
|
|
986
|
-
targets:
|
|
1007
|
+
targets: list[DataTargetBase] = None,
|
|
987
1008
|
namespace=None,
|
|
988
1009
|
return_df: bool = True,
|
|
989
1010
|
infer_options: InferOptions = InferOptions.default(),
|
|
@@ -1009,7 +1030,7 @@ class FeatureSet(ModelObj):
|
|
|
1009
1030
|
df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
|
|
1010
1031
|
|
|
1011
1032
|
# for running as remote job
|
|
1012
|
-
config = RunConfig(image=
|
|
1033
|
+
config = RunConfig(image="mlrun/mlrun")
|
|
1013
1034
|
df = ingest(stocks_set, stocks, run_config=config)
|
|
1014
1035
|
|
|
1015
1036
|
# specify source and targets
|
|
@@ -1085,11 +1106,11 @@ class FeatureSet(ModelObj):
|
|
|
1085
1106
|
def deploy_ingestion_service(
|
|
1086
1107
|
self,
|
|
1087
1108
|
source: DataSource = None,
|
|
1088
|
-
targets:
|
|
1109
|
+
targets: list[DataTargetBase] = None,
|
|
1089
1110
|
name: str = None,
|
|
1090
1111
|
run_config: RunConfig = None,
|
|
1091
1112
|
verbose=False,
|
|
1092
|
-
) ->
|
|
1113
|
+
) -> tuple[str, BaseRuntime]:
|
|
1093
1114
|
"""Start real-time ingestion service using nuclio function
|
|
1094
1115
|
|
|
1095
1116
|
Deploy a real-time function implementing feature ingestion pipeline
|
|
@@ -1122,7 +1143,7 @@ class FeatureSet(ModelObj):
|
|
|
1122
1143
|
def extract_relation_keys(
|
|
1123
1144
|
self,
|
|
1124
1145
|
other_feature_set,
|
|
1125
|
-
relations:
|
|
1146
|
+
relations: dict[str, Union[str, Entity]] = None,
|
|
1126
1147
|
) -> list[str]:
|
|
1127
1148
|
"""
|
|
1128
1149
|
Checks whether a feature set can be merged to the right of this feature set.
|
|
@@ -1189,10 +1210,10 @@ class SparkAggregateByKey(StepToDict):
|
|
|
1189
1210
|
|
|
1190
1211
|
def __init__(
|
|
1191
1212
|
self,
|
|
1192
|
-
key_columns:
|
|
1213
|
+
key_columns: list[str],
|
|
1193
1214
|
time_column: str,
|
|
1194
|
-
aggregates:
|
|
1195
|
-
emit_policy: Union[EmitPolicy,
|
|
1215
|
+
aggregates: list[dict],
|
|
1216
|
+
emit_policy: Union[EmitPolicy, dict] = None,
|
|
1196
1217
|
):
|
|
1197
1218
|
self.key_columns = key_columns
|
|
1198
1219
|
self.time_column = time_column
|