mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +134 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +133 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc8.dist-info/METADATA +0 -272
- mlrun-1.6.4rc8.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
mlrun/feature_store/api.py
CHANGED
|
@@ -15,9 +15,8 @@ import copy
|
|
|
15
15
|
import importlib.util
|
|
16
16
|
import pathlib
|
|
17
17
|
import sys
|
|
18
|
-
import typing
|
|
19
18
|
from datetime import datetime
|
|
20
|
-
from typing import Any,
|
|
19
|
+
from typing import Any, Optional, Union
|
|
21
20
|
|
|
22
21
|
import pandas as pd
|
|
23
22
|
from deprecated import deprecated
|
|
@@ -103,7 +102,7 @@ def get_offline_features(
|
|
|
103
102
|
entity_timestamp_column: str = None,
|
|
104
103
|
target: DataTargetBase = None,
|
|
105
104
|
run_config: RunConfig = None,
|
|
106
|
-
drop_columns:
|
|
105
|
+
drop_columns: list[str] = None,
|
|
107
106
|
start_time: Union[str, datetime] = None,
|
|
108
107
|
end_time: Union[str, datetime] = None,
|
|
109
108
|
with_indexes: bool = False,
|
|
@@ -111,9 +110,10 @@ def get_offline_features(
|
|
|
111
110
|
engine: str = None,
|
|
112
111
|
engine_args: dict = None,
|
|
113
112
|
query: str = None,
|
|
114
|
-
order_by: Union[str,
|
|
113
|
+
order_by: Union[str, list[str]] = None,
|
|
115
114
|
spark_service: str = None,
|
|
116
|
-
timestamp_for_filtering: Union[str,
|
|
115
|
+
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
116
|
+
additional_filters: list = None,
|
|
117
117
|
):
|
|
118
118
|
"""retrieve offline feature vector results
|
|
119
119
|
|
|
@@ -137,7 +137,10 @@ def get_offline_features(
|
|
|
137
137
|
]
|
|
138
138
|
vector = FeatureVector(features=features)
|
|
139
139
|
resp = get_offline_features(
|
|
140
|
-
vector,
|
|
140
|
+
vector,
|
|
141
|
+
entity_rows=trades,
|
|
142
|
+
entity_timestamp_column="time",
|
|
143
|
+
query="ticker in ['GOOG'] and bid>100",
|
|
141
144
|
)
|
|
142
145
|
print(resp.to_dataframe())
|
|
143
146
|
print(vector.get_stats_table())
|
|
@@ -173,6 +176,13 @@ def get_offline_features(
|
|
|
173
176
|
By default, the filter executes on the timestamp_key of each feature set.
|
|
174
177
|
Note: the time filtering is performed on each feature set before the
|
|
175
178
|
merge process using start_time and end_time params.
|
|
179
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
180
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
181
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
182
|
+
Example: [("Product", "=", "Computer")]
|
|
183
|
+
For all supported filters, please see:
|
|
184
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
185
|
+
|
|
176
186
|
|
|
177
187
|
"""
|
|
178
188
|
return _get_offline_features(
|
|
@@ -192,6 +202,7 @@ def get_offline_features(
|
|
|
192
202
|
order_by,
|
|
193
203
|
spark_service,
|
|
194
204
|
timestamp_for_filtering,
|
|
205
|
+
additional_filters,
|
|
195
206
|
)
|
|
196
207
|
|
|
197
208
|
|
|
@@ -201,7 +212,7 @@ def _get_offline_features(
|
|
|
201
212
|
entity_timestamp_column: str = None,
|
|
202
213
|
target: DataTargetBase = None,
|
|
203
214
|
run_config: RunConfig = None,
|
|
204
|
-
drop_columns:
|
|
215
|
+
drop_columns: list[str] = None,
|
|
205
216
|
start_time: Union[str, datetime] = None,
|
|
206
217
|
end_time: Union[str, datetime] = None,
|
|
207
218
|
with_indexes: bool = False,
|
|
@@ -209,15 +220,21 @@ def _get_offline_features(
|
|
|
209
220
|
engine: str = None,
|
|
210
221
|
engine_args: dict = None,
|
|
211
222
|
query: str = None,
|
|
212
|
-
order_by: Union[str,
|
|
223
|
+
order_by: Union[str, list[str]] = None,
|
|
213
224
|
spark_service: str = None,
|
|
214
|
-
timestamp_for_filtering: Union[str,
|
|
225
|
+
timestamp_for_filtering: Union[str, dict[str, str]] = None,
|
|
226
|
+
additional_filters=None,
|
|
215
227
|
) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
|
|
216
228
|
if entity_rows is None and entity_timestamp_column is not None:
|
|
217
229
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
218
230
|
"entity_timestamp_column param "
|
|
219
231
|
"can not be specified without entity_rows param"
|
|
220
232
|
)
|
|
233
|
+
if isinstance(target, BaseStoreTarget) and not target.support_pandas:
|
|
234
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
+
f"get_offline_features does not support targets that do not support pandas engine."
|
|
236
|
+
f" Target kind: {target.kind}"
|
|
237
|
+
)
|
|
221
238
|
|
|
222
239
|
if isinstance(feature_vector, FeatureVector):
|
|
223
240
|
update_stats = True
|
|
@@ -250,6 +267,7 @@ def _get_offline_features(
|
|
|
250
267
|
start_time=start_time,
|
|
251
268
|
end_time=end_time,
|
|
252
269
|
timestamp_for_filtering=timestamp_for_filtering,
|
|
270
|
+
additional_filters=additional_filters,
|
|
253
271
|
)
|
|
254
272
|
|
|
255
273
|
merger = merger_engine(feature_vector, **(engine_args or {}))
|
|
@@ -265,6 +283,7 @@ def _get_offline_features(
|
|
|
265
283
|
update_stats=update_stats,
|
|
266
284
|
query=query,
|
|
267
285
|
order_by=order_by,
|
|
286
|
+
additional_filters=additional_filters,
|
|
268
287
|
)
|
|
269
288
|
|
|
270
289
|
|
|
@@ -280,7 +299,7 @@ def get_online_feature_service(
|
|
|
280
299
|
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
281
300
|
impute_policy: dict = None,
|
|
282
301
|
update_stats: bool = False,
|
|
283
|
-
entity_keys:
|
|
302
|
+
entity_keys: list[str] = None,
|
|
284
303
|
):
|
|
285
304
|
"""initialize and return online feature vector service api,
|
|
286
305
|
returns :py:class:`~mlrun.feature_store.OnlineVectorService`
|
|
@@ -308,7 +327,7 @@ def get_online_feature_service(
|
|
|
308
327
|
|
|
309
328
|
Example::
|
|
310
329
|
|
|
311
|
-
svc = get_online_feature_service(vector_uri, entity_keys=[
|
|
330
|
+
svc = get_online_feature_service(vector_uri, entity_keys=["ticker"])
|
|
312
331
|
try:
|
|
313
332
|
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
314
333
|
print(resp)
|
|
@@ -361,7 +380,7 @@ def _get_online_feature_service(
|
|
|
361
380
|
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
362
381
|
impute_policy: dict = None,
|
|
363
382
|
update_stats: bool = False,
|
|
364
|
-
entity_keys:
|
|
383
|
+
entity_keys: list[str] = None,
|
|
365
384
|
) -> OnlineVectorService:
|
|
366
385
|
if isinstance(feature_vector, FeatureVector):
|
|
367
386
|
update_stats = True
|
|
@@ -413,7 +432,7 @@ def _rename_source_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
413
432
|
return df
|
|
414
433
|
|
|
415
434
|
|
|
416
|
-
def _get_namespace(run_config: RunConfig) ->
|
|
435
|
+
def _get_namespace(run_config: RunConfig) -> dict[str, Any]:
|
|
417
436
|
# if running locally, we need to import the file dynamically to get its namespace
|
|
418
437
|
if run_config and run_config.local and run_config.function:
|
|
419
438
|
filename = run_config.function.spec.filename
|
|
@@ -431,7 +450,7 @@ def _get_namespace(run_config: RunConfig) -> Dict[str, Any]:
|
|
|
431
450
|
def ingest(
|
|
432
451
|
featureset: Union[FeatureSet, str] = None,
|
|
433
452
|
source=None,
|
|
434
|
-
targets:
|
|
453
|
+
targets: list[DataTargetBase] = None,
|
|
435
454
|
namespace=None,
|
|
436
455
|
return_df: bool = True,
|
|
437
456
|
infer_options: InferOptions = InferOptions.default(),
|
|
@@ -457,7 +476,7 @@ def ingest(
|
|
|
457
476
|
df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
|
|
458
477
|
|
|
459
478
|
# for running as remote job
|
|
460
|
-
config = RunConfig(image=
|
|
479
|
+
config = RunConfig(image="mlrun/mlrun")
|
|
461
480
|
df = ingest(stocks_set, stocks, run_config=config)
|
|
462
481
|
|
|
463
482
|
# specify source and targets
|
|
@@ -511,7 +530,7 @@ def ingest(
|
|
|
511
530
|
def _ingest(
|
|
512
531
|
featureset: Union[FeatureSet, str] = None,
|
|
513
532
|
source=None,
|
|
514
|
-
targets:
|
|
533
|
+
targets: list[DataTargetBase] = None,
|
|
515
534
|
namespace=None,
|
|
516
535
|
return_df: bool = True,
|
|
517
536
|
infer_options: InferOptions = InferOptions.default(),
|
|
@@ -876,7 +895,7 @@ def _preview(
|
|
|
876
895
|
def _run_ingestion_job(
|
|
877
896
|
featureset: Union[FeatureSet, str],
|
|
878
897
|
source: DataSource = None,
|
|
879
|
-
targets:
|
|
898
|
+
targets: list[DataTargetBase] = None,
|
|
880
899
|
name: str = None,
|
|
881
900
|
infer_options: InferOptions = InferOptions.default(),
|
|
882
901
|
run_config: RunConfig = None,
|
|
@@ -901,11 +920,11 @@ def _run_ingestion_job(
|
|
|
901
920
|
def deploy_ingestion_service_v2(
|
|
902
921
|
featureset: Union[FeatureSet, str],
|
|
903
922
|
source: DataSource = None,
|
|
904
|
-
targets:
|
|
923
|
+
targets: list[DataTargetBase] = None,
|
|
905
924
|
name: str = None,
|
|
906
925
|
run_config: RunConfig = None,
|
|
907
926
|
verbose=False,
|
|
908
|
-
) ->
|
|
927
|
+
) -> tuple[str, BaseRuntime]:
|
|
909
928
|
"""Start real-time ingestion service using nuclio function
|
|
910
929
|
|
|
911
930
|
Deploy a real-time function implementing feature ingestion pipeline
|
|
@@ -944,11 +963,11 @@ def deploy_ingestion_service_v2(
|
|
|
944
963
|
def _deploy_ingestion_service_v2(
|
|
945
964
|
featureset: Union[FeatureSet, str],
|
|
946
965
|
source: DataSource = None,
|
|
947
|
-
targets:
|
|
966
|
+
targets: list[DataTargetBase] = None,
|
|
948
967
|
name: str = None,
|
|
949
968
|
run_config: RunConfig = None,
|
|
950
969
|
verbose=False,
|
|
951
|
-
) ->
|
|
970
|
+
) -> tuple[str, BaseRuntime]:
|
|
952
971
|
if isinstance(featureset, str):
|
|
953
972
|
featureset = get_feature_set_by_uri(featureset)
|
|
954
973
|
|
|
@@ -1003,58 +1022,11 @@ def _deploy_ingestion_service_v2(
|
|
|
1003
1022
|
return function.deploy(), function
|
|
1004
1023
|
|
|
1005
1024
|
|
|
1006
|
-
@deprecated(
|
|
1007
|
-
version="1.5.0",
|
|
1008
|
-
reason="'deploy_ingestion_service' will be removed in 1.7.0, use 'deploy_ingestion_service_v2' instead",
|
|
1009
|
-
category=FutureWarning,
|
|
1010
|
-
)
|
|
1011
|
-
def deploy_ingestion_service(
|
|
1012
|
-
featureset: Union[FeatureSet, str],
|
|
1013
|
-
source: DataSource = None,
|
|
1014
|
-
targets: List[DataTargetBase] = None,
|
|
1015
|
-
name: str = None,
|
|
1016
|
-
run_config: RunConfig = None,
|
|
1017
|
-
verbose=False,
|
|
1018
|
-
) -> str:
|
|
1019
|
-
"""Start real-time ingestion service using nuclio function
|
|
1020
|
-
|
|
1021
|
-
Deploy a real-time function implementing feature ingestion pipeline
|
|
1022
|
-
the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
|
|
1023
|
-
|
|
1024
|
-
the `run_config` parameter allow specifying the function and job configuration,
|
|
1025
|
-
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
1026
|
-
|
|
1027
|
-
example::
|
|
1028
|
-
|
|
1029
|
-
source = HTTPSource()
|
|
1030
|
-
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
1031
|
-
config = RunConfig(function=func)
|
|
1032
|
-
my_set.deploy_ingestion_service(source, run_config=config)
|
|
1033
|
-
|
|
1034
|
-
:param featureset: feature set object or uri
|
|
1035
|
-
:param source: data source object describing the online or offline source
|
|
1036
|
-
:param targets: list of data target objects
|
|
1037
|
-
:param name: name for the job/function
|
|
1038
|
-
:param run_config: service runtime configuration (function object/uri, resources, etc..)
|
|
1039
|
-
:param verbose: verbose log
|
|
1040
|
-
|
|
1041
|
-
:return: URL to access the deployed ingestion service
|
|
1042
|
-
"""
|
|
1043
|
-
endpoint, _ = featureset.deploy_ingestion_service(
|
|
1044
|
-
source=source,
|
|
1045
|
-
targets=targets,
|
|
1046
|
-
name=name,
|
|
1047
|
-
run_config=run_config,
|
|
1048
|
-
verbose=verbose,
|
|
1049
|
-
)
|
|
1050
|
-
return endpoint
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
1025
|
def _ingest_with_spark(
|
|
1054
1026
|
spark=None,
|
|
1055
1027
|
featureset: Union[FeatureSet, str] = None,
|
|
1056
1028
|
source: BaseSourceDriver = None,
|
|
1057
|
-
targets:
|
|
1029
|
+
targets: list[BaseStoreTarget] = None,
|
|
1058
1030
|
infer_options: InferOptions = InferOptions.default(),
|
|
1059
1031
|
mlrun_context=None,
|
|
1060
1032
|
namespace=None,
|
|
@@ -1065,6 +1037,8 @@ def _ingest_with_spark(
|
|
|
1065
1037
|
try:
|
|
1066
1038
|
import pyspark.sql
|
|
1067
1039
|
|
|
1040
|
+
from mlrun.datastore.spark_utils import check_special_columns_exists
|
|
1041
|
+
|
|
1068
1042
|
if spark is None or spark is True:
|
|
1069
1043
|
# create spark context
|
|
1070
1044
|
|
|
@@ -1077,13 +1051,13 @@ def _ingest_with_spark(
|
|
|
1077
1051
|
|
|
1078
1052
|
spark = (
|
|
1079
1053
|
pyspark.sql.SparkSession.builder.appName(session_name)
|
|
1054
|
+
.config("spark.driver.memory", "2g")
|
|
1080
1055
|
.config("spark.sql.session.timeZone", "UTC")
|
|
1081
1056
|
.getOrCreate()
|
|
1082
1057
|
)
|
|
1083
1058
|
created_spark_context = True
|
|
1084
1059
|
|
|
1085
1060
|
timestamp_key = featureset.spec.timestamp_key
|
|
1086
|
-
|
|
1087
1061
|
if isinstance(source, pd.DataFrame):
|
|
1088
1062
|
df = spark.createDataFrame(source)
|
|
1089
1063
|
elif isinstance(source, pyspark.sql.DataFrame):
|
|
@@ -1113,6 +1087,12 @@ def _ingest_with_spark(
|
|
|
1113
1087
|
target = get_target_driver(target, featureset)
|
|
1114
1088
|
target.set_resource(featureset)
|
|
1115
1089
|
if featureset.spec.passthrough and target.is_offline:
|
|
1090
|
+
check_special_columns_exists(
|
|
1091
|
+
spark_df=df,
|
|
1092
|
+
entities=featureset.spec.entities,
|
|
1093
|
+
timestamp_key=timestamp_key,
|
|
1094
|
+
label_column=featureset.spec.label_column,
|
|
1095
|
+
)
|
|
1116
1096
|
continue
|
|
1117
1097
|
spark_options = target.get_spark_options(
|
|
1118
1098
|
key_columns, timestamp_key, overwrite
|
|
@@ -1122,9 +1102,21 @@ def _ingest_with_spark(
|
|
|
1122
1102
|
df_to_write = target.prepare_spark_df(
|
|
1123
1103
|
df_to_write, key_columns, timestamp_key, spark_options
|
|
1124
1104
|
)
|
|
1105
|
+
write_format = spark_options.pop("format", None)
|
|
1106
|
+
# We can get to this point if the column exists in different letter cases,
|
|
1107
|
+
# so PySpark will be able to read it, but we still have to raise an exception for it.
|
|
1108
|
+
|
|
1109
|
+
# This check is here and not in to_spark_df because in spark_merger we can have a target
|
|
1110
|
+
# that has different letter cases than the source, like in SnowflakeTarget.
|
|
1111
|
+
check_special_columns_exists(
|
|
1112
|
+
spark_df=df_to_write,
|
|
1113
|
+
entities=featureset.spec.entities,
|
|
1114
|
+
timestamp_key=timestamp_key,
|
|
1115
|
+
label_column=featureset.spec.label_column,
|
|
1116
|
+
)
|
|
1125
1117
|
if overwrite:
|
|
1126
1118
|
write_spark_dataframe_with_options(
|
|
1127
|
-
spark_options, df_to_write, "overwrite"
|
|
1119
|
+
spark_options, df_to_write, "overwrite", write_format=write_format
|
|
1128
1120
|
)
|
|
1129
1121
|
else:
|
|
1130
1122
|
# appending an empty dataframe may cause an empty file to be created (e.g. when writing to parquet)
|
|
@@ -1132,7 +1124,7 @@ def _ingest_with_spark(
|
|
|
1132
1124
|
df_to_write.persist()
|
|
1133
1125
|
if df_to_write.count() > 0:
|
|
1134
1126
|
write_spark_dataframe_with_options(
|
|
1135
|
-
spark_options, df_to_write, "append"
|
|
1127
|
+
spark_options, df_to_write, "append", write_format=write_format
|
|
1136
1128
|
)
|
|
1137
1129
|
target.update_resource_status("ready")
|
|
1138
1130
|
|
|
@@ -1207,7 +1199,7 @@ def _infer_from_static_df(
|
|
|
1207
1199
|
def set_task_params(
|
|
1208
1200
|
featureset: FeatureSet,
|
|
1209
1201
|
source: DataSource = None,
|
|
1210
|
-
targets:
|
|
1202
|
+
targets: list[DataTargetBase] = None,
|
|
1211
1203
|
parameters: dict = None,
|
|
1212
1204
|
infer_options: InferOptions = InferOptions.Null,
|
|
1213
1205
|
overwrite=None,
|
mlrun/feature_store/common.py
CHANGED
|
@@ -37,17 +37,12 @@ def parse_feature_string(feature):
|
|
|
37
37
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
38
38
|
f"feature {feature} must be {expected_message}"
|
|
39
39
|
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
feature_set
|
|
46
|
-
feature_name = splitted[1]
|
|
47
|
-
splitted = feature_name.split(" as ")
|
|
48
|
-
if len(splitted) > 1:
|
|
49
|
-
return feature_set.strip(), splitted[0].strip(), splitted[1].strip()
|
|
50
|
-
return feature_set.strip(), feature_name.strip(), None
|
|
40
|
+
feature_set, feature_name = feature.rsplit(feature_separator, 1)
|
|
41
|
+
feature_set = feature_set.strip()
|
|
42
|
+
split_result = feature_name.split(" as ", 1)
|
|
43
|
+
feature_name = split_result[0].strip()
|
|
44
|
+
alias = split_result[1].strip() if len(split_result) > 1 else None
|
|
45
|
+
return feature_set, feature_name, alias
|
|
51
46
|
|
|
52
47
|
|
|
53
48
|
def parse_project_name_from_feature_string(feature):
|
|
@@ -192,7 +187,7 @@ class RunConfig:
|
|
|
192
187
|
owner=None,
|
|
193
188
|
credentials: typing.Optional[mlrun.model.Credentials] = None,
|
|
194
189
|
code: str = None,
|
|
195
|
-
requirements: typing.Union[str,
|
|
190
|
+
requirements: typing.Union[str, list[str]] = None,
|
|
196
191
|
extra_spec: dict = None,
|
|
197
192
|
auth_info=None,
|
|
198
193
|
):
|