mlrun 1.6.0rc26__py3-none-any.whl → 1.6.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/manager.py +6 -0
- mlrun/artifacts/model.py +28 -22
- mlrun/common/db/sql_session.py +3 -0
- mlrun/common/model_monitoring/helpers.py +4 -2
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/common.py +40 -0
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +21 -5
- mlrun/common/schemas/project.py +2 -0
- mlrun/config.py +43 -17
- mlrun/data_types/data_types.py +4 -0
- mlrun/datastore/azure_blob.py +9 -9
- mlrun/datastore/base.py +22 -44
- mlrun/datastore/datastore.py +7 -3
- mlrun/datastore/datastore_profile.py +15 -3
- mlrun/datastore/google_cloud_storage.py +7 -7
- mlrun/datastore/sources.py +17 -4
- mlrun/datastore/targets.py +3 -1
- mlrun/datastore/utils.py +11 -1
- mlrun/datastore/v3io.py +70 -46
- mlrun/db/base.py +18 -0
- mlrun/db/httpdb.py +41 -36
- mlrun/execution.py +3 -3
- mlrun/feature_store/api.py +133 -132
- mlrun/feature_store/feature_set.py +89 -0
- mlrun/feature_store/feature_vector.py +120 -0
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
- mlrun/frameworks/tf_keras/model_handler.py +7 -7
- mlrun/k8s_utils.py +56 -0
- mlrun/kfpops.py +19 -10
- mlrun/model.py +6 -0
- mlrun/model_monitoring/api.py +8 -8
- mlrun/model_monitoring/batch.py +1 -1
- mlrun/model_monitoring/controller.py +0 -7
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +13 -13
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
- mlrun/model_monitoring/stream_processing.py +52 -38
- mlrun/package/packagers/pandas_packagers.py +3 -3
- mlrun/package/utils/_archiver.py +3 -1
- mlrun/platforms/iguazio.py +6 -65
- mlrun/projects/pipelines.py +29 -12
- mlrun/projects/project.py +100 -61
- mlrun/run.py +2 -0
- mlrun/runtimes/base.py +24 -1
- mlrun/runtimes/function.py +14 -15
- mlrun/runtimes/kubejob.py +5 -3
- mlrun/runtimes/local.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +6 -6
- mlrun/runtimes/pod.py +3 -3
- mlrun/runtimes/serving.py +7 -14
- mlrun/runtimes/sparkjob/spark3job.py +3 -3
- mlrun/serving/remote.py +4 -2
- mlrun/serving/routers.py +14 -8
- mlrun/utils/async_http.py +3 -3
- mlrun/utils/helpers.py +59 -3
- mlrun/utils/http.py +3 -3
- mlrun/utils/logger.py +2 -2
- mlrun/utils/notifications/notification_pusher.py +6 -6
- mlrun/utils/regex.py +5 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.6.0rc26.dist-info → mlrun-1.6.3rc1.dist-info}/METADATA +21 -23
- {mlrun-1.6.0rc26.dist-info → mlrun-1.6.3rc1.dist-info}/RECORD +66 -65
- {mlrun-1.6.0rc26.dist-info → mlrun-1.6.3rc1.dist-info}/WHEEL +1 -1
- {mlrun-1.6.0rc26.dist-info → mlrun-1.6.3rc1.dist-info}/LICENSE +0 -0
- {mlrun-1.6.0rc26.dist-info → mlrun-1.6.3rc1.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.0rc26.dist-info → mlrun-1.6.3rc1.dist-info}/top_level.txt +0 -0
mlrun/feature_store/api.py
CHANGED
|
@@ -93,7 +93,8 @@ def _features_to_vector_and_check_permissions(features, update_stats):
|
|
|
93
93
|
|
|
94
94
|
@deprecated(
|
|
95
95
|
version="1.6.0",
|
|
96
|
-
reason="
|
|
96
|
+
reason="get_offline_features() will be removed in 1.8.0, please instead use "
|
|
97
|
+
"get_feature_vector('store://feature_vector_name').get_offline_features()",
|
|
97
98
|
category=FutureWarning,
|
|
98
99
|
)
|
|
99
100
|
def get_offline_features(
|
|
@@ -114,44 +115,6 @@ def get_offline_features(
|
|
|
114
115
|
spark_service: str = None,
|
|
115
116
|
timestamp_for_filtering: Union[str, Dict[str, str]] = None,
|
|
116
117
|
):
|
|
117
|
-
return _get_offline_features(
|
|
118
|
-
feature_vector,
|
|
119
|
-
entity_rows,
|
|
120
|
-
entity_timestamp_column,
|
|
121
|
-
target,
|
|
122
|
-
run_config,
|
|
123
|
-
drop_columns,
|
|
124
|
-
start_time,
|
|
125
|
-
end_time,
|
|
126
|
-
with_indexes,
|
|
127
|
-
update_stats,
|
|
128
|
-
engine,
|
|
129
|
-
engine_args,
|
|
130
|
-
query,
|
|
131
|
-
order_by,
|
|
132
|
-
spark_service,
|
|
133
|
-
timestamp_for_filtering,
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def _get_offline_features(
|
|
138
|
-
feature_vector: Union[str, FeatureVector],
|
|
139
|
-
entity_rows=None,
|
|
140
|
-
entity_timestamp_column: str = None,
|
|
141
|
-
target: DataTargetBase = None,
|
|
142
|
-
run_config: RunConfig = None,
|
|
143
|
-
drop_columns: List[str] = None,
|
|
144
|
-
start_time: Union[str, datetime] = None,
|
|
145
|
-
end_time: Union[str, datetime] = None,
|
|
146
|
-
with_indexes: bool = False,
|
|
147
|
-
update_stats: bool = False,
|
|
148
|
-
engine: str = None,
|
|
149
|
-
engine_args: dict = None,
|
|
150
|
-
query: str = None,
|
|
151
|
-
order_by: Union[str, List[str]] = None,
|
|
152
|
-
spark_service: str = None,
|
|
153
|
-
timestamp_for_filtering: Union[str, Dict[str, str]] = None,
|
|
154
|
-
) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
|
|
155
118
|
"""retrieve offline feature vector results
|
|
156
119
|
|
|
157
120
|
specify a feature vector object/uri and retrieve the desired features, their metadata
|
|
@@ -212,6 +175,44 @@ def _get_offline_features(
|
|
|
212
175
|
merge process using start_time and end_time params.
|
|
213
176
|
|
|
214
177
|
"""
|
|
178
|
+
return _get_offline_features(
|
|
179
|
+
feature_vector,
|
|
180
|
+
entity_rows,
|
|
181
|
+
entity_timestamp_column,
|
|
182
|
+
target,
|
|
183
|
+
run_config,
|
|
184
|
+
drop_columns,
|
|
185
|
+
start_time,
|
|
186
|
+
end_time,
|
|
187
|
+
with_indexes,
|
|
188
|
+
update_stats,
|
|
189
|
+
engine,
|
|
190
|
+
engine_args,
|
|
191
|
+
query,
|
|
192
|
+
order_by,
|
|
193
|
+
spark_service,
|
|
194
|
+
timestamp_for_filtering,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _get_offline_features(
|
|
199
|
+
feature_vector: Union[str, FeatureVector],
|
|
200
|
+
entity_rows=None,
|
|
201
|
+
entity_timestamp_column: str = None,
|
|
202
|
+
target: DataTargetBase = None,
|
|
203
|
+
run_config: RunConfig = None,
|
|
204
|
+
drop_columns: List[str] = None,
|
|
205
|
+
start_time: Union[str, datetime] = None,
|
|
206
|
+
end_time: Union[str, datetime] = None,
|
|
207
|
+
with_indexes: bool = False,
|
|
208
|
+
update_stats: bool = False,
|
|
209
|
+
engine: str = None,
|
|
210
|
+
engine_args: dict = None,
|
|
211
|
+
query: str = None,
|
|
212
|
+
order_by: Union[str, List[str]] = None,
|
|
213
|
+
spark_service: str = None,
|
|
214
|
+
timestamp_for_filtering: Union[str, Dict[str, str]] = None,
|
|
215
|
+
) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
|
|
215
216
|
if entity_rows is None and entity_timestamp_column is not None:
|
|
216
217
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
217
218
|
"entity_timestamp_column param "
|
|
@@ -269,8 +270,8 @@ def _get_offline_features(
|
|
|
269
270
|
|
|
270
271
|
@deprecated(
|
|
271
272
|
version="1.6.0",
|
|
272
|
-
reason="
|
|
273
|
-
"
|
|
273
|
+
reason="get_online_feature_service() will be removed in 1.8.0, please instead use "
|
|
274
|
+
"get_feature_vector('store://feature_vector_name').get_online_feature_service()",
|
|
274
275
|
category=FutureWarning,
|
|
275
276
|
)
|
|
276
277
|
def get_online_feature_service(
|
|
@@ -281,24 +282,6 @@ def get_online_feature_service(
|
|
|
281
282
|
update_stats: bool = False,
|
|
282
283
|
entity_keys: List[str] = None,
|
|
283
284
|
):
|
|
284
|
-
return _get_online_feature_service(
|
|
285
|
-
feature_vector,
|
|
286
|
-
run_config,
|
|
287
|
-
fixed_window_type,
|
|
288
|
-
impute_policy,
|
|
289
|
-
update_stats,
|
|
290
|
-
entity_keys,
|
|
291
|
-
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def _get_online_feature_service(
|
|
295
|
-
feature_vector: Union[str, FeatureVector],
|
|
296
|
-
run_config: RunConfig = None,
|
|
297
|
-
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
298
|
-
impute_policy: dict = None,
|
|
299
|
-
update_stats: bool = False,
|
|
300
|
-
entity_keys: List[str] = None,
|
|
301
|
-
) -> OnlineVectorService:
|
|
302
285
|
"""initialize and return online feature vector service api,
|
|
303
286
|
returns :py:class:`~mlrun.feature_store.OnlineVectorService`
|
|
304
287
|
|
|
@@ -362,6 +345,24 @@ def _get_online_feature_service(
|
|
|
362
345
|
:return: Initialize the `OnlineVectorService`.
|
|
363
346
|
Will be used in subclasses where `support_online=True`.
|
|
364
347
|
"""
|
|
348
|
+
return _get_online_feature_service(
|
|
349
|
+
feature_vector,
|
|
350
|
+
run_config,
|
|
351
|
+
fixed_window_type,
|
|
352
|
+
impute_policy,
|
|
353
|
+
update_stats,
|
|
354
|
+
entity_keys,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _get_online_feature_service(
|
|
359
|
+
feature_vector: Union[str, FeatureVector],
|
|
360
|
+
run_config: RunConfig = None,
|
|
361
|
+
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
362
|
+
impute_policy: dict = None,
|
|
363
|
+
update_stats: bool = False,
|
|
364
|
+
entity_keys: List[str] = None,
|
|
365
|
+
) -> OnlineVectorService:
|
|
365
366
|
if isinstance(feature_vector, FeatureVector):
|
|
366
367
|
update_stats = True
|
|
367
368
|
feature_vector = _features_to_vector_and_check_permissions(
|
|
@@ -438,40 +439,6 @@ def ingest(
|
|
|
438
439
|
mlrun_context=None,
|
|
439
440
|
spark_context=None,
|
|
440
441
|
overwrite=None,
|
|
441
|
-
) -> Optional[pd.DataFrame]:
|
|
442
|
-
if mlrun_context is None:
|
|
443
|
-
deprecated(
|
|
444
|
-
version="1.6.0",
|
|
445
|
-
reason="Calling 'ingest' with mlrun_context=None is deprecated and will be removed in 1.8.0,\
|
|
446
|
-
use 'FeatureSet.ingest()' instead",
|
|
447
|
-
category=FutureWarning,
|
|
448
|
-
)
|
|
449
|
-
|
|
450
|
-
return _ingest(
|
|
451
|
-
featureset,
|
|
452
|
-
source,
|
|
453
|
-
targets,
|
|
454
|
-
namespace,
|
|
455
|
-
return_df,
|
|
456
|
-
infer_options,
|
|
457
|
-
run_config,
|
|
458
|
-
mlrun_context,
|
|
459
|
-
spark_context,
|
|
460
|
-
overwrite,
|
|
461
|
-
)
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
def _ingest(
|
|
465
|
-
featureset: Union[FeatureSet, str] = None,
|
|
466
|
-
source=None,
|
|
467
|
-
targets: List[DataTargetBase] = None,
|
|
468
|
-
namespace=None,
|
|
469
|
-
return_df: bool = True,
|
|
470
|
-
infer_options: InferOptions = InferOptions.default(),
|
|
471
|
-
run_config: RunConfig = None,
|
|
472
|
-
mlrun_context=None,
|
|
473
|
-
spark_context=None,
|
|
474
|
-
overwrite=None,
|
|
475
442
|
) -> Optional[pd.DataFrame]:
|
|
476
443
|
"""Read local DataFrame, file, URL, or source into the feature store
|
|
477
444
|
Ingest reads from the source, run the graph transformations, infers metadata and stats
|
|
@@ -519,6 +486,40 @@ def _ingest(
|
|
|
519
486
|
False for scheduled ingest - does not delete the target)
|
|
520
487
|
:return: if return_df is True, a dataframe will be returned based on the graph
|
|
521
488
|
"""
|
|
489
|
+
if mlrun_context is None:
|
|
490
|
+
deprecated(
|
|
491
|
+
version="1.6.0",
|
|
492
|
+
reason="Calling 'ingest' with mlrun_context=None is deprecated and will be removed in 1.8.0,\
|
|
493
|
+
use 'FeatureSet.ingest()' instead",
|
|
494
|
+
category=FutureWarning,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
return _ingest(
|
|
498
|
+
featureset,
|
|
499
|
+
source,
|
|
500
|
+
targets,
|
|
501
|
+
namespace,
|
|
502
|
+
return_df,
|
|
503
|
+
infer_options,
|
|
504
|
+
run_config,
|
|
505
|
+
mlrun_context,
|
|
506
|
+
spark_context,
|
|
507
|
+
overwrite,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _ingest(
|
|
512
|
+
featureset: Union[FeatureSet, str] = None,
|
|
513
|
+
source=None,
|
|
514
|
+
targets: List[DataTargetBase] = None,
|
|
515
|
+
namespace=None,
|
|
516
|
+
return_df: bool = True,
|
|
517
|
+
infer_options: InferOptions = InferOptions.default(),
|
|
518
|
+
run_config: RunConfig = None,
|
|
519
|
+
mlrun_context=None,
|
|
520
|
+
spark_context=None,
|
|
521
|
+
overwrite=None,
|
|
522
|
+
) -> Optional[pd.DataFrame]:
|
|
522
523
|
if isinstance(source, pd.DataFrame):
|
|
523
524
|
source = _rename_source_dataframe_columns(source)
|
|
524
525
|
|
|
@@ -768,26 +769,6 @@ def preview(
|
|
|
768
769
|
options: InferOptions = None,
|
|
769
770
|
verbose: bool = False,
|
|
770
771
|
sample_size: int = None,
|
|
771
|
-
) -> pd.DataFrame:
|
|
772
|
-
return _preview(
|
|
773
|
-
featureset,
|
|
774
|
-
source,
|
|
775
|
-
entity_columns,
|
|
776
|
-
namespace,
|
|
777
|
-
options,
|
|
778
|
-
verbose,
|
|
779
|
-
sample_size,
|
|
780
|
-
)
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
def _preview(
|
|
784
|
-
featureset: FeatureSet,
|
|
785
|
-
source,
|
|
786
|
-
entity_columns: list = None,
|
|
787
|
-
namespace=None,
|
|
788
|
-
options: InferOptions = None,
|
|
789
|
-
verbose: bool = False,
|
|
790
|
-
sample_size: int = None,
|
|
791
772
|
) -> pd.DataFrame:
|
|
792
773
|
"""run the ingestion pipeline with local DataFrame/file data and infer features schema and stats
|
|
793
774
|
|
|
@@ -811,6 +792,26 @@ def _preview(
|
|
|
811
792
|
:param verbose: verbose log
|
|
812
793
|
:param sample_size: num of rows to sample from the dataset (for large datasets)
|
|
813
794
|
"""
|
|
795
|
+
return _preview(
|
|
796
|
+
featureset,
|
|
797
|
+
source,
|
|
798
|
+
entity_columns,
|
|
799
|
+
namespace,
|
|
800
|
+
options,
|
|
801
|
+
verbose,
|
|
802
|
+
sample_size,
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def _preview(
|
|
807
|
+
featureset: FeatureSet,
|
|
808
|
+
source,
|
|
809
|
+
entity_columns: list = None,
|
|
810
|
+
namespace=None,
|
|
811
|
+
options: InferOptions = None,
|
|
812
|
+
verbose: bool = False,
|
|
813
|
+
sample_size: int = None,
|
|
814
|
+
) -> pd.DataFrame:
|
|
814
815
|
if isinstance(source, pd.DataFrame):
|
|
815
816
|
source = _rename_source_dataframe_columns(source)
|
|
816
817
|
|
|
@@ -904,24 +905,6 @@ def deploy_ingestion_service_v2(
|
|
|
904
905
|
name: str = None,
|
|
905
906
|
run_config: RunConfig = None,
|
|
906
907
|
verbose=False,
|
|
907
|
-
) -> typing.Tuple[str, BaseRuntime]:
|
|
908
|
-
return _deploy_ingestion_service_v2(
|
|
909
|
-
featureset,
|
|
910
|
-
source,
|
|
911
|
-
targets,
|
|
912
|
-
name,
|
|
913
|
-
run_config,
|
|
914
|
-
verbose,
|
|
915
|
-
)
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
def _deploy_ingestion_service_v2(
|
|
919
|
-
featureset: Union[FeatureSet, str],
|
|
920
|
-
source: DataSource = None,
|
|
921
|
-
targets: List[DataTargetBase] = None,
|
|
922
|
-
name: str = None,
|
|
923
|
-
run_config: RunConfig = None,
|
|
924
|
-
verbose=False,
|
|
925
908
|
) -> typing.Tuple[str, BaseRuntime]:
|
|
926
909
|
"""Start real-time ingestion service using nuclio function
|
|
927
910
|
|
|
@@ -936,7 +919,7 @@ def _deploy_ingestion_service_v2(
|
|
|
936
919
|
source = HTTPSource()
|
|
937
920
|
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
938
921
|
config = RunConfig(function=func)
|
|
939
|
-
my_set
|
|
922
|
+
deploy_ingestion_service_v2(my_set, source, run_config=config)
|
|
940
923
|
|
|
941
924
|
:param featureset: feature set object or uri
|
|
942
925
|
:param source: data source object describing the online or offline source
|
|
@@ -948,6 +931,24 @@ def _deploy_ingestion_service_v2(
|
|
|
948
931
|
:return: URL to access the deployed ingestion service, and the function that was deployed (which will
|
|
949
932
|
differ from the function passed in via the run_config parameter).
|
|
950
933
|
"""
|
|
934
|
+
return _deploy_ingestion_service_v2(
|
|
935
|
+
featureset,
|
|
936
|
+
source,
|
|
937
|
+
targets,
|
|
938
|
+
name,
|
|
939
|
+
run_config,
|
|
940
|
+
verbose,
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _deploy_ingestion_service_v2(
|
|
945
|
+
featureset: Union[FeatureSet, str],
|
|
946
|
+
source: DataSource = None,
|
|
947
|
+
targets: List[DataTargetBase] = None,
|
|
948
|
+
name: str = None,
|
|
949
|
+
run_config: RunConfig = None,
|
|
950
|
+
verbose=False,
|
|
951
|
+
) -> typing.Tuple[str, BaseRuntime]:
|
|
951
952
|
if isinstance(featureset, str):
|
|
952
953
|
featureset = get_feature_set_by_uri(featureset)
|
|
953
954
|
|
|
@@ -992,6 +992,50 @@ class FeatureSet(ModelObj):
|
|
|
992
992
|
spark_context=None,
|
|
993
993
|
overwrite=None,
|
|
994
994
|
) -> Optional[pd.DataFrame]:
|
|
995
|
+
"""Read local DataFrame, file, URL, or source into the feature store
|
|
996
|
+
Ingest reads from the source, run the graph transformations, infers metadata and stats
|
|
997
|
+
and writes the results to the default of specified targets
|
|
998
|
+
|
|
999
|
+
when targets are not specified data is stored in the configured default targets
|
|
1000
|
+
(will usually be NoSQL for real-time and Parquet for offline).
|
|
1001
|
+
|
|
1002
|
+
the `run_config` parameter allow specifying the function and job configuration,
|
|
1003
|
+
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
1004
|
+
|
|
1005
|
+
example::
|
|
1006
|
+
|
|
1007
|
+
stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
|
|
1008
|
+
stocks = pd.read_csv("stocks.csv")
|
|
1009
|
+
df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
|
|
1010
|
+
|
|
1011
|
+
# for running as remote job
|
|
1012
|
+
config = RunConfig(image='mlrun/mlrun')
|
|
1013
|
+
df = ingest(stocks_set, stocks, run_config=config)
|
|
1014
|
+
|
|
1015
|
+
# specify source and targets
|
|
1016
|
+
source = CSVSource("mycsv", path="measurements.csv")
|
|
1017
|
+
targets = [CSVTarget("mycsv", path="./mycsv.csv")]
|
|
1018
|
+
ingest(measurements, source, targets)
|
|
1019
|
+
|
|
1020
|
+
:param source: source dataframe or other sources (e.g. parquet source see:
|
|
1021
|
+
:py:class:`~mlrun.datastore.ParquetSource` and other classes in mlrun.datastore with suffix
|
|
1022
|
+
Source)
|
|
1023
|
+
:param targets: optional list of data target objects
|
|
1024
|
+
:param namespace: namespace or module containing graph classes
|
|
1025
|
+
:param return_df: indicate if to return a dataframe with the graph results
|
|
1026
|
+
:param infer_options: schema (for discovery of entities, features in featureset), index, stats,
|
|
1027
|
+
histogram and preview infer options (:py:class:`~mlrun.feature_store.InferOptions`)
|
|
1028
|
+
:param run_config: function and/or run configuration for remote jobs,
|
|
1029
|
+
see :py:class:`~mlrun.feature_store.RunConfig`
|
|
1030
|
+
:param mlrun_context: mlrun context (when running as a job), for internal use !
|
|
1031
|
+
:param spark_context: local spark session for spark ingestion, example for creating the spark context:
|
|
1032
|
+
`spark = SparkSession.builder.appName("Spark function").getOrCreate()`
|
|
1033
|
+
For remote spark ingestion, this should contain the remote spark service name
|
|
1034
|
+
:param overwrite: delete the targets' data prior to ingestion
|
|
1035
|
+
(default: True for non scheduled ingest - deletes the targets that are about to be ingested.
|
|
1036
|
+
False for scheduled ingest - does not delete the target)
|
|
1037
|
+
:return: if return_df is True, a dataframe will be returned based on the graph
|
|
1038
|
+
"""
|
|
995
1039
|
return mlrun.feature_store.api._ingest(
|
|
996
1040
|
self,
|
|
997
1041
|
source,
|
|
@@ -1014,6 +1058,26 @@ class FeatureSet(ModelObj):
|
|
|
1014
1058
|
verbose: bool = False,
|
|
1015
1059
|
sample_size: int = None,
|
|
1016
1060
|
) -> pd.DataFrame:
|
|
1061
|
+
"""run the ingestion pipeline with local DataFrame/file data and infer features schema and stats
|
|
1062
|
+
|
|
1063
|
+
example::
|
|
1064
|
+
|
|
1065
|
+
quotes_set = FeatureSet("stock-quotes", entities=[Entity("ticker")])
|
|
1066
|
+
quotes_set.add_aggregation("ask", ["sum", "max"], ["1h", "5h"], "10m")
|
|
1067
|
+
quotes_set.add_aggregation("bid", ["min", "max"], ["1h"], "10m")
|
|
1068
|
+
df = quotes_set.preview(
|
|
1069
|
+
quotes_df,
|
|
1070
|
+
entity_columns=["ticker"],
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
:param source: source dataframe or csv/parquet file path
|
|
1074
|
+
:param entity_columns: list of entity (index) column names
|
|
1075
|
+
:param namespace: namespace or module containing graph classes
|
|
1076
|
+
:param options: schema (for discovery of entities, features in featureset), index, stats,
|
|
1077
|
+
histogram and preview infer options (:py:class:`~mlrun.feature_store.InferOptions`)
|
|
1078
|
+
:param verbose: verbose log
|
|
1079
|
+
:param sample_size: num of rows to sample from the dataset (for large datasets)
|
|
1080
|
+
"""
|
|
1017
1081
|
return mlrun.feature_store.api._preview(
|
|
1018
1082
|
self, source, entity_columns, namespace, options, verbose, sample_size
|
|
1019
1083
|
)
|
|
@@ -1026,6 +1090,31 @@ class FeatureSet(ModelObj):
|
|
|
1026
1090
|
run_config: RunConfig = None,
|
|
1027
1091
|
verbose=False,
|
|
1028
1092
|
) -> Tuple[str, BaseRuntime]:
|
|
1093
|
+
"""Start real-time ingestion service using nuclio function
|
|
1094
|
+
|
|
1095
|
+
Deploy a real-time function implementing feature ingestion pipeline
|
|
1096
|
+
the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
|
|
1097
|
+
|
|
1098
|
+
the `run_config` parameter allow specifying the function and job configuration,
|
|
1099
|
+
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
1100
|
+
|
|
1101
|
+
example::
|
|
1102
|
+
|
|
1103
|
+
source = HTTPSource()
|
|
1104
|
+
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
1105
|
+
config = RunConfig(function=func)
|
|
1106
|
+
my_set.deploy_ingestion_service(source, run_config=config)
|
|
1107
|
+
|
|
1108
|
+
:param source: data source object describing the online or offline source
|
|
1109
|
+
:param targets: list of data target objects
|
|
1110
|
+
:param name: name for the job/function
|
|
1111
|
+
:param run_config: service runtime configuration (function object/uri, resources, etc..)
|
|
1112
|
+
:param verbose: verbose log
|
|
1113
|
+
|
|
1114
|
+
:return: URL to access the deployed ingestion service, and the function that was deployed (which will
|
|
1115
|
+
differ from the function passed in via the run_config parameter).
|
|
1116
|
+
"""
|
|
1117
|
+
|
|
1029
1118
|
return mlrun.feature_store.api._deploy_ingestion_service_v2(
|
|
1030
1119
|
self, source, targets, name, run_config, verbose
|
|
1031
1120
|
)
|
|
@@ -744,6 +744,64 @@ class FeatureVector(ModelObj):
|
|
|
744
744
|
spark_service: str = None,
|
|
745
745
|
timestamp_for_filtering: Union[str, Dict[str, str]] = None,
|
|
746
746
|
):
|
|
747
|
+
"""retrieve offline feature vector results
|
|
748
|
+
|
|
749
|
+
specify a feature vector object/uri and retrieve the desired features, their metadata
|
|
750
|
+
and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
|
|
751
|
+
results can be returned as a dataframe or written to a target
|
|
752
|
+
|
|
753
|
+
The start_time and end_time attributes allow filtering the data to a given time range, they accept
|
|
754
|
+
string values or pandas `Timestamp` objects, string values can also be relative, for example:
|
|
755
|
+
"now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
|
|
756
|
+
for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
|
|
757
|
+
(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment).
|
|
758
|
+
Another option to filter the data is by the `query` argument - can be seen in the example.
|
|
759
|
+
example::
|
|
760
|
+
|
|
761
|
+
features = [
|
|
762
|
+
"stock-quotes.bid",
|
|
763
|
+
"stock-quotes.asks_sum_5h",
|
|
764
|
+
"stock-quotes.ask as mycol",
|
|
765
|
+
"stocks.*",
|
|
766
|
+
]
|
|
767
|
+
vector = FeatureVector(features=features)
|
|
768
|
+
vector.get_offline_features(entity_rows=trades, entity_timestamp_column="time", query="ticker in ['GOOG']
|
|
769
|
+
and bid>100")
|
|
770
|
+
print(resp.to_dataframe())
|
|
771
|
+
print(vector.get_stats_table())
|
|
772
|
+
resp.to_parquet("./out.parquet")
|
|
773
|
+
|
|
774
|
+
:param entity_rows: dataframe with entity rows to join with
|
|
775
|
+
:param target: where to write the results to
|
|
776
|
+
:param drop_columns: list of columns to drop from the final result
|
|
777
|
+
:param entity_timestamp_column: timestamp column name in the entity rows dataframe. can be specified
|
|
778
|
+
only if param entity_rows was specified.
|
|
779
|
+
:param run_config: function and/or run configuration
|
|
780
|
+
see :py:class:`~mlrun.feature_store.RunConfig`
|
|
781
|
+
:param start_time: datetime, low limit of time needed to be filtered. Optional.
|
|
782
|
+
:param end_time: datetime, high limit of time needed to be filtered. Optional.
|
|
783
|
+
:param with_indexes: Return vector with/without the entities and the timestamp_key of the feature
|
|
784
|
+
sets and with/without entity_timestamp_column and timestamp_for_filtering
|
|
785
|
+
columns. This property can be specified also in the feature vector spec
|
|
786
|
+
(feature_vector.spec.with_indexes)
|
|
787
|
+
(default False)
|
|
788
|
+
:param update_stats: update features statistics from the requested feature sets on the vector.
|
|
789
|
+
(default False).
|
|
790
|
+
:param engine: processing engine kind ("local", "dask", or "spark")
|
|
791
|
+
:param engine_args: kwargs for the processing engine
|
|
792
|
+
:param query: The query string used to filter rows on the output
|
|
793
|
+
:param spark_service: Name of the spark service to be used (when using a remote-spark runtime)
|
|
794
|
+
:param order_by: Name or list of names to order by. The name or the names in the list can be the
|
|
795
|
+
feature name or the alias of the feature you pass in the feature list.
|
|
796
|
+
:param timestamp_for_filtering: name of the column to filter by, can be str for all the feature sets or a
|
|
797
|
+
dictionary ({<feature set name>: <timestamp column name>, ...})
|
|
798
|
+
that indicates the timestamp column name for each feature set. Optional.
|
|
799
|
+
By default, the filter executes on the timestamp_key of each feature set.
|
|
800
|
+
Note: the time filtering is performed on each feature set before the
|
|
801
|
+
merge process using start_time and end_time params.
|
|
802
|
+
|
|
803
|
+
"""
|
|
804
|
+
|
|
747
805
|
return mlrun.feature_store.api._get_offline_features(
|
|
748
806
|
self,
|
|
749
807
|
entity_rows,
|
|
@@ -771,6 +829,68 @@ class FeatureVector(ModelObj):
|
|
|
771
829
|
update_stats: bool = False,
|
|
772
830
|
entity_keys: List[str] = None,
|
|
773
831
|
):
|
|
832
|
+
"""initialize and return online feature vector service api,
|
|
833
|
+
returns :py:class:`~mlrun.feature_store.OnlineVectorService`
|
|
834
|
+
|
|
835
|
+
:**usage**:
|
|
836
|
+
There are two ways to use the function:
|
|
837
|
+
|
|
838
|
+
1. As context manager
|
|
839
|
+
|
|
840
|
+
Example::
|
|
841
|
+
|
|
842
|
+
with vector_uri.get_online_feature_service() as svc:
|
|
843
|
+
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
844
|
+
print(resp)
|
|
845
|
+
resp = svc.get([{"ticker": "AAPL"}], as_list=True)
|
|
846
|
+
print(resp)
|
|
847
|
+
|
|
848
|
+
Example with imputing::
|
|
849
|
+
|
|
850
|
+
with vector_uri.get_online_feature_service(entity_keys=['id'],
|
|
851
|
+
impute_policy={"*": "$mean", "amount": 0)) as svc:
|
|
852
|
+
resp = svc.get([{"id": "C123487"}])
|
|
853
|
+
|
|
854
|
+
2. as simple function, note that in that option you need to close the session.
|
|
855
|
+
|
|
856
|
+
Example::
|
|
857
|
+
|
|
858
|
+
svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
|
|
859
|
+
try:
|
|
860
|
+
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
861
|
+
print(resp)
|
|
862
|
+
resp = svc.get([{"ticker": "AAPL"}], as_list=True)
|
|
863
|
+
print(resp)
|
|
864
|
+
|
|
865
|
+
finally:
|
|
866
|
+
svc.close()
|
|
867
|
+
|
|
868
|
+
Example with imputing::
|
|
869
|
+
|
|
870
|
+
svc = vector_uri.get_online_feature_service(entity_keys=['id'],
|
|
871
|
+
impute_policy={"*": "$mean", "amount": 0))
|
|
872
|
+
try:
|
|
873
|
+
resp = svc.get([{"id": "C123487"}])
|
|
874
|
+
except Exception as e:
|
|
875
|
+
handling exception...
|
|
876
|
+
finally:
|
|
877
|
+
svc.close()
|
|
878
|
+
|
|
879
|
+
:param run_config: function and/or run configuration for remote jobs/services
|
|
880
|
+
:param impute_policy: a dict with `impute_policy` per feature, the dict key is the feature name and the
|
|
881
|
+
dict value indicate which value will be used in case the feature is NaN/empty, the
|
|
882
|
+
replaced value can be fixed number for constants or $mean, $max, $min, $std, $count
|
|
883
|
+
for statistical values.
|
|
884
|
+
"*" is used to specify the default for all features, example: `{"*": "$mean"}`
|
|
885
|
+
:param fixed_window_type: determines how to query the fixed window values which were previously inserted by
|
|
886
|
+
ingest
|
|
887
|
+
:param update_stats: update features statistics from the requested feature sets on the vector.
|
|
888
|
+
Default: False.
|
|
889
|
+
:param entity_keys: Entity list of the first feature_set in the vector.
|
|
890
|
+
The indexes that are used to query the online service.
|
|
891
|
+
:return: Initialize the `OnlineVectorService`.
|
|
892
|
+
Will be used in subclasses where `support_online=True`.
|
|
893
|
+
"""
|
|
774
894
|
return mlrun.feature_store.api._get_online_feature_service(
|
|
775
895
|
self,
|
|
776
896
|
run_config,
|
|
@@ -389,9 +389,9 @@ class LoggingCallback(Callback):
|
|
|
389
389
|
):
|
|
390
390
|
try:
|
|
391
391
|
self._get_hyperparameter(key_chain=learning_rate_key_chain)
|
|
392
|
-
self._dynamic_hyperparameters_keys[
|
|
393
|
-
|
|
394
|
-
|
|
392
|
+
self._dynamic_hyperparameters_keys[learning_rate_key] = (
|
|
393
|
+
learning_rate_key_chain
|
|
394
|
+
)
|
|
395
395
|
except (KeyError, IndexError, ValueError):
|
|
396
396
|
pass
|
|
397
397
|
|
|
@@ -263,13 +263,13 @@ class TFKerasModelHandler(DLModelHandler):
|
|
|
263
263
|
# Update the paths and log artifacts if context is available:
|
|
264
264
|
if self._weights_file is not None:
|
|
265
265
|
if self._context is not None:
|
|
266
|
-
artifacts[
|
|
267
|
-
self.
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
266
|
+
artifacts[self._get_weights_file_artifact_name()] = (
|
|
267
|
+
self._context.log_artifact(
|
|
268
|
+
self._weights_file,
|
|
269
|
+
local_path=self._weights_file,
|
|
270
|
+
artifact_path=output_path,
|
|
271
|
+
db_key=False,
|
|
272
|
+
)
|
|
273
273
|
)
|
|
274
274
|
|
|
275
275
|
return artifacts if self._context is not None else None
|