mlrun 1.7.0rc7__py3-none-any.whl → 1.7.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +1 -0
- mlrun/__main__.py +2 -0
- mlrun/artifacts/model.py +29 -25
- mlrun/common/schemas/__init__.py +4 -0
- mlrun/common/schemas/alert.py +122 -0
- mlrun/common/schemas/api_gateway.py +8 -1
- mlrun/common/schemas/auth.py +4 -0
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/constants.py +4 -2
- mlrun/{datastore/helpers.py → common/schemas/pagination.py} +11 -3
- mlrun/common/schemas/project.py +15 -10
- mlrun/config.py +35 -13
- mlrun/datastore/__init__.py +3 -7
- mlrun/datastore/base.py +6 -5
- mlrun/datastore/datastore_profile.py +19 -1
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +18 -30
- mlrun/datastore/targets.py +140 -12
- mlrun/datastore/utils.py +10 -5
- mlrun/datastore/v3io.py +27 -50
- mlrun/db/base.py +88 -2
- mlrun/db/httpdb.py +314 -41
- mlrun/db/nopdb.py +142 -0
- mlrun/execution.py +21 -14
- mlrun/feature_store/api.py +9 -5
- mlrun/feature_store/feature_set.py +39 -23
- mlrun/feature_store/feature_vector.py +2 -1
- mlrun/feature_store/retrieval/spark_merger.py +27 -23
- mlrun/feature_store/steps.py +30 -19
- mlrun/features.py +4 -13
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/tf_keras/__init__.py +1 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/kfpops.py +2 -5
- mlrun/launcher/base.py +1 -1
- mlrun/launcher/client.py +2 -2
- mlrun/model.py +2 -2
- mlrun/model_monitoring/application.py +11 -2
- mlrun/model_monitoring/applications/histogram_data_drift.py +3 -3
- mlrun/model_monitoring/controller.py +2 -3
- mlrun/model_monitoring/helpers.py +3 -1
- mlrun/model_monitoring/stream_processing.py +0 -1
- mlrun/model_monitoring/writer.py +32 -0
- mlrun/package/packagers_manager.py +1 -0
- mlrun/platforms/__init__.py +1 -1
- mlrun/platforms/other.py +1 -1
- mlrun/projects/operations.py +11 -4
- mlrun/projects/pipelines.py +1 -1
- mlrun/projects/project.py +180 -73
- mlrun/run.py +77 -41
- mlrun/runtimes/__init__.py +16 -0
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/kubejob.py +26 -121
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/nuclio/api_gateway.py +58 -8
- mlrun/runtimes/nuclio/application/application.py +79 -1
- mlrun/runtimes/nuclio/application/reverse_proxy.go +9 -1
- mlrun/runtimes/nuclio/function.py +20 -13
- mlrun/runtimes/nuclio/serving.py +11 -10
- mlrun/runtimes/pod.py +148 -3
- mlrun/runtimes/utils.py +0 -28
- mlrun/secrets.py +6 -2
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +7 -4
- mlrun/serving/server.py +1 -1
- mlrun/serving/states.py +14 -38
- mlrun/serving/v2_serving.py +8 -7
- mlrun/utils/helpers.py +1 -1
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/base.py +12 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +41 -13
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/METADATA +15 -15
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/RECORD +91 -89
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/top_level.txt +0 -0
mlrun/db/nopdb.py
CHANGED
|
@@ -528,6 +528,75 @@ class NopDB(RunDBInterface):
|
|
|
528
528
|
):
|
|
529
529
|
pass
|
|
530
530
|
|
|
531
|
+
def remote_builder(
|
|
532
|
+
self,
|
|
533
|
+
func: "mlrun.runtimes.BaseRuntime",
|
|
534
|
+
with_mlrun: bool,
|
|
535
|
+
mlrun_version_specifier: Optional[str] = None,
|
|
536
|
+
skip_deployed: bool = False,
|
|
537
|
+
builder_env: Optional[dict] = None,
|
|
538
|
+
force_build: bool = False,
|
|
539
|
+
):
|
|
540
|
+
pass
|
|
541
|
+
|
|
542
|
+
def deploy_nuclio_function(
|
|
543
|
+
self,
|
|
544
|
+
func: "mlrun.runtimes.RemoteRuntime",
|
|
545
|
+
builder_env: Optional[dict] = None,
|
|
546
|
+
):
|
|
547
|
+
pass
|
|
548
|
+
|
|
549
|
+
def get_builder_status(
|
|
550
|
+
self,
|
|
551
|
+
func: "mlrun.runtimes.BaseRuntime",
|
|
552
|
+
offset: int = 0,
|
|
553
|
+
logs: bool = True,
|
|
554
|
+
last_log_timestamp: float = 0.0,
|
|
555
|
+
verbose: bool = False,
|
|
556
|
+
):
|
|
557
|
+
pass
|
|
558
|
+
|
|
559
|
+
def get_nuclio_deploy_status(
|
|
560
|
+
self,
|
|
561
|
+
func: "mlrun.runtimes.RemoteRuntime",
|
|
562
|
+
last_log_timestamp: float = 0.0,
|
|
563
|
+
verbose: bool = False,
|
|
564
|
+
):
|
|
565
|
+
pass
|
|
566
|
+
|
|
567
|
+
def set_run_notifications(
|
|
568
|
+
self,
|
|
569
|
+
project: str,
|
|
570
|
+
runs: list[mlrun.model.RunObject],
|
|
571
|
+
notifications: list[mlrun.model.Notification],
|
|
572
|
+
):
|
|
573
|
+
pass
|
|
574
|
+
|
|
575
|
+
def store_run_notifications(
|
|
576
|
+
self,
|
|
577
|
+
notification_objects: list[mlrun.model.Notification],
|
|
578
|
+
run_uid: str,
|
|
579
|
+
project: str = None,
|
|
580
|
+
mask_params: bool = True,
|
|
581
|
+
):
|
|
582
|
+
pass
|
|
583
|
+
|
|
584
|
+
def store_alert_notifications(
|
|
585
|
+
self,
|
|
586
|
+
session,
|
|
587
|
+
notification_objects: list[mlrun.model.Notification],
|
|
588
|
+
alert_id: str,
|
|
589
|
+
project: str,
|
|
590
|
+
mask_params: bool = True,
|
|
591
|
+
):
|
|
592
|
+
pass
|
|
593
|
+
|
|
594
|
+
def get_log_size(self, uid, project=""):
|
|
595
|
+
pass
|
|
596
|
+
|
|
597
|
+
def watch_log(self, uid, project="", watch=True, offset=0):
|
|
598
|
+
pass
|
|
599
|
+
|
|
531
600
|
def get_datastore_profile(
|
|
532
601
|
self, name: str, project: str
|
|
533
602
|
) -> Optional[mlrun.common.schemas.DatastoreProfile]:
|
|
@@ -545,3 +614,76 @@ class NopDB(RunDBInterface):
|
|
|
545
614
|
self, profile: mlrun.common.schemas.DatastoreProfile, project: str
|
|
546
615
|
):
|
|
547
616
|
pass
|
|
617
|
+
|
|
618
|
+
def function_status(self, project, name, kind, selector):
|
|
619
|
+
pass
|
|
620
|
+
|
|
621
|
+
def start_function(
|
|
622
|
+
self, func_url: str = None, function: "mlrun.runtimes.BaseRuntime" = None
|
|
623
|
+
):
|
|
624
|
+
pass
|
|
625
|
+
|
|
626
|
+
def submit_workflow(
|
|
627
|
+
self,
|
|
628
|
+
project: str,
|
|
629
|
+
name: str,
|
|
630
|
+
workflow_spec: Union[
|
|
631
|
+
"mlrun.projects.pipelines.WorkflowSpec",
|
|
632
|
+
"mlrun.common.schemas.WorkflowSpec",
|
|
633
|
+
dict,
|
|
634
|
+
],
|
|
635
|
+
arguments: Optional[dict] = None,
|
|
636
|
+
artifact_path: Optional[str] = None,
|
|
637
|
+
source: Optional[str] = None,
|
|
638
|
+
run_name: Optional[str] = None,
|
|
639
|
+
namespace: Optional[str] = None,
|
|
640
|
+
notifications: list["mlrun.model.Notification"] = None,
|
|
641
|
+
) -> "mlrun.common.schemas.WorkflowResponse":
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
def update_model_monitoring_controller(
|
|
645
|
+
self,
|
|
646
|
+
project: str,
|
|
647
|
+
base_period: int = 10,
|
|
648
|
+
image: str = "mlrun/mlrun",
|
|
649
|
+
):
|
|
650
|
+
pass
|
|
651
|
+
|
|
652
|
+
def enable_model_monitoring(
|
|
653
|
+
self,
|
|
654
|
+
project: str,
|
|
655
|
+
base_period: int = 10,
|
|
656
|
+
image: str = "mlrun/mlrun",
|
|
657
|
+
deploy_histogram_data_drift_app: bool = True,
|
|
658
|
+
) -> None:
|
|
659
|
+
pass
|
|
660
|
+
|
|
661
|
+
def deploy_histogram_data_drift_app(
|
|
662
|
+
self, project: str, image: str = "mlrun/mlrun"
|
|
663
|
+
) -> None:
|
|
664
|
+
raise NotImplementedError
|
|
665
|
+
|
|
666
|
+
def generate_event(
|
|
667
|
+
self, name: str, event_data: Union[dict, mlrun.common.schemas.Event], project=""
|
|
668
|
+
):
|
|
669
|
+
pass
|
|
670
|
+
|
|
671
|
+
def store_alert_config(
|
|
672
|
+
self,
|
|
673
|
+
alert_name: str,
|
|
674
|
+
alert_data: Union[dict, mlrun.common.schemas.AlertConfig],
|
|
675
|
+
project="",
|
|
676
|
+
):
|
|
677
|
+
pass
|
|
678
|
+
|
|
679
|
+
def get_alert_config(self, alert_name: str, project=""):
|
|
680
|
+
pass
|
|
681
|
+
|
|
682
|
+
def list_alerts_configs(self, project=""):
|
|
683
|
+
pass
|
|
684
|
+
|
|
685
|
+
def delete_alert_config(self, alert_name: str, project=""):
|
|
686
|
+
pass
|
|
687
|
+
|
|
688
|
+
def reset_alert_config(self, alert_name: str, project=""):
|
|
689
|
+
pass
|
mlrun/execution.py
CHANGED
|
@@ -224,12 +224,12 @@ class MLClientCtx:
|
|
|
224
224
|
with context.get_child_context(myparam=param) as child:
|
|
225
225
|
accuracy = child_handler(child, df, **child.parameters)
|
|
226
226
|
accuracy_sum += accuracy
|
|
227
|
-
child.log_result(
|
|
227
|
+
child.log_result("accuracy", accuracy)
|
|
228
228
|
if accuracy > best_accuracy:
|
|
229
229
|
child.mark_as_best()
|
|
230
230
|
best_accuracy = accuracy
|
|
231
231
|
|
|
232
|
-
context.log_result(
|
|
232
|
+
context.log_result("avg_accuracy", accuracy_sum / len(param_list))
|
|
233
233
|
|
|
234
234
|
:param params: Extra (or override) params to parent context
|
|
235
235
|
:param with_parent_params: Child will copy the parent parameters and add to them
|
|
@@ -289,7 +289,9 @@ class MLClientCtx:
|
|
|
289
289
|
|
|
290
290
|
Example::
|
|
291
291
|
|
|
292
|
-
feature_vector = context.get_store_resource(
|
|
292
|
+
feature_vector = context.get_store_resource(
|
|
293
|
+
"store://feature-vectors/default/myvec"
|
|
294
|
+
)
|
|
293
295
|
dataset = context.get_store_resource("store://artifacts/default/mydata")
|
|
294
296
|
|
|
295
297
|
:param url: Store resource uri/path, store://<type>/<project>/<name>:<version>
|
|
@@ -421,7 +423,7 @@ class MLClientCtx:
|
|
|
421
423
|
|
|
422
424
|
Example::
|
|
423
425
|
|
|
424
|
-
data_path=context.artifact_subpath(
|
|
426
|
+
data_path = context.artifact_subpath("data")
|
|
425
427
|
|
|
426
428
|
"""
|
|
427
429
|
return os.path.join(self.artifact_path, *subpaths)
|
|
@@ -525,7 +527,7 @@ class MLClientCtx:
|
|
|
525
527
|
|
|
526
528
|
Example::
|
|
527
529
|
|
|
528
|
-
context.log_result(
|
|
530
|
+
context.log_result("accuracy", 0.85)
|
|
529
531
|
|
|
530
532
|
:param key: Result key
|
|
531
533
|
:param value: Result value
|
|
@@ -539,7 +541,7 @@ class MLClientCtx:
|
|
|
539
541
|
|
|
540
542
|
Example::
|
|
541
543
|
|
|
542
|
-
context.log_results({
|
|
544
|
+
context.log_results({"accuracy": 0.85, "loss": 0.2})
|
|
543
545
|
|
|
544
546
|
:param results: Key/value dict or results
|
|
545
547
|
:param commit: Commit (write to DB now vs wait for the end of the run)
|
|
@@ -674,7 +676,9 @@ class MLClientCtx:
|
|
|
674
676
|
"age": [42, 52, 36, 24, 73],
|
|
675
677
|
"testScore": [25, 94, 57, 62, 70],
|
|
676
678
|
}
|
|
677
|
-
df = pd.DataFrame(
|
|
679
|
+
df = pd.DataFrame(
|
|
680
|
+
raw_data, columns=["first_name", "last_name", "age", "testScore"]
|
|
681
|
+
)
|
|
678
682
|
context.log_dataset("mydf", df=df, stats=True)
|
|
679
683
|
|
|
680
684
|
:param key: Artifact key
|
|
@@ -752,13 +756,16 @@ class MLClientCtx:
|
|
|
752
756
|
|
|
753
757
|
Example::
|
|
754
758
|
|
|
755
|
-
context.log_model(
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
759
|
+
context.log_model(
|
|
760
|
+
"model",
|
|
761
|
+
body=dumps(model),
|
|
762
|
+
model_file="model.pkl",
|
|
763
|
+
metrics=context.results,
|
|
764
|
+
training_set=training_df,
|
|
765
|
+
label_column="label",
|
|
766
|
+
feature_vector=feature_vector_uri,
|
|
767
|
+
labels={"app": "fraud"},
|
|
768
|
+
)
|
|
762
769
|
|
|
763
770
|
:param key: Artifact key or artifact class ()
|
|
764
771
|
:param body: Will use the body as the artifact content
|
mlrun/feature_store/api.py
CHANGED
|
@@ -136,7 +136,10 @@ def get_offline_features(
|
|
|
136
136
|
]
|
|
137
137
|
vector = FeatureVector(features=features)
|
|
138
138
|
resp = get_offline_features(
|
|
139
|
-
vector,
|
|
139
|
+
vector,
|
|
140
|
+
entity_rows=trades,
|
|
141
|
+
entity_timestamp_column="time",
|
|
142
|
+
query="ticker in ['GOOG'] and bid>100",
|
|
140
143
|
)
|
|
141
144
|
print(resp.to_dataframe())
|
|
142
145
|
print(vector.get_stats_table())
|
|
@@ -307,7 +310,7 @@ def get_online_feature_service(
|
|
|
307
310
|
|
|
308
311
|
Example::
|
|
309
312
|
|
|
310
|
-
svc = get_online_feature_service(vector_uri, entity_keys=[
|
|
313
|
+
svc = get_online_feature_service(vector_uri, entity_keys=["ticker"])
|
|
311
314
|
try:
|
|
312
315
|
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
313
316
|
print(resp)
|
|
@@ -456,7 +459,7 @@ def ingest(
|
|
|
456
459
|
df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
|
|
457
460
|
|
|
458
461
|
# for running as remote job
|
|
459
|
-
config = RunConfig(image=
|
|
462
|
+
config = RunConfig(image="mlrun/mlrun")
|
|
460
463
|
df = ingest(stocks_set, stocks, run_config=config)
|
|
461
464
|
|
|
462
465
|
# specify source and targets
|
|
@@ -1121,9 +1124,10 @@ def _ingest_with_spark(
|
|
|
1121
1124
|
df_to_write = target.prepare_spark_df(
|
|
1122
1125
|
df_to_write, key_columns, timestamp_key, spark_options
|
|
1123
1126
|
)
|
|
1127
|
+
write_format = spark_options.pop("format", None)
|
|
1124
1128
|
if overwrite:
|
|
1125
1129
|
write_spark_dataframe_with_options(
|
|
1126
|
-
spark_options, df_to_write, "overwrite"
|
|
1130
|
+
spark_options, df_to_write, "overwrite", write_format=write_format
|
|
1127
1131
|
)
|
|
1128
1132
|
else:
|
|
1129
1133
|
# appending an empty dataframe may cause an empty file to be created (e.g. when writing to parquet)
|
|
@@ -1131,7 +1135,7 @@ def _ingest_with_spark(
|
|
|
1131
1135
|
df_to_write.persist()
|
|
1132
1136
|
if df_to_write.count() > 0:
|
|
1133
1137
|
write_spark_dataframe_with_options(
|
|
1134
|
-
spark_options, df_to_write, "append"
|
|
1138
|
+
spark_options, df_to_write, "append", write_format=write_format
|
|
1135
1139
|
)
|
|
1136
1140
|
target.update_resource_status("ready")
|
|
1137
1141
|
|
|
@@ -337,7 +337,10 @@ class FeatureSet(ModelObj):
|
|
|
337
337
|
example::
|
|
338
338
|
|
|
339
339
|
import mlrun.feature_store as fstore
|
|
340
|
-
|
|
340
|
+
|
|
341
|
+
ticks = fstore.FeatureSet(
|
|
342
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
343
|
+
)
|
|
341
344
|
ticks.ingest(df)
|
|
342
345
|
|
|
343
346
|
:param name: name of the feature set
|
|
@@ -625,12 +628,12 @@ class FeatureSet(ModelObj):
|
|
|
625
628
|
|
|
626
629
|
import mlrun.feature_store as fstore
|
|
627
630
|
|
|
628
|
-
ticks = fstore.FeatureSet(
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
ticks.add_entity(
|
|
632
|
-
|
|
633
|
-
|
|
631
|
+
ticks = fstore.FeatureSet(
|
|
632
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
633
|
+
)
|
|
634
|
+
ticks.add_entity(
|
|
635
|
+
"country", mlrun.data_types.ValueType.STRING, description="stock country"
|
|
636
|
+
)
|
|
634
637
|
ticks.add_entity("year", mlrun.data_types.ValueType.INT16)
|
|
635
638
|
ticks.save()
|
|
636
639
|
|
|
@@ -650,13 +653,23 @@ class FeatureSet(ModelObj):
|
|
|
650
653
|
import mlrun.feature_store as fstore
|
|
651
654
|
from mlrun.features import Feature
|
|
652
655
|
|
|
653
|
-
ticks = fstore.FeatureSet(
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
ticks.add_feature(
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
656
|
+
ticks = fstore.FeatureSet(
|
|
657
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
658
|
+
)
|
|
659
|
+
ticks.add_feature(
|
|
660
|
+
Feature(
|
|
661
|
+
value_type=mlrun.data_types.ValueType.STRING,
|
|
662
|
+
description="client consistency",
|
|
663
|
+
),
|
|
664
|
+
"ABC01",
|
|
665
|
+
)
|
|
666
|
+
ticks.add_feature(
|
|
667
|
+
Feature(
|
|
668
|
+
value_type=mlrun.data_types.ValueType.FLOAT,
|
|
669
|
+
description="client volatility",
|
|
670
|
+
),
|
|
671
|
+
"SAB",
|
|
672
|
+
)
|
|
660
673
|
ticks.save()
|
|
661
674
|
|
|
662
675
|
:param feature: setting of Feature
|
|
@@ -860,15 +873,18 @@ class FeatureSet(ModelObj):
|
|
|
860
873
|
example::
|
|
861
874
|
|
|
862
875
|
import mlrun.feature_store as fstore
|
|
876
|
+
|
|
863
877
|
...
|
|
864
|
-
ticks = fstore.FeatureSet(
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
ticks.add_aggregation(
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
878
|
+
ticks = fstore.FeatureSet(
|
|
879
|
+
"ticks", entities=["stock"], timestamp_key="timestamp"
|
|
880
|
+
)
|
|
881
|
+
ticks.add_aggregation(
|
|
882
|
+
name="priceN",
|
|
883
|
+
column="price",
|
|
884
|
+
operations=["avg"],
|
|
885
|
+
windows=["1d"],
|
|
886
|
+
period="1h",
|
|
887
|
+
)
|
|
872
888
|
ticks.plot(rankdir="LR", with_targets=True)
|
|
873
889
|
|
|
874
890
|
:param filename: target filepath for the graph image (None for the notebook)
|
|
@@ -1005,7 +1021,7 @@ class FeatureSet(ModelObj):
|
|
|
1005
1021
|
df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
|
|
1006
1022
|
|
|
1007
1023
|
# for running as remote job
|
|
1008
|
-
config = RunConfig(image=
|
|
1024
|
+
config = RunConfig(image="mlrun/mlrun")
|
|
1009
1025
|
df = ingest(stocks_set, stocks, run_config=config)
|
|
1010
1026
|
|
|
1011
1027
|
# specify source and targets
|
|
@@ -486,6 +486,7 @@ class FeatureVector(ModelObj):
|
|
|
486
486
|
example::
|
|
487
487
|
|
|
488
488
|
import mlrun.feature_store as fstore
|
|
489
|
+
|
|
489
490
|
features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
|
|
490
491
|
vector = fstore.FeatureVector("my-vec", features)
|
|
491
492
|
|
|
@@ -852,7 +853,7 @@ class FeatureVector(ModelObj):
|
|
|
852
853
|
|
|
853
854
|
Example::
|
|
854
855
|
|
|
855
|
-
svc = vector_uri.get_online_feature_service(entity_keys=[
|
|
856
|
+
svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
|
|
856
857
|
try:
|
|
857
858
|
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
858
859
|
print(resp)
|
|
@@ -24,6 +24,32 @@ from .base import BaseMerger
|
|
|
24
24
|
from .conversion import PandasConversionMixin
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
def spark_df_to_pandas(spark_df):
|
|
28
|
+
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
29
|
+
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
30
|
+
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
31
|
+
if semver.parse(pd.__version__)["major"] >= 2:
|
|
32
|
+
import pyspark.sql.functions as pyspark_functions
|
|
33
|
+
|
|
34
|
+
type_conversion_dict = {}
|
|
35
|
+
for field in spark_df.schema.fields:
|
|
36
|
+
if str(field.dataType) == "TimestampType":
|
|
37
|
+
spark_df = spark_df.withColumn(
|
|
38
|
+
field.name,
|
|
39
|
+
pyspark_functions.date_format(
|
|
40
|
+
pyspark_functions.to_timestamp(field.name),
|
|
41
|
+
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
45
|
+
df = PandasConversionMixin.toPandas(spark_df)
|
|
46
|
+
if type_conversion_dict:
|
|
47
|
+
df = df.astype(type_conversion_dict)
|
|
48
|
+
return df
|
|
49
|
+
else:
|
|
50
|
+
return PandasConversionMixin.toPandas(spark_df)
|
|
51
|
+
|
|
52
|
+
|
|
27
53
|
class SparkFeatureMerger(BaseMerger):
|
|
28
54
|
engine = "spark"
|
|
29
55
|
support_offline = True
|
|
@@ -166,29 +192,7 @@ class SparkFeatureMerger(BaseMerger):
|
|
|
166
192
|
def get_df(self, to_pandas=True):
|
|
167
193
|
if to_pandas:
|
|
168
194
|
if self._pandas_df is None:
|
|
169
|
-
df = self._result_df
|
|
170
|
-
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
171
|
-
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
172
|
-
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
173
|
-
if semver.parse(pd.__version__)["major"] >= 2:
|
|
174
|
-
import pyspark.sql.functions as pyspark_functions
|
|
175
|
-
|
|
176
|
-
type_conversion_dict = {}
|
|
177
|
-
for field in df.schema.fields:
|
|
178
|
-
if str(field.dataType) == "TimestampType":
|
|
179
|
-
df = df.withColumn(
|
|
180
|
-
field.name,
|
|
181
|
-
pyspark_functions.date_format(
|
|
182
|
-
pyspark_functions.to_timestamp(field.name),
|
|
183
|
-
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
|
|
184
|
-
),
|
|
185
|
-
)
|
|
186
|
-
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
187
|
-
df = PandasConversionMixin.toPandas(df)
|
|
188
|
-
if type_conversion_dict:
|
|
189
|
-
df = df.astype(type_conversion_dict)
|
|
190
|
-
else:
|
|
191
|
-
df = PandasConversionMixin.toPandas(df)
|
|
195
|
+
df = spark_df_to_pandas(self._result_df)
|
|
192
196
|
self._pandas_df = df
|
|
193
197
|
self._set_indexes(self._pandas_df)
|
|
194
198
|
return self._pandas_df
|
mlrun/feature_store/steps.py
CHANGED
|
@@ -162,13 +162,19 @@ class MapValues(StepToDict, MLRunStep):
|
|
|
162
162
|
example::
|
|
163
163
|
|
|
164
164
|
# replace the value "U" with '0' in the age column
|
|
165
|
-
graph.to(MapValues(mapping={
|
|
165
|
+
graph.to(MapValues(mapping={"age": {"U": "0"}}, with_original_features=True))
|
|
166
166
|
|
|
167
167
|
# replace integers, example
|
|
168
|
-
graph.to(MapValues(mapping={
|
|
168
|
+
graph.to(MapValues(mapping={"not": {0: 1, 1: 0}}))
|
|
169
169
|
|
|
170
170
|
# replace by range, use -inf and inf for extended range
|
|
171
|
-
graph.to(
|
|
171
|
+
graph.to(
|
|
172
|
+
MapValues(
|
|
173
|
+
mapping={
|
|
174
|
+
"numbers": {"ranges": {"negative": [-inf, 0], "positive": [0, inf]}}
|
|
175
|
+
}
|
|
176
|
+
)
|
|
177
|
+
)
|
|
172
178
|
|
|
173
179
|
:param mapping: a dict with entry per column and the associated old/new values map
|
|
174
180
|
:param with_original_features: set to True to keep the original features
|
|
@@ -424,8 +430,10 @@ class OneHotEncoder(StepToDict, MLRunStep):
|
|
|
424
430
|
|
|
425
431
|
example::
|
|
426
432
|
|
|
427
|
-
mapping = {
|
|
428
|
-
|
|
433
|
+
mapping = {
|
|
434
|
+
"category": ["food", "health", "transportation"],
|
|
435
|
+
"gender": ["male", "female"],
|
|
436
|
+
}
|
|
429
437
|
graph.to(OneHotEncoder(mapping=one_hot_encoder_mapping))
|
|
430
438
|
|
|
431
439
|
:param mapping: a dict of per column categories (to map to binary fields)
|
|
@@ -542,10 +550,12 @@ class DateExtractor(StepToDict, MLRunStep):
|
|
|
542
550
|
|
|
543
551
|
# (taken from the fraud-detection end-to-end feature store demo)
|
|
544
552
|
# Define the Transactions FeatureSet
|
|
545
|
-
transaction_set = fstore.FeatureSet(
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
553
|
+
transaction_set = fstore.FeatureSet(
|
|
554
|
+
"transactions",
|
|
555
|
+
entities=[fstore.Entity("source")],
|
|
556
|
+
timestamp_key="timestamp",
|
|
557
|
+
description="transactions feature set",
|
|
558
|
+
)
|
|
549
559
|
|
|
550
560
|
# Get FeatureSet computation graph
|
|
551
561
|
transaction_graph = transaction_set.graph
|
|
@@ -553,11 +563,11 @@ class DateExtractor(StepToDict, MLRunStep):
|
|
|
553
563
|
# Add the custom `DateExtractor` step
|
|
554
564
|
# to the computation graph
|
|
555
565
|
transaction_graph.to(
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
566
|
+
class_name="DateExtractor",
|
|
567
|
+
name="Extract Dates",
|
|
568
|
+
parts=["hour", "day_of_week"],
|
|
569
|
+
timestamp_col="timestamp",
|
|
570
|
+
)
|
|
561
571
|
|
|
562
572
|
:param parts: list of pandas style date-time parts you want to extract.
|
|
563
573
|
:param timestamp_col: The name of the column containing the timestamps to extract from,
|
|
@@ -694,11 +704,12 @@ class DropFeatures(StepToDict, MLRunStep):
|
|
|
694
704
|
|
|
695
705
|
example::
|
|
696
706
|
|
|
697
|
-
feature_set = fstore.FeatureSet(
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
707
|
+
feature_set = fstore.FeatureSet(
|
|
708
|
+
"fs-new",
|
|
709
|
+
entities=[fstore.Entity("id")],
|
|
710
|
+
description="feature set",
|
|
711
|
+
engine="pandas",
|
|
712
|
+
)
|
|
702
713
|
# Pre-processing graph steps
|
|
703
714
|
feature_set.graph.to(DropFeatures(features=["age"]))
|
|
704
715
|
df_pandas = feature_set.ingest(data)
|
mlrun/features.py
CHANGED
|
@@ -238,10 +238,7 @@ class Validator(ModelObj):
|
|
|
238
238
|
from mlrun.features import Validator
|
|
239
239
|
|
|
240
240
|
# Add validator to the feature 'bid' with check type
|
|
241
|
-
quotes_set["bid"].validator = Validator(
|
|
242
|
-
check_type=True,
|
|
243
|
-
severity="info"
|
|
244
|
-
)
|
|
241
|
+
quotes_set["bid"].validator = Validator(check_type=True, severity="info")
|
|
245
242
|
|
|
246
243
|
:param check_type: check feature type e.g. True, False
|
|
247
244
|
:param severity: severity name e.g. info, warning, etc.
|
|
@@ -280,10 +277,7 @@ class MinMaxValidator(Validator):
|
|
|
280
277
|
|
|
281
278
|
# Add validator to the feature 'bid', where valid
|
|
282
279
|
# minimal value is 52
|
|
283
|
-
quotes_set["bid"].validator = MinMaxValidator(
|
|
284
|
-
min=52,
|
|
285
|
-
severity="info"
|
|
286
|
-
)
|
|
280
|
+
quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
|
|
287
281
|
|
|
288
282
|
:param check_type: check feature type e.g. True, False
|
|
289
283
|
:param severity: severity name e.g. info, warning, etc.
|
|
@@ -344,9 +338,7 @@ class MinMaxLenValidator(Validator):
|
|
|
344
338
|
# Add length validator to the feature 'ticker', where valid
|
|
345
339
|
# minimal length is 1 and maximal length is 10
|
|
346
340
|
quotes_set["ticker"].validator = MinMaxLenValidator(
|
|
347
|
-
min=1,
|
|
348
|
-
max=10,
|
|
349
|
-
severity="info"
|
|
341
|
+
min=1, max=10, severity="info"
|
|
350
342
|
)
|
|
351
343
|
|
|
352
344
|
:param check_type: check feature type e.g. True, False
|
|
@@ -408,8 +400,7 @@ class RegexValidator(Validator):
|
|
|
408
400
|
# expression '(\b[A-Za-z]{1}[0-9]{7}\b)' where valid values are
|
|
409
401
|
# e.g. A1234567, z9874563, etc.
|
|
410
402
|
quotes_set["name"].validator = RegexValidator(
|
|
411
|
-
regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
|
|
412
|
-
severity="info"
|
|
403
|
+
regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)", severity="info"
|
|
413
404
|
)
|
|
414
405
|
|
|
415
406
|
:param check_type: check feature type e.g. True, False
|
|
@@ -363,7 +363,7 @@ class AutoMLRun:
|
|
|
363
363
|
|
|
364
364
|
{
|
|
365
365
|
"/.../custom_model.py": "MyModel",
|
|
366
|
-
"/.../custom_objects.py": ["object1", "object2"]
|
|
366
|
+
"/.../custom_objects.py": ["object1", "object2"],
|
|
367
367
|
}
|
|
368
368
|
|
|
369
369
|
All the paths will be accessed from the given 'custom_objects_directory',
|
|
@@ -464,7 +464,7 @@ class AutoMLRun:
|
|
|
464
464
|
|
|
465
465
|
{
|
|
466
466
|
"/.../custom_model.py": "MyModel",
|
|
467
|
-
"/.../custom_objects.py": ["object1", "object2"]
|
|
467
|
+
"/.../custom_objects.py": ["object1", "object2"],
|
|
468
468
|
}
|
|
469
469
|
|
|
470
470
|
All the paths will be accessed from the given 'custom_objects_directory',
|
|
@@ -241,7 +241,7 @@ def apply_mlrun(
|
|
|
241
241
|
|
|
242
242
|
{
|
|
243
243
|
"/.../custom_model.py": "MyModel",
|
|
244
|
-
"/.../custom_objects.py": ["object1", "object2"]
|
|
244
|
+
"/.../custom_objects.py": ["object1", "object2"],
|
|
245
245
|
}
|
|
246
246
|
|
|
247
247
|
All the paths will be accessed from the given 'custom_objects_directory', meaning
|
|
@@ -63,11 +63,9 @@ class Callback(ABC):
|
|
|
63
63
|
def on_train_end(self):
|
|
64
64
|
print("{self.name}: Done training!")
|
|
65
65
|
|
|
66
|
+
|
|
66
67
|
apply_mlrun()
|
|
67
|
-
lgb.train(
|
|
68
|
-
...,
|
|
69
|
-
callbacks=[ExampleCallback(name="Example")]
|
|
70
|
-
)
|
|
68
|
+
lgb.train(..., callbacks=[ExampleCallback(name="Example")])
|
|
71
69
|
"""
|
|
72
70
|
|
|
73
71
|
def __init__(self, order: int = 10, before_iteration: bool = False):
|
|
@@ -103,7 +103,7 @@ class LGBMModelHandler(MLModelHandler):
|
|
|
103
103
|
|
|
104
104
|
{
|
|
105
105
|
"/.../custom_model.py": "MyModel",
|
|
106
|
-
"/.../custom_objects.py": ["object1", "object2"]
|
|
106
|
+
"/.../custom_objects.py": ["object1", "object2"],
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
All the paths will be accessed from the given 'custom_objects_directory',
|