mlrun 1.3.3__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +3 -3
- mlrun/__main__.py +79 -37
- mlrun/api/__init__.py +1 -1
- mlrun/api/api/__init__.py +1 -1
- mlrun/api/api/api.py +4 -4
- mlrun/api/api/deps.py +10 -21
- mlrun/api/api/endpoints/__init__.py +1 -1
- mlrun/api/api/endpoints/artifacts.py +64 -36
- mlrun/api/api/endpoints/auth.py +4 -4
- mlrun/api/api/endpoints/background_tasks.py +11 -11
- mlrun/api/api/endpoints/client_spec.py +5 -5
- mlrun/api/api/endpoints/clusterization_spec.py +6 -4
- mlrun/api/api/endpoints/feature_store.py +124 -115
- mlrun/api/api/endpoints/files.py +22 -14
- mlrun/api/api/endpoints/frontend_spec.py +28 -21
- mlrun/api/api/endpoints/functions.py +142 -87
- mlrun/api/api/endpoints/grafana_proxy.py +89 -442
- mlrun/api/api/endpoints/healthz.py +20 -7
- mlrun/api/api/endpoints/hub.py +320 -0
- mlrun/api/api/endpoints/internal/__init__.py +1 -1
- mlrun/api/api/endpoints/internal/config.py +1 -1
- mlrun/api/api/endpoints/internal/memory_reports.py +9 -9
- mlrun/api/api/endpoints/logs.py +11 -11
- mlrun/api/api/endpoints/model_endpoints.py +74 -70
- mlrun/api/api/endpoints/operations.py +13 -9
- mlrun/api/api/endpoints/pipelines.py +93 -88
- mlrun/api/api/endpoints/projects.py +35 -35
- mlrun/api/api/endpoints/runs.py +69 -27
- mlrun/api/api/endpoints/runtime_resources.py +28 -28
- mlrun/api/api/endpoints/schedules.py +98 -41
- mlrun/api/api/endpoints/secrets.py +37 -32
- mlrun/api/api/endpoints/submit.py +12 -12
- mlrun/api/api/endpoints/tags.py +20 -22
- mlrun/api/api/utils.py +251 -42
- mlrun/api/constants.py +1 -1
- mlrun/api/crud/__init__.py +18 -15
- mlrun/api/crud/artifacts.py +10 -10
- mlrun/api/crud/client_spec.py +4 -4
- mlrun/api/crud/clusterization_spec.py +3 -3
- mlrun/api/crud/feature_store.py +54 -46
- mlrun/api/crud/functions.py +3 -3
- mlrun/api/crud/hub.py +312 -0
- mlrun/api/crud/logs.py +11 -9
- mlrun/api/crud/model_monitoring/__init__.py +3 -3
- mlrun/api/crud/model_monitoring/grafana.py +435 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +352 -129
- mlrun/api/crud/notifications.py +149 -0
- mlrun/api/crud/pipelines.py +67 -52
- mlrun/api/crud/projects.py +51 -23
- mlrun/api/crud/runs.py +7 -5
- mlrun/api/crud/runtime_resources.py +13 -13
- mlrun/api/{db/filedb → crud/runtimes}/__init__.py +1 -1
- mlrun/api/crud/runtimes/nuclio/__init__.py +14 -0
- mlrun/api/crud/runtimes/nuclio/function.py +505 -0
- mlrun/api/crud/runtimes/nuclio/helpers.py +310 -0
- mlrun/api/crud/secrets.py +88 -46
- mlrun/api/crud/tags.py +5 -5
- mlrun/api/db/__init__.py +1 -1
- mlrun/api/db/base.py +102 -54
- mlrun/api/db/init_db.py +2 -3
- mlrun/api/db/session.py +4 -12
- mlrun/api/db/sqldb/__init__.py +1 -1
- mlrun/api/db/sqldb/db.py +439 -196
- mlrun/api/db/sqldb/helpers.py +1 -1
- mlrun/api/db/sqldb/models/__init__.py +3 -3
- mlrun/api/db/sqldb/models/models_mysql.py +82 -64
- mlrun/api/db/sqldb/models/models_sqlite.py +76 -64
- mlrun/api/db/sqldb/session.py +27 -20
- mlrun/api/initial_data.py +82 -24
- mlrun/api/launcher.py +196 -0
- mlrun/api/main.py +91 -22
- mlrun/api/middlewares.py +6 -5
- mlrun/api/migrations_mysql/env.py +1 -1
- mlrun/api/migrations_mysql/versions/28383af526f3_market_place_to_hub.py +40 -0
- mlrun/api/migrations_mysql/versions/32bae1b0e29c_increase_timestamp_fields_precision.py +1 -1
- mlrun/api/migrations_mysql/versions/4903aef6a91d_tag_foreign_key_and_cascades.py +1 -1
- mlrun/api/migrations_mysql/versions/5f1351c88a19_adding_background_tasks_table.py +1 -1
- mlrun/api/migrations_mysql/versions/88e656800d6a_add_requested_logs_column_and_index_to_.py +1 -1
- mlrun/api/migrations_mysql/versions/9d16de5f03a7_adding_data_versions_table.py +1 -1
- mlrun/api/migrations_mysql/versions/b86f5b53f3d7_adding_name_and_updated_to_runs_table.py +1 -1
- mlrun/api/migrations_mysql/versions/c4af40b0bf61_init.py +1 -1
- mlrun/api/migrations_mysql/versions/c905d15bd91d_notifications.py +72 -0
- mlrun/api/migrations_mysql/versions/ee041e8fdaa0_adding_next_run_time_column_to_schedule_.py +1 -1
- mlrun/api/migrations_sqlite/env.py +1 -1
- mlrun/api/migrations_sqlite/versions/11f8dd2dc9fe_init.py +1 -1
- mlrun/api/migrations_sqlite/versions/1c954f8cb32d_schedule_last_run_uri.py +1 -1
- mlrun/api/migrations_sqlite/versions/2b6d23c715aa_adding_feature_sets.py +1 -1
- mlrun/api/migrations_sqlite/versions/4acd9430b093_market_place_to_hub.py +77 -0
- mlrun/api/migrations_sqlite/versions/6401142f2d7c_adding_next_run_time_column_to_schedule_.py +1 -1
- mlrun/api/migrations_sqlite/versions/64d90a1a69bc_adding_background_tasks_table.py +1 -1
- mlrun/api/migrations_sqlite/versions/803438ecd005_add_requested_logs_column_to_runs.py +1 -1
- mlrun/api/migrations_sqlite/versions/863114f0c659_refactoring_feature_set.py +1 -1
- mlrun/api/migrations_sqlite/versions/959ae00528ad_notifications.py +63 -0
- mlrun/api/migrations_sqlite/versions/accf9fc83d38_adding_data_versions_table.py +1 -1
- mlrun/api/migrations_sqlite/versions/b68e8e897a28_schedule_labels.py +1 -1
- mlrun/api/migrations_sqlite/versions/bcd0c1f9720c_adding_project_labels.py +1 -1
- mlrun/api/migrations_sqlite/versions/cf21882f938e_schedule_id.py +1 -1
- mlrun/api/migrations_sqlite/versions/d781f58f607f_tag_object_name_string.py +1 -1
- mlrun/api/migrations_sqlite/versions/deac06871ace_adding_marketplace_sources_table.py +1 -1
- mlrun/api/migrations_sqlite/versions/e1dd5983c06b_schedule_concurrency_limit.py +1 -1
- mlrun/api/migrations_sqlite/versions/e5594ed3ab53_adding_name_and_updated_to_runs_table.py +1 -1
- mlrun/api/migrations_sqlite/versions/f4249b4ba6fa_adding_feature_vectors.py +1 -1
- mlrun/api/migrations_sqlite/versions/f7b5a1a03629_adding_feature_labels.py +1 -1
- mlrun/api/schemas/__init__.py +216 -138
- mlrun/api/utils/__init__.py +1 -1
- mlrun/api/utils/asyncio.py +1 -1
- mlrun/api/utils/auth/__init__.py +1 -1
- mlrun/api/utils/auth/providers/__init__.py +1 -1
- mlrun/api/utils/auth/providers/base.py +7 -7
- mlrun/api/utils/auth/providers/nop.py +6 -7
- mlrun/api/utils/auth/providers/opa.py +17 -17
- mlrun/api/utils/auth/verifier.py +36 -34
- mlrun/api/utils/background_tasks.py +24 -24
- mlrun/{builder.py → api/utils/builder.py} +216 -123
- mlrun/api/utils/clients/__init__.py +1 -1
- mlrun/api/utils/clients/chief.py +19 -4
- mlrun/api/utils/clients/iguazio.py +106 -60
- mlrun/api/utils/clients/log_collector.py +1 -1
- mlrun/api/utils/clients/nuclio.py +23 -23
- mlrun/api/utils/clients/protocols/grpc.py +2 -2
- mlrun/api/utils/db/__init__.py +1 -1
- mlrun/api/utils/db/alembic.py +1 -1
- mlrun/api/utils/db/backup.py +1 -1
- mlrun/api/utils/db/mysql.py +24 -25
- mlrun/api/utils/db/sql_collation.py +1 -1
- mlrun/api/utils/db/sqlite_migration.py +2 -2
- mlrun/api/utils/events/__init__.py +14 -0
- mlrun/api/utils/events/base.py +57 -0
- mlrun/api/utils/events/events_factory.py +41 -0
- mlrun/api/utils/events/iguazio.py +217 -0
- mlrun/api/utils/events/nop.py +55 -0
- mlrun/api/utils/helpers.py +16 -13
- mlrun/api/utils/memory_reports.py +1 -1
- mlrun/api/utils/periodic.py +6 -3
- mlrun/api/utils/projects/__init__.py +1 -1
- mlrun/api/utils/projects/follower.py +33 -33
- mlrun/api/utils/projects/leader.py +36 -34
- mlrun/api/utils/projects/member.py +27 -27
- mlrun/api/utils/projects/remotes/__init__.py +1 -1
- mlrun/api/utils/projects/remotes/follower.py +13 -13
- mlrun/api/utils/projects/remotes/leader.py +10 -10
- mlrun/api/utils/projects/remotes/nop_follower.py +27 -21
- mlrun/api/utils/projects/remotes/nop_leader.py +17 -16
- mlrun/api/utils/scheduler.py +140 -51
- mlrun/api/utils/singletons/__init__.py +1 -1
- mlrun/api/utils/singletons/db.py +9 -15
- mlrun/api/utils/singletons/k8s.py +677 -5
- mlrun/api/utils/singletons/logs_dir.py +1 -1
- mlrun/api/utils/singletons/project_member.py +1 -1
- mlrun/api/utils/singletons/scheduler.py +1 -1
- mlrun/artifacts/__init__.py +2 -2
- mlrun/artifacts/base.py +8 -2
- mlrun/artifacts/dataset.py +5 -3
- mlrun/artifacts/manager.py +7 -1
- mlrun/artifacts/model.py +15 -4
- mlrun/artifacts/plots.py +1 -1
- mlrun/common/__init__.py +1 -1
- mlrun/common/constants.py +15 -0
- mlrun/common/model_monitoring.py +209 -0
- mlrun/common/schemas/__init__.py +167 -0
- mlrun/{api → common}/schemas/artifact.py +13 -14
- mlrun/{api → common}/schemas/auth.py +10 -8
- mlrun/{api → common}/schemas/background_task.py +3 -3
- mlrun/{api → common}/schemas/client_spec.py +1 -1
- mlrun/{api → common}/schemas/clusterization_spec.py +3 -3
- mlrun/{api → common}/schemas/constants.py +21 -8
- mlrun/common/schemas/events.py +36 -0
- mlrun/{api → common}/schemas/feature_store.py +2 -1
- mlrun/{api → common}/schemas/frontend_spec.py +7 -6
- mlrun/{api → common}/schemas/function.py +5 -5
- mlrun/{api → common}/schemas/http.py +3 -3
- mlrun/common/schemas/hub.py +134 -0
- mlrun/{api → common}/schemas/k8s.py +3 -3
- mlrun/{api → common}/schemas/memory_reports.py +1 -1
- mlrun/common/schemas/model_endpoints.py +342 -0
- mlrun/common/schemas/notification.py +57 -0
- mlrun/{api → common}/schemas/object.py +6 -6
- mlrun/{api → common}/schemas/pipeline.py +3 -3
- mlrun/{api → common}/schemas/project.py +6 -5
- mlrun/common/schemas/regex.py +24 -0
- mlrun/common/schemas/runs.py +30 -0
- mlrun/{api → common}/schemas/runtime_resource.py +3 -3
- mlrun/{api → common}/schemas/schedule.py +19 -7
- mlrun/{api → common}/schemas/secret.py +3 -3
- mlrun/{api → common}/schemas/tag.py +2 -2
- mlrun/common/types.py +25 -0
- mlrun/config.py +152 -20
- mlrun/data_types/__init__.py +7 -2
- mlrun/data_types/data_types.py +4 -2
- mlrun/data_types/infer.py +1 -1
- mlrun/data_types/spark.py +10 -3
- mlrun/datastore/__init__.py +10 -3
- mlrun/datastore/azure_blob.py +1 -1
- mlrun/datastore/base.py +185 -53
- mlrun/datastore/datastore.py +1 -1
- mlrun/datastore/filestore.py +1 -1
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -1
- mlrun/datastore/s3.py +1 -1
- mlrun/datastore/sources.py +192 -70
- mlrun/datastore/spark_udf.py +44 -0
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/targets.py +115 -45
- mlrun/datastore/utils.py +127 -5
- mlrun/datastore/v3io.py +1 -1
- mlrun/datastore/wasbfs/__init__.py +1 -1
- mlrun/datastore/wasbfs/fs.py +1 -1
- mlrun/db/__init__.py +7 -5
- mlrun/db/base.py +112 -68
- mlrun/db/httpdb.py +445 -277
- mlrun/db/nopdb.py +491 -0
- mlrun/db/sqldb.py +112 -65
- mlrun/errors.py +6 -1
- mlrun/execution.py +44 -22
- mlrun/feature_store/__init__.py +1 -1
- mlrun/feature_store/api.py +143 -95
- mlrun/feature_store/common.py +16 -20
- mlrun/feature_store/feature_set.py +42 -12
- mlrun/feature_store/feature_vector.py +32 -21
- mlrun/feature_store/ingestion.py +9 -12
- mlrun/feature_store/retrieval/__init__.py +3 -2
- mlrun/feature_store/retrieval/base.py +388 -66
- mlrun/feature_store/retrieval/dask_merger.py +63 -151
- mlrun/feature_store/retrieval/job.py +30 -12
- mlrun/feature_store/retrieval/local_merger.py +40 -133
- mlrun/feature_store/retrieval/spark_merger.py +129 -127
- mlrun/feature_store/retrieval/storey_merger.py +173 -0
- mlrun/feature_store/steps.py +132 -15
- mlrun/features.py +8 -3
- mlrun/frameworks/__init__.py +1 -1
- mlrun/frameworks/_common/__init__.py +1 -1
- mlrun/frameworks/_common/artifacts_library.py +1 -1
- mlrun/frameworks/_common/mlrun_interface.py +1 -1
- mlrun/frameworks/_common/model_handler.py +1 -1
- mlrun/frameworks/_common/plan.py +1 -1
- mlrun/frameworks/_common/producer.py +1 -1
- mlrun/frameworks/_common/utils.py +1 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -1
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -1
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +1 -1
- mlrun/frameworks/_dl_common/model_handler.py +1 -1
- mlrun/frameworks/_dl_common/utils.py +1 -1
- mlrun/frameworks/_ml_common/__init__.py +1 -1
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -1
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -1
- mlrun/frameworks/_ml_common/loggers/logger.py +1 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/model_handler.py +1 -1
- mlrun/frameworks/_ml_common/pkl_model_server.py +13 -1
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -1
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +1 -6
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +1 -1
- mlrun/frameworks/_ml_common/producer.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +1 -1
- mlrun/frameworks/auto_mlrun/__init__.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +1 -1
- mlrun/frameworks/huggingface/__init__.py +1 -1
- mlrun/frameworks/huggingface/model_server.py +1 -1
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +1 -1
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +1 -1
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -1
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -1
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +1 -1
- mlrun/frameworks/lgbm/mlrun_interfaces/model_mlrun_interface.py +1 -1
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/lgbm/model_server.py +1 -1
- mlrun/frameworks/lgbm/utils.py +1 -1
- mlrun/frameworks/onnx/__init__.py +1 -1
- mlrun/frameworks/onnx/dataset.py +1 -1
- mlrun/frameworks/onnx/mlrun_interface.py +1 -1
- mlrun/frameworks/onnx/model_handler.py +1 -1
- mlrun/frameworks/onnx/model_server.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +1 -1
- mlrun/frameworks/pytorch/__init__.py +1 -1
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -1
- mlrun/frameworks/pytorch/callbacks/callback.py +1 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +1 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +1 -1
- mlrun/frameworks/pytorch/callbacks_handler.py +1 -1
- mlrun/frameworks/pytorch/mlrun_interface.py +1 -1
- mlrun/frameworks/pytorch/model_handler.py +1 -1
- mlrun/frameworks/pytorch/model_server.py +1 -1
- mlrun/frameworks/pytorch/utils.py +1 -1
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/estimator.py +1 -1
- mlrun/frameworks/sklearn/metric.py +1 -1
- mlrun/frameworks/sklearn/metrics_library.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +1 -1
- mlrun/frameworks/sklearn/model_handler.py +1 -1
- mlrun/frameworks/sklearn/utils.py +1 -1
- mlrun/frameworks/tf_keras/__init__.py +1 -1
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +1 -1
- mlrun/frameworks/tf_keras/model_handler.py +1 -1
- mlrun/frameworks/tf_keras/model_server.py +1 -1
- mlrun/frameworks/tf_keras/utils.py +1 -1
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/frameworks/xgboost/mlrun_interface.py +1 -1
- mlrun/frameworks/xgboost/model_handler.py +1 -1
- mlrun/frameworks/xgboost/utils.py +1 -1
- mlrun/k8s_utils.py +14 -765
- mlrun/kfpops.py +14 -17
- mlrun/launcher/__init__.py +13 -0
- mlrun/launcher/base.py +406 -0
- mlrun/launcher/client.py +159 -0
- mlrun/launcher/factory.py +50 -0
- mlrun/launcher/local.py +276 -0
- mlrun/launcher/remote.py +178 -0
- mlrun/lists.py +10 -2
- mlrun/mlutils/__init__.py +1 -1
- mlrun/mlutils/data.py +1 -1
- mlrun/mlutils/models.py +1 -1
- mlrun/mlutils/plots.py +1 -1
- mlrun/model.py +252 -14
- mlrun/model_monitoring/__init__.py +41 -0
- mlrun/model_monitoring/features_drift_table.py +1 -1
- mlrun/model_monitoring/helpers.py +123 -38
- mlrun/model_monitoring/model_endpoint.py +144 -0
- mlrun/model_monitoring/model_monitoring_batch.py +310 -259
- mlrun/model_monitoring/stores/__init__.py +106 -0
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +448 -0
- mlrun/model_monitoring/stores/model_endpoint_store.py +147 -0
- mlrun/model_monitoring/stores/models/__init__.py +23 -0
- mlrun/model_monitoring/stores/models/base.py +18 -0
- mlrun/model_monitoring/stores/models/mysql.py +100 -0
- mlrun/model_monitoring/stores/models/sqlite.py +98 -0
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +370 -0
- mlrun/model_monitoring/stream_processing_fs.py +239 -271
- mlrun/package/__init__.py +163 -0
- mlrun/package/context_handler.py +325 -0
- mlrun/package/errors.py +47 -0
- mlrun/package/packager.py +298 -0
- mlrun/{runtimes/package → package/packagers}/__init__.py +3 -1
- mlrun/package/packagers/default_packager.py +422 -0
- mlrun/package/packagers/numpy_packagers.py +612 -0
- mlrun/package/packagers/pandas_packagers.py +968 -0
- mlrun/package/packagers/python_standard_library_packagers.py +616 -0
- mlrun/package/packagers_manager.py +786 -0
- mlrun/package/utils/__init__.py +53 -0
- mlrun/package/utils/_archiver.py +226 -0
- mlrun/package/utils/_formatter.py +211 -0
- mlrun/package/utils/_pickler.py +234 -0
- mlrun/package/utils/_supported_format.py +71 -0
- mlrun/package/utils/log_hint_utils.py +93 -0
- mlrun/package/utils/type_hint_utils.py +298 -0
- mlrun/platforms/__init__.py +1 -1
- mlrun/platforms/iguazio.py +34 -2
- mlrun/platforms/other.py +1 -1
- mlrun/projects/__init__.py +1 -1
- mlrun/projects/operations.py +14 -9
- mlrun/projects/pipelines.py +31 -13
- mlrun/projects/project.py +762 -238
- mlrun/render.py +49 -19
- mlrun/run.py +57 -326
- mlrun/runtimes/__init__.py +3 -9
- mlrun/runtimes/base.py +247 -784
- mlrun/runtimes/constants.py +1 -1
- mlrun/runtimes/daskjob.py +45 -41
- mlrun/runtimes/funcdoc.py +43 -7
- mlrun/runtimes/function.py +66 -656
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/generators.py +1 -1
- mlrun/runtimes/kubejob.py +99 -116
- mlrun/runtimes/local.py +59 -66
- mlrun/runtimes/mpijob/__init__.py +1 -1
- mlrun/runtimes/mpijob/abstract.py +13 -15
- mlrun/runtimes/mpijob/v1.py +3 -1
- mlrun/runtimes/mpijob/v1alpha1.py +1 -1
- mlrun/runtimes/nuclio.py +1 -1
- mlrun/runtimes/pod.py +51 -26
- mlrun/runtimes/remotesparkjob.py +3 -1
- mlrun/runtimes/serving.py +12 -4
- mlrun/runtimes/sparkjob/__init__.py +1 -2
- mlrun/runtimes/sparkjob/abstract.py +44 -31
- mlrun/runtimes/sparkjob/spark3job.py +11 -9
- mlrun/runtimes/utils.py +61 -42
- mlrun/secrets.py +16 -18
- mlrun/serving/__init__.py +3 -2
- mlrun/serving/merger.py +1 -1
- mlrun/serving/remote.py +1 -1
- mlrun/serving/routers.py +39 -42
- mlrun/serving/server.py +23 -13
- mlrun/serving/serving_wrapper.py +1 -1
- mlrun/serving/states.py +172 -39
- mlrun/serving/utils.py +1 -1
- mlrun/serving/v1_serving.py +1 -1
- mlrun/serving/v2_serving.py +29 -21
- mlrun/utils/__init__.py +1 -2
- mlrun/utils/async_http.py +8 -1
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +2 -2
- mlrun/utils/condition_evaluator.py +65 -0
- mlrun/utils/db.py +52 -0
- mlrun/utils/helpers.py +188 -13
- mlrun/utils/http.py +89 -54
- mlrun/utils/logger.py +48 -8
- mlrun/utils/model_monitoring.py +132 -100
- mlrun/utils/notifications/__init__.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +8 -6
- mlrun/utils/notifications/notification/base.py +20 -14
- mlrun/utils/notifications/notification/console.py +7 -4
- mlrun/utils/notifications/notification/git.py +36 -19
- mlrun/utils/notifications/notification/ipython.py +10 -8
- mlrun/utils/notifications/notification/slack.py +18 -13
- mlrun/utils/notifications/notification_pusher.py +377 -56
- mlrun/utils/regex.py +6 -1
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +1 -1
- mlrun/utils/vault.py +270 -269
- mlrun/utils/version/__init__.py +1 -1
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +1 -1
- {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/METADATA +16 -10
- mlrun-1.4.0.dist-info/RECORD +434 -0
- mlrun/api/api/endpoints/marketplace.py +0 -257
- mlrun/api/crud/marketplace.py +0 -221
- mlrun/api/crud/model_monitoring/model_endpoint_store.py +0 -847
- mlrun/api/db/filedb/db.py +0 -518
- mlrun/api/schemas/marketplace.py +0 -128
- mlrun/api/schemas/model_endpoints.py +0 -185
- mlrun/db/filedb.py +0 -891
- mlrun/feature_store/retrieval/online.py +0 -92
- mlrun/model_monitoring/constants.py +0 -67
- mlrun/runtimes/package/context_handler.py +0 -711
- mlrun/runtimes/sparkjob/spark2job.py +0 -59
- mlrun-1.3.3.dist-info/RECORD +0 -381
- {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/LICENSE +0 -0
- {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/WHEEL +0 -0
- {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -13,16 +13,30 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
import abc
|
|
16
|
+
import typing
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
|
|
19
|
+
import dask.dataframe as dd
|
|
20
|
+
import pandas as pd
|
|
16
21
|
|
|
17
22
|
import mlrun
|
|
18
23
|
from mlrun.datastore.targets import CSVTarget, ParquetTarget
|
|
24
|
+
from mlrun.feature_store.feature_set import FeatureSet
|
|
25
|
+
from mlrun.feature_store.feature_vector import Feature
|
|
19
26
|
|
|
20
|
-
from ...utils import logger
|
|
27
|
+
from ...utils import logger, str_to_timestamp
|
|
28
|
+
from ..feature_vector import OfflineVectorResponse
|
|
21
29
|
|
|
22
30
|
|
|
23
31
|
class BaseMerger(abc.ABC):
|
|
24
32
|
"""abstract feature merger class"""
|
|
25
33
|
|
|
34
|
+
# In order to be an online merger, the merger should implement `init_online_vector_service` function.
|
|
35
|
+
support_online = False
|
|
36
|
+
|
|
37
|
+
# In order to be an offline merger, the merger should implement
|
|
38
|
+
# `_order_by`, `_filter`, `_drop_columns_from_result`, `_rename_columns_and_select`, `_get_engine_df` functions.
|
|
39
|
+
support_offline = False
|
|
26
40
|
engine = None
|
|
27
41
|
|
|
28
42
|
def __init__(self, vector, **engine_args):
|
|
@@ -36,6 +50,8 @@ class BaseMerger(abc.ABC):
|
|
|
36
50
|
self._drop_indexes = True
|
|
37
51
|
self._target = None
|
|
38
52
|
self._alias = dict()
|
|
53
|
+
self._origin_alias = dict()
|
|
54
|
+
self._entity_rows_node_name = "__mlrun__$entity_rows$"
|
|
39
55
|
|
|
40
56
|
def _append_drop_column(self, key):
|
|
41
57
|
if key and key not in self._drop_columns:
|
|
@@ -67,22 +83,19 @@ class BaseMerger(abc.ABC):
|
|
|
67
83
|
drop_columns=None,
|
|
68
84
|
start_time=None,
|
|
69
85
|
end_time=None,
|
|
86
|
+
timestamp_for_filtering=None,
|
|
70
87
|
with_indexes=None,
|
|
71
88
|
update_stats=None,
|
|
72
89
|
query=None,
|
|
73
|
-
|
|
90
|
+
order_by=None,
|
|
74
91
|
):
|
|
75
92
|
self._target = target
|
|
76
|
-
self._join_type = join_type
|
|
77
93
|
|
|
78
94
|
# calculate the index columns and columns we need to drop
|
|
79
95
|
self._drop_columns = drop_columns or self._drop_columns
|
|
80
96
|
if self.vector.spec.with_indexes or with_indexes:
|
|
81
97
|
self._drop_indexes = False
|
|
82
98
|
|
|
83
|
-
if entity_timestamp_column and self._drop_indexes:
|
|
84
|
-
self._append_drop_column(entity_timestamp_column)
|
|
85
|
-
|
|
86
99
|
# retrieve the feature set objects/fields needed for the vector
|
|
87
100
|
feature_set_objects, feature_set_fields = self.vector.parse_features(
|
|
88
101
|
update_stats=update_stats
|
|
@@ -96,23 +109,34 @@ class BaseMerger(abc.ABC):
|
|
|
96
109
|
# update the feature vector objects with refreshed stats
|
|
97
110
|
self.vector.save()
|
|
98
111
|
|
|
112
|
+
if self._drop_indexes and entity_timestamp_column:
|
|
113
|
+
self._append_drop_column(entity_timestamp_column)
|
|
114
|
+
|
|
99
115
|
for feature_set in feature_set_objects.values():
|
|
100
|
-
if
|
|
116
|
+
if self._drop_indexes:
|
|
101
117
|
self._append_drop_column(feature_set.spec.timestamp_key)
|
|
102
118
|
for key in feature_set.spec.entities.keys():
|
|
103
119
|
self._append_index(key)
|
|
104
120
|
|
|
105
|
-
|
|
121
|
+
start_time = str_to_timestamp(start_time)
|
|
122
|
+
end_time = str_to_timestamp(end_time)
|
|
123
|
+
if start_time and not end_time:
|
|
124
|
+
# if end_time is not specified set it to now()
|
|
125
|
+
end_time = pd.Timestamp.now()
|
|
126
|
+
|
|
127
|
+
return self._generate_offline_vector(
|
|
106
128
|
entity_rows,
|
|
107
129
|
entity_timestamp_column,
|
|
108
130
|
feature_set_objects=feature_set_objects,
|
|
109
131
|
feature_set_fields=feature_set_fields,
|
|
110
132
|
start_time=start_time,
|
|
111
133
|
end_time=end_time,
|
|
134
|
+
timestamp_for_filtering=timestamp_for_filtering,
|
|
112
135
|
query=query,
|
|
136
|
+
order_by=order_by,
|
|
113
137
|
)
|
|
114
138
|
|
|
115
|
-
def
|
|
139
|
+
def _write_to_offline_target(self):
|
|
116
140
|
if self._target:
|
|
117
141
|
is_persistent_vector = self.vector.metadata.name is not None
|
|
118
142
|
if not self._target.path and not is_persistent_vector:
|
|
@@ -125,6 +149,14 @@ class BaseMerger(abc.ABC):
|
|
|
125
149
|
target_status = self._target.update_resource_status("ready", size=size)
|
|
126
150
|
logger.info(f"wrote target: {target_status}")
|
|
127
151
|
self.vector.save()
|
|
152
|
+
if not self._drop_indexes:
|
|
153
|
+
self.vector.spec.entity_fields = [
|
|
154
|
+
Feature(name=feature, value_type=self._result_df[feature].dtype)
|
|
155
|
+
if self._result_df[feature].dtype.name != "object"
|
|
156
|
+
else Feature(name=feature, value_type="str")
|
|
157
|
+
for feature in self._index_columns
|
|
158
|
+
]
|
|
159
|
+
self.vector.save()
|
|
128
160
|
|
|
129
161
|
def _set_indexes(self, df):
|
|
130
162
|
if self._index_columns and not self._drop_indexes:
|
|
@@ -134,29 +166,16 @@ class BaseMerger(abc.ABC):
|
|
|
134
166
|
if index not in df.columns:
|
|
135
167
|
index_columns_missing.append(index)
|
|
136
168
|
if not index_columns_missing:
|
|
137
|
-
|
|
138
|
-
df.set_index(self._index_columns, inplace=True)
|
|
139
|
-
elif self.engine == "dask":
|
|
140
|
-
if len(self._index_columns) == 1:
|
|
141
|
-
return df.set_index(self._index_columns[0])
|
|
142
|
-
elif len(self._index_columns) != 1:
|
|
143
|
-
return self._reset_index(self._result_df)
|
|
144
|
-
else:
|
|
145
|
-
logger.info(
|
|
146
|
-
"The entities will stay as columns because "
|
|
147
|
-
"Dask dataframe does not yet support multi-indexes"
|
|
148
|
-
)
|
|
149
|
-
return self._result_df
|
|
169
|
+
df.set_index(self._index_columns, inplace=True)
|
|
150
170
|
else:
|
|
151
171
|
logger.warn(
|
|
152
172
|
f"Can't set index, not all index columns found: {index_columns_missing}. "
|
|
153
173
|
f"It is possible that column was already indexed."
|
|
154
174
|
)
|
|
155
|
-
|
|
156
|
-
|
|
175
|
+
else:
|
|
176
|
+
df.reset_index(drop=True, inplace=True)
|
|
157
177
|
|
|
158
|
-
|
|
159
|
-
def _generate_vector(
|
|
178
|
+
def _generate_offline_vector(
|
|
160
179
|
self,
|
|
161
180
|
entity_rows,
|
|
162
181
|
entity_timestamp_column,
|
|
@@ -164,9 +183,203 @@ class BaseMerger(abc.ABC):
|
|
|
164
183
|
feature_set_fields,
|
|
165
184
|
start_time=None,
|
|
166
185
|
end_time=None,
|
|
186
|
+
timestamp_for_filtering=None,
|
|
167
187
|
query=None,
|
|
188
|
+
order_by=None,
|
|
168
189
|
):
|
|
169
|
-
|
|
190
|
+
self._create_engine_env()
|
|
191
|
+
|
|
192
|
+
feature_sets = []
|
|
193
|
+
dfs = []
|
|
194
|
+
keys = (
|
|
195
|
+
[]
|
|
196
|
+
) # the struct of key is [[[],[]], ..] So that each record indicates which way the corresponding
|
|
197
|
+
# featureset is connected to the previous one, and within each record the left keys are indicated in index 0
|
|
198
|
+
# and the right keys in index 1, this keys will be the keys that will be used in this join
|
|
199
|
+
|
|
200
|
+
fs_link_list = self._create_linked_relation_list(
|
|
201
|
+
feature_set_objects, feature_set_fields
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
filtered = False
|
|
205
|
+
for node in fs_link_list:
|
|
206
|
+
name = node.name
|
|
207
|
+
feature_set = feature_set_objects[name]
|
|
208
|
+
feature_sets.append(feature_set)
|
|
209
|
+
columns = feature_set_fields[name]
|
|
210
|
+
self._origin_alias.update({name: alias for name, alias in columns})
|
|
211
|
+
column_names = [name for name, _ in columns]
|
|
212
|
+
|
|
213
|
+
for column in node.data["save_cols"]:
|
|
214
|
+
if column not in column_names:
|
|
215
|
+
column_names.append(column)
|
|
216
|
+
if column not in self._index_columns:
|
|
217
|
+
self._append_drop_column(column)
|
|
218
|
+
|
|
219
|
+
if isinstance(timestamp_for_filtering, dict):
|
|
220
|
+
time_column = timestamp_for_filtering.get(
|
|
221
|
+
name, feature_set.spec.timestamp_key
|
|
222
|
+
)
|
|
223
|
+
elif isinstance(timestamp_for_filtering, str):
|
|
224
|
+
time_column = timestamp_for_filtering
|
|
225
|
+
else:
|
|
226
|
+
time_column = feature_set.spec.timestamp_key
|
|
227
|
+
|
|
228
|
+
if time_column != feature_set.spec.timestamp_key and time_column not in [
|
|
229
|
+
feature.name for feature in feature_set.spec.features
|
|
230
|
+
]:
|
|
231
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
232
|
+
f"Feature set `{name}` "
|
|
233
|
+
f"does not have a column named `{time_column}` to filter on."
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if self._drop_indexes:
|
|
237
|
+
self._append_drop_column(time_column)
|
|
238
|
+
if (start_time or end_time) and time_column:
|
|
239
|
+
filtered = True
|
|
240
|
+
|
|
241
|
+
df = self._get_engine_df(
|
|
242
|
+
feature_set,
|
|
243
|
+
name,
|
|
244
|
+
column_names,
|
|
245
|
+
start_time if time_column else None,
|
|
246
|
+
end_time if time_column else None,
|
|
247
|
+
time_column,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
column_names += node.data["save_index"]
|
|
251
|
+
node.data["save_cols"] += node.data["save_index"]
|
|
252
|
+
fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
|
|
253
|
+
if feature_set.spec.timestamp_key:
|
|
254
|
+
column_names.append(feature_set.spec.timestamp_key)
|
|
255
|
+
node.data["save_cols"].append(feature_set.spec.timestamp_key)
|
|
256
|
+
fs_entities_and_timestamp.append(feature_set.spec.timestamp_key)
|
|
257
|
+
|
|
258
|
+
# rename columns to be unique for each feature set and select if needed
|
|
259
|
+
rename_col_dict = {
|
|
260
|
+
column: f"{column}_{name}"
|
|
261
|
+
for column in column_names
|
|
262
|
+
if column not in node.data["save_cols"]
|
|
263
|
+
}
|
|
264
|
+
df_temp = self._rename_columns_and_select(
|
|
265
|
+
df,
|
|
266
|
+
rename_col_dict,
|
|
267
|
+
columns=list(set(column_names + fs_entities_and_timestamp)),
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if df_temp is not None:
|
|
271
|
+
df = df_temp
|
|
272
|
+
del df_temp
|
|
273
|
+
|
|
274
|
+
dfs.append(df)
|
|
275
|
+
del df
|
|
276
|
+
|
|
277
|
+
keys.append([node.data["left_keys"], node.data["right_keys"]])
|
|
278
|
+
|
|
279
|
+
# update alias according to the unique column name
|
|
280
|
+
new_columns = []
|
|
281
|
+
if not self._drop_indexes:
|
|
282
|
+
new_columns.extend([(ind, ind) for ind in fs_entities_and_timestamp])
|
|
283
|
+
for column, alias in columns:
|
|
284
|
+
if column in rename_col_dict:
|
|
285
|
+
new_columns.append((rename_col_dict[column], alias or column))
|
|
286
|
+
else:
|
|
287
|
+
new_columns.append((column, alias))
|
|
288
|
+
self._update_alias(dictionary={name: alias for name, alias in new_columns})
|
|
289
|
+
|
|
290
|
+
# None of the feature sets was filtered as required
|
|
291
|
+
if not filtered and (start_time or end_time):
|
|
292
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
293
|
+
"start_time and end_time can only be provided in conjunction with "
|
|
294
|
+
"a timestamp column, or when the at least one feature_set has a timestamp key"
|
|
295
|
+
)
|
|
296
|
+
# convert pandas entity_rows to spark\dask DF if needed
|
|
297
|
+
if (
|
|
298
|
+
entity_rows is not None
|
|
299
|
+
and not hasattr(entity_rows, "rdd")
|
|
300
|
+
and self.engine == "spark"
|
|
301
|
+
):
|
|
302
|
+
entity_rows = self.spark.createDataFrame(entity_rows)
|
|
303
|
+
elif (
|
|
304
|
+
entity_rows is not None
|
|
305
|
+
and not hasattr(entity_rows, "dask")
|
|
306
|
+
and self.engine == "dask"
|
|
307
|
+
):
|
|
308
|
+
entity_rows = dd.from_pandas(
|
|
309
|
+
entity_rows, npartitions=len(entity_rows.columns)
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# join the feature data frames
|
|
313
|
+
result_timestamp = self.merge(
|
|
314
|
+
entity_df=entity_rows,
|
|
315
|
+
entity_timestamp_column=entity_timestamp_column
|
|
316
|
+
if entity_rows is not None
|
|
317
|
+
else None,
|
|
318
|
+
featuresets=feature_sets,
|
|
319
|
+
featureset_dfs=dfs,
|
|
320
|
+
keys=keys,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
all_columns = None
|
|
324
|
+
if not self._drop_indexes and result_timestamp:
|
|
325
|
+
if result_timestamp not in self._alias.values():
|
|
326
|
+
self._update_alias(key=result_timestamp, val=result_timestamp)
|
|
327
|
+
all_columns = list(self._alias.keys())
|
|
328
|
+
|
|
329
|
+
df_temp = self._rename_columns_and_select(
|
|
330
|
+
self._result_df, self._alias, columns=all_columns
|
|
331
|
+
)
|
|
332
|
+
if df_temp is not None:
|
|
333
|
+
self._result_df = df_temp
|
|
334
|
+
del df_temp
|
|
335
|
+
|
|
336
|
+
df_temp = self._drop_columns_from_result()
|
|
337
|
+
if df_temp is not None:
|
|
338
|
+
self._result_df = df_temp
|
|
339
|
+
del df_temp
|
|
340
|
+
|
|
341
|
+
if self.vector.status.label_column:
|
|
342
|
+
self._result_df = self._result_df.dropna(
|
|
343
|
+
subset=[self.vector.status.label_column]
|
|
344
|
+
)
|
|
345
|
+
# filter joined data frame by the query param
|
|
346
|
+
if query:
|
|
347
|
+
self._filter(query)
|
|
348
|
+
|
|
349
|
+
if order_by:
|
|
350
|
+
if isinstance(order_by, str):
|
|
351
|
+
order_by = [order_by]
|
|
352
|
+
order_by_active = [
|
|
353
|
+
order_col
|
|
354
|
+
if order_col in self._result_df.columns
|
|
355
|
+
else self._origin_alias.get(order_col, None)
|
|
356
|
+
for order_col in order_by
|
|
357
|
+
]
|
|
358
|
+
if None in order_by_active:
|
|
359
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
360
|
+
f"Result dataframe contains {self._result_df.columns} "
|
|
361
|
+
f"columns and can't order by {order_by}"
|
|
362
|
+
)
|
|
363
|
+
self._order_by(order_by_active)
|
|
364
|
+
|
|
365
|
+
self._write_to_offline_target()
|
|
366
|
+
return OfflineVectorResponse(self)
|
|
367
|
+
|
|
368
|
+
def init_online_vector_service(
|
|
369
|
+
self, entity_keys, fixed_window_type, update_stats=False
|
|
370
|
+
):
|
|
371
|
+
"""
|
|
372
|
+
initialize the `OnlineVectorService`
|
|
373
|
+
|
|
374
|
+
:param entity_keys: list of the feature_vector indexes.
|
|
375
|
+
:param fixed_window_type: determines how to query the fixed window values which were previously
|
|
376
|
+
inserted by ingest
|
|
377
|
+
:param update_stats: update features statistics from the requested feature sets on the vector.
|
|
378
|
+
Default: False.
|
|
379
|
+
|
|
380
|
+
:return: `OnlineVectorService`
|
|
381
|
+
"""
|
|
382
|
+
raise NotImplementedError
|
|
170
383
|
|
|
171
384
|
def _unpersist_df(self, df):
|
|
172
385
|
pass
|
|
@@ -178,7 +391,6 @@ class BaseMerger(abc.ABC):
|
|
|
178
391
|
featuresets: list,
|
|
179
392
|
featureset_dfs: list,
|
|
180
393
|
keys: list = None,
|
|
181
|
-
all_columns: list = None,
|
|
182
394
|
):
|
|
183
395
|
"""join the entities and feature set features into a result dataframe"""
|
|
184
396
|
merged_df = entity_df
|
|
@@ -190,10 +402,6 @@ class BaseMerger(abc.ABC):
|
|
|
190
402
|
else:
|
|
191
403
|
# keys can be multiple keys on each side of the join
|
|
192
404
|
keys = [[[], []]] * len(featureset_dfs)
|
|
193
|
-
if all_columns is not None:
|
|
194
|
-
all_columns.pop(0)
|
|
195
|
-
else:
|
|
196
|
-
all_columns = [[]] * len(featureset_dfs)
|
|
197
405
|
entity_timestamp_column = (
|
|
198
406
|
entity_timestamp_column or featureset.spec.timestamp_key
|
|
199
407
|
)
|
|
@@ -203,16 +411,9 @@ class BaseMerger(abc.ABC):
|
|
|
203
411
|
# and it can join only by the entities of the first `featureset`
|
|
204
412
|
keys[0][0] = keys[0][1] = list(featuresets[0].spec.entities.keys())
|
|
205
413
|
|
|
206
|
-
for featureset, featureset_df, lr_key
|
|
207
|
-
|
|
208
|
-
):
|
|
209
|
-
if featureset.spec.timestamp_key:
|
|
414
|
+
for featureset, featureset_df, lr_key in zip(featuresets, featureset_dfs, keys):
|
|
415
|
+
if featureset.spec.timestamp_key and entity_timestamp_column:
|
|
210
416
|
merge_func = self._asof_join
|
|
211
|
-
if self._join_type != "inner":
|
|
212
|
-
logger.warn(
|
|
213
|
-
"Merge all the features with as_of_join and don't "
|
|
214
|
-
"take into account the join_type that was given"
|
|
215
|
-
)
|
|
216
417
|
else:
|
|
217
418
|
merge_func = self._join
|
|
218
419
|
|
|
@@ -223,7 +424,9 @@ class BaseMerger(abc.ABC):
|
|
|
223
424
|
featureset_df,
|
|
224
425
|
lr_key[0],
|
|
225
426
|
lr_key[1],
|
|
226
|
-
|
|
427
|
+
)
|
|
428
|
+
entity_timestamp_column = (
|
|
429
|
+
entity_timestamp_column or featureset.spec.timestamp_key
|
|
227
430
|
)
|
|
228
431
|
|
|
229
432
|
# unpersist as required by the implementation (e.g. spark) and delete references
|
|
@@ -232,8 +435,8 @@ class BaseMerger(abc.ABC):
|
|
|
232
435
|
del featureset_df
|
|
233
436
|
|
|
234
437
|
self._result_df = merged_df
|
|
438
|
+
return entity_timestamp_column
|
|
235
439
|
|
|
236
|
-
@abc.abstractmethod
|
|
237
440
|
def _asof_join(
|
|
238
441
|
self,
|
|
239
442
|
entity_df,
|
|
@@ -242,11 +445,9 @@ class BaseMerger(abc.ABC):
|
|
|
242
445
|
featureset_df,
|
|
243
446
|
left_keys: list,
|
|
244
447
|
right_keys: list,
|
|
245
|
-
columns: list,
|
|
246
448
|
):
|
|
247
449
|
raise NotImplementedError("_asof_join() operation not implemented in class")
|
|
248
450
|
|
|
249
|
-
@abc.abstractmethod
|
|
250
451
|
def _join(
|
|
251
452
|
self,
|
|
252
453
|
entity_df,
|
|
@@ -255,7 +456,6 @@ class BaseMerger(abc.ABC):
|
|
|
255
456
|
featureset_df,
|
|
256
457
|
left_keys: list,
|
|
257
458
|
right_keys: list,
|
|
258
|
-
columns: list,
|
|
259
459
|
):
|
|
260
460
|
raise NotImplementedError("_join() operation not implemented in class")
|
|
261
461
|
|
|
@@ -267,6 +467,7 @@ class BaseMerger(abc.ABC):
|
|
|
267
467
|
|
|
268
468
|
def get_df(self, to_pandas=True):
|
|
269
469
|
"""return the result as a dataframe (pandas by default)"""
|
|
470
|
+
self._set_indexes(self._result_df)
|
|
270
471
|
return self._result_df
|
|
271
472
|
|
|
272
473
|
def to_parquet(self, target_path, **kw):
|
|
@@ -293,6 +494,9 @@ class BaseMerger(abc.ABC):
|
|
|
293
494
|
def __eq__(self, other):
|
|
294
495
|
return self.name == other.name
|
|
295
496
|
|
|
497
|
+
def __copy__(self):
|
|
498
|
+
return BaseMerger._Node(self.name, self.order, self.data.copy())
|
|
499
|
+
|
|
296
500
|
class _LinkedList:
|
|
297
501
|
def __init__(self, head=None):
|
|
298
502
|
self.head = head
|
|
@@ -313,6 +517,19 @@ class BaseMerger(abc.ABC):
|
|
|
313
517
|
yield node
|
|
314
518
|
node = node.next
|
|
315
519
|
|
|
520
|
+
def __copy__(self):
|
|
521
|
+
ll = BaseMerger._LinkedList()
|
|
522
|
+
prev_node = None
|
|
523
|
+
for node in self:
|
|
524
|
+
new_node = node.__copy__()
|
|
525
|
+
if ll.head is None:
|
|
526
|
+
ll.head = new_node
|
|
527
|
+
else:
|
|
528
|
+
prev_node.next = new_node
|
|
529
|
+
prev_node = new_node
|
|
530
|
+
ll.len = self.len
|
|
531
|
+
return ll
|
|
532
|
+
|
|
316
533
|
def add_first(self, node):
|
|
317
534
|
node.next = self.head
|
|
318
535
|
self.head = node
|
|
@@ -325,7 +542,9 @@ class BaseMerger(abc.ABC):
|
|
|
325
542
|
for current_node in self:
|
|
326
543
|
pass
|
|
327
544
|
current_node.next = node
|
|
328
|
-
|
|
545
|
+
while node:
|
|
546
|
+
self.len += 1
|
|
547
|
+
node = node.next
|
|
329
548
|
|
|
330
549
|
def add_after(self, target_node, new_node):
|
|
331
550
|
new_node.next = target_node.next
|
|
@@ -346,7 +565,9 @@ class BaseMerger(abc.ABC):
|
|
|
346
565
|
node = self.find_node(other_head.name)
|
|
347
566
|
if node is None:
|
|
348
567
|
return
|
|
349
|
-
|
|
568
|
+
for col in other_head.data["save_cols"]:
|
|
569
|
+
if col not in node.data["save_cols"]:
|
|
570
|
+
node.data["save_cols"].append(col)
|
|
350
571
|
for other_node in other_iter:
|
|
351
572
|
if self.find_node(other_node.name) is None:
|
|
352
573
|
while node is not None and other_node.order > node.order:
|
|
@@ -357,10 +578,11 @@ class BaseMerger(abc.ABC):
|
|
|
357
578
|
self.add_last(other_node)
|
|
358
579
|
node = other_node
|
|
359
580
|
|
|
360
|
-
|
|
361
|
-
|
|
581
|
+
def _create_linked_relation_list(
|
|
582
|
+
self, feature_set_objects, feature_set_fields, entity_rows_keys=None
|
|
583
|
+
):
|
|
362
584
|
feature_set_names = list(feature_set_fields.keys())
|
|
363
|
-
if len(feature_set_names) == 1:
|
|
585
|
+
if len(feature_set_names) == 1 and not entity_rows_keys:
|
|
364
586
|
return BaseMerger._LinkedList(
|
|
365
587
|
head=BaseMerger._Node(
|
|
366
588
|
name=feature_set_names[0],
|
|
@@ -420,10 +642,9 @@ class BaseMerger(abc.ABC):
|
|
|
420
642
|
)
|
|
421
643
|
)
|
|
422
644
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
if relation_wise:
|
|
645
|
+
if all(
|
|
646
|
+
curr_col_relation_list
|
|
647
|
+
): # checking if feature_set have relation with feature_set_in
|
|
427
648
|
# add to the link list feature set according to the defined relation
|
|
428
649
|
linked_list_relation.add_last(
|
|
429
650
|
BaseMerger._Node(
|
|
@@ -437,8 +658,8 @@ class BaseMerger(abc.ABC):
|
|
|
437
658
|
order=name_in_order,
|
|
438
659
|
)
|
|
439
660
|
)
|
|
440
|
-
linked_list_relation.head.data["save_cols"].
|
|
441
|
-
|
|
661
|
+
linked_list_relation.head.data["save_cols"].extend(
|
|
662
|
+
curr_col_relation_list
|
|
442
663
|
)
|
|
443
664
|
elif name_in_order > head_order and sorted(
|
|
444
665
|
feature_set_in_entity_list_names
|
|
@@ -460,26 +681,127 @@ class BaseMerger(abc.ABC):
|
|
|
460
681
|
linked_list_relation.head.data["save_index"] = keys
|
|
461
682
|
return linked_list_relation
|
|
462
683
|
|
|
684
|
+
def _build_entity_rows_relation(entity_rows_relation, fs_name, fs_order):
|
|
685
|
+
feature_set_entity_list = feature_set_entity_list_dict[fs_name]
|
|
686
|
+
feature_set_entity_list_names = list(feature_set_entity_list.keys())
|
|
687
|
+
|
|
688
|
+
if all([ent in entity_rows_keys for ent in feature_set_entity_list_names]):
|
|
689
|
+
# add to the link list feature set according to indexes match,
|
|
690
|
+
# only if all entities in the feature set exist in the entity rows
|
|
691
|
+
keys = feature_set_entity_list_names
|
|
692
|
+
entity_rows_relation.add_last(
|
|
693
|
+
BaseMerger._Node(
|
|
694
|
+
fs_name,
|
|
695
|
+
data={
|
|
696
|
+
"left_keys": keys,
|
|
697
|
+
"right_keys": keys,
|
|
698
|
+
"save_cols": [],
|
|
699
|
+
"save_index": keys,
|
|
700
|
+
},
|
|
701
|
+
order=fs_order,
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
entity_rows_relation.head.data["save_index"] = keys
|
|
705
|
+
|
|
706
|
+
if entity_rows_keys is not None:
|
|
707
|
+
entity_rows_linked_relation = _create_relation(
|
|
708
|
+
self._entity_rows_node_name, -1
|
|
709
|
+
)
|
|
710
|
+
relation_linked_lists.append(entity_rows_linked_relation)
|
|
711
|
+
linked_list_len_goal = len(feature_set_objects) + 1
|
|
712
|
+
else:
|
|
713
|
+
entity_rows_linked_relation = None
|
|
714
|
+
linked_list_len_goal = len(feature_set_objects)
|
|
715
|
+
|
|
463
716
|
for i, name in enumerate(feature_set_names):
|
|
464
717
|
linked_relation = _create_relation(name, i)
|
|
718
|
+
if entity_rows_linked_relation is not None:
|
|
719
|
+
_build_entity_rows_relation(entity_rows_linked_relation, name, i)
|
|
465
720
|
for j, name_in in enumerate(feature_set_names):
|
|
466
721
|
if name != name_in:
|
|
467
722
|
linked_relation = _build_relation(name_in, j, linked_relation, i)
|
|
468
723
|
relation_linked_lists.append(linked_relation)
|
|
469
724
|
|
|
470
725
|
# concat all the link lists to one, for the merging process
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
726
|
+
for i in range(len(relation_linked_lists)):
|
|
727
|
+
return_relation = relation_linked_lists[i].__copy__()
|
|
728
|
+
for relation_list in relation_linked_lists:
|
|
729
|
+
return_relation.concat(relation_list)
|
|
730
|
+
if return_relation.len == linked_list_len_goal:
|
|
731
|
+
return return_relation
|
|
477
732
|
|
|
478
|
-
|
|
733
|
+
raise mlrun.errors.MLRunRuntimeError("Failed to merge")
|
|
479
734
|
|
|
480
|
-
@classmethod
|
|
481
735
|
def get_default_image(cls, kind):
|
|
482
736
|
return mlrun.mlconf.feature_store.default_job_image
|
|
483
737
|
|
|
484
738
|
def _reset_index(self, _result_df):
|
|
485
739
|
raise NotImplementedError
|
|
740
|
+
|
|
741
|
+
def _create_engine_env(self):
|
|
742
|
+
"""
|
|
743
|
+
initialize engine env if needed
|
|
744
|
+
"""
|
|
745
|
+
raise NotImplementedError
|
|
746
|
+
|
|
747
|
+
def _get_engine_df(
|
|
748
|
+
self,
|
|
749
|
+
feature_set: FeatureSet,
|
|
750
|
+
feature_set_name: typing.List[str],
|
|
751
|
+
column_names: typing.List[str] = None,
|
|
752
|
+
start_time: typing.Union[str, datetime] = None,
|
|
753
|
+
end_time: typing.Union[str, datetime] = None,
|
|
754
|
+
time_column: typing.Optional[str] = None,
|
|
755
|
+
):
|
|
756
|
+
"""
|
|
757
|
+
Return the feature_set data frame according to the args
|
|
758
|
+
|
|
759
|
+
:param feature_set: current feature_set to extract from the data frame
|
|
760
|
+
:param feature_set_name: the name of the current feature_set
|
|
761
|
+
:param column_names: list of columns to select (if not all)
|
|
762
|
+
:param start_time: filter by start time
|
|
763
|
+
:param end_time: filter by end time
|
|
764
|
+
:param time_column: specify the time column name to filter on
|
|
765
|
+
|
|
766
|
+
:return: Data frame of the current engine
|
|
767
|
+
"""
|
|
768
|
+
raise NotImplementedError
|
|
769
|
+
|
|
770
|
+
def _rename_columns_and_select(
|
|
771
|
+
self,
|
|
772
|
+
df,
|
|
773
|
+
rename_col_dict: typing.Dict[str, str],
|
|
774
|
+
columns: typing.List[str] = None,
|
|
775
|
+
):
|
|
776
|
+
"""
|
|
777
|
+
rename the columns of the df according to rename_col_dict, and select only `columns` if it is not none
|
|
778
|
+
|
|
779
|
+
:param df: the data frame to change
|
|
780
|
+
:param rename_col_dict: the renaming dictionary - {<current_column_name>: <new_column_name>, ...}
|
|
781
|
+
:param columns: list of columns to select (if not all)
|
|
782
|
+
|
|
783
|
+
:return: the data frame after the transformation or None if the transformation were preformed inplace
|
|
784
|
+
"""
|
|
785
|
+
raise NotImplementedError
|
|
786
|
+
|
|
787
|
+
def _drop_columns_from_result(self):
|
|
788
|
+
"""
|
|
789
|
+
drop `self._drop_columns` from `self._result_df`
|
|
790
|
+
"""
|
|
791
|
+
raise NotImplementedError
|
|
792
|
+
|
|
793
|
+
def _filter(self, query: str):
|
|
794
|
+
"""
|
|
795
|
+
filter `self._result_df` by `query`
|
|
796
|
+
|
|
797
|
+
:param query: The query string used to filter rows
|
|
798
|
+
"""
|
|
799
|
+
raise NotImplementedError
|
|
800
|
+
|
|
801
|
+
def _order_by(self, order_by_active: typing.List[str]):
|
|
802
|
+
"""
|
|
803
|
+
Order by `order_by_active` along all axis.
|
|
804
|
+
|
|
805
|
+
:param order_by_active: list of names to sort by.
|
|
806
|
+
"""
|
|
807
|
+
raise NotImplementedError
|