oracle-ads 2.13.9rc0__py3-none-any.whl → 2.13.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ads/aqua/__init__.py +40 -0
- ads/aqua/app.py +507 -0
- ads/aqua/cli.py +96 -0
- ads/aqua/client/__init__.py +3 -0
- ads/aqua/client/client.py +836 -0
- ads/aqua/client/openai_client.py +305 -0
- ads/aqua/common/__init__.py +5 -0
- ads/aqua/common/decorator.py +125 -0
- ads/aqua/common/entities.py +274 -0
- ads/aqua/common/enums.py +134 -0
- ads/aqua/common/errors.py +109 -0
- ads/aqua/common/utils.py +1295 -0
- ads/aqua/config/__init__.py +4 -0
- ads/aqua/config/container_config.py +246 -0
- ads/aqua/config/evaluation/__init__.py +4 -0
- ads/aqua/config/evaluation/evaluation_service_config.py +147 -0
- ads/aqua/config/utils/__init__.py +4 -0
- ads/aqua/config/utils/serializer.py +339 -0
- ads/aqua/constants.py +116 -0
- ads/aqua/data.py +14 -0
- ads/aqua/dummy_data/icon.txt +1 -0
- ads/aqua/dummy_data/oci_model_deployments.json +56 -0
- ads/aqua/dummy_data/oci_models.json +1 -0
- ads/aqua/dummy_data/readme.md +26 -0
- ads/aqua/evaluation/__init__.py +8 -0
- ads/aqua/evaluation/constants.py +53 -0
- ads/aqua/evaluation/entities.py +186 -0
- ads/aqua/evaluation/errors.py +70 -0
- ads/aqua/evaluation/evaluation.py +1814 -0
- ads/aqua/extension/__init__.py +42 -0
- ads/aqua/extension/aqua_ws_msg_handler.py +76 -0
- ads/aqua/extension/base_handler.py +90 -0
- ads/aqua/extension/common_handler.py +121 -0
- ads/aqua/extension/common_ws_msg_handler.py +36 -0
- ads/aqua/extension/deployment_handler.py +381 -0
- ads/aqua/extension/deployment_ws_msg_handler.py +54 -0
- ads/aqua/extension/errors.py +30 -0
- ads/aqua/extension/evaluation_handler.py +129 -0
- ads/aqua/extension/evaluation_ws_msg_handler.py +61 -0
- ads/aqua/extension/finetune_handler.py +96 -0
- ads/aqua/extension/model_handler.py +390 -0
- ads/aqua/extension/models/__init__.py +0 -0
- ads/aqua/extension/models/ws_models.py +145 -0
- ads/aqua/extension/models_ws_msg_handler.py +50 -0
- ads/aqua/extension/ui_handler.py +300 -0
- ads/aqua/extension/ui_websocket_handler.py +130 -0
- ads/aqua/extension/utils.py +133 -0
- ads/aqua/finetuning/__init__.py +7 -0
- ads/aqua/finetuning/constants.py +23 -0
- ads/aqua/finetuning/entities.py +181 -0
- ads/aqua/finetuning/finetuning.py +749 -0
- ads/aqua/model/__init__.py +8 -0
- ads/aqua/model/constants.py +60 -0
- ads/aqua/model/entities.py +385 -0
- ads/aqua/model/enums.py +32 -0
- ads/aqua/model/model.py +2134 -0
- ads/aqua/model/utils.py +52 -0
- ads/aqua/modeldeployment/__init__.py +6 -0
- ads/aqua/modeldeployment/constants.py +10 -0
- ads/aqua/modeldeployment/deployment.py +1315 -0
- ads/aqua/modeldeployment/entities.py +653 -0
- ads/aqua/modeldeployment/utils.py +543 -0
- ads/aqua/resources/gpu_shapes_index.json +94 -0
- ads/aqua/server/__init__.py +4 -0
- ads/aqua/server/__main__.py +24 -0
- ads/aqua/server/app.py +47 -0
- ads/aqua/server/aqua_spec.yml +1291 -0
- ads/aqua/training/__init__.py +4 -0
- ads/aqua/training/exceptions.py +476 -0
- ads/aqua/ui.py +519 -0
- ads/automl/__init__.py +9 -0
- ads/automl/driver.py +330 -0
- ads/automl/provider.py +975 -0
- ads/bds/__init__.py +5 -0
- ads/bds/auth.py +127 -0
- ads/bds/big_data_service.py +255 -0
- ads/catalog/__init__.py +19 -0
- ads/catalog/model.py +1576 -0
- ads/catalog/notebook.py +461 -0
- ads/catalog/project.py +468 -0
- ads/catalog/summary.py +178 -0
- ads/common/__init__.py +11 -0
- ads/common/analyzer.py +65 -0
- ads/common/artifact/.model-ignore +63 -0
- ads/common/artifact/__init__.py +10 -0
- ads/common/auth.py +1122 -0
- ads/common/card_identifier.py +83 -0
- ads/common/config.py +647 -0
- ads/common/data.py +165 -0
- ads/common/decorator/__init__.py +9 -0
- ads/common/decorator/argument_to_case.py +88 -0
- ads/common/decorator/deprecate.py +69 -0
- ads/common/decorator/require_nonempty_arg.py +65 -0
- ads/common/decorator/runtime_dependency.py +178 -0
- ads/common/decorator/threaded.py +97 -0
- ads/common/decorator/utils.py +35 -0
- ads/common/dsc_file_system.py +303 -0
- ads/common/error.py +14 -0
- ads/common/extended_enum.py +81 -0
- ads/common/function/__init__.py +5 -0
- ads/common/function/fn_util.py +142 -0
- ads/common/function/func_conf.yaml +25 -0
- ads/common/ipython.py +76 -0
- ads/common/model.py +679 -0
- ads/common/model_artifact.py +1759 -0
- ads/common/model_artifact_schema.json +107 -0
- ads/common/model_export_util.py +664 -0
- ads/common/model_metadata.py +24 -0
- ads/common/object_storage_details.py +296 -0
- ads/common/oci_client.py +179 -0
- ads/common/oci_datascience.py +46 -0
- ads/common/oci_logging.py +1144 -0
- ads/common/oci_mixin.py +957 -0
- ads/common/oci_resource.py +136 -0
- ads/common/serializer.py +559 -0
- ads/common/utils.py +1852 -0
- ads/common/word_lists.py +1491 -0
- ads/common/work_request.py +189 -0
- ads/config.py +1 -0
- ads/data_labeling/__init__.py +13 -0
- ads/data_labeling/boundingbox.py +253 -0
- ads/data_labeling/constants.py +47 -0
- ads/data_labeling/data_labeling_service.py +244 -0
- ads/data_labeling/interface/__init__.py +5 -0
- ads/data_labeling/interface/loader.py +16 -0
- ads/data_labeling/interface/parser.py +16 -0
- ads/data_labeling/interface/reader.py +23 -0
- ads/data_labeling/loader/__init__.py +5 -0
- ads/data_labeling/loader/file_loader.py +241 -0
- ads/data_labeling/metadata.py +110 -0
- ads/data_labeling/mixin/__init__.py +5 -0
- ads/data_labeling/mixin/data_labeling.py +232 -0
- ads/data_labeling/ner.py +129 -0
- ads/data_labeling/parser/__init__.py +5 -0
- ads/data_labeling/parser/dls_record_parser.py +388 -0
- ads/data_labeling/parser/export_metadata_parser.py +94 -0
- ads/data_labeling/parser/export_record_parser.py +473 -0
- ads/data_labeling/reader/__init__.py +5 -0
- ads/data_labeling/reader/dataset_reader.py +574 -0
- ads/data_labeling/reader/dls_record_reader.py +121 -0
- ads/data_labeling/reader/export_record_reader.py +62 -0
- ads/data_labeling/reader/jsonl_reader.py +75 -0
- ads/data_labeling/reader/metadata_reader.py +203 -0
- ads/data_labeling/reader/record_reader.py +263 -0
- ads/data_labeling/record.py +52 -0
- ads/data_labeling/visualizer/__init__.py +5 -0
- ads/data_labeling/visualizer/image_visualizer.py +525 -0
- ads/data_labeling/visualizer/text_visualizer.py +357 -0
- ads/database/__init__.py +5 -0
- ads/database/connection.py +338 -0
- ads/dataset/__init__.py +10 -0
- ads/dataset/capabilities.md +51 -0
- ads/dataset/classification_dataset.py +339 -0
- ads/dataset/correlation.py +226 -0
- ads/dataset/correlation_plot.py +563 -0
- ads/dataset/dask_series.py +173 -0
- ads/dataset/dataframe_transformer.py +110 -0
- ads/dataset/dataset.py +1979 -0
- ads/dataset/dataset_browser.py +360 -0
- ads/dataset/dataset_with_target.py +995 -0
- ads/dataset/exception.py +25 -0
- ads/dataset/factory.py +987 -0
- ads/dataset/feature_engineering_transformer.py +35 -0
- ads/dataset/feature_selection.py +107 -0
- ads/dataset/forecasting_dataset.py +26 -0
- ads/dataset/helper.py +1450 -0
- ads/dataset/label_encoder.py +99 -0
- ads/dataset/mixin/__init__.py +5 -0
- ads/dataset/mixin/dataset_accessor.py +134 -0
- ads/dataset/pipeline.py +58 -0
- ads/dataset/plot.py +710 -0
- ads/dataset/progress.py +86 -0
- ads/dataset/recommendation.py +297 -0
- ads/dataset/recommendation_transformer.py +502 -0
- ads/dataset/regression_dataset.py +14 -0
- ads/dataset/sampled_dataset.py +1050 -0
- ads/dataset/target.py +98 -0
- ads/dataset/timeseries.py +18 -0
- ads/dbmixin/__init__.py +5 -0
- ads/dbmixin/db_pandas_accessor.py +153 -0
- ads/environment/__init__.py +9 -0
- ads/environment/ml_runtime.py +66 -0
- ads/evaluations/README.md +14 -0
- ads/evaluations/__init__.py +109 -0
- ads/evaluations/evaluation_plot.py +983 -0
- ads/evaluations/evaluator.py +1334 -0
- ads/evaluations/statistical_metrics.py +543 -0
- ads/experiments/__init__.py +9 -0
- ads/experiments/capabilities.md +0 -0
- ads/explanations/__init__.py +21 -0
- ads/explanations/base_explainer.py +142 -0
- ads/explanations/capabilities.md +83 -0
- ads/explanations/explainer.py +190 -0
- ads/explanations/mlx_global_explainer.py +1050 -0
- ads/explanations/mlx_interface.py +386 -0
- ads/explanations/mlx_local_explainer.py +287 -0
- ads/explanations/mlx_whatif_explainer.py +201 -0
- ads/feature_engineering/__init__.py +20 -0
- ads/feature_engineering/accessor/__init__.py +5 -0
- ads/feature_engineering/accessor/dataframe_accessor.py +535 -0
- ads/feature_engineering/accessor/mixin/__init__.py +5 -0
- ads/feature_engineering/accessor/mixin/correlation.py +166 -0
- ads/feature_engineering/accessor/mixin/eda_mixin.py +266 -0
- ads/feature_engineering/accessor/mixin/eda_mixin_series.py +85 -0
- ads/feature_engineering/accessor/mixin/feature_types_mixin.py +211 -0
- ads/feature_engineering/accessor/mixin/utils.py +65 -0
- ads/feature_engineering/accessor/series_accessor.py +431 -0
- ads/feature_engineering/adsimage/__init__.py +5 -0
- ads/feature_engineering/adsimage/image.py +192 -0
- ads/feature_engineering/adsimage/image_reader.py +170 -0
- ads/feature_engineering/adsimage/interface/__init__.py +5 -0
- ads/feature_engineering/adsimage/interface/reader.py +19 -0
- ads/feature_engineering/adsstring/__init__.py +7 -0
- ads/feature_engineering/adsstring/oci_language/__init__.py +8 -0
- ads/feature_engineering/adsstring/string/__init__.py +8 -0
- ads/feature_engineering/data_schema.json +57 -0
- ads/feature_engineering/dataset/__init__.py +5 -0
- ads/feature_engineering/dataset/zip_code_data.py +42062 -0
- ads/feature_engineering/exceptions.py +40 -0
- ads/feature_engineering/feature_type/__init__.py +133 -0
- ads/feature_engineering/feature_type/address.py +184 -0
- ads/feature_engineering/feature_type/adsstring/__init__.py +5 -0
- ads/feature_engineering/feature_type/adsstring/common_regex_mixin.py +164 -0
- ads/feature_engineering/feature_type/adsstring/oci_language.py +93 -0
- ads/feature_engineering/feature_type/adsstring/parsers/__init__.py +5 -0
- ads/feature_engineering/feature_type/adsstring/parsers/base.py +47 -0
- ads/feature_engineering/feature_type/adsstring/parsers/nltk_parser.py +96 -0
- ads/feature_engineering/feature_type/adsstring/parsers/spacy_parser.py +221 -0
- ads/feature_engineering/feature_type/adsstring/string.py +258 -0
- ads/feature_engineering/feature_type/base.py +58 -0
- ads/feature_engineering/feature_type/boolean.py +183 -0
- ads/feature_engineering/feature_type/category.py +146 -0
- ads/feature_engineering/feature_type/constant.py +137 -0
- ads/feature_engineering/feature_type/continuous.py +151 -0
- ads/feature_engineering/feature_type/creditcard.py +314 -0
- ads/feature_engineering/feature_type/datetime.py +190 -0
- ads/feature_engineering/feature_type/discrete.py +134 -0
- ads/feature_engineering/feature_type/document.py +43 -0
- ads/feature_engineering/feature_type/gis.py +251 -0
- ads/feature_engineering/feature_type/handler/__init__.py +5 -0
- ads/feature_engineering/feature_type/handler/feature_validator.py +524 -0
- ads/feature_engineering/feature_type/handler/feature_warning.py +319 -0
- ads/feature_engineering/feature_type/handler/warnings.py +128 -0
- ads/feature_engineering/feature_type/integer.py +142 -0
- ads/feature_engineering/feature_type/ip_address.py +144 -0
- ads/feature_engineering/feature_type/ip_address_v4.py +138 -0
- ads/feature_engineering/feature_type/ip_address_v6.py +138 -0
- ads/feature_engineering/feature_type/lat_long.py +256 -0
- ads/feature_engineering/feature_type/object.py +43 -0
- ads/feature_engineering/feature_type/ordinal.py +132 -0
- ads/feature_engineering/feature_type/phone_number.py +135 -0
- ads/feature_engineering/feature_type/string.py +171 -0
- ads/feature_engineering/feature_type/text.py +93 -0
- ads/feature_engineering/feature_type/unknown.py +43 -0
- ads/feature_engineering/feature_type/zip_code.py +164 -0
- ads/feature_engineering/feature_type_manager.py +406 -0
- ads/feature_engineering/schema.py +795 -0
- ads/feature_engineering/utils.py +245 -0
- ads/feature_store/.readthedocs.yaml +19 -0
- ads/feature_store/README.md +65 -0
- ads/feature_store/__init__.py +9 -0
- ads/feature_store/common/__init__.py +0 -0
- ads/feature_store/common/enums.py +339 -0
- ads/feature_store/common/exceptions.py +18 -0
- ads/feature_store/common/spark_session_singleton.py +125 -0
- ads/feature_store/common/utils/__init__.py +0 -0
- ads/feature_store/common/utils/base64_encoder_decoder.py +72 -0
- ads/feature_store/common/utils/feature_schema_mapper.py +283 -0
- ads/feature_store/common/utils/transformation_utils.py +82 -0
- ads/feature_store/common/utils/utility.py +403 -0
- ads/feature_store/data_validation/__init__.py +0 -0
- ads/feature_store/data_validation/great_expectation.py +129 -0
- ads/feature_store/dataset.py +1230 -0
- ads/feature_store/dataset_job.py +530 -0
- ads/feature_store/docs/Dockerfile +7 -0
- ads/feature_store/docs/Makefile +44 -0
- ads/feature_store/docs/conf.py +28 -0
- ads/feature_store/docs/requirements.txt +14 -0
- ads/feature_store/docs/source/ads.feature_store.query.rst +20 -0
- ads/feature_store/docs/source/cicd.rst +137 -0
- ads/feature_store/docs/source/conf.py +86 -0
- ads/feature_store/docs/source/data_versioning.rst +33 -0
- ads/feature_store/docs/source/dataset.rst +388 -0
- ads/feature_store/docs/source/dataset_job.rst +27 -0
- ads/feature_store/docs/source/demo.rst +70 -0
- ads/feature_store/docs/source/entity.rst +78 -0
- ads/feature_store/docs/source/feature_group.rst +624 -0
- ads/feature_store/docs/source/feature_group_job.rst +29 -0
- ads/feature_store/docs/source/feature_store.rst +122 -0
- ads/feature_store/docs/source/feature_store_class.rst +123 -0
- ads/feature_store/docs/source/feature_validation.rst +66 -0
- ads/feature_store/docs/source/figures/cicd.png +0 -0
- ads/feature_store/docs/source/figures/data_validation.png +0 -0
- ads/feature_store/docs/source/figures/data_versioning.png +0 -0
- ads/feature_store/docs/source/figures/dataset.gif +0 -0
- ads/feature_store/docs/source/figures/dataset.png +0 -0
- ads/feature_store/docs/source/figures/dataset_lineage.png +0 -0
- ads/feature_store/docs/source/figures/dataset_statistics.png +0 -0
- ads/feature_store/docs/source/figures/dataset_statistics_viz.png +0 -0
- ads/feature_store/docs/source/figures/dataset_validation_results.png +0 -0
- ads/feature_store/docs/source/figures/dataset_validation_summary.png +0 -0
- ads/feature_store/docs/source/figures/drift_monitoring.png +0 -0
- ads/feature_store/docs/source/figures/entity.png +0 -0
- ads/feature_store/docs/source/figures/feature_group.png +0 -0
- ads/feature_store/docs/source/figures/feature_group_lineage.png +0 -0
- ads/feature_store/docs/source/figures/feature_group_statistics_viz.png +0 -0
- ads/feature_store/docs/source/figures/feature_store_deployment.png +0 -0
- ads/feature_store/docs/source/figures/feature_store_overview.png +0 -0
- ads/feature_store/docs/source/figures/featuregroup.gif +0 -0
- ads/feature_store/docs/source/figures/lineage_d1.png +0 -0
- ads/feature_store/docs/source/figures/lineage_d2.png +0 -0
- ads/feature_store/docs/source/figures/lineage_fg.png +0 -0
- ads/feature_store/docs/source/figures/logo-dark-mode.png +0 -0
- ads/feature_store/docs/source/figures/logo-light-mode.png +0 -0
- ads/feature_store/docs/source/figures/overview.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager_feature_store_stack.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager_home.png +0 -0
- ads/feature_store/docs/source/figures/stats_1.png +0 -0
- ads/feature_store/docs/source/figures/stats_2.png +0 -0
- ads/feature_store/docs/source/figures/stats_d.png +0 -0
- ads/feature_store/docs/source/figures/stats_fg.png +0 -0
- ads/feature_store/docs/source/figures/transformation.png +0 -0
- ads/feature_store/docs/source/figures/transformations.gif +0 -0
- ads/feature_store/docs/source/figures/validation.png +0 -0
- ads/feature_store/docs/source/figures/validation_fg.png +0 -0
- ads/feature_store/docs/source/figures/validation_results.png +0 -0
- ads/feature_store/docs/source/figures/validation_summary.png +0 -0
- ads/feature_store/docs/source/index.rst +81 -0
- ads/feature_store/docs/source/module.rst +8 -0
- ads/feature_store/docs/source/notebook.rst +94 -0
- ads/feature_store/docs/source/overview.rst +47 -0
- ads/feature_store/docs/source/quickstart.rst +176 -0
- ads/feature_store/docs/source/release_notes.rst +194 -0
- ads/feature_store/docs/source/setup_feature_store.rst +81 -0
- ads/feature_store/docs/source/statistics.rst +58 -0
- ads/feature_store/docs/source/transformation.rst +199 -0
- ads/feature_store/docs/source/ui.rst +65 -0
- ads/feature_store/docs/source/user_guides.setup.feature_store_operator.rst +66 -0
- ads/feature_store/docs/source/user_guides.setup.helm_chart.rst +192 -0
- ads/feature_store/docs/source/user_guides.setup.terraform.rst +338 -0
- ads/feature_store/entity.py +718 -0
- ads/feature_store/execution_strategy/__init__.py +0 -0
- ads/feature_store/execution_strategy/delta_lake/__init__.py +0 -0
- ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py +375 -0
- ads/feature_store/execution_strategy/engine/__init__.py +0 -0
- ads/feature_store/execution_strategy/engine/spark_engine.py +316 -0
- ads/feature_store/execution_strategy/execution_strategy.py +113 -0
- ads/feature_store/execution_strategy/execution_strategy_provider.py +47 -0
- ads/feature_store/execution_strategy/spark/__init__.py +0 -0
- ads/feature_store/execution_strategy/spark/spark_execution.py +618 -0
- ads/feature_store/feature.py +192 -0
- ads/feature_store/feature_group.py +1494 -0
- ads/feature_store/feature_group_expectation.py +346 -0
- ads/feature_store/feature_group_job.py +602 -0
- ads/feature_store/feature_lineage/__init__.py +0 -0
- ads/feature_store/feature_lineage/graphviz_service.py +180 -0
- ads/feature_store/feature_option_details.py +50 -0
- ads/feature_store/feature_statistics/__init__.py +0 -0
- ads/feature_store/feature_statistics/statistics_service.py +99 -0
- ads/feature_store/feature_store.py +699 -0
- ads/feature_store/feature_store_registrar.py +518 -0
- ads/feature_store/input_feature_detail.py +149 -0
- ads/feature_store/mixin/__init__.py +4 -0
- ads/feature_store/mixin/oci_feature_store.py +145 -0
- ads/feature_store/model_details.py +73 -0
- ads/feature_store/query/__init__.py +0 -0
- ads/feature_store/query/filter.py +266 -0
- ads/feature_store/query/generator/__init__.py +0 -0
- ads/feature_store/query/generator/query_generator.py +298 -0
- ads/feature_store/query/join.py +161 -0
- ads/feature_store/query/query.py +403 -0
- ads/feature_store/query/validator/__init__.py +0 -0
- ads/feature_store/query/validator/query_validator.py +57 -0
- ads/feature_store/response/__init__.py +0 -0
- ads/feature_store/response/response_builder.py +68 -0
- ads/feature_store/service/__init__.py +0 -0
- ads/feature_store/service/oci_dataset.py +139 -0
- ads/feature_store/service/oci_dataset_job.py +199 -0
- ads/feature_store/service/oci_entity.py +125 -0
- ads/feature_store/service/oci_feature_group.py +164 -0
- ads/feature_store/service/oci_feature_group_job.py +214 -0
- ads/feature_store/service/oci_feature_store.py +182 -0
- ads/feature_store/service/oci_lineage.py +87 -0
- ads/feature_store/service/oci_transformation.py +104 -0
- ads/feature_store/statistics/__init__.py +0 -0
- ads/feature_store/statistics/abs_feature_value.py +49 -0
- ads/feature_store/statistics/charts/__init__.py +0 -0
- ads/feature_store/statistics/charts/abstract_feature_plot.py +37 -0
- ads/feature_store/statistics/charts/box_plot.py +148 -0
- ads/feature_store/statistics/charts/frequency_distribution.py +65 -0
- ads/feature_store/statistics/charts/probability_distribution.py +68 -0
- ads/feature_store/statistics/charts/top_k_frequent_elements.py +98 -0
- ads/feature_store/statistics/feature_stat.py +126 -0
- ads/feature_store/statistics/generic_feature_value.py +33 -0
- ads/feature_store/statistics/statistics.py +41 -0
- ads/feature_store/statistics_config.py +101 -0
- ads/feature_store/templates/feature_store_template.yaml +45 -0
- ads/feature_store/transformation.py +499 -0
- ads/feature_store/validation_output.py +57 -0
- ads/hpo/__init__.py +9 -0
- ads/hpo/_imports.py +91 -0
- ads/hpo/ads_search_space.py +439 -0
- ads/hpo/distributions.py +325 -0
- ads/hpo/objective.py +280 -0
- ads/hpo/search_cv.py +1657 -0
- ads/hpo/stopping_criterion.py +75 -0
- ads/hpo/tuner_artifact.py +413 -0
- ads/hpo/utils.py +91 -0
- ads/hpo/validation.py +140 -0
- ads/hpo/visualization/__init__.py +5 -0
- ads/hpo/visualization/_contour.py +23 -0
- ads/hpo/visualization/_edf.py +20 -0
- ads/hpo/visualization/_intermediate_values.py +21 -0
- ads/hpo/visualization/_optimization_history.py +25 -0
- ads/hpo/visualization/_parallel_coordinate.py +169 -0
- ads/hpo/visualization/_param_importances.py +26 -0
- ads/jobs/__init__.py +53 -0
- ads/jobs/ads_job.py +663 -0
- ads/jobs/builders/__init__.py +5 -0
- ads/jobs/builders/base.py +156 -0
- ads/jobs/builders/infrastructure/__init__.py +6 -0
- ads/jobs/builders/infrastructure/base.py +165 -0
- ads/jobs/builders/infrastructure/dataflow.py +1252 -0
- ads/jobs/builders/infrastructure/dsc_job.py +1894 -0
- ads/jobs/builders/infrastructure/dsc_job_runtime.py +1233 -0
- ads/jobs/builders/infrastructure/utils.py +65 -0
- ads/jobs/builders/runtimes/__init__.py +5 -0
- ads/jobs/builders/runtimes/artifact.py +338 -0
- ads/jobs/builders/runtimes/base.py +325 -0
- ads/jobs/builders/runtimes/container_runtime.py +242 -0
- ads/jobs/builders/runtimes/python_runtime.py +1016 -0
- ads/jobs/builders/runtimes/pytorch_runtime.py +204 -0
- ads/jobs/cli.py +104 -0
- ads/jobs/env_var_parser.py +131 -0
- ads/jobs/extension.py +160 -0
- ads/jobs/schema/__init__.py +5 -0
- ads/jobs/schema/infrastructure_schema.json +116 -0
- ads/jobs/schema/job_schema.json +42 -0
- ads/jobs/schema/runtime_schema.json +183 -0
- ads/jobs/schema/validator.py +141 -0
- ads/jobs/serializer.py +296 -0
- ads/jobs/templates/__init__.py +5 -0
- ads/jobs/templates/container.py +6 -0
- ads/jobs/templates/driver_notebook.py +177 -0
- ads/jobs/templates/driver_oci.py +500 -0
- ads/jobs/templates/driver_python.py +48 -0
- ads/jobs/templates/driver_pytorch.py +852 -0
- ads/jobs/templates/driver_utils.py +615 -0
- ads/jobs/templates/hostname_from_env.c +55 -0
- ads/jobs/templates/oci_metrics.py +181 -0
- ads/jobs/utils.py +104 -0
- ads/llm/__init__.py +28 -0
- ads/llm/autogen/__init__.py +2 -0
- ads/llm/autogen/constants.py +15 -0
- ads/llm/autogen/reports/__init__.py +2 -0
- ads/llm/autogen/reports/base.py +67 -0
- ads/llm/autogen/reports/data.py +103 -0
- ads/llm/autogen/reports/session.py +526 -0
- ads/llm/autogen/reports/templates/chat_box.html +13 -0
- ads/llm/autogen/reports/templates/chat_box_lt.html +5 -0
- ads/llm/autogen/reports/templates/chat_box_rt.html +6 -0
- ads/llm/autogen/reports/utils.py +56 -0
- ads/llm/autogen/v02/__init__.py +4 -0
- ads/llm/autogen/v02/client.py +295 -0
- ads/llm/autogen/v02/log_handlers/__init__.py +2 -0
- ads/llm/autogen/v02/log_handlers/oci_file_handler.py +83 -0
- ads/llm/autogen/v02/loggers/__init__.py +6 -0
- ads/llm/autogen/v02/loggers/metric_logger.py +320 -0
- ads/llm/autogen/v02/loggers/session_logger.py +580 -0
- ads/llm/autogen/v02/loggers/utils.py +86 -0
- ads/llm/autogen/v02/runtime_logging.py +163 -0
- ads/llm/chain.py +268 -0
- ads/llm/chat_template.py +31 -0
- ads/llm/deploy.py +63 -0
- ads/llm/guardrails/__init__.py +5 -0
- ads/llm/guardrails/base.py +442 -0
- ads/llm/guardrails/huggingface.py +44 -0
- ads/llm/langchain/__init__.py +5 -0
- ads/llm/langchain/plugins/__init__.py +5 -0
- ads/llm/langchain/plugins/chat_models/__init__.py +5 -0
- ads/llm/langchain/plugins/chat_models/oci_data_science.py +1027 -0
- ads/llm/langchain/plugins/embeddings/__init__.py +4 -0
- ads/llm/langchain/plugins/embeddings/oci_data_science_model_deployment_endpoint.py +184 -0
- ads/llm/langchain/plugins/llms/__init__.py +5 -0
- ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +979 -0
- ads/llm/requirements.txt +3 -0
- ads/llm/serialize.py +219 -0
- ads/llm/serializers/__init__.py +0 -0
- ads/llm/serializers/retrieval_qa.py +153 -0
- ads/llm/serializers/runnable_parallel.py +27 -0
- ads/llm/templates/score_chain.jinja2 +155 -0
- ads/llm/templates/tool_chat_template_hermes.jinja +130 -0
- ads/llm/templates/tool_chat_template_mistral_parallel.jinja +94 -0
- ads/model/__init__.py +52 -0
- ads/model/artifact.py +573 -0
- ads/model/artifact_downloader.py +254 -0
- ads/model/artifact_uploader.py +267 -0
- ads/model/base_properties.py +238 -0
- ads/model/common/.model-ignore +66 -0
- ads/model/common/__init__.py +5 -0
- ads/model/common/utils.py +142 -0
- ads/model/datascience_model.py +2635 -0
- ads/model/deployment/__init__.py +20 -0
- ads/model/deployment/common/__init__.py +5 -0
- ads/model/deployment/common/utils.py +308 -0
- ads/model/deployment/model_deployer.py +466 -0
- ads/model/deployment/model_deployment.py +1846 -0
- ads/model/deployment/model_deployment_infrastructure.py +671 -0
- ads/model/deployment/model_deployment_properties.py +493 -0
- ads/model/deployment/model_deployment_runtime.py +838 -0
- ads/model/extractor/__init__.py +5 -0
- ads/model/extractor/automl_extractor.py +74 -0
- ads/model/extractor/embedding_onnx_extractor.py +80 -0
- ads/model/extractor/huggingface_extractor.py +88 -0
- ads/model/extractor/keras_extractor.py +84 -0
- ads/model/extractor/lightgbm_extractor.py +93 -0
- ads/model/extractor/model_info_extractor.py +114 -0
- ads/model/extractor/model_info_extractor_factory.py +105 -0
- ads/model/extractor/pytorch_extractor.py +87 -0
- ads/model/extractor/sklearn_extractor.py +112 -0
- ads/model/extractor/spark_extractor.py +89 -0
- ads/model/extractor/tensorflow_extractor.py +85 -0
- ads/model/extractor/xgboost_extractor.py +94 -0
- ads/model/framework/__init__.py +5 -0
- ads/model/framework/automl_model.py +178 -0
- ads/model/framework/embedding_onnx_model.py +438 -0
- ads/model/framework/huggingface_model.py +399 -0
- ads/model/framework/lightgbm_model.py +266 -0
- ads/model/framework/pytorch_model.py +266 -0
- ads/model/framework/sklearn_model.py +250 -0
- ads/model/framework/spark_model.py +326 -0
- ads/model/framework/tensorflow_model.py +254 -0
- ads/model/framework/xgboost_model.py +258 -0
- ads/model/generic_model.py +3518 -0
- ads/model/model_artifact_boilerplate/README.md +381 -0
- ads/model/model_artifact_boilerplate/__init__.py +5 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/__init__.py +5 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py +427 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/requirements.txt +2 -0
- ads/model/model_artifact_boilerplate/runtime.yaml +7 -0
- ads/model/model_artifact_boilerplate/score.py +61 -0
- ads/model/model_file_description_schema.json +68 -0
- ads/model/model_introspect.py +331 -0
- ads/model/model_metadata.py +1810 -0
- ads/model/model_metadata_mixin.py +460 -0
- ads/model/model_properties.py +63 -0
- ads/model/model_version_set.py +739 -0
- ads/model/runtime/__init__.py +5 -0
- ads/model/runtime/env_info.py +306 -0
- ads/model/runtime/model_deployment_details.py +37 -0
- ads/model/runtime/model_provenance_details.py +58 -0
- ads/model/runtime/runtime_info.py +81 -0
- ads/model/runtime/schemas/inference_env_info_schema.yaml +16 -0
- ads/model/runtime/schemas/model_provenance_schema.yaml +36 -0
- ads/model/runtime/schemas/training_env_info_schema.yaml +16 -0
- ads/model/runtime/utils.py +201 -0
- ads/model/serde/__init__.py +5 -0
- ads/model/serde/common.py +40 -0
- ads/model/serde/model_input.py +547 -0
- ads/model/serde/model_serializer.py +1184 -0
- ads/model/service/__init__.py +5 -0
- ads/model/service/oci_datascience_model.py +1076 -0
- ads/model/service/oci_datascience_model_deployment.py +500 -0
- ads/model/service/oci_datascience_model_version_set.py +176 -0
- ads/model/transformer/__init__.py +5 -0
- ads/model/transformer/onnx_transformer.py +324 -0
- ads/mysqldb/__init__.py +5 -0
- ads/mysqldb/mysql_db.py +227 -0
- ads/opctl/__init__.py +18 -0
- ads/opctl/anomaly_detection.py +11 -0
- ads/opctl/backend/__init__.py +5 -0
- ads/opctl/backend/ads_dataflow.py +353 -0
- ads/opctl/backend/ads_ml_job.py +710 -0
- ads/opctl/backend/ads_ml_pipeline.py +164 -0
- ads/opctl/backend/ads_model_deployment.py +209 -0
- ads/opctl/backend/base.py +146 -0
- ads/opctl/backend/local.py +1053 -0
- ads/opctl/backend/marketplace/__init__.py +9 -0
- ads/opctl/backend/marketplace/helm_helper.py +173 -0
- ads/opctl/backend/marketplace/local_marketplace.py +271 -0
- ads/opctl/backend/marketplace/marketplace_backend_runner.py +71 -0
- ads/opctl/backend/marketplace/marketplace_operator_interface.py +44 -0
- ads/opctl/backend/marketplace/marketplace_operator_runner.py +24 -0
- ads/opctl/backend/marketplace/marketplace_utils.py +212 -0
- ads/opctl/backend/marketplace/models/__init__.py +5 -0
- ads/opctl/backend/marketplace/models/bearer_token.py +94 -0
- ads/opctl/backend/marketplace/models/marketplace_type.py +70 -0
- ads/opctl/backend/marketplace/models/ocir_details.py +56 -0
- ads/opctl/backend/marketplace/prerequisite_checker.py +238 -0
- ads/opctl/cli.py +707 -0
- ads/opctl/cmds.py +869 -0
- ads/opctl/conda/__init__.py +5 -0
- ads/opctl/conda/cli.py +193 -0
- ads/opctl/conda/cmds.py +749 -0
- ads/opctl/conda/config.yaml +34 -0
- ads/opctl/conda/manifest_template.yaml +13 -0
- ads/opctl/conda/multipart_uploader.py +188 -0
- ads/opctl/conda/pack.py +89 -0
- ads/opctl/config/__init__.py +5 -0
- ads/opctl/config/base.py +57 -0
- ads/opctl/config/diagnostics/__init__.py +5 -0
- ads/opctl/config/diagnostics/distributed/default_requirements_config.yaml +62 -0
- ads/opctl/config/merger.py +255 -0
- ads/opctl/config/resolver.py +297 -0
- ads/opctl/config/utils.py +79 -0
- ads/opctl/config/validator.py +17 -0
- ads/opctl/config/versioner.py +68 -0
- ads/opctl/config/yaml_parsers/__init__.py +7 -0
- ads/opctl/config/yaml_parsers/base.py +58 -0
- ads/opctl/config/yaml_parsers/distributed/__init__.py +7 -0
- ads/opctl/config/yaml_parsers/distributed/yaml_parser.py +201 -0
- ads/opctl/constants.py +66 -0
- ads/opctl/decorator/__init__.py +5 -0
- ads/opctl/decorator/common.py +129 -0
- ads/opctl/diagnostics/__init__.py +5 -0
- ads/opctl/diagnostics/__main__.py +25 -0
- ads/opctl/diagnostics/check_distributed_job_requirements.py +212 -0
- ads/opctl/diagnostics/check_requirements.py +144 -0
- ads/opctl/diagnostics/requirement_exception.py +9 -0
- ads/opctl/distributed/README.md +109 -0
- ads/opctl/distributed/__init__.py +5 -0
- ads/opctl/distributed/certificates.py +32 -0
- ads/opctl/distributed/cli.py +207 -0
- ads/opctl/distributed/cmds.py +731 -0
- ads/opctl/distributed/common/__init__.py +5 -0
- ads/opctl/distributed/common/abstract_cluster_provider.py +449 -0
- ads/opctl/distributed/common/abstract_framework_spec_builder.py +88 -0
- ads/opctl/distributed/common/cluster_config_helper.py +103 -0
- ads/opctl/distributed/common/cluster_provider_factory.py +21 -0
- ads/opctl/distributed/common/cluster_runner.py +54 -0
- ads/opctl/distributed/common/framework_factory.py +29 -0
- ads/opctl/docker/Dockerfile.job +103 -0
- ads/opctl/docker/Dockerfile.job.arm +107 -0
- ads/opctl/docker/Dockerfile.job.gpu +175 -0
- ads/opctl/docker/base-env.yaml +13 -0
- ads/opctl/docker/cuda.repo +6 -0
- ads/opctl/docker/operator/.dockerignore +0 -0
- ads/opctl/docker/operator/Dockerfile +41 -0
- ads/opctl/docker/operator/Dockerfile.gpu +85 -0
- ads/opctl/docker/operator/cuda.repo +6 -0
- ads/opctl/docker/operator/environment.yaml +8 -0
- ads/opctl/forecast.py +11 -0
- ads/opctl/index.yaml +3 -0
- ads/opctl/model/__init__.py +5 -0
- ads/opctl/model/cli.py +65 -0
- ads/opctl/model/cmds.py +73 -0
- ads/opctl/operator/README.md +4 -0
- ads/opctl/operator/__init__.py +31 -0
- ads/opctl/operator/cli.py +344 -0
- ads/opctl/operator/cmd.py +596 -0
- ads/opctl/operator/common/__init__.py +5 -0
- ads/opctl/operator/common/backend_factory.py +460 -0
- ads/opctl/operator/common/const.py +27 -0
- ads/opctl/operator/common/data/synthetic.csv +16001 -0
- ads/opctl/operator/common/dictionary_merger.py +148 -0
- ads/opctl/operator/common/errors.py +42 -0
- ads/opctl/operator/common/operator_config.py +99 -0
- ads/opctl/operator/common/operator_loader.py +811 -0
- ads/opctl/operator/common/operator_schema.yaml +130 -0
- ads/opctl/operator/common/operator_yaml_generator.py +152 -0
- ads/opctl/operator/common/utils.py +208 -0
- ads/opctl/operator/lowcode/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/MLoperator +16 -0
- ads/opctl/operator/lowcode/anomaly/README.md +207 -0
- ads/opctl/operator/lowcode/anomaly/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/__main__.py +103 -0
- ads/opctl/operator/lowcode/anomaly/cmd.py +35 -0
- ads/opctl/operator/lowcode/anomaly/const.py +167 -0
- ads/opctl/operator/lowcode/anomaly/environment.yaml +10 -0
- ads/opctl/operator/lowcode/anomaly/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +146 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +162 -0
- ads/opctl/operator/lowcode/anomaly/model/automlx.py +99 -0
- ads/opctl/operator/lowcode/anomaly/model/autots.py +115 -0
- ads/opctl/operator/lowcode/anomaly/model/base_model.py +404 -0
- ads/opctl/operator/lowcode/anomaly/model/factory.py +110 -0
- ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +78 -0
- ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +78 -0
- ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +120 -0
- ads/opctl/operator/lowcode/anomaly/model/tods.py +119 -0
- ads/opctl/operator/lowcode/anomaly/operator_config.py +127 -0
- ads/opctl/operator/lowcode/anomaly/schema.yaml +401 -0
- ads/opctl/operator/lowcode/anomaly/utils.py +88 -0
- ads/opctl/operator/lowcode/common/__init__.py +5 -0
- ads/opctl/operator/lowcode/common/const.py +10 -0
- ads/opctl/operator/lowcode/common/data.py +116 -0
- ads/opctl/operator/lowcode/common/errors.py +47 -0
- ads/opctl/operator/lowcode/common/transformations.py +296 -0
- ads/opctl/operator/lowcode/common/utils.py +384 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/MLoperator +13 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/README.md +30 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/__init__.py +5 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/__main__.py +116 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/cmd.py +85 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/const.py +15 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/environment.yaml +0 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/__init__.py +4 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/apigw_config.py +32 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/db_config.py +43 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/mysql_config.py +120 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/serializable_yaml_model.py +34 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/operator_utils.py +386 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/schema.yaml +160 -0
- ads/opctl/operator/lowcode/forecast/MLoperator +25 -0
- ads/opctl/operator/lowcode/forecast/README.md +209 -0
- ads/opctl/operator/lowcode/forecast/__init__.py +5 -0
- ads/opctl/operator/lowcode/forecast/__main__.py +89 -0
- ads/opctl/operator/lowcode/forecast/cmd.py +40 -0
- ads/opctl/operator/lowcode/forecast/const.py +92 -0
- ads/opctl/operator/lowcode/forecast/environment.yaml +20 -0
- ads/opctl/operator/lowcode/forecast/errors.py +26 -0
- ads/opctl/operator/lowcode/forecast/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/forecast/model/arima.py +279 -0
- ads/opctl/operator/lowcode/forecast/model/automlx.py +553 -0
- ads/opctl/operator/lowcode/forecast/model/autots.py +312 -0
- ads/opctl/operator/lowcode/forecast/model/base_model.py +875 -0
- ads/opctl/operator/lowcode/forecast/model/factory.py +106 -0
- ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +492 -0
- ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +243 -0
- ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +482 -0
- ads/opctl/operator/lowcode/forecast/model/prophet.py +450 -0
- ads/opctl/operator/lowcode/forecast/model_evaluator.py +244 -0
- ads/opctl/operator/lowcode/forecast/operator_config.py +234 -0
- ads/opctl/operator/lowcode/forecast/schema.yaml +506 -0
- ads/opctl/operator/lowcode/forecast/utils.py +397 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/__init__.py +7 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/deployment_manager.py +285 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/score.py +246 -0
- ads/opctl/operator/lowcode/pii/MLoperator +17 -0
- ads/opctl/operator/lowcode/pii/README.md +208 -0
- ads/opctl/operator/lowcode/pii/__init__.py +5 -0
- ads/opctl/operator/lowcode/pii/__main__.py +78 -0
- ads/opctl/operator/lowcode/pii/cmd.py +39 -0
- ads/opctl/operator/lowcode/pii/constant.py +84 -0
- ads/opctl/operator/lowcode/pii/environment.yaml +17 -0
- ads/opctl/operator/lowcode/pii/errors.py +27 -0
- ads/opctl/operator/lowcode/pii/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/pii/model/factory.py +82 -0
- ads/opctl/operator/lowcode/pii/model/guardrails.py +167 -0
- ads/opctl/operator/lowcode/pii/model/pii.py +145 -0
- ads/opctl/operator/lowcode/pii/model/processor/__init__.py +34 -0
- ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py +34 -0
- ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py +35 -0
- ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py +225 -0
- ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py +73 -0
- ads/opctl/operator/lowcode/pii/model/processor/remover.py +26 -0
- ads/opctl/operator/lowcode/pii/model/report.py +487 -0
- ads/opctl/operator/lowcode/pii/operator_config.py +95 -0
- ads/opctl/operator/lowcode/pii/schema.yaml +108 -0
- ads/opctl/operator/lowcode/pii/utils.py +43 -0
- ads/opctl/operator/lowcode/recommender/MLoperator +16 -0
- ads/opctl/operator/lowcode/recommender/README.md +206 -0
- ads/opctl/operator/lowcode/recommender/__init__.py +5 -0
- ads/opctl/operator/lowcode/recommender/__main__.py +82 -0
- ads/opctl/operator/lowcode/recommender/cmd.py +33 -0
- ads/opctl/operator/lowcode/recommender/constant.py +30 -0
- ads/opctl/operator/lowcode/recommender/environment.yaml +11 -0
- ads/opctl/operator/lowcode/recommender/model/base_model.py +212 -0
- ads/opctl/operator/lowcode/recommender/model/factory.py +56 -0
- ads/opctl/operator/lowcode/recommender/model/recommender_dataset.py +25 -0
- ads/opctl/operator/lowcode/recommender/model/svd.py +106 -0
- ads/opctl/operator/lowcode/recommender/operator_config.py +81 -0
- ads/opctl/operator/lowcode/recommender/schema.yaml +265 -0
- ads/opctl/operator/lowcode/recommender/utils.py +13 -0
- ads/opctl/operator/runtime/__init__.py +5 -0
- ads/opctl/operator/runtime/const.py +17 -0
- ads/opctl/operator/runtime/container_runtime_schema.yaml +50 -0
- ads/opctl/operator/runtime/marketplace_runtime.py +50 -0
- ads/opctl/operator/runtime/python_marketplace_runtime_schema.yaml +21 -0
- ads/opctl/operator/runtime/python_runtime_schema.yaml +21 -0
- ads/opctl/operator/runtime/runtime.py +115 -0
- ads/opctl/schema.yaml.yml +36 -0
- ads/opctl/script.py +40 -0
- ads/opctl/spark/__init__.py +5 -0
- ads/opctl/spark/cli.py +43 -0
- ads/opctl/spark/cmds.py +147 -0
- ads/opctl/templates/diagnostic_report_template.jinja2 +102 -0
- ads/opctl/utils.py +344 -0
- ads/oracledb/__init__.py +5 -0
- ads/oracledb/oracle_db.py +346 -0
- ads/pipeline/__init__.py +39 -0
- ads/pipeline/ads_pipeline.py +2279 -0
- ads/pipeline/ads_pipeline_run.py +772 -0
- ads/pipeline/ads_pipeline_step.py +605 -0
- ads/pipeline/builders/__init__.py +5 -0
- ads/pipeline/builders/infrastructure/__init__.py +5 -0
- ads/pipeline/builders/infrastructure/custom_script.py +32 -0
- ads/pipeline/cli.py +119 -0
- ads/pipeline/extension.py +291 -0
- ads/pipeline/schema/__init__.py +5 -0
- ads/pipeline/schema/cs_step_schema.json +35 -0
- ads/pipeline/schema/ml_step_schema.json +31 -0
- ads/pipeline/schema/pipeline_schema.json +71 -0
- ads/pipeline/visualizer/__init__.py +5 -0
- ads/pipeline/visualizer/base.py +570 -0
- ads/pipeline/visualizer/graph_renderer.py +272 -0
- ads/pipeline/visualizer/text_renderer.py +84 -0
- ads/secrets/__init__.py +11 -0
- ads/secrets/adb.py +386 -0
- ads/secrets/auth_token.py +86 -0
- ads/secrets/big_data_service.py +365 -0
- ads/secrets/mysqldb.py +149 -0
- ads/secrets/oracledb.py +160 -0
- ads/secrets/secrets.py +407 -0
- ads/telemetry/__init__.py +7 -0
- ads/telemetry/base.py +69 -0
- ads/telemetry/client.py +122 -0
- ads/telemetry/telemetry.py +257 -0
- ads/templates/dataflow_pyspark.jinja2 +13 -0
- ads/templates/dataflow_sparksql.jinja2 +22 -0
- ads/templates/func.jinja2 +20 -0
- ads/templates/schemas/openapi.json +1740 -0
- ads/templates/score-pkl.jinja2 +173 -0
- ads/templates/score.jinja2 +322 -0
- ads/templates/score_embedding_onnx.jinja2 +202 -0
- ads/templates/score_generic.jinja2 +165 -0
- ads/templates/score_huggingface_pipeline.jinja2 +217 -0
- ads/templates/score_lightgbm.jinja2 +185 -0
- ads/templates/score_onnx.jinja2 +407 -0
- ads/templates/score_onnx_new.jinja2 +473 -0
- ads/templates/score_oracle_automl.jinja2 +185 -0
- ads/templates/score_pyspark.jinja2 +154 -0
- ads/templates/score_pytorch.jinja2 +219 -0
- ads/templates/score_scikit-learn.jinja2 +184 -0
- ads/templates/score_tensorflow.jinja2 +184 -0
- ads/templates/score_xgboost.jinja2 +178 -0
- ads/text_dataset/__init__.py +5 -0
- ads/text_dataset/backends.py +211 -0
- ads/text_dataset/dataset.py +445 -0
- ads/text_dataset/extractor.py +207 -0
- ads/text_dataset/options.py +53 -0
- ads/text_dataset/udfs.py +22 -0
- ads/text_dataset/utils.py +49 -0
- ads/type_discovery/__init__.py +9 -0
- ads/type_discovery/abstract_detector.py +21 -0
- ads/type_discovery/constant_detector.py +41 -0
- ads/type_discovery/continuous_detector.py +54 -0
- ads/type_discovery/credit_card_detector.py +99 -0
- ads/type_discovery/datetime_detector.py +92 -0
- ads/type_discovery/discrete_detector.py +118 -0
- ads/type_discovery/document_detector.py +146 -0
- ads/type_discovery/ip_detector.py +68 -0
- ads/type_discovery/latlon_detector.py +90 -0
- ads/type_discovery/phone_number_detector.py +63 -0
- ads/type_discovery/type_discovery_driver.py +87 -0
- ads/type_discovery/typed_feature.py +594 -0
- ads/type_discovery/unknown_detector.py +41 -0
- ads/type_discovery/zipcode_detector.py +48 -0
- ads/vault/__init__.py +7 -0
- ads/vault/vault.py +237 -0
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/METADATA +150 -149
- oracle_ads-2.13.10.dist-info/RECORD +858 -0
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/WHEEL +1 -2
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/entry_points.txt +2 -1
- oracle_ads-2.13.9rc0.dist-info/RECORD +0 -9
- oracle_ads-2.13.9rc0.dist-info/top_level.txt +0 -1
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/licenses/LICENSE.txt +0 -0
ads/dataset/dataset.py
ADDED
@@ -0,0 +1,1979 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*--
|
3
|
+
|
4
|
+
# Copyright (c) 2020, 2024 Oracle and/or its affiliates.
|
5
|
+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
|
+
|
7
|
+
from __future__ import print_function, absolute_import, division
|
8
|
+
|
9
|
+
import copy
|
10
|
+
import datetime
|
11
|
+
import fsspec
|
12
|
+
import numpy as np
|
13
|
+
import os
|
14
|
+
import pandas as pd
|
15
|
+
import uuid
|
16
|
+
|
17
|
+
from collections import Counter
|
18
|
+
from sklearn.preprocessing import FunctionTransformer
|
19
|
+
from typing import Iterable, Tuple, Union
|
20
|
+
|
21
|
+
from ads import set_documentation_mode
|
22
|
+
from ads.common import utils
|
23
|
+
from ads.common.decorator.deprecate import deprecated
|
24
|
+
from ads.dataset import helper, logger
|
25
|
+
from ads.dataset.dataframe_transformer import DataFrameTransformer
|
26
|
+
from ads.dataset.exception import ValidationError
|
27
|
+
from ads.dataset.helper import (
|
28
|
+
convert_columns,
|
29
|
+
fix_column_names,
|
30
|
+
generate_sample,
|
31
|
+
DatasetDefaults,
|
32
|
+
deprecate_default_value,
|
33
|
+
deprecate_variable,
|
34
|
+
get_dataset,
|
35
|
+
infer_target_type,
|
36
|
+
)
|
37
|
+
from ads.dataset.label_encoder import DataFrameLabelEncoder
|
38
|
+
from ads.dataset.pipeline import TransformerPipeline
|
39
|
+
from ads.dataset.progress import DummyProgressBar
|
40
|
+
from ads.dataset.sampled_dataset import PandasDataset
|
41
|
+
from ads.type_discovery.type_discovery_driver import TypeDiscoveryDriver
|
42
|
+
from ads.dataset.helper import get_feature_type
|
43
|
+
from ads.dataset.correlation_plot import plot_correlation_heatmap
|
44
|
+
from ads.dataset.correlation import (
|
45
|
+
_cat_vs_cts,
|
46
|
+
_cat_vs_cat,
|
47
|
+
_get_columns_by_type,
|
48
|
+
_validate_correlation_methods,
|
49
|
+
)
|
50
|
+
from ads.common.decorator.runtime_dependency import (
|
51
|
+
runtime_dependency,
|
52
|
+
OptionalDependency,
|
53
|
+
)
|
54
|
+
|
55
|
+
N_Features_Wide_Dataset = 64
|
56
|
+
|
57
|
+
|
58
|
+
pd.set_option("display.max_colwidth", None)
|
59
|
+
|
60
|
+
|
61
|
+
class ADSDataset(PandasDataset):
|
62
|
+
"""
|
63
|
+
An ADSDataset Object.
|
64
|
+
|
65
|
+
The ADSDataset object cannot be used for classification or regression problems until a
|
66
|
+
target has been set using `set_target`. To see some rows in the data use any of the usual
|
67
|
+
Pandas functions like `head()`. There are also a variety of converters, to_dask,
|
68
|
+
to_pandas, to_h2o, to_xgb, to_csv, to_parquet, to_json & to_hdf .
|
69
|
+
"""
|
70
|
+
|
71
|
+
df_read_functions = ["head", "describe", "_get_numeric_data"]
|
72
|
+
|
73
|
+
def __init__(
|
74
|
+
self,
|
75
|
+
df,
|
76
|
+
sampled_df=None,
|
77
|
+
shape=None,
|
78
|
+
name="",
|
79
|
+
description=None,
|
80
|
+
type_discovery=True,
|
81
|
+
types={},
|
82
|
+
metadata=None,
|
83
|
+
progress=DummyProgressBar(),
|
84
|
+
transformer_pipeline=None,
|
85
|
+
interactive=False,
|
86
|
+
**kwargs,
|
87
|
+
):
|
88
|
+
#
|
89
|
+
# to keep performance high and linear no matter the size of the distributed dataset we
|
90
|
+
# create a pandas df that's used internally because this has a fixed upper size.
|
91
|
+
#
|
92
|
+
if shape is None:
|
93
|
+
shape = df.shape
|
94
|
+
|
95
|
+
if sampled_df is None:
|
96
|
+
sampled_df = generate_sample(
|
97
|
+
df,
|
98
|
+
shape[0],
|
99
|
+
DatasetDefaults.sampling_confidence_level,
|
100
|
+
DatasetDefaults.sampling_confidence_interval,
|
101
|
+
**kwargs,
|
102
|
+
)
|
103
|
+
super().__init__(
|
104
|
+
sampled_df,
|
105
|
+
type_discovery=type_discovery,
|
106
|
+
types=types,
|
107
|
+
metadata=metadata,
|
108
|
+
progress=progress,
|
109
|
+
)
|
110
|
+
self.df = fix_column_names(df)
|
111
|
+
|
112
|
+
self.name = name
|
113
|
+
self.description = description
|
114
|
+
self.shape = shape
|
115
|
+
# store these args to reapply when building a new dataset for delegate operations on dataframe
|
116
|
+
self.init_kwargs = {**kwargs, "type_discovery": type_discovery}
|
117
|
+
if transformer_pipeline is None:
|
118
|
+
# Update transformer pipeline to convert column types and fix names
|
119
|
+
self.transformer_pipeline = TransformerPipeline(
|
120
|
+
steps=[
|
121
|
+
(
|
122
|
+
"prepare",
|
123
|
+
FunctionTransformer(func=fix_column_names, validate=False),
|
124
|
+
)
|
125
|
+
]
|
126
|
+
)
|
127
|
+
self.transformer_pipeline = self._update_transformer_pipeline(
|
128
|
+
steps=[
|
129
|
+
(
|
130
|
+
"type_discovery",
|
131
|
+
FunctionTransformer(
|
132
|
+
func=convert_columns,
|
133
|
+
validate=False,
|
134
|
+
kw_args={"dtypes": self.sampled_df.dtypes},
|
135
|
+
),
|
136
|
+
)
|
137
|
+
]
|
138
|
+
)
|
139
|
+
else:
|
140
|
+
self.transformer_pipeline = transformer_pipeline
|
141
|
+
|
142
|
+
def __repr__(self):
|
143
|
+
rows, cols = self.shape
|
144
|
+
return f"{rows:,} rows, {cols:,} columns"
|
145
|
+
|
146
|
+
def __len__(self):
|
147
|
+
return self.shape[0]
|
148
|
+
|
149
|
+
@staticmethod
|
150
|
+
def from_dataframe(
|
151
|
+
df,
|
152
|
+
sampled_df=None,
|
153
|
+
shape=None,
|
154
|
+
name="",
|
155
|
+
description=None,
|
156
|
+
type_discovery=True,
|
157
|
+
types={},
|
158
|
+
metadata=None,
|
159
|
+
progress=DummyProgressBar(),
|
160
|
+
transformer_pipeline=None,
|
161
|
+
interactive=False,
|
162
|
+
**kwargs,
|
163
|
+
) -> "ADSDataset":
|
164
|
+
return ADSDataset(
|
165
|
+
df=df,
|
166
|
+
sampled_df=sampled_df,
|
167
|
+
shape=shape,
|
168
|
+
name=name,
|
169
|
+
description=description,
|
170
|
+
type_discovery=type_discovery,
|
171
|
+
types=types,
|
172
|
+
metadata=metadata,
|
173
|
+
progress=progress,
|
174
|
+
transformer_pipeline=transformer_pipeline,
|
175
|
+
interactive=interactive,
|
176
|
+
**kwargs,
|
177
|
+
)
|
178
|
+
|
179
|
+
@property
|
180
|
+
@deprecated(
|
181
|
+
"2.5.2", details="The ddf attribute is deprecated. Use the df attribute."
|
182
|
+
)
|
183
|
+
def ddf(self):
|
184
|
+
return self.df
|
185
|
+
|
186
|
+
@deprecated(
|
187
|
+
"2.5.2", details="The compute method is deprecated. Use the df attribute."
|
188
|
+
)
|
189
|
+
def compute(self):
|
190
|
+
return self.df
|
191
|
+
|
192
|
+
@runtime_dependency(
|
193
|
+
module="ipywidgets", object="HTML", install_from=OptionalDependency.NOTEBOOK
|
194
|
+
)
|
195
|
+
@runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
|
196
|
+
def _repr_html_(self):
|
197
|
+
from IPython.core.display import display, HTML
|
198
|
+
|
199
|
+
display(
|
200
|
+
HTML(
|
201
|
+
utils.horizontal_scrollable_div(
|
202
|
+
self.sampled_df.head(5)
|
203
|
+
.style.set_table_styles(utils.get_dataframe_styles())
|
204
|
+
.set_table_attributes("class=table")
|
205
|
+
.hide()
|
206
|
+
.to_html()
|
207
|
+
)
|
208
|
+
)
|
209
|
+
)
|
210
|
+
|
211
|
+
def _head(self, n=5):
|
212
|
+
"""
|
213
|
+
Return the first `n` rows of the dataset.
|
214
|
+
|
215
|
+
Parameters
|
216
|
+
----------
|
217
|
+
n : int, default 5
|
218
|
+
Number of rows to select.
|
219
|
+
|
220
|
+
Returns
|
221
|
+
-------
|
222
|
+
dataset_head : pandas.DataFrame
|
223
|
+
The first `n` rows of the dataset
|
224
|
+
|
225
|
+
Examples
|
226
|
+
--------
|
227
|
+
>>> import pandas as pd
|
228
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
|
229
|
+
>>> ds.head()
|
230
|
+
* displays the first 5 rows of the dataset, just as the traditional head() function would *
|
231
|
+
"""
|
232
|
+
df = self.df.head(n=n)
|
233
|
+
|
234
|
+
#
|
235
|
+
# we could just return the above but, jupyterlab doesn't render these well
|
236
|
+
# when the width exceeds the screen area. To address that we wrap the dataframe
|
237
|
+
# with a class that has an optimized _repr_html_ handler, this object
|
238
|
+
# extends the pandas dataframe so it can still be used as-a dataframe
|
239
|
+
#
|
240
|
+
class FormattedDataFrame(pd.DataFrame):
|
241
|
+
def __init__(self, *args, **kwargs):
|
242
|
+
super(FormattedDataFrame, self).__init__(*args, **kwargs)
|
243
|
+
|
244
|
+
@property
|
245
|
+
def _constructor(self):
|
246
|
+
return FormattedDataFrame
|
247
|
+
|
248
|
+
@runtime_dependency(
|
249
|
+
module="ipywidgets",
|
250
|
+
object="HTML",
|
251
|
+
install_from=OptionalDependency.NOTEBOOK,
|
252
|
+
)
|
253
|
+
@runtime_dependency(
|
254
|
+
module="IPython", install_from=OptionalDependency.NOTEBOOK
|
255
|
+
)
|
256
|
+
def _repr_html_(self):
|
257
|
+
from IPython.core.display import display, HTML
|
258
|
+
|
259
|
+
display(
|
260
|
+
HTML(
|
261
|
+
utils.horizontal_scrollable_div(
|
262
|
+
self.style.set_table_styles(utils.get_dataframe_styles())
|
263
|
+
.set_table_attributes("class=table")
|
264
|
+
.hide()
|
265
|
+
.to_html()
|
266
|
+
)
|
267
|
+
)
|
268
|
+
)
|
269
|
+
return None
|
270
|
+
|
271
|
+
def __repr__(self):
|
272
|
+
return "{} rows, {} columns".format(*self.shape)
|
273
|
+
|
274
|
+
return FormattedDataFrame(df)
|
275
|
+
|
276
|
+
def call(self, func, *args, sample_size=None, **kwargs):
|
277
|
+
r"""
|
278
|
+
Runs a custom function on dataframe
|
279
|
+
|
280
|
+
func will receive the pandas dataframe (which represents the dataset) as an argument named 'df' by default.
|
281
|
+
This can be overridden by specifying the dataframe argument name in a tuple (func, dataframe_name).
|
282
|
+
|
283
|
+
Parameters
|
284
|
+
----------
|
285
|
+
func: Union[callable, tuple]
|
286
|
+
Custom function that takes pandas dataframe as input
|
287
|
+
Alternatively a (callable, data) tuple where data is a string indicating the keyword of callable
|
288
|
+
that expects the dataframe name
|
289
|
+
args: iterable, optional
|
290
|
+
Positional arguments passed into func
|
291
|
+
sample_size: int, Optional
|
292
|
+
To use a sampled dataframe
|
293
|
+
kwargs: mapping, optional
|
294
|
+
A dictionary of keyword arguments passed into func
|
295
|
+
|
296
|
+
Returns
|
297
|
+
-------
|
298
|
+
func: function
|
299
|
+
a plotting function that contains `*args` and `**kwargs`
|
300
|
+
|
301
|
+
Examples
|
302
|
+
--------
|
303
|
+
>>> import pandas as pd
|
304
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
|
305
|
+
>>> def f1(df):
|
306
|
+
... return(sum(df), axis=0)
|
307
|
+
>>> sum_ds = ds.call(f1)
|
308
|
+
"""
|
309
|
+
|
310
|
+
data = "df"
|
311
|
+
if isinstance(func, tuple):
|
312
|
+
func, data = func
|
313
|
+
if data in kwargs:
|
314
|
+
raise ValueError(
|
315
|
+
"'%s' is both the data argument and a keyword argument" % data
|
316
|
+
)
|
317
|
+
|
318
|
+
if sample_size is None:
|
319
|
+
# user has asked not to do sampling
|
320
|
+
df = self.df.copy()
|
321
|
+
else:
|
322
|
+
df = self.df.sample(n=sample_size)
|
323
|
+
kwargs[data] = df
|
324
|
+
return func(*args, **kwargs)
|
325
|
+
|
326
|
+
def set_target(self, target, type_discovery=True, target_type=None):
|
327
|
+
"""
|
328
|
+
Returns a dataset tagged based on the type of target.
|
329
|
+
|
330
|
+
Parameters
|
331
|
+
----------
|
332
|
+
target: str
|
333
|
+
name of the feature to use as target.
|
334
|
+
type_discovery: bool
|
335
|
+
This is set as True by default.
|
336
|
+
target_type: type
|
337
|
+
If provided, then the target will be typed with the provided value.
|
338
|
+
|
339
|
+
Returns
|
340
|
+
-------
|
341
|
+
ds: ADSDataset
|
342
|
+
tagged according to the type of the target column.
|
343
|
+
|
344
|
+
Examples
|
345
|
+
--------
|
346
|
+
>>> import pandas as pd
|
347
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
|
348
|
+
>>> ds_with_target= ds.set_target("target_class")
|
349
|
+
"""
|
350
|
+
if target_type:
|
351
|
+
target_series = self.sampled_df[target].astype(target_type)
|
352
|
+
else:
|
353
|
+
target_series = self.sampled_df[target]
|
354
|
+
return get_dataset(
|
355
|
+
self.df,
|
356
|
+
self.sampled_df,
|
357
|
+
target,
|
358
|
+
infer_target_type(target, target_series, type_discovery),
|
359
|
+
self.shape,
|
360
|
+
**self.init_kwargs,
|
361
|
+
)
|
362
|
+
|
363
|
+
@deprecated("2.5.2", details="Instead use `to_pandas`.")
|
364
|
+
def to_pandas_dataframe(
|
365
|
+
self, filter=None, frac=None, include_transformer_pipeline=False
|
366
|
+
):
|
367
|
+
return self.to_pandas(
|
368
|
+
filter=filter,
|
369
|
+
frac=frac,
|
370
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
371
|
+
)
|
372
|
+
|
373
|
+
def to_pandas(self, filter=None, frac=None, include_transformer_pipeline=False):
|
374
|
+
"""
|
375
|
+
Returns a copy of the data as pandas.DataFrame, and a sklearn pipeline optionally that holds the
|
376
|
+
transformations run so far on the data.
|
377
|
+
|
378
|
+
The pipeline returned can be updated with the transformations done offline and passed along with the
|
379
|
+
dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
|
380
|
+
|
381
|
+
Parameters
|
382
|
+
----------
|
383
|
+
filter: str, optional
|
384
|
+
The query string to filter the dataframe, for example
|
385
|
+
ds.to_pandas(filter="age > 50 and location == 'san francisco")
|
386
|
+
See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
387
|
+
frac: float, optional
|
388
|
+
fraction of original data to return.
|
389
|
+
include_transformer_pipeline: bool, default: False
|
390
|
+
If True, (dataframe, transformer_pipeline) is returned as a tuple
|
391
|
+
|
392
|
+
Returns
|
393
|
+
-------
|
394
|
+
dataframe : pandas.DataFrame
|
395
|
+
if include_transformer_pipeline is False.
|
396
|
+
(data, transformer_pipeline): tuple of pandas.DataFrame and dataset.pipeline.TransformerPipeline
|
397
|
+
if include_transformer_pipeline is True.
|
398
|
+
|
399
|
+
Examples
|
400
|
+
--------
|
401
|
+
>>> import pandas as pd
|
402
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
403
|
+
>>> ds_as_df = ds.to_pandas()
|
404
|
+
|
405
|
+
Notes
|
406
|
+
-----
|
407
|
+
See also https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
|
408
|
+
"""
|
409
|
+
df = self.df.query(filter) if filter is not None else self.df.copy()
|
410
|
+
if frac is not None:
|
411
|
+
df = df.sample(frac=frac)
|
412
|
+
return (
|
413
|
+
(df, copy.deepcopy(self.transformer_pipeline))
|
414
|
+
if include_transformer_pipeline
|
415
|
+
else df
|
416
|
+
)
|
417
|
+
|
418
|
+
@deprecated("2.5.2", details="Instead use `to_dask`.")
|
419
|
+
def to_dask_dataframe(
|
420
|
+
self,
|
421
|
+
filter=None,
|
422
|
+
frac=None,
|
423
|
+
npartitions=None,
|
424
|
+
include_transformer_pipeline=False,
|
425
|
+
):
|
426
|
+
return self.to_dask(
|
427
|
+
filter=filter,
|
428
|
+
frac=frac,
|
429
|
+
npartitions=npartitions,
|
430
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
431
|
+
)
|
432
|
+
|
433
|
+
@runtime_dependency(module="dask.dataframe", short_name="dd")
|
434
|
+
def to_dask(
|
435
|
+
self,
|
436
|
+
filter=None,
|
437
|
+
frac=None,
|
438
|
+
npartitions=None,
|
439
|
+
include_transformer_pipeline=False,
|
440
|
+
):
|
441
|
+
"""
|
442
|
+
Returns a copy of the data as dask.dataframe.core.DataFrame, and a sklearn pipeline optionally that holds the
|
443
|
+
transformations run so far on the data.
|
444
|
+
|
445
|
+
The pipeline returned can be updated with the transformations done offline and passed along with the
|
446
|
+
dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
|
447
|
+
|
448
|
+
Parameters
|
449
|
+
----------
|
450
|
+
filter: str, optional
|
451
|
+
The query string to filter the dataframe, for example
|
452
|
+
ds.to_dask(filter="age > 50 and location == 'san francisco")
|
453
|
+
See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
454
|
+
frac: float, optional
|
455
|
+
fraction of original data to return.
|
456
|
+
include_transformer_pipeline: bool, default: False
|
457
|
+
If True, (dataframe, transformer_pipeline) is returned as a tuple.
|
458
|
+
|
459
|
+
Returns
|
460
|
+
-------
|
461
|
+
dataframe : dask.dataframe.core.DataFrame
|
462
|
+
if include_transformer_pipeline is False.
|
463
|
+
(data, transformer_pipeline): tuple of dask.dataframe.core.DataFrame and dataset.pipeline.TransformerPipeline
|
464
|
+
if include_transformer_pipeline is True.
|
465
|
+
|
466
|
+
Examples
|
467
|
+
--------
|
468
|
+
>>> import pandas as pd
|
469
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
470
|
+
>>> ds_dask = ds.to_dask()
|
471
|
+
|
472
|
+
Notes
|
473
|
+
-----
|
474
|
+
See also http://docs.dask.org/en/latest/dataframe-api.html#dataframe and
|
475
|
+
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
|
476
|
+
|
477
|
+
"""
|
478
|
+
res = self.to_pandas(
|
479
|
+
filter=filter,
|
480
|
+
frac=frac,
|
481
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
482
|
+
)
|
483
|
+
return (
|
484
|
+
(dd.from_pandas(res[0], npartitions=npartitions), res[1])
|
485
|
+
if include_transformer_pipeline
|
486
|
+
else dd.from_pandas(res, npartitions=npartitions)
|
487
|
+
)
|
488
|
+
|
489
|
+
@deprecated("2.5.2", details="Instead use `to_h2o`.")
|
490
|
+
def to_h2o_dataframe(
|
491
|
+
self, filter=None, frac=None, include_transformer_pipeline=False
|
492
|
+
):
|
493
|
+
return self.to_h2o(
|
494
|
+
filter=filter,
|
495
|
+
frac=frac,
|
496
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
497
|
+
)
|
498
|
+
|
499
|
+
@runtime_dependency(module="h2o")
|
500
|
+
def to_h2o(self, filter=None, frac=None, include_transformer_pipeline=False):
|
501
|
+
"""
|
502
|
+
Returns a copy of the data as h2o.H2OFrame, and a sklearn pipeline optionally that holds the
|
503
|
+
transformations run so far on the data.
|
504
|
+
|
505
|
+
The pipeline returned can be updated with the transformations done offline and passed along with the
|
506
|
+
dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
|
507
|
+
|
508
|
+
Parameters
|
509
|
+
----------
|
510
|
+
filter: str, optional
|
511
|
+
The query string to filter the dataframe, for example
|
512
|
+
ds.to_h2o(filter="age > 50 and location == 'san francisco")
|
513
|
+
See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
514
|
+
frac: float, optional
|
515
|
+
fraction of original data to return.
|
516
|
+
include_transformer_pipeline: bool, default: False
|
517
|
+
If True, (dataframe, transformer_pipeline) is returned as a tuple.
|
518
|
+
|
519
|
+
Returns
|
520
|
+
-------
|
521
|
+
dataframe : h2o.H2OFrame
|
522
|
+
if include_transformer_pipeline is False.
|
523
|
+
(data, transformer_pipeline): tuple of h2o.H2OFrame and dataset.pipeline.TransformerPipeline
|
524
|
+
if include_transformer_pipeline is True.
|
525
|
+
|
526
|
+
Examples
|
527
|
+
--------
|
528
|
+
>>> import pandas as pd
|
529
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
530
|
+
>>> ds_as_h2o = ds.to_h2o()
|
531
|
+
|
532
|
+
Notes
|
533
|
+
-----
|
534
|
+
See also https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
|
535
|
+
"""
|
536
|
+
res = self.to_pandas(
|
537
|
+
filter=filter,
|
538
|
+
frac=frac,
|
539
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
540
|
+
)
|
541
|
+
return (
|
542
|
+
(h2o.H2OFrame(res[0]), res[1])
|
543
|
+
if include_transformer_pipeline
|
544
|
+
else h2o.H2OFrame(res)
|
545
|
+
)
|
546
|
+
|
547
|
+
@deprecated("2.5.2", details="Instead use `to_xgb`.")
|
548
|
+
def to_xgb_dmatrix(
|
549
|
+
self, filter=None, frac=None, include_transformer_pipeline=False
|
550
|
+
):
|
551
|
+
return self.to_xgb(
|
552
|
+
filter=filter,
|
553
|
+
frac=frac,
|
554
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
555
|
+
)
|
556
|
+
|
557
|
+
@runtime_dependency(module="xgboost", install_from=OptionalDependency.BOOSTED)
|
558
|
+
def to_xgb(self, filter=None, frac=None, include_transformer_pipeline=False):
|
559
|
+
"""
|
560
|
+
Returns a copy of the data as xgboost.DMatrix, and a sklearn pipeline optionally that holds the
|
561
|
+
transformations run so far on the data.
|
562
|
+
|
563
|
+
The pipeline returned can be updated with the transformations done offline and passed along with the
|
564
|
+
dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
|
565
|
+
|
566
|
+
Parameters
|
567
|
+
----------
|
568
|
+
filter: str, optional
|
569
|
+
The query string to filter the dataframe, for example
|
570
|
+
ds.to_xgb(filter="age > 50 and location == 'san francisco")
|
571
|
+
See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
572
|
+
frac: float, optional
|
573
|
+
fraction of original data to return.
|
574
|
+
include_transformer_pipeline: bool, default: False
|
575
|
+
If True, (dataframe, transformer_pipeline) is returned as a tuple.
|
576
|
+
|
577
|
+
Returns
|
578
|
+
-------
|
579
|
+
dataframe : xgboost.DMatrix
|
580
|
+
if include_transformer_pipeline is False.
|
581
|
+
(data, transformer_pipeline): tuple of xgboost.DMatrix and dataset.pipeline.TransformerPipeline
|
582
|
+
if include_transformer_pipeline is True.
|
583
|
+
|
584
|
+
Examples
|
585
|
+
--------
|
586
|
+
>>> import pandas as pd
|
587
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
588
|
+
>>> xgb_dmat = ds.to_xgb()
|
589
|
+
|
590
|
+
Notes
|
591
|
+
-----
|
592
|
+
See also https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
|
593
|
+
"""
|
594
|
+
res = self.to_pandas(
|
595
|
+
filter=filter,
|
596
|
+
frac=frac,
|
597
|
+
include_transformer_pipeline=include_transformer_pipeline,
|
598
|
+
)
|
599
|
+
df = res[0] if include_transformer_pipeline else res
|
600
|
+
le = DataFrameLabelEncoder()
|
601
|
+
df = le.fit_transform(df)
|
602
|
+
if include_transformer_pipeline:
|
603
|
+
res[1].add(le)
|
604
|
+
xgb_matrix = xgboost.DMatrix(df)
|
605
|
+
return (xgb_matrix, res[1]) if include_transformer_pipeline else xgb_matrix
|
606
|
+
|
607
|
+
def sample(self, frac=None, random_state=utils.random_state):
|
608
|
+
"""
|
609
|
+
Returns random sample of dataset.
|
610
|
+
|
611
|
+
Parameters
|
612
|
+
----------
|
613
|
+
frac : float, optional
|
614
|
+
Fraction of axis items to return.
|
615
|
+
random_state : int or ``np.random.RandomState``
|
616
|
+
If int we create a new RandomState with this as the seed
|
617
|
+
Otherwise we draw from the passed RandomState
|
618
|
+
|
619
|
+
Returns
|
620
|
+
-------
|
621
|
+
sampled_dataset: ADSDataset
|
622
|
+
An ADSDataset which was randomly sampled.
|
623
|
+
|
624
|
+
Examples
|
625
|
+
--------
|
626
|
+
>>> import pandas as pd
|
627
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
628
|
+
>>> ds_sample = ds.sample()
|
629
|
+
"""
|
630
|
+
df = self.df.sample(frac=frac, random_state=random_state)
|
631
|
+
return self._build_new_dataset(df)
|
632
|
+
|
633
|
+
def drop_columns(self, columns):
|
634
|
+
"""
|
635
|
+
Return new dataset with specified columns removed.
|
636
|
+
|
637
|
+
Parameters
|
638
|
+
----------
|
639
|
+
columns : str or list
|
640
|
+
columns to drop.
|
641
|
+
|
642
|
+
Returns
|
643
|
+
-------
|
644
|
+
dataset: same type as the caller
|
645
|
+
a dataset with specified columns dropped.
|
646
|
+
|
647
|
+
Raises
|
648
|
+
------
|
649
|
+
ValidationError
|
650
|
+
If any of the feature names is not found in the dataset.
|
651
|
+
|
652
|
+
Examples
|
653
|
+
--------
|
654
|
+
>>> import pandas as pd
|
655
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
656
|
+
>>> ds_smaller = ds.drop_columns(['col1', 'col2'])
|
657
|
+
"""
|
658
|
+
self._validate_feature(columns)
|
659
|
+
return self.drop(columns, axis=1)
|
660
|
+
|
661
|
+
def assign_column(self, column, arg):
|
662
|
+
"""
|
663
|
+
Return new dataset with new column or values of the existing column mapped according to input correspondence.
|
664
|
+
|
665
|
+
Used for adding a new column or substituting each value in a column with another value, that may be derived from
|
666
|
+
a function, a :class:`pandas.Series` or a :class:`pandas.DataFrame`.
|
667
|
+
|
668
|
+
Parameters
|
669
|
+
----------
|
670
|
+
column : str
|
671
|
+
Name of the feature to update.
|
672
|
+
arg : function, dict, Series or DataFrame
|
673
|
+
Mapping correspondence.
|
674
|
+
|
675
|
+
Returns
|
676
|
+
-------
|
677
|
+
dataset: same type as the caller
|
678
|
+
a dataset with the specified column assigned.
|
679
|
+
|
680
|
+
Examples
|
681
|
+
--------
|
682
|
+
>>> import pandas as pd
|
683
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
684
|
+
>>> ds_same_size = ds.assign_column('target',lambda x: x>15 if x not None)
|
685
|
+
>>> ds_bigger = ds.assign_column('new_col', np.arange(ds.shape[0]))
|
686
|
+
"""
|
687
|
+
target_name = (
|
688
|
+
self.target.name if not utils.is_same_class(self, ADSDataset) else None
|
689
|
+
)
|
690
|
+
if isinstance(arg, Iterable) or isinstance(arg, ADSDataset):
|
691
|
+
df = self.df.copy()
|
692
|
+
if type(arg) == pd.DataFrame:
|
693
|
+
col_to_add = arg
|
694
|
+
elif type(arg) == ADSDataset:
|
695
|
+
col_to_add = arg.df
|
696
|
+
elif type(arg) == dict:
|
697
|
+
col_to_add = pd.DataFrame.from_dict(arg)
|
698
|
+
elif type(arg) in [list, np.ndarray]:
|
699
|
+
col_to_add = pd.DataFrame(arg, columns=["new_col"])
|
700
|
+
elif type(arg) == pd.Series:
|
701
|
+
col_to_add = arg.rename("new_col").to_frame()
|
702
|
+
elif utils._is_dask_dataframe(arg):
|
703
|
+
col_to_add = arg.compute()
|
704
|
+
elif utils._is_dask_series(arg):
|
705
|
+
col_to_add = arg.compute().rename("new_col").to_frame()
|
706
|
+
else:
|
707
|
+
raise ValueError(
|
708
|
+
f"assign_column currently does not support arg of type {type(arg)}. Reformat "
|
709
|
+
f"as types: Pandas, numpy, list, or dict"
|
710
|
+
)
|
711
|
+
if column in df.columns:
|
712
|
+
df = df.drop(columns=column)
|
713
|
+
new_df = pd.concat([df, col_to_add], axis=1).rename(
|
714
|
+
columns={"new_col": column}
|
715
|
+
)
|
716
|
+
return self._build_new_dataset(new_df)
|
717
|
+
|
718
|
+
else:
|
719
|
+
sampled_df = self.sampled_df.copy()
|
720
|
+
df = self.df.copy()
|
721
|
+
sampled_df[column] = sampled_df[column].apply(arg)
|
722
|
+
df[column] = df[column].apply(arg)
|
723
|
+
if column == target_name:
|
724
|
+
target_type = get_feature_type(target_name, sampled_df[target_name])
|
725
|
+
return self._build_new_dataset(
|
726
|
+
df, sampled_df, target=target_name, target_type=target_type
|
727
|
+
)
|
728
|
+
else:
|
729
|
+
return self._build_new_dataset(
|
730
|
+
df,
|
731
|
+
sampled_df,
|
732
|
+
target=target_name,
|
733
|
+
target_type=self.target.type
|
734
|
+
if target_name != column and target_name is not None
|
735
|
+
else None,
|
736
|
+
)
|
737
|
+
|
738
|
+
def rename_columns(self, columns):
|
739
|
+
"""
|
740
|
+
Returns a new dataset with altered column names.
|
741
|
+
|
742
|
+
dict values must be unique (1-to-1). Labels not contained in a dict will be left as-is.
|
743
|
+
Extra labels listed don't throw an error.
|
744
|
+
|
745
|
+
Parameters
|
746
|
+
----------
|
747
|
+
columns: dict-like or function or list of str
|
748
|
+
dict to rename columns selectively, or list of names to rename all columns, or a function like
|
749
|
+
str.upper
|
750
|
+
|
751
|
+
Returns
|
752
|
+
-------
|
753
|
+
dataset: same type as the caller
|
754
|
+
A dataset with specified columns renamed.
|
755
|
+
|
756
|
+
Examples
|
757
|
+
--------
|
758
|
+
>>> import pandas as pd
|
759
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
760
|
+
>>> ds_renamed = ds.rename_columns({'col1': 'target'})
|
761
|
+
"""
|
762
|
+
if isinstance(columns, list):
|
763
|
+
assert len(columns) == len(
|
764
|
+
self.columns.values
|
765
|
+
), "columns length do not match the dataset"
|
766
|
+
columns = dict(zip(self.columns.values, columns))
|
767
|
+
return self.rename(columns=columns)
|
768
|
+
|
769
|
+
def set_name(self, name):
|
770
|
+
"""
|
771
|
+
Sets name for the dataset.
|
772
|
+
|
773
|
+
This name will be used to filter the datasets returned by ds.list() API.
|
774
|
+
Calling this API is optional. By default name of the dataset is set to empty.
|
775
|
+
|
776
|
+
Parameters
|
777
|
+
----------
|
778
|
+
name: str
|
779
|
+
Name of the dataset.
|
780
|
+
|
781
|
+
Examples
|
782
|
+
--------
|
783
|
+
>>> import pandas as pd
|
784
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
|
785
|
+
>>> ds_renamed = ds.set_name("dataset1")
|
786
|
+
"""
|
787
|
+
self.name = name
|
788
|
+
|
789
|
+
def set_description(self, description):
|
790
|
+
"""
|
791
|
+
Sets description for the dataset.
|
792
|
+
|
793
|
+
Give your dataset a description.
|
794
|
+
|
795
|
+
Parameters
|
796
|
+
----------
|
797
|
+
description: str
|
798
|
+
Description of the dataset.
|
799
|
+
|
800
|
+
Examples
|
801
|
+
--------
|
802
|
+
>>> import pandas as pd
|
803
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
|
804
|
+
>>> ds_renamed = ds.set_description("dataset1 is from "data1.csv"")
|
805
|
+
"""
|
806
|
+
self.description = description
|
807
|
+
|
808
|
+
def snapshot(self, snapshot_dir=None, name="", storage_options=None):
|
809
|
+
"""
|
810
|
+
Snapshot the dataset with modifications made so far.
|
811
|
+
|
812
|
+
Optionally caller can invoke ds.set_name() before saving to identify the dataset uniquely at the time of
|
813
|
+
using ds.list().
|
814
|
+
|
815
|
+
The snapshot can be reloaded by providing the URI returned by this API to DatasetFactory.open()
|
816
|
+
|
817
|
+
Parameters
|
818
|
+
----------
|
819
|
+
snapshot_dir: str, optional
|
820
|
+
Directory path under which dataset snapshot will be created.
|
821
|
+
Defaults to snapshots_dir set using DatasetFactory.set_default_storage().
|
822
|
+
name: str, optional, default: ""
|
823
|
+
Name to uniquely identify the snapshot using DatasetFactory.list_snapshots().
|
824
|
+
If not provided, an auto-generated name is used.
|
825
|
+
storage_options: dict, optional
|
826
|
+
Parameters passed on to the backend filesystem class.
|
827
|
+
Defaults to storage_options set using DatasetFactory.set_default_storage().
|
828
|
+
|
829
|
+
Returns
|
830
|
+
-------
|
831
|
+
p_str: str
|
832
|
+
the URI to access the snapshotted dataset.
|
833
|
+
|
834
|
+
Examples
|
835
|
+
--------
|
836
|
+
>>> import pandas as pd
|
837
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
838
|
+
>>> ds_uri = ds.snapshot()
|
839
|
+
"""
|
840
|
+
if snapshot_dir is None:
|
841
|
+
import ads.dataset.factory as factory
|
842
|
+
|
843
|
+
snapshot_dir = factory.default_snapshots_dir
|
844
|
+
if snapshot_dir is None:
|
845
|
+
raise ValueError(
|
846
|
+
"Specify snapshot_dir or use DatasetFactory.set_default_storage() to set default \
|
847
|
+
storage options"
|
848
|
+
)
|
849
|
+
else:
|
850
|
+
logger.info("Using default snapshots dir %s" % snapshot_dir)
|
851
|
+
name = self._get_unique_name(name)
|
852
|
+
if not snapshot_dir.endswith("/"):
|
853
|
+
snapshot_dir = snapshot_dir + "/"
|
854
|
+
parquet_file = "%s%s.parquet" % (snapshot_dir, name)
|
855
|
+
os.makedirs(snapshot_dir, exist_ok=True)
|
856
|
+
if storage_options is None and parquet_file[:3] == "oci":
|
857
|
+
import ads.dataset.factory as factory
|
858
|
+
|
859
|
+
storage_options = factory.default_storage_options
|
860
|
+
logger.info("Using default storage options.")
|
861
|
+
|
862
|
+
return helper.write_parquet(
|
863
|
+
path=parquet_file,
|
864
|
+
data=self.df,
|
865
|
+
metadata_dict={
|
866
|
+
"metadata": self.feature_types,
|
867
|
+
"transformer": self.transformer_pipeline,
|
868
|
+
},
|
869
|
+
storage_options=storage_options,
|
870
|
+
)
|
871
|
+
|
872
|
+
def to_csv(self, path, storage_options=None, **kwargs):
|
873
|
+
"""
|
874
|
+
Save the materialized dataframe to csv file.
|
875
|
+
|
876
|
+
Parameters
|
877
|
+
----------
|
878
|
+
path: str
|
879
|
+
Location to write to. If there are more than one partitions in df, should include a glob character to
|
880
|
+
expand into a set of file names, or provide a `name_function=parameter`.
|
881
|
+
Supports protocol specifications such as `"oci://"`, `"s3://"`.
|
882
|
+
storage_options: dict, optional
|
883
|
+
Parameters passed on to the backend filesystem class.
|
884
|
+
Defaults to storage_options set using DatasetFactory.set_default_storage().
|
885
|
+
kwargs: dict, optional
|
886
|
+
|
887
|
+
Examples
|
888
|
+
--------
|
889
|
+
>>> import pandas as pd
|
890
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
891
|
+
>>> [ds_link] = ds.to_csv("my/path.csv")
|
892
|
+
"""
|
893
|
+
if storage_options is None:
|
894
|
+
import ads.dataset.factory as factory
|
895
|
+
|
896
|
+
storage_options = factory.default_storage_options
|
897
|
+
logger.info("Using default storage options")
|
898
|
+
return self.df.to_csv(path, storage_options=storage_options, **kwargs)
|
899
|
+
|
900
|
+
def to_parquet(self, path, storage_options=None, **kwargs):
|
901
|
+
"""
|
902
|
+
Save data to parquet file.
|
903
|
+
|
904
|
+
Parameters
|
905
|
+
----------
|
906
|
+
path: str
|
907
|
+
Location to write to. If there are more than one partitions in df, should include a glob character to
|
908
|
+
expand into a set of file names, or provide a `name_function=parameter`.
|
909
|
+
Supports protocol specifications such as `"oci://"`, `"s3://"`.
|
910
|
+
storage_options: dict, optional
|
911
|
+
Parameters passed on to the backend filesystem class.
|
912
|
+
Defaults to storage_options set using DatasetFactory.set_default_storage().
|
913
|
+
kwargs: dict, optional
|
914
|
+
|
915
|
+
Examples
|
916
|
+
--------
|
917
|
+
>>> import pandas as pd
|
918
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
919
|
+
>>> ds.to_parquet("my/path")
|
920
|
+
"""
|
921
|
+
if storage_options is None:
|
922
|
+
import ads.dataset.factory as factory
|
923
|
+
|
924
|
+
storage_options = factory.default_storage_options
|
925
|
+
logger.info("Using default storage options")
|
926
|
+
return self.df.to_parquet(path, storage_options=storage_options, **kwargs)
|
927
|
+
|
928
|
+
def to_json(self, path, storage_options=None, **kwargs):
|
929
|
+
"""
|
930
|
+
Save data to JSON files.
|
931
|
+
|
932
|
+
Parameters
|
933
|
+
----------
|
934
|
+
path: str
|
935
|
+
Location to write to. If there are more than one partitions in df, should include a glob character to
|
936
|
+
expand into a set of file names, or provide a `name_function=parameter`.
|
937
|
+
Supports protocol specifications such as `"oci://"`, `"s3://"`.
|
938
|
+
storage_options: dict, optional
|
939
|
+
Parameters passed on to the backend filesystem class.
|
940
|
+
Defaults to storage_options set using DatasetFactory.set_default_storage().
|
941
|
+
kwargs: dict, optional
|
942
|
+
|
943
|
+
Examples
|
944
|
+
--------
|
945
|
+
>>> import pandas as pd
|
946
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
947
|
+
>>> ds.to_json("my/path.json")
|
948
|
+
"""
|
949
|
+
if storage_options is None:
|
950
|
+
import ads.dataset.factory as factory
|
951
|
+
|
952
|
+
storage_options = factory.default_storage_options
|
953
|
+
logger.info("Using default storage options")
|
954
|
+
|
955
|
+
return self.df.to_json(path, storage_options=storage_options, **kwargs)
|
956
|
+
|
957
|
+
def to_hdf(
|
958
|
+
self, path: str, key: str, storage_options: dict = None, **kwargs
|
959
|
+
) -> str:
|
960
|
+
"""
|
961
|
+
Save data to Hierarchical Data Format (HDF) files.
|
962
|
+
|
963
|
+
Parameters
|
964
|
+
----------
|
965
|
+
path : string
|
966
|
+
Path to a target filename.
|
967
|
+
key : string
|
968
|
+
Datapath within the files.
|
969
|
+
storage_options: dict, optional
|
970
|
+
Parameters passed to the backend filesystem class.
|
971
|
+
Defaults to storage_options set using DatasetFactory.set_default_storage().
|
972
|
+
kwargs: dict, optional
|
973
|
+
|
974
|
+
Returns
|
975
|
+
-------
|
976
|
+
str
|
977
|
+
The filename of the HDF5 file created.
|
978
|
+
|
979
|
+
Examples
|
980
|
+
--------
|
981
|
+
>>> import pandas as pd
|
982
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
983
|
+
>>> ds.to_hdf(path="my/path.h5", key="df")
|
984
|
+
"""
|
985
|
+
if storage_options is None:
|
986
|
+
import ads.dataset.factory as factory
|
987
|
+
|
988
|
+
storage_options = factory.default_storage_options
|
989
|
+
logger.info("Using default storage options")
|
990
|
+
|
991
|
+
with pd.HDFStore(
|
992
|
+
"memory",
|
993
|
+
mode="w",
|
994
|
+
driver="H5FD_CORE",
|
995
|
+
driver_core_backing_store=0,
|
996
|
+
) as hdf_store:
|
997
|
+
hdf_store.put(key, self.df, format=kwargs.get("hdf5_format", "fixed"))
|
998
|
+
data = hdf_store._handle.get_file_image()
|
999
|
+
|
1000
|
+
new_path = (
|
1001
|
+
path.replace("*", "0")
|
1002
|
+
if path[-3:] == ".h5"
|
1003
|
+
else path.replace("*", "0") + ".h5"
|
1004
|
+
)
|
1005
|
+
|
1006
|
+
with fsspec.open(
|
1007
|
+
urlpath=new_path, mode="wb", storage_options=storage_options, **kwargs
|
1008
|
+
) as fo:
|
1009
|
+
fo.write(data)
|
1010
|
+
|
1011
|
+
return new_path
|
1012
|
+
|
1013
|
+
@runtime_dependency(module="fastavro", install_from=OptionalDependency.DATA)
|
1014
|
+
def to_avro(self, path, schema=None, storage_options=None, **kwargs):
|
1015
|
+
"""
|
1016
|
+
Save data to Avro files.
|
1017
|
+
Avro is a remote procedure call and data serialization framework developed within Apache's Hadoop project. It
|
1018
|
+
uses JSON for defining data types and protocols, and serializes data in a compact binary format.
|
1019
|
+
|
1020
|
+
Parameters
|
1021
|
+
----------
|
1022
|
+
path : string
|
1023
|
+
Path to a target filename. May contain a ``*`` to denote many filenames.
|
1024
|
+
schema : dict
|
1025
|
+
Avro schema dictionary, see below.
|
1026
|
+
storage_options: dict, optional
|
1027
|
+
Parameters passed to the backend filesystem class.
|
1028
|
+
Defaults to storage_options set using DatasetFactory.set_default_storage().
|
1029
|
+
kwargs: dict, optional
|
1030
|
+
See https://fastavro.readthedocs.io/en/latest/writer.html
|
1031
|
+
|
1032
|
+
Notes
|
1033
|
+
-----
|
1034
|
+
Avro schema is a complex dictionary describing the data,
|
1035
|
+
see https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
|
1036
|
+
and https://fastavro.readthedocs.io/en/latest/writer.html.
|
1037
|
+
Its structure is as follows::
|
1038
|
+
|
1039
|
+
{'name': 'Test',
|
1040
|
+
'namespace': 'Test',
|
1041
|
+
'doc': 'Descriptive text',
|
1042
|
+
'type': 'record',
|
1043
|
+
'fields': [
|
1044
|
+
{'name': 'a', 'type': 'int'},
|
1045
|
+
]}
|
1046
|
+
|
1047
|
+
where the "name" field is required, but "namespace" and "doc" are optional
|
1048
|
+
descriptors; "type" must always be "record". The list of fields should
|
1049
|
+
have an entry for every key of the input records, and the types are
|
1050
|
+
like the primitive, complex or logical types of the Avro spec
|
1051
|
+
(https://avro.apache.org/docs/1.8.2/spec.html).
|
1052
|
+
|
1053
|
+
Examples
|
1054
|
+
--------
|
1055
|
+
>>> import pandas
|
1056
|
+
>>> import fastavro
|
1057
|
+
>>> with open("data.avro", "rb") as fp:
|
1058
|
+
>>> reader = fastavro.reader(fp)
|
1059
|
+
>>> records = [r for r in reader]
|
1060
|
+
>>> df = pandas.DataFrame.from_records(records)
|
1061
|
+
>>> ds = ADSDataset.from_dataframe(df)
|
1062
|
+
>>> ds.to_avro("my/path.avro")
|
1063
|
+
"""
|
1064
|
+
# Get the row by row formatting
|
1065
|
+
data_row_by_row = []
|
1066
|
+
for i, row in self.df.iterrows():
|
1067
|
+
data_row_by_row.append(row.to_dict())
|
1068
|
+
# Try to auto-generate schema
|
1069
|
+
if schema is None:
|
1070
|
+
avro_types = self._convert_dtypes_to_avro_types()
|
1071
|
+
schema = {"name": self.name, "doc": self.description, "type": "record"}
|
1072
|
+
fields = []
|
1073
|
+
## Add vars
|
1074
|
+
for col, dtype in avro_types:
|
1075
|
+
fields.append({"name": col, "type": ["null", dtype]})
|
1076
|
+
schema["fields"] = fields
|
1077
|
+
|
1078
|
+
parsed_schema = fastavro.parse_schema(schema=schema)
|
1079
|
+
new_path = (
|
1080
|
+
path.replace("*", "0")
|
1081
|
+
if path[-5:] == ".avro"
|
1082
|
+
else path.replace("*", "0") + ".avro"
|
1083
|
+
)
|
1084
|
+
with fsspec.open(
|
1085
|
+
new_path, "wb", storage_options=storage_options, **kwargs
|
1086
|
+
) as fo:
|
1087
|
+
fastavro.writer(fo, parsed_schema, data_row_by_row)
|
1088
|
+
return new_path
|
1089
|
+
|
1090
|
+
def _convert_dtypes_to_avro_types(self):
|
1091
|
+
avro_types = []
|
1092
|
+
for name, dtype in zip(self.dtypes.index, self.dtypes.values):
|
1093
|
+
if dtype == np.int64:
|
1094
|
+
avro_dtype = "long"
|
1095
|
+
elif "int" in str(dtype):
|
1096
|
+
avro_dtype = "int"
|
1097
|
+
elif dtype == np.float64:
|
1098
|
+
avro_dtype = "double"
|
1099
|
+
elif "float" in str(dtype):
|
1100
|
+
avro_dtype = "float"
|
1101
|
+
elif dtype == np.bool_:
|
1102
|
+
avro_dtype = "boolean"
|
1103
|
+
else:
|
1104
|
+
avro_dtype = "string"
|
1105
|
+
avro_types.append((name, avro_dtype))
|
1106
|
+
return avro_types
|
1107
|
+
|
1108
|
+
def astype(self, types):
|
1109
|
+
"""
|
1110
|
+
Convert data type of features.
|
1111
|
+
|
1112
|
+
Parameters
|
1113
|
+
----------
|
1114
|
+
types: dict
|
1115
|
+
key is the existing feature name
|
1116
|
+
value is the data type to which the values of the feature should be converted.
|
1117
|
+
Valid data types: All numpy datatypes (Example: np.float64, np.int64, ...)
|
1118
|
+
or one of categorical, continuous, ordinal or datetime.
|
1119
|
+
|
1120
|
+
Returns
|
1121
|
+
-------
|
1122
|
+
updated_dataset: `ADSDataset`
|
1123
|
+
an ADSDataset with new data types
|
1124
|
+
|
1125
|
+
Examples
|
1126
|
+
--------
|
1127
|
+
>>> import pandas as pd
|
1128
|
+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
|
1129
|
+
>>> ds_reformatted = ds.astype({"target": "categorical"})
|
1130
|
+
"""
|
1131
|
+
return self.__getattr__("astype")(helper.map_types(types))
|
1132
|
+
|
1133
|
+
def merge(self, data, **kwargs):
|
1134
|
+
"""
|
1135
|
+
Merges this dataset with another ADSDataset or pandas dataframe.
|
1136
|
+
|
1137
|
+
Parameters
|
1138
|
+
----------
|
1139
|
+
data : Union[ADSDataset, pandas.DataFrame]
|
1140
|
+
Data to merge.
|
1141
|
+
kwargs : dict, optional
|
1142
|
+
additional keyword arguments that would be passed to underlying dataframe's merge API.
|
1143
|
+
|
1144
|
+
Examples
|
1145
|
+
--------
|
1146
|
+
>>> import pandas as pd
|
1147
|
+
>>> df1 = pd.read_csv("data1.csv")
|
1148
|
+
>>> df2 = pd.read_csv("data2.csv")
|
1149
|
+
>>> ds = ADSDataset.from_dataframe(df1.merge(df2))
|
1150
|
+
>>> ds_12 = ds1.merge(ds2)
|
1151
|
+
"""
|
1152
|
+
assert isinstance(data, pd.DataFrame) or isinstance(
|
1153
|
+
data, ADSDataset
|
1154
|
+
), "Can only merge datasets if they are of the types pandas or ads"
|
1155
|
+
df = self.df.merge(data.df if isinstance(data, ADSDataset) else data, **kwargs)
|
1156
|
+
return self._build_new_dataset(df, progress=utils.get_progress_bar(3))
|
1157
|
+
|
1158
|
+
"""
|
1159
|
+
Internal methods
|
1160
|
+
"""
|
1161
|
+
|
1162
|
+
def __getattr__(self, item):
|
1163
|
+
attr = getattr(self.df, item)
|
1164
|
+
if callable(attr):
|
1165
|
+
return self._apply(attr)
|
1166
|
+
else:
|
1167
|
+
return attr
|
1168
|
+
|
1169
|
+
def __getitem__(self, key):
|
1170
|
+
if isinstance(key, str) or isinstance(key, (tuple, str)):
|
1171
|
+
return self.df[key]
|
1172
|
+
else:
|
1173
|
+
return self._build_new_dataset(self.df[key])
|
1174
|
+
|
1175
|
+
def _apply(self, func):
|
1176
|
+
def df_func(*args, _new_target=None, **kwargs):
|
1177
|
+
has_dataframe_arg = False
|
1178
|
+
args = list(args)
|
1179
|
+
for i, arg in enumerate(args):
|
1180
|
+
if isinstance(arg, ADSDataset) or isinstance(arg, pd.DataFrame):
|
1181
|
+
has_dataframe_arg = True
|
1182
|
+
# convert any argument that is of type ADSDataset to dataframe. This is useful in delegate calls
|
1183
|
+
# like dataset1.concat(dataset2)
|
1184
|
+
args[i] = arg.df if isinstance(arg, ADSDataset) else arg
|
1185
|
+
|
1186
|
+
result = func(*args, **kwargs)
|
1187
|
+
|
1188
|
+
# return the response as such if the the result is not a dataframe and it is a read function such as head
|
1189
|
+
if (
|
1190
|
+
isinstance(result, pd.DataFrame)
|
1191
|
+
and func.__name__ not in self.df_read_functions
|
1192
|
+
):
|
1193
|
+
target_name = None
|
1194
|
+
target_sample_val = None
|
1195
|
+
if not utils.is_same_class(self, ADSDataset):
|
1196
|
+
target_name = (
|
1197
|
+
self.target.name if _new_target is None else _new_target
|
1198
|
+
)
|
1199
|
+
target_sample_val = (
|
1200
|
+
self.sampled_df[self.target.name].dropna().values[0]
|
1201
|
+
)
|
1202
|
+
|
1203
|
+
df = result
|
1204
|
+
n = len(df)
|
1205
|
+
trans_df = None
|
1206
|
+
transformed = False
|
1207
|
+
transformers = []
|
1208
|
+
|
1209
|
+
# The sampled dataframe needs to be re-generated when this operation involves another dataframe.
|
1210
|
+
# Also, this kind of transformations cannot be reproduced at the time of scoring
|
1211
|
+
if not has_dataframe_arg:
|
1212
|
+
ft = DataFrameTransformer(
|
1213
|
+
func_name=func.__name__,
|
1214
|
+
target_name=target_name,
|
1215
|
+
target_sample_val=target_sample_val,
|
1216
|
+
args=args,
|
1217
|
+
kw_args=kwargs,
|
1218
|
+
).fit(result)
|
1219
|
+
# transformed is set to false if the method fails to run on pandas dataframe. In this case a new
|
1220
|
+
# sampled dataframe is added
|
1221
|
+
trans_df, transformed = ft._transform(self.sampled_df.copy())
|
1222
|
+
# if the dataset length changes as a result of transformation, these operations need not be added to
|
1223
|
+
# pipeline as they do not need to be reproduced at the time of scoring.
|
1224
|
+
transformers = (func.__name__, ft) if n == self.shape[0] else []
|
1225
|
+
|
1226
|
+
init_kwargs = self.init_kwargs.copy()
|
1227
|
+
if func.__name__ == "astype":
|
1228
|
+
if "types" in init_kwargs:
|
1229
|
+
init_kwargs["types"] = init_kwargs["types"] + args[0]
|
1230
|
+
else:
|
1231
|
+
init_kwargs["types"] = args[0]
|
1232
|
+
|
1233
|
+
# if the transforming function is not supported by pandas dataframe, we need to sample the dask
|
1234
|
+
# dataframe again to get a new representation
|
1235
|
+
return self._build_new_dataset(
|
1236
|
+
df,
|
1237
|
+
sampled_df=df,
|
1238
|
+
target=target_name,
|
1239
|
+
target_type=TypeDiscoveryDriver().discover(
|
1240
|
+
target_name, df[target_name]
|
1241
|
+
)
|
1242
|
+
if target_name is not None and target_name in df
|
1243
|
+
else None,
|
1244
|
+
sample=not transformed,
|
1245
|
+
transformers=transformers,
|
1246
|
+
**init_kwargs,
|
1247
|
+
)
|
1248
|
+
return result
|
1249
|
+
|
1250
|
+
return df_func
|
1251
|
+
|
1252
|
+
def _handle_key_error(self, args):
|
1253
|
+
raise ValidationError("Column %s does not exist in data" % str(args))
|
1254
|
+
|
1255
|
+
def _build_new_dataset(
|
1256
|
+
self,
|
1257
|
+
df,
|
1258
|
+
sampled_df=None,
|
1259
|
+
target=None,
|
1260
|
+
target_type=None,
|
1261
|
+
transformers=[],
|
1262
|
+
sample=False,
|
1263
|
+
progress=DummyProgressBar(),
|
1264
|
+
n=None,
|
1265
|
+
**init_kwargs,
|
1266
|
+
):
|
1267
|
+
prev_doc_mode = utils.is_documentation_mode()
|
1268
|
+
|
1269
|
+
set_documentation_mode(False)
|
1270
|
+
|
1271
|
+
init_kwargs = (
|
1272
|
+
self.init_kwargs
|
1273
|
+
if init_kwargs is None or len(init_kwargs) == 0
|
1274
|
+
else init_kwargs.copy()
|
1275
|
+
)
|
1276
|
+
n = len(df) if n is None else n
|
1277
|
+
|
1278
|
+
# re-calculate sample df if not provided
|
1279
|
+
if sampled_df is None or sample:
|
1280
|
+
if progress:
|
1281
|
+
progress.update("Sampling data")
|
1282
|
+
sampled_df = generate_sample(
|
1283
|
+
df,
|
1284
|
+
n,
|
1285
|
+
DatasetDefaults.sampling_confidence_level,
|
1286
|
+
DatasetDefaults.sampling_confidence_interval,
|
1287
|
+
**init_kwargs,
|
1288
|
+
)
|
1289
|
+
else:
|
1290
|
+
if progress:
|
1291
|
+
progress.update()
|
1292
|
+
shape = (n, len(df.columns))
|
1293
|
+
if not utils.is_same_class(self, ADSDataset) and target is None:
|
1294
|
+
target = self.target.name
|
1295
|
+
|
1296
|
+
set_documentation_mode(prev_doc_mode)
|
1297
|
+
|
1298
|
+
# return a ADSDataset object if the target has been removed from the dataframe
|
1299
|
+
if target in sampled_df.columns:
|
1300
|
+
if progress:
|
1301
|
+
progress.update("Building new dataset")
|
1302
|
+
target_type = self.target.type if target_type is None else target_type
|
1303
|
+
|
1304
|
+
new_ds = get_dataset(
|
1305
|
+
df,
|
1306
|
+
sampled_df,
|
1307
|
+
target,
|
1308
|
+
target_type,
|
1309
|
+
shape,
|
1310
|
+
progress=progress,
|
1311
|
+
**init_kwargs,
|
1312
|
+
)
|
1313
|
+
|
1314
|
+
new_ds.transformer_pipeline = self._update_transformer_pipeline(
|
1315
|
+
transformers
|
1316
|
+
)
|
1317
|
+
return new_ds
|
1318
|
+
else:
|
1319
|
+
if target is not None and not isinstance(progress, DummyProgressBar):
|
1320
|
+
logger.info(
|
1321
|
+
"The target variable does not exist. Use `set_target()` to specify the target."
|
1322
|
+
)
|
1323
|
+
if progress:
|
1324
|
+
progress.update("Building the dataset with no target.")
|
1325
|
+
dsp = ADSDataset(
|
1326
|
+
df,
|
1327
|
+
sampled_df,
|
1328
|
+
shape,
|
1329
|
+
progress=progress,
|
1330
|
+
interactive=False,
|
1331
|
+
**init_kwargs,
|
1332
|
+
)
|
1333
|
+
dsp.transformer_pipeline = self._update_transformer_pipeline(transformers)
|
1334
|
+
return dsp
|
1335
|
+
|
1336
|
+
def _validate_feature(self, feature_names):
|
1337
|
+
if np.isscalar(feature_names):
|
1338
|
+
feature_names = [feature_names]
|
1339
|
+
for feature in feature_names:
|
1340
|
+
if feature not in self.df.columns:
|
1341
|
+
self._handle_key_error(feature)
|
1342
|
+
|
1343
|
+
def _update_transformer_pipeline(self, steps=[]):
|
1344
|
+
if isinstance(steps, tuple):
|
1345
|
+
steps = [steps]
|
1346
|
+
if steps is None or len(steps) == 0:
|
1347
|
+
return copy.deepcopy(self.transformer_pipeline)
|
1348
|
+
if self.transformer_pipeline is not None:
|
1349
|
+
transformer_pipeline = TransformerPipeline(
|
1350
|
+
steps=self.transformer_pipeline.steps + steps
|
1351
|
+
)
|
1352
|
+
else:
|
1353
|
+
transformer_pipeline = TransformerPipeline(steps=steps)
|
1354
|
+
return transformer_pipeline
|
1355
|
+
|
1356
|
+
def _get_unique_name(self, name):
|
1357
|
+
id = (
|
1358
|
+
uuid.uuid4().hex + "_" + datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
1359
|
+
)
|
1360
|
+
if name == "":
|
1361
|
+
return id
|
1362
|
+
return name + "_" + id
|
1363
|
+
|
1364
|
+
def corr(
|
1365
|
+
self,
|
1366
|
+
correlation_methods: Union[list, str] = "pearson",
|
1367
|
+
frac: float = 1.0,
|
1368
|
+
sample_size: float = 1.0,
|
1369
|
+
nan_threshold: float = 0.8,
|
1370
|
+
overwrite: bool = None,
|
1371
|
+
force_recompute: bool = False,
|
1372
|
+
):
|
1373
|
+
"""
|
1374
|
+
Compute pairwise correlation of numeric and categorical columns, output a matrix or a list of matrices computed
|
1375
|
+
using the correlation methods passed in.
|
1376
|
+
|
1377
|
+
Parameters
|
1378
|
+
----------
|
1379
|
+
correlation_methods: Union[list, str], default to 'pearson'
|
1380
|
+
|
1381
|
+
- 'pearson': Use Pearson's Correlation between continuous features,
|
1382
|
+
- 'cramers v': Use Cramer's V correlations between categorical features,
|
1383
|
+
- 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
|
1384
|
+
- 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
|
1385
|
+
|
1386
|
+
Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
|
1387
|
+
frac:
|
1388
|
+
Is deprecated and replaced by sample_size.
|
1389
|
+
sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
|
1390
|
+
What fraction of the data should be used in the calculation?
|
1391
|
+
nan_threshold: float, default to 0.8, Range -> [0, 1]
|
1392
|
+
Only compute a correlation when the proportion of the values, in a column, is less than or equal to nan_threshold.
|
1393
|
+
overwrite:
|
1394
|
+
Is deprecated and replaced by force_recompute.
|
1395
|
+
force_recompute: bool, default to be False
|
1396
|
+
|
1397
|
+
- If False, it calculates the correlation matrix if there is no cached correlation matrix. Otherwise,
|
1398
|
+
it returns the cached correlation matrix.
|
1399
|
+
- If True, it calculates the correlation matrix regardless whether there is cached result or not.
|
1400
|
+
|
1401
|
+
Returns
|
1402
|
+
-------
|
1403
|
+
correlation: Union[list, pandas.DataFrame]
|
1404
|
+
The pairwise correlations as a matrix (DataFrame) or list of matrices
|
1405
|
+
"""
|
1406
|
+
frac = deprecate_default_value(
|
1407
|
+
frac,
|
1408
|
+
None,
|
1409
|
+
1,
|
1410
|
+
"<code>frac=None</code> is superseded by <code>sample_size=1.0</code>.",
|
1411
|
+
FutureWarning,
|
1412
|
+
)
|
1413
|
+
|
1414
|
+
if frac != 1.0:
|
1415
|
+
deprecate_frac = deprecate_variable(
|
1416
|
+
frac,
|
1417
|
+
sample_size,
|
1418
|
+
"<code>frac</code> is superseded by <code>sample_size</code>.",
|
1419
|
+
DeprecationWarning,
|
1420
|
+
)
|
1421
|
+
if sample_size == 1.0:
|
1422
|
+
sample_size = deprecate_frac
|
1423
|
+
|
1424
|
+
force_recompute = deprecate_variable(
|
1425
|
+
overwrite,
|
1426
|
+
force_recompute,
|
1427
|
+
f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
|
1428
|
+
DeprecationWarning,
|
1429
|
+
)
|
1430
|
+
if sample_size > 1 or sample_size <= 0:
|
1431
|
+
logger.error("`sample_size` must to be in the range of (0, 1].")
|
1432
|
+
return
|
1433
|
+
if nan_threshold > 1 or nan_threshold < 0:
|
1434
|
+
logger.error("`nan_threshold` must be between 0 and 1 (exclusive).")
|
1435
|
+
return
|
1436
|
+
return self._compute_correlation(
|
1437
|
+
frac=sample_size,
|
1438
|
+
threshold=nan_threshold,
|
1439
|
+
force_recompute=force_recompute,
|
1440
|
+
correlation_methods=correlation_methods,
|
1441
|
+
)
|
1442
|
+
|
1443
|
+
def _compute_correlation(
|
1444
|
+
self,
|
1445
|
+
frac=1.0,
|
1446
|
+
threshold=0.8,
|
1447
|
+
include_n_features=16,
|
1448
|
+
correlation_methods="pearson",
|
1449
|
+
force_recompute=False,
|
1450
|
+
):
|
1451
|
+
"""
|
1452
|
+
returns a list of correlation matrix/matrices
|
1453
|
+
"""
|
1454
|
+
|
1455
|
+
# validate the correlation methods
|
1456
|
+
correlation_methods = _validate_correlation_methods(correlation_methods)
|
1457
|
+
|
1458
|
+
# if users choose to sample a frac of the data
|
1459
|
+
corr_df = self.df if not frac else self.df.sample(frac=frac)
|
1460
|
+
|
1461
|
+
# return columns by type and filter by threshold
|
1462
|
+
threshold = threshold * 100
|
1463
|
+
feature_types_df = pd.DataFrame.from_dict(self.feature_types).T
|
1464
|
+
|
1465
|
+
# reduce the dim of wide data
|
1466
|
+
n_rows, n_columns = self.shape
|
1467
|
+
|
1468
|
+
is_wide_dataset = n_columns >= N_Features_Wide_Dataset
|
1469
|
+
|
1470
|
+
if is_wide_dataset and include_n_features:
|
1471
|
+
corr_df, feature_types_df = self._reduce_dim_for_wide_dataset(
|
1472
|
+
corr_df, feature_types_df, include_n_features
|
1473
|
+
)
|
1474
|
+
|
1475
|
+
categorical_columns, continuous_columns, _ = _get_columns_by_type(
|
1476
|
+
feature_types_df, threshold=threshold
|
1477
|
+
)
|
1478
|
+
|
1479
|
+
# get the correlation
|
1480
|
+
correlation_list = []
|
1481
|
+
for method in correlation_methods:
|
1482
|
+
correlation_list.append(
|
1483
|
+
self._return_correlation(
|
1484
|
+
corr_df,
|
1485
|
+
method,
|
1486
|
+
categorical_columns,
|
1487
|
+
continuous_columns,
|
1488
|
+
force_recompute,
|
1489
|
+
)
|
1490
|
+
)
|
1491
|
+
return correlation_list[0] if len(correlation_list) == 1 else correlation_list
|
1492
|
+
|
1493
|
+
def _calc_pearson(self, df: pd.DataFrame, continuous_columns: list) -> pd.DataFrame:
|
1494
|
+
self._pearson = (
|
1495
|
+
df[continuous_columns].corr()
|
1496
|
+
if len(continuous_columns) > 1
|
1497
|
+
else pd.DataFrame()
|
1498
|
+
)
|
1499
|
+
return self._pearson
|
1500
|
+
|
1501
|
+
def _calc_cramers_v(
|
1502
|
+
self, df: pd.DataFrame, categorical_columns: list
|
1503
|
+
) -> pd.DataFrame:
|
1504
|
+
self._cramers_v = _cat_vs_cat(df, categorical_columns)
|
1505
|
+
return self._cramers_v
|
1506
|
+
|
1507
|
+
def _calc_correlation_ratio(
|
1508
|
+
self,
|
1509
|
+
df: pd.core.frame.DataFrame,
|
1510
|
+
categorical_columns: list,
|
1511
|
+
continuous_columns: list,
|
1512
|
+
) -> pd.DataFrame:
|
1513
|
+
self._correlation_ratio = _cat_vs_cts(
|
1514
|
+
df, categorical_columns, continuous_columns
|
1515
|
+
)
|
1516
|
+
return self._correlation_ratio
|
1517
|
+
|
1518
|
+
def _return_correlation(
|
1519
|
+
self,
|
1520
|
+
corr_df,
|
1521
|
+
method,
|
1522
|
+
categorical_columns,
|
1523
|
+
continuous_columns,
|
1524
|
+
force_recompute,
|
1525
|
+
):
|
1526
|
+
if not force_recompute and hasattr(self, "_" + "_".join(method.split())):
|
1527
|
+
logger.info(
|
1528
|
+
f"Using cached results for {method} correlation. Use"
|
1529
|
+
" `force_recompute=True` to override."
|
1530
|
+
)
|
1531
|
+
return getattr(self, "_" + "_".join(method.split()))
|
1532
|
+
else:
|
1533
|
+
if method == "pearson":
|
1534
|
+
self._calc_pearson(corr_df, continuous_columns)
|
1535
|
+
return self._pearson
|
1536
|
+
elif method == "cramers v":
|
1537
|
+
self._calc_cramers_v(corr_df, categorical_columns)
|
1538
|
+
return self._cramers_v
|
1539
|
+
elif method == "correlation ratio":
|
1540
|
+
self._calc_correlation_ratio(
|
1541
|
+
corr_df, categorical_columns, continuous_columns
|
1542
|
+
)
|
1543
|
+
return self._correlation_ratio
|
1544
|
+
else:
|
1545
|
+
raise ValueError(f"The {method} method is not supported.")
|
1546
|
+
|
1547
|
+
@runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
|
1548
|
+
def _reduce_dim_for_wide_dataset(
|
1549
|
+
self, corr_df: pd.DataFrame, feature_types_df: pd.DataFrame, include_n_features
|
1550
|
+
):
|
1551
|
+
min_cores_for_correlation = 2
|
1552
|
+
n_rows, n_columns = self.shape
|
1553
|
+
|
1554
|
+
from IPython.core.display import display, HTML
|
1555
|
+
|
1556
|
+
if utils.get_cpu_count() <= min_cores_for_correlation:
|
1557
|
+
msg = (
|
1558
|
+
f"Not attempting to calculate correlations, too few cores ({utils.get_cpu_count()}) "
|
1559
|
+
f"for wide dataset ({n_columns} columns)"
|
1560
|
+
)
|
1561
|
+
display(HTML(f"<li>{msg}</li>"))
|
1562
|
+
return None, None
|
1563
|
+
|
1564
|
+
display(HTML(f"<li>detected wide dataset ({n_columns} columns)</li>"))
|
1565
|
+
|
1566
|
+
if "target" in self.__dict__:
|
1567
|
+
display(
|
1568
|
+
HTML(
|
1569
|
+
f"<li>feature reduction using mutual information (max {include_n_features} columns)</li>"
|
1570
|
+
)
|
1571
|
+
)
|
1572
|
+
logger.info("Set `include_n_features=None` to include all features.")
|
1573
|
+
corr_sampled_df = self._find_feature_subset(
|
1574
|
+
self.sampled_df, self.target.name, include_n_features=include_n_features
|
1575
|
+
)
|
1576
|
+
corr_df, feature_types_df = self._update_dataframes(
|
1577
|
+
corr_sampled_df, corr_df, feature_types_df
|
1578
|
+
)
|
1579
|
+
else:
|
1580
|
+
#
|
1581
|
+
# in the absense of a target we simply use the first_n
|
1582
|
+
#
|
1583
|
+
logger.info(
|
1584
|
+
f"To include the first {include_n_features} features based on the feature"
|
1585
|
+
f"importance, use `.set_target`()."
|
1586
|
+
)
|
1587
|
+
feature_types_df = feature_types_df[
|
1588
|
+
(feature_types_df.index.isin(corr_df.columns.values))
|
1589
|
+
& feature_types_df.type.isin(
|
1590
|
+
["categorical", "ordinal", "continuous", "zipcode"]
|
1591
|
+
)
|
1592
|
+
]
|
1593
|
+
corr_df = corr_df[feature_types_df.index[:include_n_features]]
|
1594
|
+
feature_types_df = feature_types_df.iloc[:include_n_features, :]
|
1595
|
+
return corr_df, feature_types_df
|
1596
|
+
|
1597
|
+
def _update_dataframes(self, corr_sampled_df, corr_df, feature_types_df):
|
1598
|
+
"""
|
1599
|
+
update the dataframe and feature types based on the reduced dataframe
|
1600
|
+
"""
|
1601
|
+
cols = corr_sampled_df.columns.tolist()
|
1602
|
+
cols.insert(0, cols.pop(cols.index(self.target.name)))
|
1603
|
+
corr_df_reduced = corr_df[[*cols]]
|
1604
|
+
feature_types_df_reduced = feature_types_df[feature_types_df.index.isin(cols)]
|
1605
|
+
return corr_df_reduced, feature_types_df_reduced
|
1606
|
+
|
1607
|
+
def show_corr(
|
1608
|
+
self,
|
1609
|
+
frac: float = 1.0,
|
1610
|
+
sample_size: float = 1.0,
|
1611
|
+
nan_threshold: float = 0.8,
|
1612
|
+
overwrite: bool = None,
|
1613
|
+
force_recompute: bool = False,
|
1614
|
+
correlation_target: str = None,
|
1615
|
+
plot_type: str = "heatmap",
|
1616
|
+
correlation_threshold: float = -1,
|
1617
|
+
correlation_methods="pearson",
|
1618
|
+
**kwargs,
|
1619
|
+
):
|
1620
|
+
"""
|
1621
|
+
Show heatmap or barplot of pairwise correlation of numeric and categorical columns, output three tabs
|
1622
|
+
which are heatmap or barplot of correlation matrix of numeric columns vs numeric columns using pearson
|
1623
|
+
correlation method, categorical columns vs categorical columns using Cramer's V method,
|
1624
|
+
and numeric vs categorical columns, excluding NA/null values and columns which have more than
|
1625
|
+
80% of NA/null values. By default, only 'pearson' correlation is calculated and shown in the first tab.
|
1626
|
+
Set correlation_methods='all' to show all correlation charts.
|
1627
|
+
|
1628
|
+
Parameters
|
1629
|
+
----------
|
1630
|
+
frac: Is superseded by sample_size
|
1631
|
+
sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
|
1632
|
+
What fraction of the data should be used in the calculation?
|
1633
|
+
nan_threshold: float, defaults to 0.8, Range -> [0, 1]
|
1634
|
+
In the default case, it will only calculate the correlation of the columns which has less than or equal to
|
1635
|
+
80% of missing values.
|
1636
|
+
overwrite:
|
1637
|
+
Is deprecated and replaced by force_recompute.
|
1638
|
+
force_recompute: bool, default to be False.
|
1639
|
+
|
1640
|
+
- If False, it calculates the correlation matrix if there is no cached correlation matrix. Otherwise,
|
1641
|
+
it returns the cached correlation matrix.
|
1642
|
+
- If True, it calculates the correlation matrix regardless whether there is cached result or not.
|
1643
|
+
|
1644
|
+
plot_type: str, default to "heatmap"
|
1645
|
+
It can only be "heatmap" or "bar". Note that if "bar" is chosen, correlation_target also has to be set and
|
1646
|
+
the bar chart will only show the correlation values of the pairs which have the target in them.
|
1647
|
+
correlation_target: str, default to Non
|
1648
|
+
It can be any columns of type continuous, ordinal, categorical or zipcode. When correlation_target is set,
|
1649
|
+
only pairs that contains correlation_target will show.
|
1650
|
+
correlation_threshold: float, default to -1
|
1651
|
+
It can be any number between -1 and 1.
|
1652
|
+
correlation_methods: Union[list, str], defaults to 'pearson'
|
1653
|
+
|
1654
|
+
- 'pearson': Use Pearson's Correlation between continuous features,
|
1655
|
+
- 'cramers v': Use Cramer's V correlations between categorical features,
|
1656
|
+
- 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
|
1657
|
+
- 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
|
1658
|
+
|
1659
|
+
Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
|
1660
|
+
|
1661
|
+
Returns
|
1662
|
+
-------
|
1663
|
+
None
|
1664
|
+
"""
|
1665
|
+
frac = deprecate_default_value(
|
1666
|
+
frac,
|
1667
|
+
None,
|
1668
|
+
1,
|
1669
|
+
"<code>frac=None</code> is superseded by <code>sample_size=1.0</code>.",
|
1670
|
+
FutureWarning,
|
1671
|
+
)
|
1672
|
+
if frac != 1.0:
|
1673
|
+
deprecate_frac = deprecate_variable(
|
1674
|
+
frac,
|
1675
|
+
sample_size,
|
1676
|
+
"<code>frac</code> is deprecated. Use <code>sample_size</code> instead.",
|
1677
|
+
DeprecationWarning,
|
1678
|
+
)
|
1679
|
+
if sample_size == 1.0:
|
1680
|
+
sample_size = deprecate_frac
|
1681
|
+
|
1682
|
+
feature_types_df = pd.DataFrame.from_dict(self.feature_types).loc["type", :]
|
1683
|
+
features_list = list(
|
1684
|
+
feature_types_df[
|
1685
|
+
feature_types_df.isin(
|
1686
|
+
["categorical", "zipcode", "continuous", "ordinal"]
|
1687
|
+
)
|
1688
|
+
].index
|
1689
|
+
)
|
1690
|
+
if plot_type not in ["heatmap", "bar"]:
|
1691
|
+
raise ValueError('plot_type has to be "heatmap" ' 'or "bar"')
|
1692
|
+
|
1693
|
+
if plot_type == "bar" and correlation_target is None:
|
1694
|
+
raise ValueError('correlation_target has to be set when plot_type="bar".')
|
1695
|
+
|
1696
|
+
if correlation_target:
|
1697
|
+
if correlation_target not in features_list:
|
1698
|
+
raise ValueError(
|
1699
|
+
"correlation_target has to be in {}.".format(features_list)
|
1700
|
+
)
|
1701
|
+
|
1702
|
+
force_recompute = deprecate_variable(
|
1703
|
+
overwrite,
|
1704
|
+
force_recompute,
|
1705
|
+
f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
|
1706
|
+
DeprecationWarning,
|
1707
|
+
)
|
1708
|
+
|
1709
|
+
plot_correlation_heatmap(
|
1710
|
+
ds=self,
|
1711
|
+
frac=sample_size,
|
1712
|
+
force_recompute=force_recompute,
|
1713
|
+
correlation_target=correlation_target,
|
1714
|
+
plot_type=plot_type,
|
1715
|
+
correlation_threshold=correlation_threshold,
|
1716
|
+
nan_threshold=nan_threshold,
|
1717
|
+
correlation_methods=correlation_methods,
|
1718
|
+
**kwargs,
|
1719
|
+
)
|
1720
|
+
|
1721
|
+
@runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
|
1722
|
+
@runtime_dependency(module="ipywidgets", install_from=OptionalDependency.NOTEBOOK)
|
1723
|
+
def show_in_notebook(
|
1724
|
+
self,
|
1725
|
+
correlation_threshold=-1,
|
1726
|
+
selected_index=0,
|
1727
|
+
sample_size=0,
|
1728
|
+
visualize_features=True,
|
1729
|
+
correlation_methods="pearson",
|
1730
|
+
**kwargs,
|
1731
|
+
):
|
1732
|
+
"""
|
1733
|
+
Provide visualization of dataset.
|
1734
|
+
|
1735
|
+
- Display feature distribution. The data table display will show a maximum of 8 digits,
|
1736
|
+
- Plot the correlation between the dataset features (as a heatmap) only when all the features are
|
1737
|
+
continuous or ordinal,
|
1738
|
+
- Display data head.
|
1739
|
+
|
1740
|
+
Parameters
|
1741
|
+
----------
|
1742
|
+
correlation_threshold : int, default -1
|
1743
|
+
The correlation threshold to select, which only show features that have larger or equal
|
1744
|
+
correlation values than the threshold.
|
1745
|
+
selected_index: int, str, default 0
|
1746
|
+
The displayed output is stacked into an accordion widget, use selected_index to force the display to open
|
1747
|
+
a specific element, use the (zero offset) index or any prefix string of the name (eg, 'corr' for
|
1748
|
+
correlations)
|
1749
|
+
sample_size: int, default 0
|
1750
|
+
The size (in rows) to sample for visualizations
|
1751
|
+
visualize_features: bool default False
|
1752
|
+
For the "Features" section control if feature visualizations are shown or not. If not only
|
1753
|
+
a summary of the numeric statistics is shown. The numeric statistics are also always shows
|
1754
|
+
for wide (>64 features) datasets
|
1755
|
+
correlation_methods: Union[list, str], default to 'pearson'
|
1756
|
+
|
1757
|
+
- 'pearson': Use Pearson's Correlation between continuous features,
|
1758
|
+
- 'cramers v': Use Cramer's V correlations between categorical features,
|
1759
|
+
- 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
|
1760
|
+
- 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
|
1761
|
+
|
1762
|
+
Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
|
1763
|
+
"""
|
1764
|
+
|
1765
|
+
if not utils.is_notebook():
|
1766
|
+
print("show_in_notebook called but not in notebook environment")
|
1767
|
+
return
|
1768
|
+
|
1769
|
+
n_rows, n_columns = self.shape
|
1770
|
+
|
1771
|
+
min_sample_size = 10000
|
1772
|
+
if sample_size == 0:
|
1773
|
+
sub_samp_size = len(self.sampled_df)
|
1774
|
+
sub_samp_df = self.sampled_df
|
1775
|
+
else:
|
1776
|
+
sub_samp_size = max(min(sample_size, len(self.sampled_df)), min_sample_size)
|
1777
|
+
sub_samp_df = self.sampled_df.sample(n=sub_samp_size)
|
1778
|
+
|
1779
|
+
html_summary = ""
|
1780
|
+
if self.name:
|
1781
|
+
html_summary += "<h1>Name: %s</h1>" % (self.name)
|
1782
|
+
|
1783
|
+
# dataset type (problem type)
|
1784
|
+
html_summary += "<h3>Type: %s</h3>" % self.__class__.__name__
|
1785
|
+
|
1786
|
+
if self.description:
|
1787
|
+
html_summary += "<pre>%s</pre>" % self.description
|
1788
|
+
html_summary += "<hr>"
|
1789
|
+
|
1790
|
+
html_summary += "<h3>{:,} Rows, {:,} Columns</h3>".format(n_rows, n_columns)
|
1791
|
+
html_summary += "<h4>Column Types:</h4><UL>"
|
1792
|
+
|
1793
|
+
for group in Counter(
|
1794
|
+
[self.feature_types[k].meta_data["type"] for k in self.feature_types]
|
1795
|
+
).most_common():
|
1796
|
+
html_summary += "<LI><b>%s:</b> %d features" % (group[0], group[1])
|
1797
|
+
|
1798
|
+
html_summary += "</UL>"
|
1799
|
+
|
1800
|
+
html_summary += """
|
1801
|
+
<p><b>
|
1802
|
+
Note: Visualizations use a sampled subset of the dataset, this is to
|
1803
|
+
improve plotting performance. The sample size is calculated to be statistically
|
1804
|
+
significant within the confidence level: {} and confidence interval: {}.
|
1805
|
+
|
1806
|
+
The sampled data has {:,} rows
|
1807
|
+
</b>
|
1808
|
+
</p>
|
1809
|
+
|
1810
|
+
<ul>
|
1811
|
+
<li>The confidence <i>level</i> refers to the long-term success rate of the
|
1812
|
+
method, that is, how often this type of interval will capture the parameter
|
1813
|
+
of interest.
|
1814
|
+
</li>
|
1815
|
+
|
1816
|
+
<li>A specific confidence <i>interval</i> gives a range of plausible values for
|
1817
|
+
the parameter of interest
|
1818
|
+
</li>
|
1819
|
+
</ul>
|
1820
|
+
|
1821
|
+
""".format(
|
1822
|
+
DatasetDefaults.sampling_confidence_level,
|
1823
|
+
DatasetDefaults.sampling_confidence_interval,
|
1824
|
+
sub_samp_df.shape[0],
|
1825
|
+
)
|
1826
|
+
|
1827
|
+
html_summary += "</UL>"
|
1828
|
+
|
1829
|
+
from ipywidgets import widgets
|
1830
|
+
|
1831
|
+
summary = widgets.HTML(html_summary)
|
1832
|
+
|
1833
|
+
features = widgets.HTML()
|
1834
|
+
correlations = widgets.Output()
|
1835
|
+
warningz = widgets.HTML()
|
1836
|
+
|
1837
|
+
warningz.value = "Analyzing for warnings..."
|
1838
|
+
features.value = "Calculating full statistical info..."
|
1839
|
+
|
1840
|
+
# with correlations:
|
1841
|
+
# display(HTML("<li>calculating...</li>"))
|
1842
|
+
|
1843
|
+
accordion = widgets.Accordion(
|
1844
|
+
children=[summary, features, correlations, warningz]
|
1845
|
+
)
|
1846
|
+
accordion.set_title(0, "Summary")
|
1847
|
+
accordion.set_title(1, "Features")
|
1848
|
+
accordion.set_title(2, "Correlations")
|
1849
|
+
accordion.set_title(3, "Warnings")
|
1850
|
+
|
1851
|
+
if isinstance(selected_index, str):
|
1852
|
+
# lookup by title
|
1853
|
+
possible_titles = [
|
1854
|
+
accordion.get_title(i) for i in range(len(accordion.children))
|
1855
|
+
]
|
1856
|
+
for i, title in enumerate(possible_titles):
|
1857
|
+
if title.lower().startswith(selected_index.lower()):
|
1858
|
+
selected_index = i
|
1859
|
+
break
|
1860
|
+
|
1861
|
+
if isinstance(selected_index, str):
|
1862
|
+
# failed to match a title
|
1863
|
+
logger.info(
|
1864
|
+
"`selected_index` should be one of: {}.".format(
|
1865
|
+
", ".join(possible_titles)
|
1866
|
+
)
|
1867
|
+
)
|
1868
|
+
selected_index = 0
|
1869
|
+
|
1870
|
+
accordion.selected_index = selected_index
|
1871
|
+
|
1872
|
+
is_wide_dataset = n_columns >= N_Features_Wide_Dataset
|
1873
|
+
|
1874
|
+
#
|
1875
|
+
# set up dataframe to use for correlation calculations
|
1876
|
+
#
|
1877
|
+
|
1878
|
+
self.df_stats = self._calculate_dataset_statistics(
|
1879
|
+
is_wide_dataset, [features, warningz]
|
1880
|
+
)
|
1881
|
+
|
1882
|
+
with correlations:
|
1883
|
+
feature_types_df = pd.DataFrame.from_dict(self.feature_types).loc["type", :]
|
1884
|
+
if not is_wide_dataset:
|
1885
|
+
feature_types_df = feature_types_df[
|
1886
|
+
self.df_stats["missing"] < len(self.df)
|
1887
|
+
]
|
1888
|
+
|
1889
|
+
frac = kwargs.pop("frac", 1.0)
|
1890
|
+
overwrite = kwargs.pop("overwrite", None)
|
1891
|
+
force_recompute = kwargs.pop("force_recompute", False)
|
1892
|
+
force_recompute = deprecate_variable(
|
1893
|
+
overwrite,
|
1894
|
+
force_recompute,
|
1895
|
+
f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
|
1896
|
+
DeprecationWarning,
|
1897
|
+
)
|
1898
|
+
plot_type = kwargs.pop("plot_type", "heatmap")
|
1899
|
+
correlation_target = kwargs.pop("correlation_target", None)
|
1900
|
+
nan_threshold = kwargs.pop("nan_threshold", 0.8)
|
1901
|
+
self.show_corr(
|
1902
|
+
correlation_threshold=correlation_threshold,
|
1903
|
+
sample_size=frac,
|
1904
|
+
force_recompute=force_recompute,
|
1905
|
+
plot_type=plot_type,
|
1906
|
+
correlation_target=correlation_target,
|
1907
|
+
nan_threshold=nan_threshold,
|
1908
|
+
correlation_methods=correlation_methods,
|
1909
|
+
**kwargs,
|
1910
|
+
)
|
1911
|
+
|
1912
|
+
from IPython.core.display import display
|
1913
|
+
|
1914
|
+
display(accordion)
|
1915
|
+
|
1916
|
+
# generate html for feature_distribution & warnings
|
1917
|
+
|
1918
|
+
accordion.set_title(
|
1919
|
+
1, f"Features ({n_columns})"
|
1920
|
+
) # adjust for datasets with target
|
1921
|
+
|
1922
|
+
#
|
1923
|
+
# compute missing value statistics
|
1924
|
+
# not done for wide datasets
|
1925
|
+
#
|
1926
|
+
|
1927
|
+
features.value = self._generate_features_html(
|
1928
|
+
is_wide_dataset,
|
1929
|
+
n_columns,
|
1930
|
+
self.df_stats,
|
1931
|
+
visualizations_follow=bool(visualize_features),
|
1932
|
+
)
|
1933
|
+
|
1934
|
+
warningz.value = self._generate_warnings_html(
|
1935
|
+
is_wide_dataset, n_rows, n_columns, self.df_stats, warningz, accordion
|
1936
|
+
)
|
1937
|
+
|
1938
|
+
if visualize_features and not is_wide_dataset:
|
1939
|
+
self._visualize_feature_distribution(features)
|
1940
|
+
|
1941
|
+
def get_recommendations(self, *args, **kwargs): # real signature may change
|
1942
|
+
"""
|
1943
|
+
Returns user-friendly error message to set target variable before invoking this API.
|
1944
|
+
|
1945
|
+
Parameters
|
1946
|
+
----------
|
1947
|
+
kwargs
|
1948
|
+
|
1949
|
+
Returns
|
1950
|
+
-------
|
1951
|
+
NotImplementedError
|
1952
|
+
raises NotImplementedError, if target parameter value not provided
|
1953
|
+
|
1954
|
+
"""
|
1955
|
+
raise NotImplementedError(
|
1956
|
+
"Please set the target using set_target() before invoking this API. See "
|
1957
|
+
"https://accelerated-data-science.readthedocs.io/en/latest/ads.dataset.html#ads.dataset.dataset.ADSDataset.set_target "
|
1958
|
+
"for the API usage."
|
1959
|
+
)
|
1960
|
+
|
1961
|
+
def suggest_recommendations(self, *args, **kwargs): # real signature may change
|
1962
|
+
"""
|
1963
|
+
Returns user-friendly error message to set target variable before invoking this API.
|
1964
|
+
|
1965
|
+
Parameters
|
1966
|
+
----------
|
1967
|
+
kwargs
|
1968
|
+
|
1969
|
+
Returns
|
1970
|
+
-------
|
1971
|
+
NotImplementedError
|
1972
|
+
raises NotImplementedError, if target parameter value not provided
|
1973
|
+
|
1974
|
+
"""
|
1975
|
+
raise NotImplementedError(
|
1976
|
+
"Please set the target using set_target() before invoking this API. See "
|
1977
|
+
"https://accelerated-data-science.readthedocs.io/en/latest/ads.dataset.html#ads.dataset.dataset.ADSDataset.set_target "
|
1978
|
+
"for the API usage."
|
1979
|
+
)
|