oracle-ads 2.13.9rc0__py3-none-any.whl → 2.13.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ads/aqua/__init__.py +40 -0
- ads/aqua/app.py +507 -0
- ads/aqua/cli.py +96 -0
- ads/aqua/client/__init__.py +3 -0
- ads/aqua/client/client.py +836 -0
- ads/aqua/client/openai_client.py +305 -0
- ads/aqua/common/__init__.py +5 -0
- ads/aqua/common/decorator.py +125 -0
- ads/aqua/common/entities.py +274 -0
- ads/aqua/common/enums.py +134 -0
- ads/aqua/common/errors.py +109 -0
- ads/aqua/common/utils.py +1295 -0
- ads/aqua/config/__init__.py +4 -0
- ads/aqua/config/container_config.py +246 -0
- ads/aqua/config/evaluation/__init__.py +4 -0
- ads/aqua/config/evaluation/evaluation_service_config.py +147 -0
- ads/aqua/config/utils/__init__.py +4 -0
- ads/aqua/config/utils/serializer.py +339 -0
- ads/aqua/constants.py +116 -0
- ads/aqua/data.py +14 -0
- ads/aqua/dummy_data/icon.txt +1 -0
- ads/aqua/dummy_data/oci_model_deployments.json +56 -0
- ads/aqua/dummy_data/oci_models.json +1 -0
- ads/aqua/dummy_data/readme.md +26 -0
- ads/aqua/evaluation/__init__.py +8 -0
- ads/aqua/evaluation/constants.py +53 -0
- ads/aqua/evaluation/entities.py +186 -0
- ads/aqua/evaluation/errors.py +70 -0
- ads/aqua/evaluation/evaluation.py +1814 -0
- ads/aqua/extension/__init__.py +42 -0
- ads/aqua/extension/aqua_ws_msg_handler.py +76 -0
- ads/aqua/extension/base_handler.py +90 -0
- ads/aqua/extension/common_handler.py +121 -0
- ads/aqua/extension/common_ws_msg_handler.py +36 -0
- ads/aqua/extension/deployment_handler.py +381 -0
- ads/aqua/extension/deployment_ws_msg_handler.py +54 -0
- ads/aqua/extension/errors.py +30 -0
- ads/aqua/extension/evaluation_handler.py +129 -0
- ads/aqua/extension/evaluation_ws_msg_handler.py +61 -0
- ads/aqua/extension/finetune_handler.py +96 -0
- ads/aqua/extension/model_handler.py +390 -0
- ads/aqua/extension/models/__init__.py +0 -0
- ads/aqua/extension/models/ws_models.py +145 -0
- ads/aqua/extension/models_ws_msg_handler.py +50 -0
- ads/aqua/extension/ui_handler.py +300 -0
- ads/aqua/extension/ui_websocket_handler.py +130 -0
- ads/aqua/extension/utils.py +133 -0
- ads/aqua/finetuning/__init__.py +7 -0
- ads/aqua/finetuning/constants.py +23 -0
- ads/aqua/finetuning/entities.py +181 -0
- ads/aqua/finetuning/finetuning.py +749 -0
- ads/aqua/model/__init__.py +8 -0
- ads/aqua/model/constants.py +60 -0
- ads/aqua/model/entities.py +385 -0
- ads/aqua/model/enums.py +32 -0
- ads/aqua/model/model.py +2134 -0
- ads/aqua/model/utils.py +52 -0
- ads/aqua/modeldeployment/__init__.py +6 -0
- ads/aqua/modeldeployment/constants.py +10 -0
- ads/aqua/modeldeployment/deployment.py +1315 -0
- ads/aqua/modeldeployment/entities.py +653 -0
- ads/aqua/modeldeployment/utils.py +543 -0
- ads/aqua/resources/gpu_shapes_index.json +94 -0
- ads/aqua/server/__init__.py +4 -0
- ads/aqua/server/__main__.py +24 -0
- ads/aqua/server/app.py +47 -0
- ads/aqua/server/aqua_spec.yml +1291 -0
- ads/aqua/training/__init__.py +4 -0
- ads/aqua/training/exceptions.py +476 -0
- ads/aqua/ui.py +519 -0
- ads/automl/__init__.py +9 -0
- ads/automl/driver.py +330 -0
- ads/automl/provider.py +975 -0
- ads/bds/__init__.py +5 -0
- ads/bds/auth.py +127 -0
- ads/bds/big_data_service.py +255 -0
- ads/catalog/__init__.py +19 -0
- ads/catalog/model.py +1576 -0
- ads/catalog/notebook.py +461 -0
- ads/catalog/project.py +468 -0
- ads/catalog/summary.py +178 -0
- ads/common/__init__.py +11 -0
- ads/common/analyzer.py +65 -0
- ads/common/artifact/.model-ignore +63 -0
- ads/common/artifact/__init__.py +10 -0
- ads/common/auth.py +1122 -0
- ads/common/card_identifier.py +83 -0
- ads/common/config.py +647 -0
- ads/common/data.py +165 -0
- ads/common/decorator/__init__.py +9 -0
- ads/common/decorator/argument_to_case.py +88 -0
- ads/common/decorator/deprecate.py +69 -0
- ads/common/decorator/require_nonempty_arg.py +65 -0
- ads/common/decorator/runtime_dependency.py +178 -0
- ads/common/decorator/threaded.py +97 -0
- ads/common/decorator/utils.py +35 -0
- ads/common/dsc_file_system.py +303 -0
- ads/common/error.py +14 -0
- ads/common/extended_enum.py +81 -0
- ads/common/function/__init__.py +5 -0
- ads/common/function/fn_util.py +142 -0
- ads/common/function/func_conf.yaml +25 -0
- ads/common/ipython.py +76 -0
- ads/common/model.py +679 -0
- ads/common/model_artifact.py +1759 -0
- ads/common/model_artifact_schema.json +107 -0
- ads/common/model_export_util.py +664 -0
- ads/common/model_metadata.py +24 -0
- ads/common/object_storage_details.py +296 -0
- ads/common/oci_client.py +179 -0
- ads/common/oci_datascience.py +46 -0
- ads/common/oci_logging.py +1144 -0
- ads/common/oci_mixin.py +957 -0
- ads/common/oci_resource.py +136 -0
- ads/common/serializer.py +559 -0
- ads/common/utils.py +1852 -0
- ads/common/word_lists.py +1491 -0
- ads/common/work_request.py +189 -0
- ads/config.py +1 -0
- ads/data_labeling/__init__.py +13 -0
- ads/data_labeling/boundingbox.py +253 -0
- ads/data_labeling/constants.py +47 -0
- ads/data_labeling/data_labeling_service.py +244 -0
- ads/data_labeling/interface/__init__.py +5 -0
- ads/data_labeling/interface/loader.py +16 -0
- ads/data_labeling/interface/parser.py +16 -0
- ads/data_labeling/interface/reader.py +23 -0
- ads/data_labeling/loader/__init__.py +5 -0
- ads/data_labeling/loader/file_loader.py +241 -0
- ads/data_labeling/metadata.py +110 -0
- ads/data_labeling/mixin/__init__.py +5 -0
- ads/data_labeling/mixin/data_labeling.py +232 -0
- ads/data_labeling/ner.py +129 -0
- ads/data_labeling/parser/__init__.py +5 -0
- ads/data_labeling/parser/dls_record_parser.py +388 -0
- ads/data_labeling/parser/export_metadata_parser.py +94 -0
- ads/data_labeling/parser/export_record_parser.py +473 -0
- ads/data_labeling/reader/__init__.py +5 -0
- ads/data_labeling/reader/dataset_reader.py +574 -0
- ads/data_labeling/reader/dls_record_reader.py +121 -0
- ads/data_labeling/reader/export_record_reader.py +62 -0
- ads/data_labeling/reader/jsonl_reader.py +75 -0
- ads/data_labeling/reader/metadata_reader.py +203 -0
- ads/data_labeling/reader/record_reader.py +263 -0
- ads/data_labeling/record.py +52 -0
- ads/data_labeling/visualizer/__init__.py +5 -0
- ads/data_labeling/visualizer/image_visualizer.py +525 -0
- ads/data_labeling/visualizer/text_visualizer.py +357 -0
- ads/database/__init__.py +5 -0
- ads/database/connection.py +338 -0
- ads/dataset/__init__.py +10 -0
- ads/dataset/capabilities.md +51 -0
- ads/dataset/classification_dataset.py +339 -0
- ads/dataset/correlation.py +226 -0
- ads/dataset/correlation_plot.py +563 -0
- ads/dataset/dask_series.py +173 -0
- ads/dataset/dataframe_transformer.py +110 -0
- ads/dataset/dataset.py +1979 -0
- ads/dataset/dataset_browser.py +360 -0
- ads/dataset/dataset_with_target.py +995 -0
- ads/dataset/exception.py +25 -0
- ads/dataset/factory.py +987 -0
- ads/dataset/feature_engineering_transformer.py +35 -0
- ads/dataset/feature_selection.py +107 -0
- ads/dataset/forecasting_dataset.py +26 -0
- ads/dataset/helper.py +1450 -0
- ads/dataset/label_encoder.py +99 -0
- ads/dataset/mixin/__init__.py +5 -0
- ads/dataset/mixin/dataset_accessor.py +134 -0
- ads/dataset/pipeline.py +58 -0
- ads/dataset/plot.py +710 -0
- ads/dataset/progress.py +86 -0
- ads/dataset/recommendation.py +297 -0
- ads/dataset/recommendation_transformer.py +502 -0
- ads/dataset/regression_dataset.py +14 -0
- ads/dataset/sampled_dataset.py +1050 -0
- ads/dataset/target.py +98 -0
- ads/dataset/timeseries.py +18 -0
- ads/dbmixin/__init__.py +5 -0
- ads/dbmixin/db_pandas_accessor.py +153 -0
- ads/environment/__init__.py +9 -0
- ads/environment/ml_runtime.py +66 -0
- ads/evaluations/README.md +14 -0
- ads/evaluations/__init__.py +109 -0
- ads/evaluations/evaluation_plot.py +983 -0
- ads/evaluations/evaluator.py +1334 -0
- ads/evaluations/statistical_metrics.py +543 -0
- ads/experiments/__init__.py +9 -0
- ads/experiments/capabilities.md +0 -0
- ads/explanations/__init__.py +21 -0
- ads/explanations/base_explainer.py +142 -0
- ads/explanations/capabilities.md +83 -0
- ads/explanations/explainer.py +190 -0
- ads/explanations/mlx_global_explainer.py +1050 -0
- ads/explanations/mlx_interface.py +386 -0
- ads/explanations/mlx_local_explainer.py +287 -0
- ads/explanations/mlx_whatif_explainer.py +201 -0
- ads/feature_engineering/__init__.py +20 -0
- ads/feature_engineering/accessor/__init__.py +5 -0
- ads/feature_engineering/accessor/dataframe_accessor.py +535 -0
- ads/feature_engineering/accessor/mixin/__init__.py +5 -0
- ads/feature_engineering/accessor/mixin/correlation.py +166 -0
- ads/feature_engineering/accessor/mixin/eda_mixin.py +266 -0
- ads/feature_engineering/accessor/mixin/eda_mixin_series.py +85 -0
- ads/feature_engineering/accessor/mixin/feature_types_mixin.py +211 -0
- ads/feature_engineering/accessor/mixin/utils.py +65 -0
- ads/feature_engineering/accessor/series_accessor.py +431 -0
- ads/feature_engineering/adsimage/__init__.py +5 -0
- ads/feature_engineering/adsimage/image.py +192 -0
- ads/feature_engineering/adsimage/image_reader.py +170 -0
- ads/feature_engineering/adsimage/interface/__init__.py +5 -0
- ads/feature_engineering/adsimage/interface/reader.py +19 -0
- ads/feature_engineering/adsstring/__init__.py +7 -0
- ads/feature_engineering/adsstring/oci_language/__init__.py +8 -0
- ads/feature_engineering/adsstring/string/__init__.py +8 -0
- ads/feature_engineering/data_schema.json +57 -0
- ads/feature_engineering/dataset/__init__.py +5 -0
- ads/feature_engineering/dataset/zip_code_data.py +42062 -0
- ads/feature_engineering/exceptions.py +40 -0
- ads/feature_engineering/feature_type/__init__.py +133 -0
- ads/feature_engineering/feature_type/address.py +184 -0
- ads/feature_engineering/feature_type/adsstring/__init__.py +5 -0
- ads/feature_engineering/feature_type/adsstring/common_regex_mixin.py +164 -0
- ads/feature_engineering/feature_type/adsstring/oci_language.py +93 -0
- ads/feature_engineering/feature_type/adsstring/parsers/__init__.py +5 -0
- ads/feature_engineering/feature_type/adsstring/parsers/base.py +47 -0
- ads/feature_engineering/feature_type/adsstring/parsers/nltk_parser.py +96 -0
- ads/feature_engineering/feature_type/adsstring/parsers/spacy_parser.py +221 -0
- ads/feature_engineering/feature_type/adsstring/string.py +258 -0
- ads/feature_engineering/feature_type/base.py +58 -0
- ads/feature_engineering/feature_type/boolean.py +183 -0
- ads/feature_engineering/feature_type/category.py +146 -0
- ads/feature_engineering/feature_type/constant.py +137 -0
- ads/feature_engineering/feature_type/continuous.py +151 -0
- ads/feature_engineering/feature_type/creditcard.py +314 -0
- ads/feature_engineering/feature_type/datetime.py +190 -0
- ads/feature_engineering/feature_type/discrete.py +134 -0
- ads/feature_engineering/feature_type/document.py +43 -0
- ads/feature_engineering/feature_type/gis.py +251 -0
- ads/feature_engineering/feature_type/handler/__init__.py +5 -0
- ads/feature_engineering/feature_type/handler/feature_validator.py +524 -0
- ads/feature_engineering/feature_type/handler/feature_warning.py +319 -0
- ads/feature_engineering/feature_type/handler/warnings.py +128 -0
- ads/feature_engineering/feature_type/integer.py +142 -0
- ads/feature_engineering/feature_type/ip_address.py +144 -0
- ads/feature_engineering/feature_type/ip_address_v4.py +138 -0
- ads/feature_engineering/feature_type/ip_address_v6.py +138 -0
- ads/feature_engineering/feature_type/lat_long.py +256 -0
- ads/feature_engineering/feature_type/object.py +43 -0
- ads/feature_engineering/feature_type/ordinal.py +132 -0
- ads/feature_engineering/feature_type/phone_number.py +135 -0
- ads/feature_engineering/feature_type/string.py +171 -0
- ads/feature_engineering/feature_type/text.py +93 -0
- ads/feature_engineering/feature_type/unknown.py +43 -0
- ads/feature_engineering/feature_type/zip_code.py +164 -0
- ads/feature_engineering/feature_type_manager.py +406 -0
- ads/feature_engineering/schema.py +795 -0
- ads/feature_engineering/utils.py +245 -0
- ads/feature_store/.readthedocs.yaml +19 -0
- ads/feature_store/README.md +65 -0
- ads/feature_store/__init__.py +9 -0
- ads/feature_store/common/__init__.py +0 -0
- ads/feature_store/common/enums.py +339 -0
- ads/feature_store/common/exceptions.py +18 -0
- ads/feature_store/common/spark_session_singleton.py +125 -0
- ads/feature_store/common/utils/__init__.py +0 -0
- ads/feature_store/common/utils/base64_encoder_decoder.py +72 -0
- ads/feature_store/common/utils/feature_schema_mapper.py +283 -0
- ads/feature_store/common/utils/transformation_utils.py +82 -0
- ads/feature_store/common/utils/utility.py +403 -0
- ads/feature_store/data_validation/__init__.py +0 -0
- ads/feature_store/data_validation/great_expectation.py +129 -0
- ads/feature_store/dataset.py +1230 -0
- ads/feature_store/dataset_job.py +530 -0
- ads/feature_store/docs/Dockerfile +7 -0
- ads/feature_store/docs/Makefile +44 -0
- ads/feature_store/docs/conf.py +28 -0
- ads/feature_store/docs/requirements.txt +14 -0
- ads/feature_store/docs/source/ads.feature_store.query.rst +20 -0
- ads/feature_store/docs/source/cicd.rst +137 -0
- ads/feature_store/docs/source/conf.py +86 -0
- ads/feature_store/docs/source/data_versioning.rst +33 -0
- ads/feature_store/docs/source/dataset.rst +388 -0
- ads/feature_store/docs/source/dataset_job.rst +27 -0
- ads/feature_store/docs/source/demo.rst +70 -0
- ads/feature_store/docs/source/entity.rst +78 -0
- ads/feature_store/docs/source/feature_group.rst +624 -0
- ads/feature_store/docs/source/feature_group_job.rst +29 -0
- ads/feature_store/docs/source/feature_store.rst +122 -0
- ads/feature_store/docs/source/feature_store_class.rst +123 -0
- ads/feature_store/docs/source/feature_validation.rst +66 -0
- ads/feature_store/docs/source/figures/cicd.png +0 -0
- ads/feature_store/docs/source/figures/data_validation.png +0 -0
- ads/feature_store/docs/source/figures/data_versioning.png +0 -0
- ads/feature_store/docs/source/figures/dataset.gif +0 -0
- ads/feature_store/docs/source/figures/dataset.png +0 -0
- ads/feature_store/docs/source/figures/dataset_lineage.png +0 -0
- ads/feature_store/docs/source/figures/dataset_statistics.png +0 -0
- ads/feature_store/docs/source/figures/dataset_statistics_viz.png +0 -0
- ads/feature_store/docs/source/figures/dataset_validation_results.png +0 -0
- ads/feature_store/docs/source/figures/dataset_validation_summary.png +0 -0
- ads/feature_store/docs/source/figures/drift_monitoring.png +0 -0
- ads/feature_store/docs/source/figures/entity.png +0 -0
- ads/feature_store/docs/source/figures/feature_group.png +0 -0
- ads/feature_store/docs/source/figures/feature_group_lineage.png +0 -0
- ads/feature_store/docs/source/figures/feature_group_statistics_viz.png +0 -0
- ads/feature_store/docs/source/figures/feature_store_deployment.png +0 -0
- ads/feature_store/docs/source/figures/feature_store_overview.png +0 -0
- ads/feature_store/docs/source/figures/featuregroup.gif +0 -0
- ads/feature_store/docs/source/figures/lineage_d1.png +0 -0
- ads/feature_store/docs/source/figures/lineage_d2.png +0 -0
- ads/feature_store/docs/source/figures/lineage_fg.png +0 -0
- ads/feature_store/docs/source/figures/logo-dark-mode.png +0 -0
- ads/feature_store/docs/source/figures/logo-light-mode.png +0 -0
- ads/feature_store/docs/source/figures/overview.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager_feature_store_stack.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager_home.png +0 -0
- ads/feature_store/docs/source/figures/stats_1.png +0 -0
- ads/feature_store/docs/source/figures/stats_2.png +0 -0
- ads/feature_store/docs/source/figures/stats_d.png +0 -0
- ads/feature_store/docs/source/figures/stats_fg.png +0 -0
- ads/feature_store/docs/source/figures/transformation.png +0 -0
- ads/feature_store/docs/source/figures/transformations.gif +0 -0
- ads/feature_store/docs/source/figures/validation.png +0 -0
- ads/feature_store/docs/source/figures/validation_fg.png +0 -0
- ads/feature_store/docs/source/figures/validation_results.png +0 -0
- ads/feature_store/docs/source/figures/validation_summary.png +0 -0
- ads/feature_store/docs/source/index.rst +81 -0
- ads/feature_store/docs/source/module.rst +8 -0
- ads/feature_store/docs/source/notebook.rst +94 -0
- ads/feature_store/docs/source/overview.rst +47 -0
- ads/feature_store/docs/source/quickstart.rst +176 -0
- ads/feature_store/docs/source/release_notes.rst +194 -0
- ads/feature_store/docs/source/setup_feature_store.rst +81 -0
- ads/feature_store/docs/source/statistics.rst +58 -0
- ads/feature_store/docs/source/transformation.rst +199 -0
- ads/feature_store/docs/source/ui.rst +65 -0
- ads/feature_store/docs/source/user_guides.setup.feature_store_operator.rst +66 -0
- ads/feature_store/docs/source/user_guides.setup.helm_chart.rst +192 -0
- ads/feature_store/docs/source/user_guides.setup.terraform.rst +338 -0
- ads/feature_store/entity.py +718 -0
- ads/feature_store/execution_strategy/__init__.py +0 -0
- ads/feature_store/execution_strategy/delta_lake/__init__.py +0 -0
- ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py +375 -0
- ads/feature_store/execution_strategy/engine/__init__.py +0 -0
- ads/feature_store/execution_strategy/engine/spark_engine.py +316 -0
- ads/feature_store/execution_strategy/execution_strategy.py +113 -0
- ads/feature_store/execution_strategy/execution_strategy_provider.py +47 -0
- ads/feature_store/execution_strategy/spark/__init__.py +0 -0
- ads/feature_store/execution_strategy/spark/spark_execution.py +618 -0
- ads/feature_store/feature.py +192 -0
- ads/feature_store/feature_group.py +1494 -0
- ads/feature_store/feature_group_expectation.py +346 -0
- ads/feature_store/feature_group_job.py +602 -0
- ads/feature_store/feature_lineage/__init__.py +0 -0
- ads/feature_store/feature_lineage/graphviz_service.py +180 -0
- ads/feature_store/feature_option_details.py +50 -0
- ads/feature_store/feature_statistics/__init__.py +0 -0
- ads/feature_store/feature_statistics/statistics_service.py +99 -0
- ads/feature_store/feature_store.py +699 -0
- ads/feature_store/feature_store_registrar.py +518 -0
- ads/feature_store/input_feature_detail.py +149 -0
- ads/feature_store/mixin/__init__.py +4 -0
- ads/feature_store/mixin/oci_feature_store.py +145 -0
- ads/feature_store/model_details.py +73 -0
- ads/feature_store/query/__init__.py +0 -0
- ads/feature_store/query/filter.py +266 -0
- ads/feature_store/query/generator/__init__.py +0 -0
- ads/feature_store/query/generator/query_generator.py +298 -0
- ads/feature_store/query/join.py +161 -0
- ads/feature_store/query/query.py +403 -0
- ads/feature_store/query/validator/__init__.py +0 -0
- ads/feature_store/query/validator/query_validator.py +57 -0
- ads/feature_store/response/__init__.py +0 -0
- ads/feature_store/response/response_builder.py +68 -0
- ads/feature_store/service/__init__.py +0 -0
- ads/feature_store/service/oci_dataset.py +139 -0
- ads/feature_store/service/oci_dataset_job.py +199 -0
- ads/feature_store/service/oci_entity.py +125 -0
- ads/feature_store/service/oci_feature_group.py +164 -0
- ads/feature_store/service/oci_feature_group_job.py +214 -0
- ads/feature_store/service/oci_feature_store.py +182 -0
- ads/feature_store/service/oci_lineage.py +87 -0
- ads/feature_store/service/oci_transformation.py +104 -0
- ads/feature_store/statistics/__init__.py +0 -0
- ads/feature_store/statistics/abs_feature_value.py +49 -0
- ads/feature_store/statistics/charts/__init__.py +0 -0
- ads/feature_store/statistics/charts/abstract_feature_plot.py +37 -0
- ads/feature_store/statistics/charts/box_plot.py +148 -0
- ads/feature_store/statistics/charts/frequency_distribution.py +65 -0
- ads/feature_store/statistics/charts/probability_distribution.py +68 -0
- ads/feature_store/statistics/charts/top_k_frequent_elements.py +98 -0
- ads/feature_store/statistics/feature_stat.py +126 -0
- ads/feature_store/statistics/generic_feature_value.py +33 -0
- ads/feature_store/statistics/statistics.py +41 -0
- ads/feature_store/statistics_config.py +101 -0
- ads/feature_store/templates/feature_store_template.yaml +45 -0
- ads/feature_store/transformation.py +499 -0
- ads/feature_store/validation_output.py +57 -0
- ads/hpo/__init__.py +9 -0
- ads/hpo/_imports.py +91 -0
- ads/hpo/ads_search_space.py +439 -0
- ads/hpo/distributions.py +325 -0
- ads/hpo/objective.py +280 -0
- ads/hpo/search_cv.py +1657 -0
- ads/hpo/stopping_criterion.py +75 -0
- ads/hpo/tuner_artifact.py +413 -0
- ads/hpo/utils.py +91 -0
- ads/hpo/validation.py +140 -0
- ads/hpo/visualization/__init__.py +5 -0
- ads/hpo/visualization/_contour.py +23 -0
- ads/hpo/visualization/_edf.py +20 -0
- ads/hpo/visualization/_intermediate_values.py +21 -0
- ads/hpo/visualization/_optimization_history.py +25 -0
- ads/hpo/visualization/_parallel_coordinate.py +169 -0
- ads/hpo/visualization/_param_importances.py +26 -0
- ads/jobs/__init__.py +53 -0
- ads/jobs/ads_job.py +663 -0
- ads/jobs/builders/__init__.py +5 -0
- ads/jobs/builders/base.py +156 -0
- ads/jobs/builders/infrastructure/__init__.py +6 -0
- ads/jobs/builders/infrastructure/base.py +165 -0
- ads/jobs/builders/infrastructure/dataflow.py +1252 -0
- ads/jobs/builders/infrastructure/dsc_job.py +1894 -0
- ads/jobs/builders/infrastructure/dsc_job_runtime.py +1233 -0
- ads/jobs/builders/infrastructure/utils.py +65 -0
- ads/jobs/builders/runtimes/__init__.py +5 -0
- ads/jobs/builders/runtimes/artifact.py +338 -0
- ads/jobs/builders/runtimes/base.py +325 -0
- ads/jobs/builders/runtimes/container_runtime.py +242 -0
- ads/jobs/builders/runtimes/python_runtime.py +1016 -0
- ads/jobs/builders/runtimes/pytorch_runtime.py +204 -0
- ads/jobs/cli.py +104 -0
- ads/jobs/env_var_parser.py +131 -0
- ads/jobs/extension.py +160 -0
- ads/jobs/schema/__init__.py +5 -0
- ads/jobs/schema/infrastructure_schema.json +116 -0
- ads/jobs/schema/job_schema.json +42 -0
- ads/jobs/schema/runtime_schema.json +183 -0
- ads/jobs/schema/validator.py +141 -0
- ads/jobs/serializer.py +296 -0
- ads/jobs/templates/__init__.py +5 -0
- ads/jobs/templates/container.py +6 -0
- ads/jobs/templates/driver_notebook.py +177 -0
- ads/jobs/templates/driver_oci.py +500 -0
- ads/jobs/templates/driver_python.py +48 -0
- ads/jobs/templates/driver_pytorch.py +852 -0
- ads/jobs/templates/driver_utils.py +615 -0
- ads/jobs/templates/hostname_from_env.c +55 -0
- ads/jobs/templates/oci_metrics.py +181 -0
- ads/jobs/utils.py +104 -0
- ads/llm/__init__.py +28 -0
- ads/llm/autogen/__init__.py +2 -0
- ads/llm/autogen/constants.py +15 -0
- ads/llm/autogen/reports/__init__.py +2 -0
- ads/llm/autogen/reports/base.py +67 -0
- ads/llm/autogen/reports/data.py +103 -0
- ads/llm/autogen/reports/session.py +526 -0
- ads/llm/autogen/reports/templates/chat_box.html +13 -0
- ads/llm/autogen/reports/templates/chat_box_lt.html +5 -0
- ads/llm/autogen/reports/templates/chat_box_rt.html +6 -0
- ads/llm/autogen/reports/utils.py +56 -0
- ads/llm/autogen/v02/__init__.py +4 -0
- ads/llm/autogen/v02/client.py +295 -0
- ads/llm/autogen/v02/log_handlers/__init__.py +2 -0
- ads/llm/autogen/v02/log_handlers/oci_file_handler.py +83 -0
- ads/llm/autogen/v02/loggers/__init__.py +6 -0
- ads/llm/autogen/v02/loggers/metric_logger.py +320 -0
- ads/llm/autogen/v02/loggers/session_logger.py +580 -0
- ads/llm/autogen/v02/loggers/utils.py +86 -0
- ads/llm/autogen/v02/runtime_logging.py +163 -0
- ads/llm/chain.py +268 -0
- ads/llm/chat_template.py +31 -0
- ads/llm/deploy.py +63 -0
- ads/llm/guardrails/__init__.py +5 -0
- ads/llm/guardrails/base.py +442 -0
- ads/llm/guardrails/huggingface.py +44 -0
- ads/llm/langchain/__init__.py +5 -0
- ads/llm/langchain/plugins/__init__.py +5 -0
- ads/llm/langchain/plugins/chat_models/__init__.py +5 -0
- ads/llm/langchain/plugins/chat_models/oci_data_science.py +1027 -0
- ads/llm/langchain/plugins/embeddings/__init__.py +4 -0
- ads/llm/langchain/plugins/embeddings/oci_data_science_model_deployment_endpoint.py +184 -0
- ads/llm/langchain/plugins/llms/__init__.py +5 -0
- ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +979 -0
- ads/llm/requirements.txt +3 -0
- ads/llm/serialize.py +219 -0
- ads/llm/serializers/__init__.py +0 -0
- ads/llm/serializers/retrieval_qa.py +153 -0
- ads/llm/serializers/runnable_parallel.py +27 -0
- ads/llm/templates/score_chain.jinja2 +155 -0
- ads/llm/templates/tool_chat_template_hermes.jinja +130 -0
- ads/llm/templates/tool_chat_template_mistral_parallel.jinja +94 -0
- ads/model/__init__.py +52 -0
- ads/model/artifact.py +573 -0
- ads/model/artifact_downloader.py +254 -0
- ads/model/artifact_uploader.py +267 -0
- ads/model/base_properties.py +238 -0
- ads/model/common/.model-ignore +66 -0
- ads/model/common/__init__.py +5 -0
- ads/model/common/utils.py +142 -0
- ads/model/datascience_model.py +2635 -0
- ads/model/deployment/__init__.py +20 -0
- ads/model/deployment/common/__init__.py +5 -0
- ads/model/deployment/common/utils.py +308 -0
- ads/model/deployment/model_deployer.py +466 -0
- ads/model/deployment/model_deployment.py +1846 -0
- ads/model/deployment/model_deployment_infrastructure.py +671 -0
- ads/model/deployment/model_deployment_properties.py +493 -0
- ads/model/deployment/model_deployment_runtime.py +838 -0
- ads/model/extractor/__init__.py +5 -0
- ads/model/extractor/automl_extractor.py +74 -0
- ads/model/extractor/embedding_onnx_extractor.py +80 -0
- ads/model/extractor/huggingface_extractor.py +88 -0
- ads/model/extractor/keras_extractor.py +84 -0
- ads/model/extractor/lightgbm_extractor.py +93 -0
- ads/model/extractor/model_info_extractor.py +114 -0
- ads/model/extractor/model_info_extractor_factory.py +105 -0
- ads/model/extractor/pytorch_extractor.py +87 -0
- ads/model/extractor/sklearn_extractor.py +112 -0
- ads/model/extractor/spark_extractor.py +89 -0
- ads/model/extractor/tensorflow_extractor.py +85 -0
- ads/model/extractor/xgboost_extractor.py +94 -0
- ads/model/framework/__init__.py +5 -0
- ads/model/framework/automl_model.py +178 -0
- ads/model/framework/embedding_onnx_model.py +438 -0
- ads/model/framework/huggingface_model.py +399 -0
- ads/model/framework/lightgbm_model.py +266 -0
- ads/model/framework/pytorch_model.py +266 -0
- ads/model/framework/sklearn_model.py +250 -0
- ads/model/framework/spark_model.py +326 -0
- ads/model/framework/tensorflow_model.py +254 -0
- ads/model/framework/xgboost_model.py +258 -0
- ads/model/generic_model.py +3518 -0
- ads/model/model_artifact_boilerplate/README.md +381 -0
- ads/model/model_artifact_boilerplate/__init__.py +5 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/__init__.py +5 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py +427 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/requirements.txt +2 -0
- ads/model/model_artifact_boilerplate/runtime.yaml +7 -0
- ads/model/model_artifact_boilerplate/score.py +61 -0
- ads/model/model_file_description_schema.json +68 -0
- ads/model/model_introspect.py +331 -0
- ads/model/model_metadata.py +1810 -0
- ads/model/model_metadata_mixin.py +460 -0
- ads/model/model_properties.py +63 -0
- ads/model/model_version_set.py +739 -0
- ads/model/runtime/__init__.py +5 -0
- ads/model/runtime/env_info.py +306 -0
- ads/model/runtime/model_deployment_details.py +37 -0
- ads/model/runtime/model_provenance_details.py +58 -0
- ads/model/runtime/runtime_info.py +81 -0
- ads/model/runtime/schemas/inference_env_info_schema.yaml +16 -0
- ads/model/runtime/schemas/model_provenance_schema.yaml +36 -0
- ads/model/runtime/schemas/training_env_info_schema.yaml +16 -0
- ads/model/runtime/utils.py +201 -0
- ads/model/serde/__init__.py +5 -0
- ads/model/serde/common.py +40 -0
- ads/model/serde/model_input.py +547 -0
- ads/model/serde/model_serializer.py +1184 -0
- ads/model/service/__init__.py +5 -0
- ads/model/service/oci_datascience_model.py +1076 -0
- ads/model/service/oci_datascience_model_deployment.py +500 -0
- ads/model/service/oci_datascience_model_version_set.py +176 -0
- ads/model/transformer/__init__.py +5 -0
- ads/model/transformer/onnx_transformer.py +324 -0
- ads/mysqldb/__init__.py +5 -0
- ads/mysqldb/mysql_db.py +227 -0
- ads/opctl/__init__.py +18 -0
- ads/opctl/anomaly_detection.py +11 -0
- ads/opctl/backend/__init__.py +5 -0
- ads/opctl/backend/ads_dataflow.py +353 -0
- ads/opctl/backend/ads_ml_job.py +710 -0
- ads/opctl/backend/ads_ml_pipeline.py +164 -0
- ads/opctl/backend/ads_model_deployment.py +209 -0
- ads/opctl/backend/base.py +146 -0
- ads/opctl/backend/local.py +1053 -0
- ads/opctl/backend/marketplace/__init__.py +9 -0
- ads/opctl/backend/marketplace/helm_helper.py +173 -0
- ads/opctl/backend/marketplace/local_marketplace.py +271 -0
- ads/opctl/backend/marketplace/marketplace_backend_runner.py +71 -0
- ads/opctl/backend/marketplace/marketplace_operator_interface.py +44 -0
- ads/opctl/backend/marketplace/marketplace_operator_runner.py +24 -0
- ads/opctl/backend/marketplace/marketplace_utils.py +212 -0
- ads/opctl/backend/marketplace/models/__init__.py +5 -0
- ads/opctl/backend/marketplace/models/bearer_token.py +94 -0
- ads/opctl/backend/marketplace/models/marketplace_type.py +70 -0
- ads/opctl/backend/marketplace/models/ocir_details.py +56 -0
- ads/opctl/backend/marketplace/prerequisite_checker.py +238 -0
- ads/opctl/cli.py +707 -0
- ads/opctl/cmds.py +869 -0
- ads/opctl/conda/__init__.py +5 -0
- ads/opctl/conda/cli.py +193 -0
- ads/opctl/conda/cmds.py +749 -0
- ads/opctl/conda/config.yaml +34 -0
- ads/opctl/conda/manifest_template.yaml +13 -0
- ads/opctl/conda/multipart_uploader.py +188 -0
- ads/opctl/conda/pack.py +89 -0
- ads/opctl/config/__init__.py +5 -0
- ads/opctl/config/base.py +57 -0
- ads/opctl/config/diagnostics/__init__.py +5 -0
- ads/opctl/config/diagnostics/distributed/default_requirements_config.yaml +62 -0
- ads/opctl/config/merger.py +255 -0
- ads/opctl/config/resolver.py +297 -0
- ads/opctl/config/utils.py +79 -0
- ads/opctl/config/validator.py +17 -0
- ads/opctl/config/versioner.py +68 -0
- ads/opctl/config/yaml_parsers/__init__.py +7 -0
- ads/opctl/config/yaml_parsers/base.py +58 -0
- ads/opctl/config/yaml_parsers/distributed/__init__.py +7 -0
- ads/opctl/config/yaml_parsers/distributed/yaml_parser.py +201 -0
- ads/opctl/constants.py +66 -0
- ads/opctl/decorator/__init__.py +5 -0
- ads/opctl/decorator/common.py +129 -0
- ads/opctl/diagnostics/__init__.py +5 -0
- ads/opctl/diagnostics/__main__.py +25 -0
- ads/opctl/diagnostics/check_distributed_job_requirements.py +212 -0
- ads/opctl/diagnostics/check_requirements.py +144 -0
- ads/opctl/diagnostics/requirement_exception.py +9 -0
- ads/opctl/distributed/README.md +109 -0
- ads/opctl/distributed/__init__.py +5 -0
- ads/opctl/distributed/certificates.py +32 -0
- ads/opctl/distributed/cli.py +207 -0
- ads/opctl/distributed/cmds.py +731 -0
- ads/opctl/distributed/common/__init__.py +5 -0
- ads/opctl/distributed/common/abstract_cluster_provider.py +449 -0
- ads/opctl/distributed/common/abstract_framework_spec_builder.py +88 -0
- ads/opctl/distributed/common/cluster_config_helper.py +103 -0
- ads/opctl/distributed/common/cluster_provider_factory.py +21 -0
- ads/opctl/distributed/common/cluster_runner.py +54 -0
- ads/opctl/distributed/common/framework_factory.py +29 -0
- ads/opctl/docker/Dockerfile.job +103 -0
- ads/opctl/docker/Dockerfile.job.arm +107 -0
- ads/opctl/docker/Dockerfile.job.gpu +175 -0
- ads/opctl/docker/base-env.yaml +13 -0
- ads/opctl/docker/cuda.repo +6 -0
- ads/opctl/docker/operator/.dockerignore +0 -0
- ads/opctl/docker/operator/Dockerfile +41 -0
- ads/opctl/docker/operator/Dockerfile.gpu +85 -0
- ads/opctl/docker/operator/cuda.repo +6 -0
- ads/opctl/docker/operator/environment.yaml +8 -0
- ads/opctl/forecast.py +11 -0
- ads/opctl/index.yaml +3 -0
- ads/opctl/model/__init__.py +5 -0
- ads/opctl/model/cli.py +65 -0
- ads/opctl/model/cmds.py +73 -0
- ads/opctl/operator/README.md +4 -0
- ads/opctl/operator/__init__.py +31 -0
- ads/opctl/operator/cli.py +344 -0
- ads/opctl/operator/cmd.py +596 -0
- ads/opctl/operator/common/__init__.py +5 -0
- ads/opctl/operator/common/backend_factory.py +460 -0
- ads/opctl/operator/common/const.py +27 -0
- ads/opctl/operator/common/data/synthetic.csv +16001 -0
- ads/opctl/operator/common/dictionary_merger.py +148 -0
- ads/opctl/operator/common/errors.py +42 -0
- ads/opctl/operator/common/operator_config.py +99 -0
- ads/opctl/operator/common/operator_loader.py +811 -0
- ads/opctl/operator/common/operator_schema.yaml +130 -0
- ads/opctl/operator/common/operator_yaml_generator.py +152 -0
- ads/opctl/operator/common/utils.py +208 -0
- ads/opctl/operator/lowcode/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/MLoperator +16 -0
- ads/opctl/operator/lowcode/anomaly/README.md +207 -0
- ads/opctl/operator/lowcode/anomaly/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/__main__.py +103 -0
- ads/opctl/operator/lowcode/anomaly/cmd.py +35 -0
- ads/opctl/operator/lowcode/anomaly/const.py +167 -0
- ads/opctl/operator/lowcode/anomaly/environment.yaml +10 -0
- ads/opctl/operator/lowcode/anomaly/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +146 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +162 -0
- ads/opctl/operator/lowcode/anomaly/model/automlx.py +99 -0
- ads/opctl/operator/lowcode/anomaly/model/autots.py +115 -0
- ads/opctl/operator/lowcode/anomaly/model/base_model.py +404 -0
- ads/opctl/operator/lowcode/anomaly/model/factory.py +110 -0
- ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +78 -0
- ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +78 -0
- ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +120 -0
- ads/opctl/operator/lowcode/anomaly/model/tods.py +119 -0
- ads/opctl/operator/lowcode/anomaly/operator_config.py +127 -0
- ads/opctl/operator/lowcode/anomaly/schema.yaml +401 -0
- ads/opctl/operator/lowcode/anomaly/utils.py +88 -0
- ads/opctl/operator/lowcode/common/__init__.py +5 -0
- ads/opctl/operator/lowcode/common/const.py +10 -0
- ads/opctl/operator/lowcode/common/data.py +116 -0
- ads/opctl/operator/lowcode/common/errors.py +47 -0
- ads/opctl/operator/lowcode/common/transformations.py +296 -0
- ads/opctl/operator/lowcode/common/utils.py +384 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/MLoperator +13 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/README.md +30 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/__init__.py +5 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/__main__.py +116 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/cmd.py +85 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/const.py +15 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/environment.yaml +0 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/__init__.py +4 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/apigw_config.py +32 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/db_config.py +43 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/mysql_config.py +120 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/serializable_yaml_model.py +34 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/operator_utils.py +386 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/schema.yaml +160 -0
- ads/opctl/operator/lowcode/forecast/MLoperator +25 -0
- ads/opctl/operator/lowcode/forecast/README.md +209 -0
- ads/opctl/operator/lowcode/forecast/__init__.py +5 -0
- ads/opctl/operator/lowcode/forecast/__main__.py +89 -0
- ads/opctl/operator/lowcode/forecast/cmd.py +40 -0
- ads/opctl/operator/lowcode/forecast/const.py +92 -0
- ads/opctl/operator/lowcode/forecast/environment.yaml +20 -0
- ads/opctl/operator/lowcode/forecast/errors.py +26 -0
- ads/opctl/operator/lowcode/forecast/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/forecast/model/arima.py +279 -0
- ads/opctl/operator/lowcode/forecast/model/automlx.py +553 -0
- ads/opctl/operator/lowcode/forecast/model/autots.py +312 -0
- ads/opctl/operator/lowcode/forecast/model/base_model.py +875 -0
- ads/opctl/operator/lowcode/forecast/model/factory.py +106 -0
- ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +492 -0
- ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +243 -0
- ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +482 -0
- ads/opctl/operator/lowcode/forecast/model/prophet.py +450 -0
- ads/opctl/operator/lowcode/forecast/model_evaluator.py +244 -0
- ads/opctl/operator/lowcode/forecast/operator_config.py +234 -0
- ads/opctl/operator/lowcode/forecast/schema.yaml +506 -0
- ads/opctl/operator/lowcode/forecast/utils.py +397 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/__init__.py +7 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/deployment_manager.py +285 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/score.py +246 -0
- ads/opctl/operator/lowcode/pii/MLoperator +17 -0
- ads/opctl/operator/lowcode/pii/README.md +208 -0
- ads/opctl/operator/lowcode/pii/__init__.py +5 -0
- ads/opctl/operator/lowcode/pii/__main__.py +78 -0
- ads/opctl/operator/lowcode/pii/cmd.py +39 -0
- ads/opctl/operator/lowcode/pii/constant.py +84 -0
- ads/opctl/operator/lowcode/pii/environment.yaml +17 -0
- ads/opctl/operator/lowcode/pii/errors.py +27 -0
- ads/opctl/operator/lowcode/pii/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/pii/model/factory.py +82 -0
- ads/opctl/operator/lowcode/pii/model/guardrails.py +167 -0
- ads/opctl/operator/lowcode/pii/model/pii.py +145 -0
- ads/opctl/operator/lowcode/pii/model/processor/__init__.py +34 -0
- ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py +34 -0
- ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py +35 -0
- ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py +225 -0
- ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py +73 -0
- ads/opctl/operator/lowcode/pii/model/processor/remover.py +26 -0
- ads/opctl/operator/lowcode/pii/model/report.py +487 -0
- ads/opctl/operator/lowcode/pii/operator_config.py +95 -0
- ads/opctl/operator/lowcode/pii/schema.yaml +108 -0
- ads/opctl/operator/lowcode/pii/utils.py +43 -0
- ads/opctl/operator/lowcode/recommender/MLoperator +16 -0
- ads/opctl/operator/lowcode/recommender/README.md +206 -0
- ads/opctl/operator/lowcode/recommender/__init__.py +5 -0
- ads/opctl/operator/lowcode/recommender/__main__.py +82 -0
- ads/opctl/operator/lowcode/recommender/cmd.py +33 -0
- ads/opctl/operator/lowcode/recommender/constant.py +30 -0
- ads/opctl/operator/lowcode/recommender/environment.yaml +11 -0
- ads/opctl/operator/lowcode/recommender/model/base_model.py +212 -0
- ads/opctl/operator/lowcode/recommender/model/factory.py +56 -0
- ads/opctl/operator/lowcode/recommender/model/recommender_dataset.py +25 -0
- ads/opctl/operator/lowcode/recommender/model/svd.py +106 -0
- ads/opctl/operator/lowcode/recommender/operator_config.py +81 -0
- ads/opctl/operator/lowcode/recommender/schema.yaml +265 -0
- ads/opctl/operator/lowcode/recommender/utils.py +13 -0
- ads/opctl/operator/runtime/__init__.py +5 -0
- ads/opctl/operator/runtime/const.py +17 -0
- ads/opctl/operator/runtime/container_runtime_schema.yaml +50 -0
- ads/opctl/operator/runtime/marketplace_runtime.py +50 -0
- ads/opctl/operator/runtime/python_marketplace_runtime_schema.yaml +21 -0
- ads/opctl/operator/runtime/python_runtime_schema.yaml +21 -0
- ads/opctl/operator/runtime/runtime.py +115 -0
- ads/opctl/schema.yaml.yml +36 -0
- ads/opctl/script.py +40 -0
- ads/opctl/spark/__init__.py +5 -0
- ads/opctl/spark/cli.py +43 -0
- ads/opctl/spark/cmds.py +147 -0
- ads/opctl/templates/diagnostic_report_template.jinja2 +102 -0
- ads/opctl/utils.py +344 -0
- ads/oracledb/__init__.py +5 -0
- ads/oracledb/oracle_db.py +346 -0
- ads/pipeline/__init__.py +39 -0
- ads/pipeline/ads_pipeline.py +2279 -0
- ads/pipeline/ads_pipeline_run.py +772 -0
- ads/pipeline/ads_pipeline_step.py +605 -0
- ads/pipeline/builders/__init__.py +5 -0
- ads/pipeline/builders/infrastructure/__init__.py +5 -0
- ads/pipeline/builders/infrastructure/custom_script.py +32 -0
- ads/pipeline/cli.py +119 -0
- ads/pipeline/extension.py +291 -0
- ads/pipeline/schema/__init__.py +5 -0
- ads/pipeline/schema/cs_step_schema.json +35 -0
- ads/pipeline/schema/ml_step_schema.json +31 -0
- ads/pipeline/schema/pipeline_schema.json +71 -0
- ads/pipeline/visualizer/__init__.py +5 -0
- ads/pipeline/visualizer/base.py +570 -0
- ads/pipeline/visualizer/graph_renderer.py +272 -0
- ads/pipeline/visualizer/text_renderer.py +84 -0
- ads/secrets/__init__.py +11 -0
- ads/secrets/adb.py +386 -0
- ads/secrets/auth_token.py +86 -0
- ads/secrets/big_data_service.py +365 -0
- ads/secrets/mysqldb.py +149 -0
- ads/secrets/oracledb.py +160 -0
- ads/secrets/secrets.py +407 -0
- ads/telemetry/__init__.py +7 -0
- ads/telemetry/base.py +69 -0
- ads/telemetry/client.py +122 -0
- ads/telemetry/telemetry.py +257 -0
- ads/templates/dataflow_pyspark.jinja2 +13 -0
- ads/templates/dataflow_sparksql.jinja2 +22 -0
- ads/templates/func.jinja2 +20 -0
- ads/templates/schemas/openapi.json +1740 -0
- ads/templates/score-pkl.jinja2 +173 -0
- ads/templates/score.jinja2 +322 -0
- ads/templates/score_embedding_onnx.jinja2 +202 -0
- ads/templates/score_generic.jinja2 +165 -0
- ads/templates/score_huggingface_pipeline.jinja2 +217 -0
- ads/templates/score_lightgbm.jinja2 +185 -0
- ads/templates/score_onnx.jinja2 +407 -0
- ads/templates/score_onnx_new.jinja2 +473 -0
- ads/templates/score_oracle_automl.jinja2 +185 -0
- ads/templates/score_pyspark.jinja2 +154 -0
- ads/templates/score_pytorch.jinja2 +219 -0
- ads/templates/score_scikit-learn.jinja2 +184 -0
- ads/templates/score_tensorflow.jinja2 +184 -0
- ads/templates/score_xgboost.jinja2 +178 -0
- ads/text_dataset/__init__.py +5 -0
- ads/text_dataset/backends.py +211 -0
- ads/text_dataset/dataset.py +445 -0
- ads/text_dataset/extractor.py +207 -0
- ads/text_dataset/options.py +53 -0
- ads/text_dataset/udfs.py +22 -0
- ads/text_dataset/utils.py +49 -0
- ads/type_discovery/__init__.py +9 -0
- ads/type_discovery/abstract_detector.py +21 -0
- ads/type_discovery/constant_detector.py +41 -0
- ads/type_discovery/continuous_detector.py +54 -0
- ads/type_discovery/credit_card_detector.py +99 -0
- ads/type_discovery/datetime_detector.py +92 -0
- ads/type_discovery/discrete_detector.py +118 -0
- ads/type_discovery/document_detector.py +146 -0
- ads/type_discovery/ip_detector.py +68 -0
- ads/type_discovery/latlon_detector.py +90 -0
- ads/type_discovery/phone_number_detector.py +63 -0
- ads/type_discovery/type_discovery_driver.py +87 -0
- ads/type_discovery/typed_feature.py +594 -0
- ads/type_discovery/unknown_detector.py +41 -0
- ads/type_discovery/zipcode_detector.py +48 -0
- ads/vault/__init__.py +7 -0
- ads/vault/vault.py +237 -0
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/METADATA +150 -149
- oracle_ads-2.13.10.dist-info/RECORD +858 -0
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/WHEEL +1 -2
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/entry_points.txt +2 -1
- oracle_ads-2.13.9rc0.dist-info/RECORD +0 -9
- oracle_ads-2.13.9rc0.dist-info/top_level.txt +0 -1
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
### Dataset Capabilities
|
2
|
+
|
3
|
+
The Dataset is an abstraction atop a variety of sources
|
4
|
+
- local file system via Pandas dataframe
|
5
|
+
- remote file systems like s3, Oracle Storage
|
6
|
+
- advanced possibilities include using pyarrow for Apache Arrow datsets
|
7
|
+
- interoperate with Dask.dataframes
|
8
|
+
|
9
|
+
The sdk's dataset provides the following capabilities:
|
10
|
+
- Support loading dataset from pandas, remote sources, clipboard etc
|
11
|
+
```
|
12
|
+
df = pd.load_csv(...)
|
13
|
+
ds = sdk.Dataset.fromPandas(df)
|
14
|
+
```
|
15
|
+
- Visualizing of dataset
|
16
|
+
- Show all features, their data types and counts, meta data for a dataset, for example if the dataset is time-series or not.
|
17
|
+
- Show (heatmap) of correlated features
|
18
|
+
```
|
19
|
+
show_in_notebook(ds)
|
20
|
+
```
|
21
|
+
- Show distribution of values for features, using a datatype-dependent
|
22
|
+
visualizer (text vs categorical vs numerical will all render differently)
|
23
|
+
```
|
24
|
+
show_in_notebook(ds, features=["col1", "col2", "col3"])
|
25
|
+
```
|
26
|
+
|
27
|
+
look at http://pandas.pydata.org/pandas-docs/stable/style.html for some pandas
|
28
|
+
examples, also seaborn https://stackoverflow.com/questions/39409866/correlation-heatmap
|
29
|
+
|
30
|
+
- Data cleaning
|
31
|
+
- Support redaction of features
|
32
|
+
- Handle missing values
|
33
|
+
- Identify outliers and inconsistencies
|
34
|
+
- Imputation
|
35
|
+
- Identify data type of features
|
36
|
+
- Define a data type hierarchy that includes special patterns like credit cards, zip codes, phone number, etc.
|
37
|
+
- Feature selection
|
38
|
+
- features used in a model is part of AutoML, however, for modeling that's not using AutoML the dataset
|
39
|
+
should have feature selection capabilities using a plugin (initial should be information theoretic) - see
|
40
|
+
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection for examples
|
41
|
+
- Sampling
|
42
|
+
- can be used to limit data used for modeling, can also be used to balance and unbalanced target variable.
|
43
|
+
- "auto mode" determines if up/down is required
|
44
|
+
- Features can hold feature encoding hints
|
45
|
+
- these hints are part of the dataset, not the underlying pandas dataframe
|
46
|
+
- Discretization
|
47
|
+
- of target variable for regression to classification problem
|
48
|
+
|
49
|
+
- Persistence & interchange
|
50
|
+
- support save as snapshot (possibly parquet format), a snapshot generates a shareable URI
|
51
|
+
- load from snapshot URI
|
@@ -0,0 +1,339 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
|
4
|
+
# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
|
5
|
+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import warnings
|
9
|
+
|
10
|
+
from ads.common import utils, logger
|
11
|
+
from ads.dataset import helper
|
12
|
+
from ads.dataset.exception import ValidationError
|
13
|
+
from ads.dataset.dataset_with_target import ADSDatasetWithTarget
|
14
|
+
from sklearn.preprocessing import FunctionTransformer
|
15
|
+
from ads.dataset.helper import deprecate_variable, deprecate_default_value
|
16
|
+
|
17
|
+
|
18
|
+
class ClassificationDataset(ADSDatasetWithTarget):
|
19
|
+
"""
|
20
|
+
Dataset for classification task
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
|
24
|
+
ADSDatasetWithTarget.__init__(
|
25
|
+
self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs
|
26
|
+
)
|
27
|
+
|
28
|
+
def auto_transform(
|
29
|
+
self,
|
30
|
+
fix_imbalance: bool = True,
|
31
|
+
correlation_threshold: float = 0.7,
|
32
|
+
frac: float = 1.0,
|
33
|
+
correlation_methods: str = "pearson",
|
34
|
+
):
|
35
|
+
"""
|
36
|
+
Return transformed dataset with several optimizations applied automatically.
|
37
|
+
The optimizations include:
|
38
|
+
|
39
|
+
- Dropping constant and primary key columns, which has no predictive quality,
|
40
|
+
- Imputation, to fill in missing values in noisy data:
|
41
|
+
|
42
|
+
- For continuous variables, fill with mean if less than 40% is missing, else drop,
|
43
|
+
- For categorical variables, fill with most frequent if less than 40% is missing, else drop,
|
44
|
+
|
45
|
+
- Dropping strongly co-correlated columns that tend to produce less generalizable models,
|
46
|
+
- Balancing dataset using up or down sampling.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
fix_imbalance : bool, defaults to True.
|
51
|
+
Fix imbalance between classes in dataset. Used only for classification datasets.
|
52
|
+
correlation_threshold: float, defaults to 0.7. It must be between 0 and 1, inclusive.
|
53
|
+
The correlation threshold where columns with correlation higher than the threshold will
|
54
|
+
be considered as strongly co-correlated and recommended to be taken care of.
|
55
|
+
frac: float, defaults to 1.0. Range -> (0, 1].
|
56
|
+
What fraction of the data should be used in the calculation?
|
57
|
+
correlation_methods: Union[list, str], defaults to 'pearson'.
|
58
|
+
|
59
|
+
- 'pearson': Use Pearson's Correlation between continuous features,
|
60
|
+
- 'cramers v': Use Cramer's V correlations between categorical features,
|
61
|
+
- 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
|
62
|
+
- 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
|
63
|
+
|
64
|
+
Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
|
65
|
+
|
66
|
+
Returns
|
67
|
+
-------
|
68
|
+
transformed_dataset : ADSDatasetWithTarget
|
69
|
+
The dataset after transformation
|
70
|
+
|
71
|
+
Examples
|
72
|
+
--------
|
73
|
+
>>> ds_clean = ds.auto_transform(correlation_threshold=0.6)
|
74
|
+
"""
|
75
|
+
frac = deprecate_default_value(
|
76
|
+
frac,
|
77
|
+
None,
|
78
|
+
1,
|
79
|
+
f"<code>frac=None</code> is deprecated. Use <code>frac=1.0</code> instead.",
|
80
|
+
FutureWarning,
|
81
|
+
)
|
82
|
+
with utils.get_progress_bar(7) as progress:
|
83
|
+
df, sampled_df, transformer_pipeline = self._transform(
|
84
|
+
progress=progress,
|
85
|
+
fix_imbalance=fix_imbalance,
|
86
|
+
correlation_threshold=correlation_threshold,
|
87
|
+
frac=frac,
|
88
|
+
correlation_methods=correlation_methods,
|
89
|
+
)
|
90
|
+
return self._build_new_dataset(
|
91
|
+
df,
|
92
|
+
sampled_df=sampled_df,
|
93
|
+
transformers=transformer_pipeline.steps,
|
94
|
+
progress=progress,
|
95
|
+
)
|
96
|
+
|
97
|
+
def convert_to_text_classification(self, text_column: str):
|
98
|
+
"""
|
99
|
+
Builds a new dataset with the given text column as the only feature besides target.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
text_column: str
|
104
|
+
Feature name to use for text classification task
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
ds: TextClassificationDataset
|
109
|
+
Dataset with one text feature and a classification target
|
110
|
+
|
111
|
+
Examples
|
112
|
+
--------
|
113
|
+
>>> review_ds = DatasetFactory.open("review_data.csv")
|
114
|
+
>>> ds_text_class = review_ds.convert_to_text_classification('reviews')
|
115
|
+
"""
|
116
|
+
|
117
|
+
def _select_features(df, feature_names, target):
|
118
|
+
if target in df.columns:
|
119
|
+
feature_names = feature_names + [target]
|
120
|
+
return df[feature_names]
|
121
|
+
|
122
|
+
transformer = (
|
123
|
+
f"convert_to_text_classification using feature {text_column}",
|
124
|
+
FunctionTransformer(
|
125
|
+
func=_select_features,
|
126
|
+
validate=False,
|
127
|
+
kw_args={
|
128
|
+
"feature_names": [self.target.name, text_column],
|
129
|
+
"target": self.target.name,
|
130
|
+
},
|
131
|
+
).fit(self.sampled_df),
|
132
|
+
)
|
133
|
+
if utils.is_same_class(self, BinaryClassificationDataset):
|
134
|
+
new_ds = BinaryTextClassificationDataset(
|
135
|
+
self.df[[self.target.name, text_column]],
|
136
|
+
self.sampled_df[[self.target.name, text_column]],
|
137
|
+
self.target.name,
|
138
|
+
self.target.type,
|
139
|
+
(len(self.df), 2),
|
140
|
+
**self.init_kwargs,
|
141
|
+
)
|
142
|
+
|
143
|
+
else:
|
144
|
+
new_ds = MultiClassTextClassificationDataset(
|
145
|
+
self.df[[self.target.name, text_column]],
|
146
|
+
self.sampled_df[[self.target.name, text_column]],
|
147
|
+
self.target.name,
|
148
|
+
self.target.type,
|
149
|
+
(len(self.df), 2),
|
150
|
+
**self.init_kwargs,
|
151
|
+
)
|
152
|
+
new_ds.transformer_pipeline = self._update_transformer_pipeline(transformer)
|
153
|
+
return new_ds
|
154
|
+
|
155
|
+
def down_sample(self, sampler=None):
|
156
|
+
"""
|
157
|
+
Fixes an imbalanced dataset by down-sampling.
|
158
|
+
|
159
|
+
Parameters
|
160
|
+
----------
|
161
|
+
sampler: An instance of SamplerMixin
|
162
|
+
Should implement fit_resample(X,y) method. If None, does random down sampling.
|
163
|
+
|
164
|
+
Returns
|
165
|
+
-------
|
166
|
+
down_sampled_ds: ClassificationDataset
|
167
|
+
A down-sampled dataset.
|
168
|
+
|
169
|
+
Examples
|
170
|
+
--------
|
171
|
+
>>> ds = DatasetFactory.open("some_data.csv")
|
172
|
+
>>> ds_balanced_small = ds.down_sample()
|
173
|
+
"""
|
174
|
+
return self._build_new_dataset(
|
175
|
+
helper.down_sample(self.df, self.target.name)
|
176
|
+
if sampler is None
|
177
|
+
else helper.sample(
|
178
|
+
sampler,
|
179
|
+
self.df.drop(self.target.name, axis=1),
|
180
|
+
self.df[self.target.name],
|
181
|
+
)
|
182
|
+
)
|
183
|
+
|
184
|
+
def up_sample(self, sampler="default"):
|
185
|
+
"""
|
186
|
+
Fixes imbalanced dataset by up-sampling
|
187
|
+
|
188
|
+
Parameters
|
189
|
+
----------
|
190
|
+
sampler: An instance of SamplerMixin
|
191
|
+
Should implement fit_resample(X,y) method.
|
192
|
+
If 'default', either SMOTE or random sampler will be used
|
193
|
+
fill_missing_type: a string
|
194
|
+
Can either be 'mean', 'mode' or 'median'.
|
195
|
+
|
196
|
+
Returns
|
197
|
+
-------
|
198
|
+
up_sampled_ds: ClassificationDataset
|
199
|
+
an up-sampled dataset
|
200
|
+
|
201
|
+
Examples
|
202
|
+
--------
|
203
|
+
>>> ds = DatasetFactory.open("some_data.csv")
|
204
|
+
>>> ds_balanced_large = ds.up_sample()
|
205
|
+
"""
|
206
|
+
return self._build_new_dataset(
|
207
|
+
helper.up_sample(
|
208
|
+
self.df,
|
209
|
+
self.target.name,
|
210
|
+
sampler=sampler,
|
211
|
+
feature_types=self.feature_types,
|
212
|
+
)
|
213
|
+
)
|
214
|
+
|
215
|
+
|
216
|
+
class BinaryClassificationDataset(ClassificationDataset):
|
217
|
+
"""
|
218
|
+
Dataset for binary classification
|
219
|
+
"""
|
220
|
+
|
221
|
+
def __init__(
|
222
|
+
self, df, sampled_df, target, target_type, shape, positive_class=None, **kwargs
|
223
|
+
):
|
224
|
+
if positive_class is not None:
|
225
|
+
# map positive_class to True
|
226
|
+
update_arg = lambda x: x == positive_class
|
227
|
+
|
228
|
+
def mapper(df, column_name, arg):
|
229
|
+
df[column_name] = df[column_name].map(arg)
|
230
|
+
return df
|
231
|
+
|
232
|
+
df = mapper(df, target, update_arg)
|
233
|
+
sampled_df = mapper(sampled_df, target, update_arg)
|
234
|
+
ClassificationDataset.__init__(
|
235
|
+
self, df, sampled_df, target, target_type, shape, **kwargs
|
236
|
+
)
|
237
|
+
|
238
|
+
def set_positive_class(self, positive_class, missing_value=False):
|
239
|
+
"""
|
240
|
+
Return new dataset with values in target column mapped to True or False
|
241
|
+
in accordance with the specified positive label.
|
242
|
+
|
243
|
+
Parameters
|
244
|
+
----------
|
245
|
+
positive_class : same dtype as target
|
246
|
+
The target label which should be identified as positive outcome from model.
|
247
|
+
missing_value : bool
|
248
|
+
missing values will be converted to this
|
249
|
+
|
250
|
+
Returns
|
251
|
+
-------
|
252
|
+
dataset: same type as the caller
|
253
|
+
|
254
|
+
Raises
|
255
|
+
------
|
256
|
+
ValidationError
|
257
|
+
if the positive_class is not present in target
|
258
|
+
|
259
|
+
Examples
|
260
|
+
--------
|
261
|
+
>>> ds = DatasetFactory.open("iris.csv")
|
262
|
+
>>> ds_with_target = ds.set_target('class')
|
263
|
+
>>> ds_with_pos_class = ds.set_positive_class('setosa')
|
264
|
+
"""
|
265
|
+
if positive_class not in self.target.target_vals:
|
266
|
+
raise ValidationError(
|
267
|
+
"Positive label '%s' not in target values '%s'"
|
268
|
+
% (positive_class, self.target.target_vals)
|
269
|
+
)
|
270
|
+
|
271
|
+
return self.assign_column(
|
272
|
+
self.target.name,
|
273
|
+
lambda x: pd.isnull(x) and missing_value or x == positive_class,
|
274
|
+
)
|
275
|
+
|
276
|
+
|
277
|
+
class MultiClassClassificationDataset(ClassificationDataset):
|
278
|
+
"""
|
279
|
+
Dataset for multi-class classification
|
280
|
+
"""
|
281
|
+
|
282
|
+
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
|
283
|
+
ClassificationDataset.__init__(
|
284
|
+
self, df, sampled_df, target, target_type, shape, **kwargs
|
285
|
+
)
|
286
|
+
|
287
|
+
|
288
|
+
class BinaryTextClassificationDataset(BinaryClassificationDataset):
|
289
|
+
"""
|
290
|
+
Dataset for binary text classification
|
291
|
+
"""
|
292
|
+
|
293
|
+
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
|
294
|
+
BinaryClassificationDataset.__init__(
|
295
|
+
self, df, sampled_df, target, target_type, shape, **kwargs
|
296
|
+
)
|
297
|
+
|
298
|
+
def auto_transform(self):
|
299
|
+
"""
|
300
|
+
Automatically chooses the most effective dataset transformation
|
301
|
+
"""
|
302
|
+
logger.info("No optimizations.")
|
303
|
+
return self
|
304
|
+
|
305
|
+
def select_best_features(self, score_func=None, k=12):
|
306
|
+
"""
|
307
|
+
Automatically chooses the best features and removes the rest
|
308
|
+
"""
|
309
|
+
logger.info(
|
310
|
+
"There are an insufficient number of features to do feature selection."
|
311
|
+
)
|
312
|
+
return self
|
313
|
+
|
314
|
+
|
315
|
+
class MultiClassTextClassificationDataset(MultiClassClassificationDataset):
|
316
|
+
"""
|
317
|
+
Dataset for multi-class text classification
|
318
|
+
"""
|
319
|
+
|
320
|
+
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
|
321
|
+
MultiClassClassificationDataset.__init__(
|
322
|
+
self, df, sampled_df, target, target_type, shape, **kwargs
|
323
|
+
)
|
324
|
+
|
325
|
+
def auto_transform(self):
|
326
|
+
"""
|
327
|
+
Automatically chooses the most effective dataset transformation
|
328
|
+
"""
|
329
|
+
logger.info("No optimizations.")
|
330
|
+
return self
|
331
|
+
|
332
|
+
def select_best_features(self, score_func=None, k=12):
|
333
|
+
"""
|
334
|
+
Automatically chooses the best features and removes the rest
|
335
|
+
"""
|
336
|
+
logger.info(
|
337
|
+
"There are an insufficient number of features to do feature selection."
|
338
|
+
)
|
339
|
+
return self
|
@@ -0,0 +1,226 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
|
4
|
+
# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
|
5
|
+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
|
+
|
7
|
+
from __future__ import print_function, absolute_import
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from ads.common import logger
|
13
|
+
from ads.dataset.exception import ValidationError
|
14
|
+
from collections import defaultdict, Counter
|
15
|
+
from itertools import product, combinations
|
16
|
+
from typing import Tuple
|
17
|
+
|
18
|
+
|
19
|
+
def _cat_vs_cat(df: pd.core.frame.DataFrame, categorical_columns: list) -> pd.DataFrame:
|
20
|
+
"""
|
21
|
+
calc the correlation of all pairs of categorical features and categorical features
|
22
|
+
"""
|
23
|
+
if not categorical_columns:
|
24
|
+
return pd.DataFrame()
|
25
|
+
categorical_pairs = list(combinations(categorical_columns, 2))
|
26
|
+
corr_list = []
|
27
|
+
for col in categorical_pairs:
|
28
|
+
cat1_name = col[0]
|
29
|
+
cat2_name = col[1]
|
30
|
+
_check_if_same_type(df[col[0]], cat1_name)
|
31
|
+
_check_if_same_type(df[col[1]], cat2_name)
|
32
|
+
corr_list.append(
|
33
|
+
_cramers_v(np.array(df[col[0]].values), np.array(df[col[1]].values))
|
34
|
+
)
|
35
|
+
correlation_matrix = _list_to_dataframe(categorical_pairs, corr_list)
|
36
|
+
return correlation_matrix
|
37
|
+
|
38
|
+
|
39
|
+
def _cat_vs_cts(
|
40
|
+
df: pd.core.frame.DataFrame, categorical_columns: list, continuous_columns: list
|
41
|
+
) -> pd.DataFrame:
|
42
|
+
"""
|
43
|
+
calc the correlation of all pairs of categorical features and continuous features
|
44
|
+
"""
|
45
|
+
numerical_categorical_pairs = list(product(categorical_columns, continuous_columns))
|
46
|
+
corr_list = []
|
47
|
+
for col in numerical_categorical_pairs:
|
48
|
+
corr_list.append(
|
49
|
+
_correlation_ratio(np.array(df[col[0]].values), np.array(df[col[1]].values))
|
50
|
+
)
|
51
|
+
correlation_matrix = _list_to_dataframe(numerical_categorical_pairs, corr_list)
|
52
|
+
return correlation_matrix
|
53
|
+
|
54
|
+
|
55
|
+
def _list_to_dataframe(name_list: list, corr_list: list) -> pd.DataFrame:
|
56
|
+
corr_dict = defaultdict(dict)
|
57
|
+
for idx, corr in zip(name_list, corr_list):
|
58
|
+
row_name = idx[0]
|
59
|
+
col_name = idx[1]
|
60
|
+
corr_dict[row_name][col_name] = corr_dict[col_name][row_name] = round(corr, 4)
|
61
|
+
corr_dict[row_name][row_name] = corr_dict[col_name][col_name] = 1.0000
|
62
|
+
correlation_matrix = pd.DataFrame.from_dict(corr_dict).sort_index()
|
63
|
+
correlation_matrix = correlation_matrix.loc[:, correlation_matrix.index]
|
64
|
+
return correlation_matrix
|
65
|
+
|
66
|
+
|
67
|
+
def _correlation_ratio(cat: np.ndarray, cts: np.ndarray):
|
68
|
+
"""
|
69
|
+
calc the correlation of a pair of a categorical feature and a continuous feature
|
70
|
+
using correlation ratio when input are two numpy arrays
|
71
|
+
"""
|
72
|
+
keep_cts = ~pd.isnull(cts)
|
73
|
+
cat_no_nan = cat[keep_cts]
|
74
|
+
cts_no_nan = cts[keep_cts]
|
75
|
+
|
76
|
+
keep_cat = ~pd.isnull(cat_no_nan)
|
77
|
+
cat_no_none = cat_no_nan[keep_cat]
|
78
|
+
cts_no_none = cts_no_nan[keep_cat]
|
79
|
+
|
80
|
+
unq_cat, tags, group_count = np.unique(
|
81
|
+
list(cat_no_none), return_inverse=1, return_counts=1
|
82
|
+
)
|
83
|
+
group_mean = np.bincount(tags, cts_no_none) / group_count
|
84
|
+
overall_mean = np.nanmean(cts_no_none)
|
85
|
+
n = len(cts_no_none)
|
86
|
+
|
87
|
+
dispersion_within = np.dot(group_count, np.square(group_mean - overall_mean))
|
88
|
+
dispersion_population = cts_no_none.var() * n
|
89
|
+
ratio = dispersion_within / dispersion_population
|
90
|
+
|
91
|
+
return np.sqrt(ratio)
|
92
|
+
|
93
|
+
|
94
|
+
def _count_occurrence(
|
95
|
+
cat1: np.ndarray, cat2: np.ndarray
|
96
|
+
) -> Tuple[np.ndarray, int, int]:
|
97
|
+
"""
|
98
|
+
calc the contingency table of two arrays
|
99
|
+
"""
|
100
|
+
occurance_cnt = Counter([(x, y) for x, y in zip(cat1, cat2)])
|
101
|
+
nunique_cat1 = np.unique(cat1[~pd.isnull(cat1)])
|
102
|
+
nunique_cat2 = np.unique(cat2[~pd.isnull(cat2)])
|
103
|
+
r = len(nunique_cat1)
|
104
|
+
k = len(nunique_cat2)
|
105
|
+
contigency_table = np.zeros((r, k))
|
106
|
+
for row, num1 in enumerate(nunique_cat1):
|
107
|
+
for col, num2 in enumerate(nunique_cat2):
|
108
|
+
contigency_table[row, col] = occurance_cnt[(num1, num2)]
|
109
|
+
|
110
|
+
return contigency_table, r, k
|
111
|
+
|
112
|
+
|
113
|
+
def _chi_squared(count_matrix: np.ndarray, n_obs: int) -> float:
|
114
|
+
"""
|
115
|
+
Compute Chi-squared when given a contingency table
|
116
|
+
"""
|
117
|
+
row_sums = np.tile(np.sum(count_matrix, axis=1), (count_matrix.shape[1], 1)).T
|
118
|
+
col_sums = np.tile(np.sum(count_matrix, axis=0), (count_matrix.shape[0], 1))
|
119
|
+
return np.sum(
|
120
|
+
np.square(count_matrix - row_sums * col_sums / n_obs)
|
121
|
+
/ (row_sums * col_sums / n_obs)
|
122
|
+
)
|
123
|
+
|
124
|
+
|
125
|
+
def _cramers_v(cat1: np.ndarray, cat2: np.ndarray) -> float:
|
126
|
+
"""
|
127
|
+
calc the cramers v of two numpy arrays
|
128
|
+
"""
|
129
|
+
n = len(cat1)
|
130
|
+
if n == 1:
|
131
|
+
return 0
|
132
|
+
contigency_table, r, k = _count_occurrence(cat1, cat2)
|
133
|
+
|
134
|
+
if r == 0:
|
135
|
+
return 0.0000
|
136
|
+
|
137
|
+
chi2 = _chi_squared(contigency_table, n)
|
138
|
+
phi2 = chi2 / n
|
139
|
+
|
140
|
+
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
|
141
|
+
rcorr = r - (np.square(r - 1)) / (n - 1)
|
142
|
+
kcorr = k - (np.square(k - 1)) / (n - 1)
|
143
|
+
denominator = min((kcorr - 1), (rcorr - 1))
|
144
|
+
if denominator == 0:
|
145
|
+
return np.nan
|
146
|
+
return np.sqrt(phi2corr / denominator)
|
147
|
+
|
148
|
+
|
149
|
+
def _get_columns_by_type(
|
150
|
+
feature_types_df: pd.DataFrame, threshold: float = 0.8
|
151
|
+
) -> Tuple[list, list, list]:
|
152
|
+
"""
|
153
|
+
return the categorical columns, continuous columns and columns of other types
|
154
|
+
"""
|
155
|
+
missing = feature_types_df.loc[:, "missing_percentage"] > threshold
|
156
|
+
not_missing = feature_types_df.loc[:, "missing_percentage"] <= threshold
|
157
|
+
missing_columns = list(feature_types_df.loc[missing, "feature_name"].values)
|
158
|
+
constant_columns = list(
|
159
|
+
feature_types_df.loc[
|
160
|
+
(feature_types_df.loc[:, "type"].isin(["constant"])), "feature_name"
|
161
|
+
].values
|
162
|
+
)
|
163
|
+
categorical_columns = list(
|
164
|
+
feature_types_df.loc[
|
165
|
+
(feature_types_df.loc[:, "type"].isin(["categorical", "zipcode"]))
|
166
|
+
& not_missing,
|
167
|
+
"feature_name",
|
168
|
+
].values
|
169
|
+
)
|
170
|
+
continuous_columns = list(
|
171
|
+
feature_types_df.loc[
|
172
|
+
(feature_types_df.loc[:, "type"].isin(["continuous", "ordinal"]))
|
173
|
+
& not_missing,
|
174
|
+
"feature_name",
|
175
|
+
].values
|
176
|
+
)
|
177
|
+
other_columns = list(
|
178
|
+
set(feature_types_df.index.values)
|
179
|
+
- set(categorical_columns)
|
180
|
+
- set(continuous_columns)
|
181
|
+
- set(missing_columns)
|
182
|
+
- set(constant_columns)
|
183
|
+
)
|
184
|
+
|
185
|
+
if missing_columns:
|
186
|
+
logger.info(
|
187
|
+
f"The columns {missing_columns} are not included because more than {threshold}% of the values are missing. "
|
188
|
+
)
|
189
|
+
logger.info(
|
190
|
+
f"The columns {missing_columns} are not included because more than {threshold}% of the values are missing. "
|
191
|
+
f"Adjust this threshold using the `nan_threshold` parameter."
|
192
|
+
)
|
193
|
+
if constant_columns:
|
194
|
+
logger.info(
|
195
|
+
" The constant columns {} are not included.".format(constant_columns)
|
196
|
+
)
|
197
|
+
if other_columns:
|
198
|
+
logger.info(
|
199
|
+
f" The columns {other_columns} are not included because more than {threshold}% of the values are missing, or "
|
200
|
+
f"they are not one of the following types: "
|
201
|
+
f"`categorical`, `zipcode`, `continuous`"
|
202
|
+
f", or `ordinal`."
|
203
|
+
)
|
204
|
+
|
205
|
+
return categorical_columns, continuous_columns, other_columns
|
206
|
+
|
207
|
+
|
208
|
+
def _validate_correlation_methods(correlation_methods):
|
209
|
+
if isinstance(correlation_methods, str):
|
210
|
+
correlation_methods = [correlation_methods]
|
211
|
+
for method in correlation_methods:
|
212
|
+
if method not in ["all", "pearson", "cramers v", "correlation ratio"]:
|
213
|
+
raise ValidationError(f"{method} is not supported.")
|
214
|
+
if "all" in [method for method in correlation_methods]:
|
215
|
+
correlation_methods = ["pearson", "cramers v", "correlation ratio"]
|
216
|
+
return correlation_methods
|
217
|
+
|
218
|
+
|
219
|
+
def _check_if_same_type(series, col_name):
|
220
|
+
col = series.dropna().values
|
221
|
+
if len(col) > 0:
|
222
|
+
col_type = type(col[0])
|
223
|
+
if not all([isinstance(x, col_type) for x in col]):
|
224
|
+
raise TypeError(
|
225
|
+
f"More than one data type in the column `{col_name}`. Keep all the values in that column the same type."
|
226
|
+
)
|