oracle-ads 2.13.9rc0__py3-none-any.whl → 2.13.10rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ads/aqua/__init__.py +40 -0
- ads/aqua/app.py +507 -0
- ads/aqua/cli.py +96 -0
- ads/aqua/client/__init__.py +3 -0
- ads/aqua/client/client.py +836 -0
- ads/aqua/client/openai_client.py +305 -0
- ads/aqua/common/__init__.py +5 -0
- ads/aqua/common/decorator.py +125 -0
- ads/aqua/common/entities.py +274 -0
- ads/aqua/common/enums.py +134 -0
- ads/aqua/common/errors.py +109 -0
- ads/aqua/common/utils.py +1295 -0
- ads/aqua/config/__init__.py +4 -0
- ads/aqua/config/container_config.py +247 -0
- ads/aqua/config/evaluation/__init__.py +4 -0
- ads/aqua/config/evaluation/evaluation_service_config.py +147 -0
- ads/aqua/config/utils/__init__.py +4 -0
- ads/aqua/config/utils/serializer.py +339 -0
- ads/aqua/constants.py +116 -0
- ads/aqua/data.py +14 -0
- ads/aqua/dummy_data/icon.txt +1 -0
- ads/aqua/dummy_data/oci_model_deployments.json +56 -0
- ads/aqua/dummy_data/oci_models.json +1 -0
- ads/aqua/dummy_data/readme.md +26 -0
- ads/aqua/evaluation/__init__.py +8 -0
- ads/aqua/evaluation/constants.py +53 -0
- ads/aqua/evaluation/entities.py +186 -0
- ads/aqua/evaluation/errors.py +70 -0
- ads/aqua/evaluation/evaluation.py +1814 -0
- ads/aqua/extension/__init__.py +42 -0
- ads/aqua/extension/aqua_ws_msg_handler.py +76 -0
- ads/aqua/extension/base_handler.py +90 -0
- ads/aqua/extension/common_handler.py +121 -0
- ads/aqua/extension/common_ws_msg_handler.py +36 -0
- ads/aqua/extension/deployment_handler.py +381 -0
- ads/aqua/extension/deployment_ws_msg_handler.py +54 -0
- ads/aqua/extension/errors.py +30 -0
- ads/aqua/extension/evaluation_handler.py +129 -0
- ads/aqua/extension/evaluation_ws_msg_handler.py +61 -0
- ads/aqua/extension/finetune_handler.py +96 -0
- ads/aqua/extension/model_handler.py +390 -0
- ads/aqua/extension/models/__init__.py +0 -0
- ads/aqua/extension/models/ws_models.py +145 -0
- ads/aqua/extension/models_ws_msg_handler.py +50 -0
- ads/aqua/extension/ui_handler.py +300 -0
- ads/aqua/extension/ui_websocket_handler.py +130 -0
- ads/aqua/extension/utils.py +133 -0
- ads/aqua/finetuning/__init__.py +7 -0
- ads/aqua/finetuning/constants.py +23 -0
- ads/aqua/finetuning/entities.py +181 -0
- ads/aqua/finetuning/finetuning.py +749 -0
- ads/aqua/model/__init__.py +8 -0
- ads/aqua/model/constants.py +60 -0
- ads/aqua/model/entities.py +385 -0
- ads/aqua/model/enums.py +32 -0
- ads/aqua/model/model.py +2134 -0
- ads/aqua/model/utils.py +52 -0
- ads/aqua/modeldeployment/__init__.py +6 -0
- ads/aqua/modeldeployment/constants.py +10 -0
- ads/aqua/modeldeployment/deployment.py +1315 -0
- ads/aqua/modeldeployment/entities.py +653 -0
- ads/aqua/modeldeployment/utils.py +543 -0
- ads/aqua/resources/gpu_shapes_index.json +94 -0
- ads/aqua/server/__init__.py +4 -0
- ads/aqua/server/__main__.py +24 -0
- ads/aqua/server/app.py +47 -0
- ads/aqua/server/aqua_spec.yml +1291 -0
- ads/aqua/training/__init__.py +4 -0
- ads/aqua/training/exceptions.py +476 -0
- ads/aqua/ui.py +519 -0
- ads/automl/__init__.py +9 -0
- ads/automl/driver.py +330 -0
- ads/automl/provider.py +975 -0
- ads/bds/__init__.py +5 -0
- ads/bds/auth.py +127 -0
- ads/bds/big_data_service.py +255 -0
- ads/catalog/__init__.py +19 -0
- ads/catalog/model.py +1576 -0
- ads/catalog/notebook.py +461 -0
- ads/catalog/project.py +468 -0
- ads/catalog/summary.py +178 -0
- ads/common/__init__.py +11 -0
- ads/common/analyzer.py +65 -0
- ads/common/artifact/.model-ignore +63 -0
- ads/common/artifact/__init__.py +10 -0
- ads/common/auth.py +1122 -0
- ads/common/card_identifier.py +83 -0
- ads/common/config.py +647 -0
- ads/common/data.py +165 -0
- ads/common/decorator/__init__.py +9 -0
- ads/common/decorator/argument_to_case.py +88 -0
- ads/common/decorator/deprecate.py +69 -0
- ads/common/decorator/require_nonempty_arg.py +65 -0
- ads/common/decorator/runtime_dependency.py +178 -0
- ads/common/decorator/threaded.py +97 -0
- ads/common/decorator/utils.py +35 -0
- ads/common/dsc_file_system.py +303 -0
- ads/common/error.py +14 -0
- ads/common/extended_enum.py +81 -0
- ads/common/function/__init__.py +5 -0
- ads/common/function/fn_util.py +142 -0
- ads/common/function/func_conf.yaml +25 -0
- ads/common/ipython.py +76 -0
- ads/common/model.py +679 -0
- ads/common/model_artifact.py +1759 -0
- ads/common/model_artifact_schema.json +107 -0
- ads/common/model_export_util.py +664 -0
- ads/common/model_metadata.py +24 -0
- ads/common/object_storage_details.py +296 -0
- ads/common/oci_client.py +179 -0
- ads/common/oci_datascience.py +46 -0
- ads/common/oci_logging.py +1144 -0
- ads/common/oci_mixin.py +957 -0
- ads/common/oci_resource.py +136 -0
- ads/common/serializer.py +559 -0
- ads/common/utils.py +1852 -0
- ads/common/word_lists.py +1491 -0
- ads/common/work_request.py +189 -0
- ads/config.py +1 -0
- ads/data_labeling/__init__.py +13 -0
- ads/data_labeling/boundingbox.py +253 -0
- ads/data_labeling/constants.py +47 -0
- ads/data_labeling/data_labeling_service.py +244 -0
- ads/data_labeling/interface/__init__.py +5 -0
- ads/data_labeling/interface/loader.py +16 -0
- ads/data_labeling/interface/parser.py +16 -0
- ads/data_labeling/interface/reader.py +23 -0
- ads/data_labeling/loader/__init__.py +5 -0
- ads/data_labeling/loader/file_loader.py +241 -0
- ads/data_labeling/metadata.py +110 -0
- ads/data_labeling/mixin/__init__.py +5 -0
- ads/data_labeling/mixin/data_labeling.py +232 -0
- ads/data_labeling/ner.py +129 -0
- ads/data_labeling/parser/__init__.py +5 -0
- ads/data_labeling/parser/dls_record_parser.py +388 -0
- ads/data_labeling/parser/export_metadata_parser.py +94 -0
- ads/data_labeling/parser/export_record_parser.py +473 -0
- ads/data_labeling/reader/__init__.py +5 -0
- ads/data_labeling/reader/dataset_reader.py +574 -0
- ads/data_labeling/reader/dls_record_reader.py +121 -0
- ads/data_labeling/reader/export_record_reader.py +62 -0
- ads/data_labeling/reader/jsonl_reader.py +75 -0
- ads/data_labeling/reader/metadata_reader.py +203 -0
- ads/data_labeling/reader/record_reader.py +263 -0
- ads/data_labeling/record.py +52 -0
- ads/data_labeling/visualizer/__init__.py +5 -0
- ads/data_labeling/visualizer/image_visualizer.py +525 -0
- ads/data_labeling/visualizer/text_visualizer.py +357 -0
- ads/database/__init__.py +5 -0
- ads/database/connection.py +338 -0
- ads/dataset/__init__.py +10 -0
- ads/dataset/capabilities.md +51 -0
- ads/dataset/classification_dataset.py +339 -0
- ads/dataset/correlation.py +226 -0
- ads/dataset/correlation_plot.py +563 -0
- ads/dataset/dask_series.py +173 -0
- ads/dataset/dataframe_transformer.py +110 -0
- ads/dataset/dataset.py +1979 -0
- ads/dataset/dataset_browser.py +360 -0
- ads/dataset/dataset_with_target.py +995 -0
- ads/dataset/exception.py +25 -0
- ads/dataset/factory.py +987 -0
- ads/dataset/feature_engineering_transformer.py +35 -0
- ads/dataset/feature_selection.py +107 -0
- ads/dataset/forecasting_dataset.py +26 -0
- ads/dataset/helper.py +1450 -0
- ads/dataset/label_encoder.py +99 -0
- ads/dataset/mixin/__init__.py +5 -0
- ads/dataset/mixin/dataset_accessor.py +134 -0
- ads/dataset/pipeline.py +58 -0
- ads/dataset/plot.py +710 -0
- ads/dataset/progress.py +86 -0
- ads/dataset/recommendation.py +297 -0
- ads/dataset/recommendation_transformer.py +502 -0
- ads/dataset/regression_dataset.py +14 -0
- ads/dataset/sampled_dataset.py +1050 -0
- ads/dataset/target.py +98 -0
- ads/dataset/timeseries.py +18 -0
- ads/dbmixin/__init__.py +5 -0
- ads/dbmixin/db_pandas_accessor.py +153 -0
- ads/environment/__init__.py +9 -0
- ads/environment/ml_runtime.py +66 -0
- ads/evaluations/README.md +14 -0
- ads/evaluations/__init__.py +109 -0
- ads/evaluations/evaluation_plot.py +983 -0
- ads/evaluations/evaluator.py +1334 -0
- ads/evaluations/statistical_metrics.py +543 -0
- ads/experiments/__init__.py +9 -0
- ads/experiments/capabilities.md +0 -0
- ads/explanations/__init__.py +21 -0
- ads/explanations/base_explainer.py +142 -0
- ads/explanations/capabilities.md +83 -0
- ads/explanations/explainer.py +190 -0
- ads/explanations/mlx_global_explainer.py +1050 -0
- ads/explanations/mlx_interface.py +386 -0
- ads/explanations/mlx_local_explainer.py +287 -0
- ads/explanations/mlx_whatif_explainer.py +201 -0
- ads/feature_engineering/__init__.py +20 -0
- ads/feature_engineering/accessor/__init__.py +5 -0
- ads/feature_engineering/accessor/dataframe_accessor.py +535 -0
- ads/feature_engineering/accessor/mixin/__init__.py +5 -0
- ads/feature_engineering/accessor/mixin/correlation.py +166 -0
- ads/feature_engineering/accessor/mixin/eda_mixin.py +266 -0
- ads/feature_engineering/accessor/mixin/eda_mixin_series.py +85 -0
- ads/feature_engineering/accessor/mixin/feature_types_mixin.py +211 -0
- ads/feature_engineering/accessor/mixin/utils.py +65 -0
- ads/feature_engineering/accessor/series_accessor.py +431 -0
- ads/feature_engineering/adsimage/__init__.py +5 -0
- ads/feature_engineering/adsimage/image.py +192 -0
- ads/feature_engineering/adsimage/image_reader.py +170 -0
- ads/feature_engineering/adsimage/interface/__init__.py +5 -0
- ads/feature_engineering/adsimage/interface/reader.py +19 -0
- ads/feature_engineering/adsstring/__init__.py +7 -0
- ads/feature_engineering/adsstring/oci_language/__init__.py +8 -0
- ads/feature_engineering/adsstring/string/__init__.py +8 -0
- ads/feature_engineering/data_schema.json +57 -0
- ads/feature_engineering/dataset/__init__.py +5 -0
- ads/feature_engineering/dataset/zip_code_data.py +42062 -0
- ads/feature_engineering/exceptions.py +40 -0
- ads/feature_engineering/feature_type/__init__.py +133 -0
- ads/feature_engineering/feature_type/address.py +184 -0
- ads/feature_engineering/feature_type/adsstring/__init__.py +5 -0
- ads/feature_engineering/feature_type/adsstring/common_regex_mixin.py +164 -0
- ads/feature_engineering/feature_type/adsstring/oci_language.py +93 -0
- ads/feature_engineering/feature_type/adsstring/parsers/__init__.py +5 -0
- ads/feature_engineering/feature_type/adsstring/parsers/base.py +47 -0
- ads/feature_engineering/feature_type/adsstring/parsers/nltk_parser.py +96 -0
- ads/feature_engineering/feature_type/adsstring/parsers/spacy_parser.py +221 -0
- ads/feature_engineering/feature_type/adsstring/string.py +258 -0
- ads/feature_engineering/feature_type/base.py +58 -0
- ads/feature_engineering/feature_type/boolean.py +183 -0
- ads/feature_engineering/feature_type/category.py +146 -0
- ads/feature_engineering/feature_type/constant.py +137 -0
- ads/feature_engineering/feature_type/continuous.py +151 -0
- ads/feature_engineering/feature_type/creditcard.py +314 -0
- ads/feature_engineering/feature_type/datetime.py +190 -0
- ads/feature_engineering/feature_type/discrete.py +134 -0
- ads/feature_engineering/feature_type/document.py +43 -0
- ads/feature_engineering/feature_type/gis.py +251 -0
- ads/feature_engineering/feature_type/handler/__init__.py +5 -0
- ads/feature_engineering/feature_type/handler/feature_validator.py +524 -0
- ads/feature_engineering/feature_type/handler/feature_warning.py +319 -0
- ads/feature_engineering/feature_type/handler/warnings.py +128 -0
- ads/feature_engineering/feature_type/integer.py +142 -0
- ads/feature_engineering/feature_type/ip_address.py +144 -0
- ads/feature_engineering/feature_type/ip_address_v4.py +138 -0
- ads/feature_engineering/feature_type/ip_address_v6.py +138 -0
- ads/feature_engineering/feature_type/lat_long.py +256 -0
- ads/feature_engineering/feature_type/object.py +43 -0
- ads/feature_engineering/feature_type/ordinal.py +132 -0
- ads/feature_engineering/feature_type/phone_number.py +135 -0
- ads/feature_engineering/feature_type/string.py +171 -0
- ads/feature_engineering/feature_type/text.py +93 -0
- ads/feature_engineering/feature_type/unknown.py +43 -0
- ads/feature_engineering/feature_type/zip_code.py +164 -0
- ads/feature_engineering/feature_type_manager.py +406 -0
- ads/feature_engineering/schema.py +795 -0
- ads/feature_engineering/utils.py +245 -0
- ads/feature_store/.readthedocs.yaml +19 -0
- ads/feature_store/README.md +65 -0
- ads/feature_store/__init__.py +9 -0
- ads/feature_store/common/__init__.py +0 -0
- ads/feature_store/common/enums.py +339 -0
- ads/feature_store/common/exceptions.py +18 -0
- ads/feature_store/common/spark_session_singleton.py +125 -0
- ads/feature_store/common/utils/__init__.py +0 -0
- ads/feature_store/common/utils/base64_encoder_decoder.py +72 -0
- ads/feature_store/common/utils/feature_schema_mapper.py +283 -0
- ads/feature_store/common/utils/transformation_utils.py +82 -0
- ads/feature_store/common/utils/utility.py +403 -0
- ads/feature_store/data_validation/__init__.py +0 -0
- ads/feature_store/data_validation/great_expectation.py +129 -0
- ads/feature_store/dataset.py +1230 -0
- ads/feature_store/dataset_job.py +530 -0
- ads/feature_store/docs/Dockerfile +7 -0
- ads/feature_store/docs/Makefile +44 -0
- ads/feature_store/docs/conf.py +28 -0
- ads/feature_store/docs/requirements.txt +14 -0
- ads/feature_store/docs/source/ads.feature_store.query.rst +20 -0
- ads/feature_store/docs/source/cicd.rst +137 -0
- ads/feature_store/docs/source/conf.py +86 -0
- ads/feature_store/docs/source/data_versioning.rst +33 -0
- ads/feature_store/docs/source/dataset.rst +388 -0
- ads/feature_store/docs/source/dataset_job.rst +27 -0
- ads/feature_store/docs/source/demo.rst +70 -0
- ads/feature_store/docs/source/entity.rst +78 -0
- ads/feature_store/docs/source/feature_group.rst +624 -0
- ads/feature_store/docs/source/feature_group_job.rst +29 -0
- ads/feature_store/docs/source/feature_store.rst +122 -0
- ads/feature_store/docs/source/feature_store_class.rst +123 -0
- ads/feature_store/docs/source/feature_validation.rst +66 -0
- ads/feature_store/docs/source/figures/cicd.png +0 -0
- ads/feature_store/docs/source/figures/data_validation.png +0 -0
- ads/feature_store/docs/source/figures/data_versioning.png +0 -0
- ads/feature_store/docs/source/figures/dataset.gif +0 -0
- ads/feature_store/docs/source/figures/dataset.png +0 -0
- ads/feature_store/docs/source/figures/dataset_lineage.png +0 -0
- ads/feature_store/docs/source/figures/dataset_statistics.png +0 -0
- ads/feature_store/docs/source/figures/dataset_statistics_viz.png +0 -0
- ads/feature_store/docs/source/figures/dataset_validation_results.png +0 -0
- ads/feature_store/docs/source/figures/dataset_validation_summary.png +0 -0
- ads/feature_store/docs/source/figures/drift_monitoring.png +0 -0
- ads/feature_store/docs/source/figures/entity.png +0 -0
- ads/feature_store/docs/source/figures/feature_group.png +0 -0
- ads/feature_store/docs/source/figures/feature_group_lineage.png +0 -0
- ads/feature_store/docs/source/figures/feature_group_statistics_viz.png +0 -0
- ads/feature_store/docs/source/figures/feature_store_deployment.png +0 -0
- ads/feature_store/docs/source/figures/feature_store_overview.png +0 -0
- ads/feature_store/docs/source/figures/featuregroup.gif +0 -0
- ads/feature_store/docs/source/figures/lineage_d1.png +0 -0
- ads/feature_store/docs/source/figures/lineage_d2.png +0 -0
- ads/feature_store/docs/source/figures/lineage_fg.png +0 -0
- ads/feature_store/docs/source/figures/logo-dark-mode.png +0 -0
- ads/feature_store/docs/source/figures/logo-light-mode.png +0 -0
- ads/feature_store/docs/source/figures/overview.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager_feature_store_stack.png +0 -0
- ads/feature_store/docs/source/figures/resource_manager_home.png +0 -0
- ads/feature_store/docs/source/figures/stats_1.png +0 -0
- ads/feature_store/docs/source/figures/stats_2.png +0 -0
- ads/feature_store/docs/source/figures/stats_d.png +0 -0
- ads/feature_store/docs/source/figures/stats_fg.png +0 -0
- ads/feature_store/docs/source/figures/transformation.png +0 -0
- ads/feature_store/docs/source/figures/transformations.gif +0 -0
- ads/feature_store/docs/source/figures/validation.png +0 -0
- ads/feature_store/docs/source/figures/validation_fg.png +0 -0
- ads/feature_store/docs/source/figures/validation_results.png +0 -0
- ads/feature_store/docs/source/figures/validation_summary.png +0 -0
- ads/feature_store/docs/source/index.rst +81 -0
- ads/feature_store/docs/source/module.rst +8 -0
- ads/feature_store/docs/source/notebook.rst +94 -0
- ads/feature_store/docs/source/overview.rst +47 -0
- ads/feature_store/docs/source/quickstart.rst +176 -0
- ads/feature_store/docs/source/release_notes.rst +194 -0
- ads/feature_store/docs/source/setup_feature_store.rst +81 -0
- ads/feature_store/docs/source/statistics.rst +58 -0
- ads/feature_store/docs/source/transformation.rst +199 -0
- ads/feature_store/docs/source/ui.rst +65 -0
- ads/feature_store/docs/source/user_guides.setup.feature_store_operator.rst +66 -0
- ads/feature_store/docs/source/user_guides.setup.helm_chart.rst +192 -0
- ads/feature_store/docs/source/user_guides.setup.terraform.rst +338 -0
- ads/feature_store/entity.py +718 -0
- ads/feature_store/execution_strategy/__init__.py +0 -0
- ads/feature_store/execution_strategy/delta_lake/__init__.py +0 -0
- ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py +375 -0
- ads/feature_store/execution_strategy/engine/__init__.py +0 -0
- ads/feature_store/execution_strategy/engine/spark_engine.py +316 -0
- ads/feature_store/execution_strategy/execution_strategy.py +113 -0
- ads/feature_store/execution_strategy/execution_strategy_provider.py +47 -0
- ads/feature_store/execution_strategy/spark/__init__.py +0 -0
- ads/feature_store/execution_strategy/spark/spark_execution.py +618 -0
- ads/feature_store/feature.py +192 -0
- ads/feature_store/feature_group.py +1494 -0
- ads/feature_store/feature_group_expectation.py +346 -0
- ads/feature_store/feature_group_job.py +602 -0
- ads/feature_store/feature_lineage/__init__.py +0 -0
- ads/feature_store/feature_lineage/graphviz_service.py +180 -0
- ads/feature_store/feature_option_details.py +50 -0
- ads/feature_store/feature_statistics/__init__.py +0 -0
- ads/feature_store/feature_statistics/statistics_service.py +99 -0
- ads/feature_store/feature_store.py +699 -0
- ads/feature_store/feature_store_registrar.py +518 -0
- ads/feature_store/input_feature_detail.py +149 -0
- ads/feature_store/mixin/__init__.py +4 -0
- ads/feature_store/mixin/oci_feature_store.py +145 -0
- ads/feature_store/model_details.py +73 -0
- ads/feature_store/query/__init__.py +0 -0
- ads/feature_store/query/filter.py +266 -0
- ads/feature_store/query/generator/__init__.py +0 -0
- ads/feature_store/query/generator/query_generator.py +298 -0
- ads/feature_store/query/join.py +161 -0
- ads/feature_store/query/query.py +403 -0
- ads/feature_store/query/validator/__init__.py +0 -0
- ads/feature_store/query/validator/query_validator.py +57 -0
- ads/feature_store/response/__init__.py +0 -0
- ads/feature_store/response/response_builder.py +68 -0
- ads/feature_store/service/__init__.py +0 -0
- ads/feature_store/service/oci_dataset.py +139 -0
- ads/feature_store/service/oci_dataset_job.py +199 -0
- ads/feature_store/service/oci_entity.py +125 -0
- ads/feature_store/service/oci_feature_group.py +164 -0
- ads/feature_store/service/oci_feature_group_job.py +214 -0
- ads/feature_store/service/oci_feature_store.py +182 -0
- ads/feature_store/service/oci_lineage.py +87 -0
- ads/feature_store/service/oci_transformation.py +104 -0
- ads/feature_store/statistics/__init__.py +0 -0
- ads/feature_store/statistics/abs_feature_value.py +49 -0
- ads/feature_store/statistics/charts/__init__.py +0 -0
- ads/feature_store/statistics/charts/abstract_feature_plot.py +37 -0
- ads/feature_store/statistics/charts/box_plot.py +148 -0
- ads/feature_store/statistics/charts/frequency_distribution.py +65 -0
- ads/feature_store/statistics/charts/probability_distribution.py +68 -0
- ads/feature_store/statistics/charts/top_k_frequent_elements.py +98 -0
- ads/feature_store/statistics/feature_stat.py +126 -0
- ads/feature_store/statistics/generic_feature_value.py +33 -0
- ads/feature_store/statistics/statistics.py +41 -0
- ads/feature_store/statistics_config.py +101 -0
- ads/feature_store/templates/feature_store_template.yaml +45 -0
- ads/feature_store/transformation.py +499 -0
- ads/feature_store/validation_output.py +57 -0
- ads/hpo/__init__.py +9 -0
- ads/hpo/_imports.py +91 -0
- ads/hpo/ads_search_space.py +439 -0
- ads/hpo/distributions.py +325 -0
- ads/hpo/objective.py +280 -0
- ads/hpo/search_cv.py +1657 -0
- ads/hpo/stopping_criterion.py +75 -0
- ads/hpo/tuner_artifact.py +413 -0
- ads/hpo/utils.py +91 -0
- ads/hpo/validation.py +140 -0
- ads/hpo/visualization/__init__.py +5 -0
- ads/hpo/visualization/_contour.py +23 -0
- ads/hpo/visualization/_edf.py +20 -0
- ads/hpo/visualization/_intermediate_values.py +21 -0
- ads/hpo/visualization/_optimization_history.py +25 -0
- ads/hpo/visualization/_parallel_coordinate.py +169 -0
- ads/hpo/visualization/_param_importances.py +26 -0
- ads/jobs/__init__.py +53 -0
- ads/jobs/ads_job.py +663 -0
- ads/jobs/builders/__init__.py +5 -0
- ads/jobs/builders/base.py +156 -0
- ads/jobs/builders/infrastructure/__init__.py +6 -0
- ads/jobs/builders/infrastructure/base.py +165 -0
- ads/jobs/builders/infrastructure/dataflow.py +1252 -0
- ads/jobs/builders/infrastructure/dsc_job.py +1894 -0
- ads/jobs/builders/infrastructure/dsc_job_runtime.py +1233 -0
- ads/jobs/builders/infrastructure/utils.py +65 -0
- ads/jobs/builders/runtimes/__init__.py +5 -0
- ads/jobs/builders/runtimes/artifact.py +338 -0
- ads/jobs/builders/runtimes/base.py +325 -0
- ads/jobs/builders/runtimes/container_runtime.py +242 -0
- ads/jobs/builders/runtimes/python_runtime.py +1016 -0
- ads/jobs/builders/runtimes/pytorch_runtime.py +204 -0
- ads/jobs/cli.py +104 -0
- ads/jobs/env_var_parser.py +131 -0
- ads/jobs/extension.py +160 -0
- ads/jobs/schema/__init__.py +5 -0
- ads/jobs/schema/infrastructure_schema.json +116 -0
- ads/jobs/schema/job_schema.json +42 -0
- ads/jobs/schema/runtime_schema.json +183 -0
- ads/jobs/schema/validator.py +141 -0
- ads/jobs/serializer.py +296 -0
- ads/jobs/templates/__init__.py +5 -0
- ads/jobs/templates/container.py +6 -0
- ads/jobs/templates/driver_notebook.py +177 -0
- ads/jobs/templates/driver_oci.py +500 -0
- ads/jobs/templates/driver_python.py +48 -0
- ads/jobs/templates/driver_pytorch.py +852 -0
- ads/jobs/templates/driver_utils.py +615 -0
- ads/jobs/templates/hostname_from_env.c +55 -0
- ads/jobs/templates/oci_metrics.py +181 -0
- ads/jobs/utils.py +104 -0
- ads/llm/__init__.py +28 -0
- ads/llm/autogen/__init__.py +2 -0
- ads/llm/autogen/constants.py +15 -0
- ads/llm/autogen/reports/__init__.py +2 -0
- ads/llm/autogen/reports/base.py +67 -0
- ads/llm/autogen/reports/data.py +103 -0
- ads/llm/autogen/reports/session.py +526 -0
- ads/llm/autogen/reports/templates/chat_box.html +13 -0
- ads/llm/autogen/reports/templates/chat_box_lt.html +5 -0
- ads/llm/autogen/reports/templates/chat_box_rt.html +6 -0
- ads/llm/autogen/reports/utils.py +56 -0
- ads/llm/autogen/v02/__init__.py +4 -0
- ads/llm/autogen/v02/client.py +295 -0
- ads/llm/autogen/v02/log_handlers/__init__.py +2 -0
- ads/llm/autogen/v02/log_handlers/oci_file_handler.py +83 -0
- ads/llm/autogen/v02/loggers/__init__.py +6 -0
- ads/llm/autogen/v02/loggers/metric_logger.py +320 -0
- ads/llm/autogen/v02/loggers/session_logger.py +580 -0
- ads/llm/autogen/v02/loggers/utils.py +86 -0
- ads/llm/autogen/v02/runtime_logging.py +163 -0
- ads/llm/chain.py +268 -0
- ads/llm/chat_template.py +31 -0
- ads/llm/deploy.py +63 -0
- ads/llm/guardrails/__init__.py +5 -0
- ads/llm/guardrails/base.py +442 -0
- ads/llm/guardrails/huggingface.py +44 -0
- ads/llm/langchain/__init__.py +5 -0
- ads/llm/langchain/plugins/__init__.py +5 -0
- ads/llm/langchain/plugins/chat_models/__init__.py +5 -0
- ads/llm/langchain/plugins/chat_models/oci_data_science.py +1027 -0
- ads/llm/langchain/plugins/embeddings/__init__.py +4 -0
- ads/llm/langchain/plugins/embeddings/oci_data_science_model_deployment_endpoint.py +184 -0
- ads/llm/langchain/plugins/llms/__init__.py +5 -0
- ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +979 -0
- ads/llm/requirements.txt +3 -0
- ads/llm/serialize.py +219 -0
- ads/llm/serializers/__init__.py +0 -0
- ads/llm/serializers/retrieval_qa.py +153 -0
- ads/llm/serializers/runnable_parallel.py +27 -0
- ads/llm/templates/score_chain.jinja2 +155 -0
- ads/llm/templates/tool_chat_template_hermes.jinja +130 -0
- ads/llm/templates/tool_chat_template_mistral_parallel.jinja +94 -0
- ads/model/__init__.py +52 -0
- ads/model/artifact.py +573 -0
- ads/model/artifact_downloader.py +254 -0
- ads/model/artifact_uploader.py +267 -0
- ads/model/base_properties.py +238 -0
- ads/model/common/.model-ignore +66 -0
- ads/model/common/__init__.py +5 -0
- ads/model/common/utils.py +142 -0
- ads/model/datascience_model.py +2635 -0
- ads/model/deployment/__init__.py +20 -0
- ads/model/deployment/common/__init__.py +5 -0
- ads/model/deployment/common/utils.py +308 -0
- ads/model/deployment/model_deployer.py +466 -0
- ads/model/deployment/model_deployment.py +1846 -0
- ads/model/deployment/model_deployment_infrastructure.py +671 -0
- ads/model/deployment/model_deployment_properties.py +493 -0
- ads/model/deployment/model_deployment_runtime.py +838 -0
- ads/model/extractor/__init__.py +5 -0
- ads/model/extractor/automl_extractor.py +74 -0
- ads/model/extractor/embedding_onnx_extractor.py +80 -0
- ads/model/extractor/huggingface_extractor.py +88 -0
- ads/model/extractor/keras_extractor.py +84 -0
- ads/model/extractor/lightgbm_extractor.py +93 -0
- ads/model/extractor/model_info_extractor.py +114 -0
- ads/model/extractor/model_info_extractor_factory.py +105 -0
- ads/model/extractor/pytorch_extractor.py +87 -0
- ads/model/extractor/sklearn_extractor.py +112 -0
- ads/model/extractor/spark_extractor.py +89 -0
- ads/model/extractor/tensorflow_extractor.py +85 -0
- ads/model/extractor/xgboost_extractor.py +94 -0
- ads/model/framework/__init__.py +5 -0
- ads/model/framework/automl_model.py +178 -0
- ads/model/framework/embedding_onnx_model.py +438 -0
- ads/model/framework/huggingface_model.py +399 -0
- ads/model/framework/lightgbm_model.py +266 -0
- ads/model/framework/pytorch_model.py +266 -0
- ads/model/framework/sklearn_model.py +250 -0
- ads/model/framework/spark_model.py +326 -0
- ads/model/framework/tensorflow_model.py +254 -0
- ads/model/framework/xgboost_model.py +258 -0
- ads/model/generic_model.py +3518 -0
- ads/model/model_artifact_boilerplate/README.md +381 -0
- ads/model/model_artifact_boilerplate/__init__.py +5 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/__init__.py +5 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py +427 -0
- ads/model/model_artifact_boilerplate/artifact_introspection_test/requirements.txt +2 -0
- ads/model/model_artifact_boilerplate/runtime.yaml +7 -0
- ads/model/model_artifact_boilerplate/score.py +61 -0
- ads/model/model_file_description_schema.json +68 -0
- ads/model/model_introspect.py +331 -0
- ads/model/model_metadata.py +1810 -0
- ads/model/model_metadata_mixin.py +460 -0
- ads/model/model_properties.py +63 -0
- ads/model/model_version_set.py +739 -0
- ads/model/runtime/__init__.py +5 -0
- ads/model/runtime/env_info.py +306 -0
- ads/model/runtime/model_deployment_details.py +37 -0
- ads/model/runtime/model_provenance_details.py +58 -0
- ads/model/runtime/runtime_info.py +81 -0
- ads/model/runtime/schemas/inference_env_info_schema.yaml +16 -0
- ads/model/runtime/schemas/model_provenance_schema.yaml +36 -0
- ads/model/runtime/schemas/training_env_info_schema.yaml +16 -0
- ads/model/runtime/utils.py +201 -0
- ads/model/serde/__init__.py +5 -0
- ads/model/serde/common.py +40 -0
- ads/model/serde/model_input.py +547 -0
- ads/model/serde/model_serializer.py +1184 -0
- ads/model/service/__init__.py +5 -0
- ads/model/service/oci_datascience_model.py +1076 -0
- ads/model/service/oci_datascience_model_deployment.py +500 -0
- ads/model/service/oci_datascience_model_version_set.py +176 -0
- ads/model/transformer/__init__.py +5 -0
- ads/model/transformer/onnx_transformer.py +324 -0
- ads/mysqldb/__init__.py +5 -0
- ads/mysqldb/mysql_db.py +227 -0
- ads/opctl/__init__.py +18 -0
- ads/opctl/anomaly_detection.py +11 -0
- ads/opctl/backend/__init__.py +5 -0
- ads/opctl/backend/ads_dataflow.py +353 -0
- ads/opctl/backend/ads_ml_job.py +710 -0
- ads/opctl/backend/ads_ml_pipeline.py +164 -0
- ads/opctl/backend/ads_model_deployment.py +209 -0
- ads/opctl/backend/base.py +146 -0
- ads/opctl/backend/local.py +1053 -0
- ads/opctl/backend/marketplace/__init__.py +9 -0
- ads/opctl/backend/marketplace/helm_helper.py +173 -0
- ads/opctl/backend/marketplace/local_marketplace.py +271 -0
- ads/opctl/backend/marketplace/marketplace_backend_runner.py +71 -0
- ads/opctl/backend/marketplace/marketplace_operator_interface.py +44 -0
- ads/opctl/backend/marketplace/marketplace_operator_runner.py +24 -0
- ads/opctl/backend/marketplace/marketplace_utils.py +212 -0
- ads/opctl/backend/marketplace/models/__init__.py +5 -0
- ads/opctl/backend/marketplace/models/bearer_token.py +94 -0
- ads/opctl/backend/marketplace/models/marketplace_type.py +70 -0
- ads/opctl/backend/marketplace/models/ocir_details.py +56 -0
- ads/opctl/backend/marketplace/prerequisite_checker.py +238 -0
- ads/opctl/cli.py +707 -0
- ads/opctl/cmds.py +869 -0
- ads/opctl/conda/__init__.py +5 -0
- ads/opctl/conda/cli.py +193 -0
- ads/opctl/conda/cmds.py +749 -0
- ads/opctl/conda/config.yaml +34 -0
- ads/opctl/conda/manifest_template.yaml +13 -0
- ads/opctl/conda/multipart_uploader.py +188 -0
- ads/opctl/conda/pack.py +89 -0
- ads/opctl/config/__init__.py +5 -0
- ads/opctl/config/base.py +57 -0
- ads/opctl/config/diagnostics/__init__.py +5 -0
- ads/opctl/config/diagnostics/distributed/default_requirements_config.yaml +62 -0
- ads/opctl/config/merger.py +255 -0
- ads/opctl/config/resolver.py +297 -0
- ads/opctl/config/utils.py +79 -0
- ads/opctl/config/validator.py +17 -0
- ads/opctl/config/versioner.py +68 -0
- ads/opctl/config/yaml_parsers/__init__.py +7 -0
- ads/opctl/config/yaml_parsers/base.py +58 -0
- ads/opctl/config/yaml_parsers/distributed/__init__.py +7 -0
- ads/opctl/config/yaml_parsers/distributed/yaml_parser.py +201 -0
- ads/opctl/constants.py +66 -0
- ads/opctl/decorator/__init__.py +5 -0
- ads/opctl/decorator/common.py +129 -0
- ads/opctl/diagnostics/__init__.py +5 -0
- ads/opctl/diagnostics/__main__.py +25 -0
- ads/opctl/diagnostics/check_distributed_job_requirements.py +212 -0
- ads/opctl/diagnostics/check_requirements.py +144 -0
- ads/opctl/diagnostics/requirement_exception.py +9 -0
- ads/opctl/distributed/README.md +109 -0
- ads/opctl/distributed/__init__.py +5 -0
- ads/opctl/distributed/certificates.py +32 -0
- ads/opctl/distributed/cli.py +207 -0
- ads/opctl/distributed/cmds.py +731 -0
- ads/opctl/distributed/common/__init__.py +5 -0
- ads/opctl/distributed/common/abstract_cluster_provider.py +449 -0
- ads/opctl/distributed/common/abstract_framework_spec_builder.py +88 -0
- ads/opctl/distributed/common/cluster_config_helper.py +103 -0
- ads/opctl/distributed/common/cluster_provider_factory.py +21 -0
- ads/opctl/distributed/common/cluster_runner.py +54 -0
- ads/opctl/distributed/common/framework_factory.py +29 -0
- ads/opctl/docker/Dockerfile.job +103 -0
- ads/opctl/docker/Dockerfile.job.arm +107 -0
- ads/opctl/docker/Dockerfile.job.gpu +175 -0
- ads/opctl/docker/base-env.yaml +13 -0
- ads/opctl/docker/cuda.repo +6 -0
- ads/opctl/docker/operator/.dockerignore +0 -0
- ads/opctl/docker/operator/Dockerfile +41 -0
- ads/opctl/docker/operator/Dockerfile.gpu +85 -0
- ads/opctl/docker/operator/cuda.repo +6 -0
- ads/opctl/docker/operator/environment.yaml +8 -0
- ads/opctl/forecast.py +11 -0
- ads/opctl/index.yaml +3 -0
- ads/opctl/model/__init__.py +5 -0
- ads/opctl/model/cli.py +65 -0
- ads/opctl/model/cmds.py +73 -0
- ads/opctl/operator/README.md +4 -0
- ads/opctl/operator/__init__.py +31 -0
- ads/opctl/operator/cli.py +344 -0
- ads/opctl/operator/cmd.py +596 -0
- ads/opctl/operator/common/__init__.py +5 -0
- ads/opctl/operator/common/backend_factory.py +460 -0
- ads/opctl/operator/common/const.py +27 -0
- ads/opctl/operator/common/data/synthetic.csv +16001 -0
- ads/opctl/operator/common/dictionary_merger.py +148 -0
- ads/opctl/operator/common/errors.py +42 -0
- ads/opctl/operator/common/operator_config.py +99 -0
- ads/opctl/operator/common/operator_loader.py +811 -0
- ads/opctl/operator/common/operator_schema.yaml +130 -0
- ads/opctl/operator/common/operator_yaml_generator.py +152 -0
- ads/opctl/operator/common/utils.py +208 -0
- ads/opctl/operator/lowcode/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/MLoperator +16 -0
- ads/opctl/operator/lowcode/anomaly/README.md +207 -0
- ads/opctl/operator/lowcode/anomaly/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/__main__.py +103 -0
- ads/opctl/operator/lowcode/anomaly/cmd.py +35 -0
- ads/opctl/operator/lowcode/anomaly/const.py +167 -0
- ads/opctl/operator/lowcode/anomaly/environment.yaml +10 -0
- ads/opctl/operator/lowcode/anomaly/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +146 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +162 -0
- ads/opctl/operator/lowcode/anomaly/model/automlx.py +99 -0
- ads/opctl/operator/lowcode/anomaly/model/autots.py +115 -0
- ads/opctl/operator/lowcode/anomaly/model/base_model.py +404 -0
- ads/opctl/operator/lowcode/anomaly/model/factory.py +110 -0
- ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +78 -0
- ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +78 -0
- ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +120 -0
- ads/opctl/operator/lowcode/anomaly/model/tods.py +119 -0
- ads/opctl/operator/lowcode/anomaly/operator_config.py +127 -0
- ads/opctl/operator/lowcode/anomaly/schema.yaml +401 -0
- ads/opctl/operator/lowcode/anomaly/utils.py +88 -0
- ads/opctl/operator/lowcode/common/__init__.py +5 -0
- ads/opctl/operator/lowcode/common/const.py +10 -0
- ads/opctl/operator/lowcode/common/data.py +116 -0
- ads/opctl/operator/lowcode/common/errors.py +47 -0
- ads/opctl/operator/lowcode/common/transformations.py +296 -0
- ads/opctl/operator/lowcode/common/utils.py +384 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/MLoperator +13 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/README.md +30 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/__init__.py +5 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/__main__.py +116 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/cmd.py +85 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/const.py +15 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/environment.yaml +0 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/__init__.py +4 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/apigw_config.py +32 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/db_config.py +43 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/mysql_config.py +120 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/models/serializable_yaml_model.py +34 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/operator_utils.py +386 -0
- ads/opctl/operator/lowcode/feature_store_marketplace/schema.yaml +160 -0
- ads/opctl/operator/lowcode/forecast/MLoperator +25 -0
- ads/opctl/operator/lowcode/forecast/README.md +209 -0
- ads/opctl/operator/lowcode/forecast/__init__.py +5 -0
- ads/opctl/operator/lowcode/forecast/__main__.py +89 -0
- ads/opctl/operator/lowcode/forecast/cmd.py +40 -0
- ads/opctl/operator/lowcode/forecast/const.py +92 -0
- ads/opctl/operator/lowcode/forecast/environment.yaml +20 -0
- ads/opctl/operator/lowcode/forecast/errors.py +26 -0
- ads/opctl/operator/lowcode/forecast/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/forecast/model/arima.py +279 -0
- ads/opctl/operator/lowcode/forecast/model/automlx.py +553 -0
- ads/opctl/operator/lowcode/forecast/model/autots.py +312 -0
- ads/opctl/operator/lowcode/forecast/model/base_model.py +875 -0
- ads/opctl/operator/lowcode/forecast/model/factory.py +106 -0
- ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +492 -0
- ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +243 -0
- ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +482 -0
- ads/opctl/operator/lowcode/forecast/model/prophet.py +450 -0
- ads/opctl/operator/lowcode/forecast/model_evaluator.py +244 -0
- ads/opctl/operator/lowcode/forecast/operator_config.py +234 -0
- ads/opctl/operator/lowcode/forecast/schema.yaml +506 -0
- ads/opctl/operator/lowcode/forecast/utils.py +397 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/__init__.py +7 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/deployment_manager.py +285 -0
- ads/opctl/operator/lowcode/forecast/whatifserve/score.py +246 -0
- ads/opctl/operator/lowcode/pii/MLoperator +17 -0
- ads/opctl/operator/lowcode/pii/README.md +208 -0
- ads/opctl/operator/lowcode/pii/__init__.py +5 -0
- ads/opctl/operator/lowcode/pii/__main__.py +78 -0
- ads/opctl/operator/lowcode/pii/cmd.py +39 -0
- ads/opctl/operator/lowcode/pii/constant.py +84 -0
- ads/opctl/operator/lowcode/pii/environment.yaml +17 -0
- ads/opctl/operator/lowcode/pii/errors.py +27 -0
- ads/opctl/operator/lowcode/pii/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/pii/model/factory.py +82 -0
- ads/opctl/operator/lowcode/pii/model/guardrails.py +167 -0
- ads/opctl/operator/lowcode/pii/model/pii.py +145 -0
- ads/opctl/operator/lowcode/pii/model/processor/__init__.py +34 -0
- ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py +34 -0
- ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py +35 -0
- ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py +225 -0
- ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py +73 -0
- ads/opctl/operator/lowcode/pii/model/processor/remover.py +26 -0
- ads/opctl/operator/lowcode/pii/model/report.py +487 -0
- ads/opctl/operator/lowcode/pii/operator_config.py +95 -0
- ads/opctl/operator/lowcode/pii/schema.yaml +108 -0
- ads/opctl/operator/lowcode/pii/utils.py +43 -0
- ads/opctl/operator/lowcode/recommender/MLoperator +16 -0
- ads/opctl/operator/lowcode/recommender/README.md +206 -0
- ads/opctl/operator/lowcode/recommender/__init__.py +5 -0
- ads/opctl/operator/lowcode/recommender/__main__.py +82 -0
- ads/opctl/operator/lowcode/recommender/cmd.py +33 -0
- ads/opctl/operator/lowcode/recommender/constant.py +30 -0
- ads/opctl/operator/lowcode/recommender/environment.yaml +11 -0
- ads/opctl/operator/lowcode/recommender/model/base_model.py +212 -0
- ads/opctl/operator/lowcode/recommender/model/factory.py +56 -0
- ads/opctl/operator/lowcode/recommender/model/recommender_dataset.py +25 -0
- ads/opctl/operator/lowcode/recommender/model/svd.py +106 -0
- ads/opctl/operator/lowcode/recommender/operator_config.py +81 -0
- ads/opctl/operator/lowcode/recommender/schema.yaml +265 -0
- ads/opctl/operator/lowcode/recommender/utils.py +13 -0
- ads/opctl/operator/runtime/__init__.py +5 -0
- ads/opctl/operator/runtime/const.py +17 -0
- ads/opctl/operator/runtime/container_runtime_schema.yaml +50 -0
- ads/opctl/operator/runtime/marketplace_runtime.py +50 -0
- ads/opctl/operator/runtime/python_marketplace_runtime_schema.yaml +21 -0
- ads/opctl/operator/runtime/python_runtime_schema.yaml +21 -0
- ads/opctl/operator/runtime/runtime.py +115 -0
- ads/opctl/schema.yaml.yml +36 -0
- ads/opctl/script.py +40 -0
- ads/opctl/spark/__init__.py +5 -0
- ads/opctl/spark/cli.py +43 -0
- ads/opctl/spark/cmds.py +147 -0
- ads/opctl/templates/diagnostic_report_template.jinja2 +102 -0
- ads/opctl/utils.py +344 -0
- ads/oracledb/__init__.py +5 -0
- ads/oracledb/oracle_db.py +346 -0
- ads/pipeline/__init__.py +39 -0
- ads/pipeline/ads_pipeline.py +2279 -0
- ads/pipeline/ads_pipeline_run.py +772 -0
- ads/pipeline/ads_pipeline_step.py +605 -0
- ads/pipeline/builders/__init__.py +5 -0
- ads/pipeline/builders/infrastructure/__init__.py +5 -0
- ads/pipeline/builders/infrastructure/custom_script.py +32 -0
- ads/pipeline/cli.py +119 -0
- ads/pipeline/extension.py +291 -0
- ads/pipeline/schema/__init__.py +5 -0
- ads/pipeline/schema/cs_step_schema.json +35 -0
- ads/pipeline/schema/ml_step_schema.json +31 -0
- ads/pipeline/schema/pipeline_schema.json +71 -0
- ads/pipeline/visualizer/__init__.py +5 -0
- ads/pipeline/visualizer/base.py +570 -0
- ads/pipeline/visualizer/graph_renderer.py +272 -0
- ads/pipeline/visualizer/text_renderer.py +84 -0
- ads/secrets/__init__.py +11 -0
- ads/secrets/adb.py +386 -0
- ads/secrets/auth_token.py +86 -0
- ads/secrets/big_data_service.py +365 -0
- ads/secrets/mysqldb.py +149 -0
- ads/secrets/oracledb.py +160 -0
- ads/secrets/secrets.py +407 -0
- ads/telemetry/__init__.py +7 -0
- ads/telemetry/base.py +69 -0
- ads/telemetry/client.py +122 -0
- ads/telemetry/telemetry.py +257 -0
- ads/templates/dataflow_pyspark.jinja2 +13 -0
- ads/templates/dataflow_sparksql.jinja2 +22 -0
- ads/templates/func.jinja2 +20 -0
- ads/templates/schemas/openapi.json +1740 -0
- ads/templates/score-pkl.jinja2 +173 -0
- ads/templates/score.jinja2 +322 -0
- ads/templates/score_embedding_onnx.jinja2 +202 -0
- ads/templates/score_generic.jinja2 +165 -0
- ads/templates/score_huggingface_pipeline.jinja2 +217 -0
- ads/templates/score_lightgbm.jinja2 +185 -0
- ads/templates/score_onnx.jinja2 +407 -0
- ads/templates/score_onnx_new.jinja2 +473 -0
- ads/templates/score_oracle_automl.jinja2 +185 -0
- ads/templates/score_pyspark.jinja2 +154 -0
- ads/templates/score_pytorch.jinja2 +219 -0
- ads/templates/score_scikit-learn.jinja2 +184 -0
- ads/templates/score_tensorflow.jinja2 +184 -0
- ads/templates/score_xgboost.jinja2 +178 -0
- ads/text_dataset/__init__.py +5 -0
- ads/text_dataset/backends.py +211 -0
- ads/text_dataset/dataset.py +445 -0
- ads/text_dataset/extractor.py +207 -0
- ads/text_dataset/options.py +53 -0
- ads/text_dataset/udfs.py +22 -0
- ads/text_dataset/utils.py +49 -0
- ads/type_discovery/__init__.py +9 -0
- ads/type_discovery/abstract_detector.py +21 -0
- ads/type_discovery/constant_detector.py +41 -0
- ads/type_discovery/continuous_detector.py +54 -0
- ads/type_discovery/credit_card_detector.py +99 -0
- ads/type_discovery/datetime_detector.py +92 -0
- ads/type_discovery/discrete_detector.py +118 -0
- ads/type_discovery/document_detector.py +146 -0
- ads/type_discovery/ip_detector.py +68 -0
- ads/type_discovery/latlon_detector.py +90 -0
- ads/type_discovery/phone_number_detector.py +63 -0
- ads/type_discovery/type_discovery_driver.py +87 -0
- ads/type_discovery/typed_feature.py +594 -0
- ads/type_discovery/unknown_detector.py +41 -0
- ads/type_discovery/zipcode_detector.py +48 -0
- ads/vault/__init__.py +7 -0
- ads/vault/vault.py +237 -0
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10rc0.dist-info}/METADATA +150 -149
- oracle_ads-2.13.10rc0.dist-info/RECORD +858 -0
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10rc0.dist-info}/WHEEL +1 -2
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10rc0.dist-info}/entry_points.txt +2 -1
- oracle_ads-2.13.9rc0.dist-info/RECORD +0 -9
- oracle_ads-2.13.9rc0.dist-info/top_level.txt +0 -1
- {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.10rc0.dist-info}/licenses/LICENSE.txt +0 -0
ads/dataset/helper.py
ADDED
@@ -0,0 +1,1450 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
|
4
|
+
# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
|
5
|
+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
|
+
|
7
|
+
import ast
|
8
|
+
import base64
|
9
|
+
import html
|
10
|
+
import io
|
11
|
+
import math
|
12
|
+
import os
|
13
|
+
import warnings
|
14
|
+
import re
|
15
|
+
from collections import defaultdict
|
16
|
+
import inspect
|
17
|
+
import importlib
|
18
|
+
from typing import Callable, List, Tuple, Union
|
19
|
+
import fsspec
|
20
|
+
|
21
|
+
# from pandas.io.common import _compression_to_extension
|
22
|
+
|
23
|
+
from numbers import Number
|
24
|
+
from urllib.parse import urlparse
|
25
|
+
|
26
|
+
import numpy as np
|
27
|
+
import pandas as pd
|
28
|
+
|
29
|
+
from pandas.core.dtypes.common import (
|
30
|
+
is_numeric_dtype,
|
31
|
+
is_bool_dtype,
|
32
|
+
is_categorical_dtype,
|
33
|
+
is_datetime64_any_dtype,
|
34
|
+
is_float_dtype,
|
35
|
+
)
|
36
|
+
|
37
|
+
from ads.common.decorator.runtime_dependency import (
|
38
|
+
runtime_dependency,
|
39
|
+
OptionalDependency,
|
40
|
+
)
|
41
|
+
from ads.common import utils
|
42
|
+
from ads.dataset import logger
|
43
|
+
from ads.type_discovery.type_discovery_driver import TypeDiscoveryDriver
|
44
|
+
from ads.type_discovery.typed_feature import (
|
45
|
+
ContinuousTypedFeature,
|
46
|
+
DateTimeTypedFeature,
|
47
|
+
CategoricalTypedFeature,
|
48
|
+
GISTypedFeature,
|
49
|
+
TypedFeature,
|
50
|
+
UnknownTypedFeature,
|
51
|
+
OrdinalTypedFeature,
|
52
|
+
DocumentTypedFeature,
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
class DatasetDefaults:
|
57
|
+
sampling_confidence_level = 95
|
58
|
+
sampling_confidence_interval = 1.0
|
59
|
+
|
60
|
+
|
61
|
+
_known_db_protocols = {"sqlite", "ADB", "oracle+cx_oracle"}
|
62
|
+
|
63
|
+
|
64
|
+
def concatenate(X, y):
|
65
|
+
if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
|
66
|
+
return pd.concat([X, y], axis=1)
|
67
|
+
else:
|
68
|
+
return X.assign(**{y.name: y})
|
69
|
+
|
70
|
+
|
71
|
+
def fix_column_names(X):
|
72
|
+
X.columns = X.columns.astype("str").str.strip().str.replace(" ", "_")
|
73
|
+
return X
|
74
|
+
|
75
|
+
|
76
|
+
def convert_columns(df, feature_metadata=None, dtypes=None):
|
77
|
+
if feature_metadata is not None:
|
78
|
+
dtypes = {}
|
79
|
+
for feature in feature_metadata:
|
80
|
+
dtype = get_dtype(feature_metadata[feature], df[feature].dtype)
|
81
|
+
if dtype is not None:
|
82
|
+
dtypes[feature] = dtype
|
83
|
+
return df.astype(dtypes)
|
84
|
+
|
85
|
+
|
86
|
+
def get_dtype(feature_type, dtype):
|
87
|
+
if isinstance(feature_type, ContinuousTypedFeature) or isinstance(
|
88
|
+
feature_type, OrdinalTypedFeature
|
89
|
+
):
|
90
|
+
return dtype.name if is_numeric_dtype(dtype) else "float"
|
91
|
+
elif isinstance(feature_type, DateTimeTypedFeature):
|
92
|
+
return "datetime64[ns]" if not dtype.name.startswith("datetime") else dtype
|
93
|
+
elif isinstance(feature_type, CategoricalTypedFeature):
|
94
|
+
return "bool" if is_bool_dtype(dtype) else "category"
|
95
|
+
|
96
|
+
|
97
|
+
def get_feature_type(name, series):
|
98
|
+
if is_bool_dtype(series) or is_categorical_dtype(series):
|
99
|
+
return CategoricalTypedFeature.build(name, series)
|
100
|
+
elif is_numeric_dtype(series):
|
101
|
+
if is_float_dtype(series):
|
102
|
+
return ContinuousTypedFeature.build(name, series)
|
103
|
+
else:
|
104
|
+
return OrdinalTypedFeature.build(name, series)
|
105
|
+
elif is_datetime64_any_dtype(series):
|
106
|
+
return DateTimeTypedFeature.build(name, series)
|
107
|
+
else:
|
108
|
+
return UnknownTypedFeature.build(name, series)
|
109
|
+
|
110
|
+
|
111
|
+
def convert_to_html(plot):
|
112
|
+
img = io.BytesIO()
|
113
|
+
plot.savefig(img, format="png", bbox_inches="tight")
|
114
|
+
img.seek(0)
|
115
|
+
encoded = base64.b64encode(img.getvalue())
|
116
|
+
return '<img width=95%" src="data:image/png;base64, {}"><hr><br>'.format(
|
117
|
+
encoded.decode("utf-8")
|
118
|
+
)
|
119
|
+
|
120
|
+
|
121
|
+
def _num_partitions_for_dataframe(df):
|
122
|
+
# takes pandas dataframe, guesses good number of partitions
|
123
|
+
return utils.get_cpu_count() if df.shape[0] > 1000 * utils.get_cpu_count() else 1
|
124
|
+
|
125
|
+
|
126
|
+
class ElaboratedPath:
|
127
|
+
"""
|
128
|
+
The Elaborated Path class unifies all of the operations and information related to a path or pathlist.
|
129
|
+
Whether the user wants to
|
130
|
+
An Elaborated path can accept any of the following as a valid source:
|
131
|
+
* A single path
|
132
|
+
* A glob pattern path
|
133
|
+
* A directory
|
134
|
+
* A list of paths (Note: all of these paths must be from the same filesystem AND have the same format)
|
135
|
+
* A sqlalchemy connection url
|
136
|
+
"""
|
137
|
+
|
138
|
+
def __init__(
|
139
|
+
self,
|
140
|
+
source: Union[str, List[str]],
|
141
|
+
format: str = None,
|
142
|
+
name: str = None,
|
143
|
+
**kwargs,
|
144
|
+
):
|
145
|
+
"""
|
146
|
+
:param source:
|
147
|
+
:param format:
|
148
|
+
:param kwargs:
|
149
|
+
|
150
|
+
By the end of this method, this class needs to have paths, format, and name ready
|
151
|
+
"""
|
152
|
+
self._kwargs = kwargs
|
153
|
+
self._format = format
|
154
|
+
self._name = name
|
155
|
+
if isinstance(source, str):
|
156
|
+
self._original_source = source
|
157
|
+
self._determine_protocol_type()
|
158
|
+
if self._type == "db":
|
159
|
+
self._paths = [self._original_source]
|
160
|
+
else:
|
161
|
+
self._elaborate_path()
|
162
|
+
elif isinstance(source, list) and all(isinstance(file, str) for file in source):
|
163
|
+
assert len(source) > 0, "Error, the source you passed in was an empty list."
|
164
|
+
self._original_source = source[0]
|
165
|
+
self._paths = source
|
166
|
+
self._type = "list"
|
167
|
+
else:
|
168
|
+
raise ValueError(f"Source argument not understood: {source}")
|
169
|
+
if self.num_paths == 0:
|
170
|
+
raise FileNotFoundError(
|
171
|
+
f"Error: We could not find any files associated with the source: "
|
172
|
+
f"{source}. Double check that this source is a valid glob pattern,"
|
173
|
+
f" directory, or path."
|
174
|
+
)
|
175
|
+
self._determine_format()
|
176
|
+
self._determine_name()
|
177
|
+
|
178
|
+
@property
|
179
|
+
def paths(self) -> List[str]:
|
180
|
+
"""
|
181
|
+
:return: a list of str
|
182
|
+
Each element will be a valid path
|
183
|
+
"""
|
184
|
+
return self._paths
|
185
|
+
|
186
|
+
@property
|
187
|
+
def num_paths(self) -> int:
|
188
|
+
"""
|
189
|
+
This method will return the number of paths found with the associated original glob, folder, or path.
|
190
|
+
If this returns 0,
|
191
|
+
:return:
|
192
|
+
"""
|
193
|
+
return len(self._paths)
|
194
|
+
|
195
|
+
@property
|
196
|
+
def name(self) -> str:
|
197
|
+
return self._name
|
198
|
+
|
199
|
+
@property
|
200
|
+
def format(self) -> str:
|
201
|
+
return self._format
|
202
|
+
|
203
|
+
def _determine_name(self):
|
204
|
+
if self._name is None:
|
205
|
+
if self._type == "list":
|
206
|
+
self._name = (
|
207
|
+
f"DataFrame from [{os.path.basename(self._original_source)}, ...]"
|
208
|
+
)
|
209
|
+
elif self._type == "glob":
|
210
|
+
self._name = f"DataFrame from {os.path.basename(self._original_source)}"
|
211
|
+
else:
|
212
|
+
self._name = f"DataFrame from {urlparse(self._original_source).scheme}"
|
213
|
+
|
214
|
+
def _determine_format(self):
|
215
|
+
"""
|
216
|
+
Infer format from the path.
|
217
|
+
|
218
|
+
If its a compressed file, returns the extension before compression extension.
|
219
|
+
If the extension cannot be inferred, returns None
|
220
|
+
|
221
|
+
Parameters
|
222
|
+
----------
|
223
|
+
path : ElaboratedPath
|
224
|
+
an ElaboratedPath object
|
225
|
+
|
226
|
+
Returns
|
227
|
+
-------
|
228
|
+
format : str
|
229
|
+
"""
|
230
|
+
if self._format in [None, "infer"]:
|
231
|
+
format_keys = []
|
232
|
+
for i in range(min(self.num_paths, 5)):
|
233
|
+
format_keys.append(self._remove_compressions(self.paths[i]))
|
234
|
+
if len(format_keys) == 0:
|
235
|
+
raise ValueError(
|
236
|
+
f"Could not determine the format key for source: {self._original_source}"
|
237
|
+
)
|
238
|
+
if format_keys.count(format_keys[0]) != len(format_keys):
|
239
|
+
raise ValueError(
|
240
|
+
f"Got multiple formats from the source: {self._original_source}. Run again "
|
241
|
+
f'using the format parameter. Ex: format=<your format key, like: "csv", "hdf", etc.>'
|
242
|
+
)
|
243
|
+
self._format = format_keys[0]
|
244
|
+
else:
|
245
|
+
self._format = self._format.lower()
|
246
|
+
|
247
|
+
def _elaborate_path(self):
|
248
|
+
self._paths = self._fs.glob(self._original_source)
|
249
|
+
if self._protocol != "":
|
250
|
+
self._paths = [f"{self._protocol}://{p}" for p in self._paths]
|
251
|
+
|
252
|
+
def _determine_protocol_type(self):
|
253
|
+
self._protocol = urlparse(self._original_source).scheme
|
254
|
+
|
255
|
+
if self._kwargs.get("fs") is not None:
|
256
|
+
self._fs = self._kwargs.pop("fs")
|
257
|
+
self._type = "glob"
|
258
|
+
elif self._original_source.startswith("oracle+cx_oracle://"):
|
259
|
+
self._protocol = "oracle+cx_oracle"
|
260
|
+
self._type = "db"
|
261
|
+
else:
|
262
|
+
try:
|
263
|
+
self._fs = fsspec.filesystem(
|
264
|
+
self._protocol, **self._kwargs.get("storage_options", dict())
|
265
|
+
)
|
266
|
+
self._type = "glob"
|
267
|
+
except ValueError:
|
268
|
+
try:
|
269
|
+
self.engine = utils.get_sqlalchemy_engine(
|
270
|
+
self._original_source, **self._kwargs
|
271
|
+
)
|
272
|
+
self._type = "db"
|
273
|
+
except:
|
274
|
+
if self._protocol in _known_db_protocols:
|
275
|
+
self._type = "db"
|
276
|
+
else:
|
277
|
+
raise ValueError(
|
278
|
+
f"Error in trying to understand the protocol for source: "
|
279
|
+
f"{self._original_source}. The protocol found: {self._protocol} is not "
|
280
|
+
f"registered with fsspec or sqlalchemy"
|
281
|
+
)
|
282
|
+
|
283
|
+
@staticmethod
|
284
|
+
def _remove_compressions(filename: str):
|
285
|
+
_compression_to_extension = [
|
286
|
+
".gz",
|
287
|
+
".bz2",
|
288
|
+
".zip",
|
289
|
+
".xz",
|
290
|
+
".zst",
|
291
|
+
".tar",
|
292
|
+
".tar.gz",
|
293
|
+
".tar.xz",
|
294
|
+
".tar.bz2",
|
295
|
+
]
|
296
|
+
for compression in _compression_to_extension:
|
297
|
+
if filename.strip().endswith(compression):
|
298
|
+
return ElaboratedPath._remove_compressions(
|
299
|
+
os.path.splitext(filename.rstrip("/*"))[0]
|
300
|
+
)
|
301
|
+
format = os.path.splitext(filename.rstrip("/*"))[1][1:].lower()
|
302
|
+
return format.lower() if format != "" else None
|
303
|
+
|
304
|
+
|
305
|
+
class DatasetLoadException(BaseException):
|
306
|
+
def __init__(self, exc_msg):
|
307
|
+
self.exc_msg = exc_msg
|
308
|
+
|
309
|
+
def __str__(self):
|
310
|
+
return self.exc_msg
|
311
|
+
|
312
|
+
|
313
|
+
def _get_dtype_from_error(e):
|
314
|
+
error_string = str(e)
|
315
|
+
|
316
|
+
if "mismatched dtypes" in error_string.lower():
|
317
|
+
# For the mismatched dtypes error, dask either returns a error message containing the dtype argument
|
318
|
+
# to specify, or the found and expected dtypes in a table format, depending on what stage
|
319
|
+
# the type inferencing fails. The below logic supports building the dtype dictionary for both cases
|
320
|
+
found_dtype_dict_str_list = re.findall(
|
321
|
+
r"dtype=({[^{}]+})", error_string, re.MULTILINE
|
322
|
+
)
|
323
|
+
if found_dtype_dict_str_list:
|
324
|
+
found_dtype_dict = ast.literal_eval(found_dtype_dict_str_list[0])
|
325
|
+
else:
|
326
|
+
found_dtype_dict = _find_dtypes_from_table(error_string)
|
327
|
+
if found_dtype_dict:
|
328
|
+
logger.warning(
|
329
|
+
"Dask type-inference/coercion failed. Retrying with "
|
330
|
+
f"dtype={found_dtype_dict}.",
|
331
|
+
exc_info=True,
|
332
|
+
)
|
333
|
+
return found_dtype_dict
|
334
|
+
return None
|
335
|
+
|
336
|
+
|
337
|
+
def _find_dtypes_from_table(error_string):
|
338
|
+
error_lines = error_string.splitlines()
|
339
|
+
dtypes = {}
|
340
|
+
# matches '| Column | Found | Expected |'
|
341
|
+
pattern = re.compile(
|
342
|
+
"\\s*\\|\\s*Column\\s*\\|\\s*Found\\s*\\|\\s*Expected\\s*\\|\\s*"
|
343
|
+
)
|
344
|
+
for i, line in enumerate(error_lines):
|
345
|
+
if re.match(pattern, line):
|
346
|
+
for j in range(i + 2, len(error_lines)):
|
347
|
+
# extracts column_name and found_dtype from '| <column_name> | <found_dtype> | <expected_dtype |'
|
348
|
+
dtype_suggestion = re.compile("\\s*\\|([^\\|]+)\\|([^\\|]+)\\|.*")
|
349
|
+
match_groups = re.match(dtype_suggestion, error_lines[j])
|
350
|
+
if match_groups is None:
|
351
|
+
break
|
352
|
+
dtypes[match_groups.group(1).strip()] = match_groups.group(2).strip()
|
353
|
+
return dtypes
|
354
|
+
|
355
|
+
|
356
|
+
def rename_duplicate_cols(original_cols):
|
357
|
+
seen_col_names = defaultdict(int)
|
358
|
+
new_cols = []
|
359
|
+
for col in original_cols:
|
360
|
+
# remove any spaces form column names
|
361
|
+
if isinstance(col, str):
|
362
|
+
col.replace(" ", "_")
|
363
|
+
if col not in seen_col_names:
|
364
|
+
new_cols.append(col)
|
365
|
+
else:
|
366
|
+
dup_count = seen_col_names[col]
|
367
|
+
new_cols.append(f"{col}.{dup_count}")
|
368
|
+
seen_col_names[col] += 1
|
369
|
+
assert len(new_cols) == len(
|
370
|
+
original_cols
|
371
|
+
), "There has been an error in re-naming duplicate columns"
|
372
|
+
return new_cols
|
373
|
+
|
374
|
+
|
375
|
+
def write_parquet(
|
376
|
+
path,
|
377
|
+
data,
|
378
|
+
engine="fastparquet",
|
379
|
+
metadata_dict=None,
|
380
|
+
compression=None,
|
381
|
+
storage_options=None,
|
382
|
+
):
|
383
|
+
"""
|
384
|
+
Uses fast parquet to write dask dataframe and custom metadata in parquet format
|
385
|
+
|
386
|
+
Parameters
|
387
|
+
----------
|
388
|
+
path : str
|
389
|
+
Path to write to
|
390
|
+
data : pandas.DataFrame
|
391
|
+
engine : string
|
392
|
+
"auto" by default
|
393
|
+
metadata_dict : Deprecated, will not pass through
|
394
|
+
compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
|
395
|
+
Name of the compression to use
|
396
|
+
storage_options : dict, optional
|
397
|
+
storage arguments required to read the path
|
398
|
+
|
399
|
+
Returns
|
400
|
+
-------
|
401
|
+
str : the file path the parquet was written to
|
402
|
+
"""
|
403
|
+
assert isinstance(data, pd.DataFrame)
|
404
|
+
if metadata_dict is not None:
|
405
|
+
warnings.warn(
|
406
|
+
"The `metadata_dict` argument is deprecated and has no effect on this method.",
|
407
|
+
DeprecationWarning,
|
408
|
+
stacklevel=2,
|
409
|
+
)
|
410
|
+
data.to_parquet(
|
411
|
+
path,
|
412
|
+
engine=engine,
|
413
|
+
compression=compression,
|
414
|
+
storage_options=storage_options,
|
415
|
+
)
|
416
|
+
return path
|
417
|
+
|
418
|
+
|
419
|
+
def is_text_data(df, target=None):
|
420
|
+
if len(df.columns.values) == 2:
|
421
|
+
feature_name = (
|
422
|
+
list(set(df.columns.values) ^ set([target]))[0]
|
423
|
+
if target
|
424
|
+
else list(set(df.columns.values))[0]
|
425
|
+
)
|
426
|
+
elif len(df.columns.values == 1):
|
427
|
+
feature_name = df.columns.values[0]
|
428
|
+
else:
|
429
|
+
return False
|
430
|
+
return isinstance(
|
431
|
+
TypeDiscoveryDriver().discover(feature_name, df[feature_name]),
|
432
|
+
DocumentTypedFeature,
|
433
|
+
)
|
434
|
+
|
435
|
+
|
436
|
+
def generate_sample(
|
437
|
+
df: pd.DataFrame,
|
438
|
+
n: int,
|
439
|
+
confidence_level: int = DatasetDefaults.sampling_confidence_level,
|
440
|
+
confidence_interval: float = DatasetDefaults.sampling_confidence_interval,
|
441
|
+
**kwargs,
|
442
|
+
):
|
443
|
+
min_size_to_sample = min(n, 10000)
|
444
|
+
|
445
|
+
sample_size = None
|
446
|
+
|
447
|
+
if "sample_max_rows" in kwargs:
|
448
|
+
requested_sample_size = int(kwargs["sample_max_rows"])
|
449
|
+
|
450
|
+
if requested_sample_size < 0:
|
451
|
+
sample_size = calculate_sample_size(
|
452
|
+
n, min_size_to_sample, confidence_level, confidence_interval
|
453
|
+
)
|
454
|
+
else:
|
455
|
+
if min_size_to_sample < requested_sample_size < n:
|
456
|
+
logger.info(
|
457
|
+
f"Downsampling from {n} rows, to the user specified {requested_sample_size} rows for graphing."
|
458
|
+
)
|
459
|
+
sample_size = requested_sample_size
|
460
|
+
elif requested_sample_size >= n:
|
461
|
+
logger.info(f"Using the entire dataset of {n} rows for graphing.")
|
462
|
+
sample_size = n
|
463
|
+
else:
|
464
|
+
sample_size = min_size_to_sample
|
465
|
+
logger.info(
|
466
|
+
f"Downsampling from {n} rows, to {sample_size} rows for graphing."
|
467
|
+
)
|
468
|
+
|
469
|
+
if sample_size and len(df) > sample_size:
|
470
|
+
frac = min(1.0, sample_size * 1.05 / n)
|
471
|
+
df = df.sample(frac=frac, random_state=42)
|
472
|
+
return df.head(sample_size) if len(df) > sample_size else df
|
473
|
+
else:
|
474
|
+
return df
|
475
|
+
|
476
|
+
|
477
|
+
def calculate_sample_size(
|
478
|
+
population_size, min_size_to_sample, confidence_level=95, confidence_interval=1.0
|
479
|
+
):
|
480
|
+
"""Find sample size for a population using Cochran’s Sample Size Formula.
|
481
|
+
With default values for confidence_level (percentage, default: 95%)
|
482
|
+
and confidence_interval (margin of error, percentage, default: 1%)
|
483
|
+
|
484
|
+
SUPPORTED CONFIDENCE LEVELS: 50%, 68%, 90%, 95%, and 99% *ONLY* - this
|
485
|
+
is because the Z-score is table based, and I'm only providing Z
|
486
|
+
for common confidence levels.
|
487
|
+
"""
|
488
|
+
|
489
|
+
if population_size < min_size_to_sample:
|
490
|
+
return None
|
491
|
+
|
492
|
+
confidence_level_constant = {
|
493
|
+
50: 0.67,
|
494
|
+
68: 0.99,
|
495
|
+
90: 1.64,
|
496
|
+
95: 1.96,
|
497
|
+
99: 2.57,
|
498
|
+
99.5: 2.807,
|
499
|
+
99.9: 3.291,
|
500
|
+
}
|
501
|
+
|
502
|
+
p = 0.5
|
503
|
+
e = confidence_interval / 100.0
|
504
|
+
N = population_size
|
505
|
+
n_0 = 0.0
|
506
|
+
n = 0.0
|
507
|
+
|
508
|
+
Z = confidence_level_constant.get(confidence_level, 99)
|
509
|
+
|
510
|
+
n_0 = ((Z**2) * p * (1 - p)) / (e**2)
|
511
|
+
n = n_0 / (1 + ((n_0 - 1) / float(N)))
|
512
|
+
|
513
|
+
sample_size = max(int(math.ceil(n)), min_size_to_sample)
|
514
|
+
|
515
|
+
logger.info(f"Downsampling from {population_size} rows to {sample_size} rows.")
|
516
|
+
|
517
|
+
return sample_size
|
518
|
+
|
519
|
+
|
520
|
+
def map_types(types):
|
521
|
+
for column in types:
|
522
|
+
if types[column] == "continuous":
|
523
|
+
types[column] = "float64"
|
524
|
+
elif types[column] == "ordinal":
|
525
|
+
types[column] = "int64"
|
526
|
+
elif types[column] == "categorical":
|
527
|
+
types[column] = "category"
|
528
|
+
elif types[column] == "datetime":
|
529
|
+
types[column] = "datetime64[ns]"
|
530
|
+
return types
|
531
|
+
|
532
|
+
|
533
|
+
@runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
|
534
|
+
@runtime_dependency(module="graphviz", install_from=OptionalDependency.VIZ)
|
535
|
+
def visualize_transformation(transformer_pipeline, text=None):
|
536
|
+
dot = graphviz.Digraph()
|
537
|
+
|
538
|
+
# show a single node for paritions
|
539
|
+
dot.attr(
|
540
|
+
"node",
|
541
|
+
shape="tab",
|
542
|
+
style="filled",
|
543
|
+
fontname="courier",
|
544
|
+
fontsize="12",
|
545
|
+
fontcolor="white",
|
546
|
+
resolution="144",
|
547
|
+
)
|
548
|
+
if text:
|
549
|
+
dot.node("partitions", text, margin="0.25", fillcolor="dimgray")
|
550
|
+
|
551
|
+
dot.attr(
|
552
|
+
"node",
|
553
|
+
shape="component",
|
554
|
+
style="filled",
|
555
|
+
fontname="courier",
|
556
|
+
fontsize="10",
|
557
|
+
fontcolor="black",
|
558
|
+
resolution="144",
|
559
|
+
)
|
560
|
+
for step in transformer_pipeline.steps:
|
561
|
+
name, clazz, clazzname, is_ads = (
|
562
|
+
step[0],
|
563
|
+
step[1],
|
564
|
+
step[1].__class__.__name__,
|
565
|
+
"ads" in str(step[1].__class__),
|
566
|
+
)
|
567
|
+
ads_node = str(step[1].__class__.__name__) in [
|
568
|
+
"AutoMLPreprocessingTransformer",
|
569
|
+
"DataFrameTransformer",
|
570
|
+
"RecommendationTransformer",
|
571
|
+
"AutoMLFeatureSelection",
|
572
|
+
"FeatureEngineeringTransformer",
|
573
|
+
]
|
574
|
+
if ads_node:
|
575
|
+
text2html = "< {} >".format(
|
576
|
+
html.escape(step[1].__repr__()).replace("\n", "<br/>")
|
577
|
+
)
|
578
|
+
dot.node(name, text2html, margin="0.25", fillcolor="gold2")
|
579
|
+
else:
|
580
|
+
dot.node(name, name.rsplit("/")[0], fillcolor="azure")
|
581
|
+
|
582
|
+
def format_label(stage):
|
583
|
+
if "FunctionTransformer" in str(transformer_pipeline.steps[stage][1].__class__):
|
584
|
+
return "<<font face='courier' point-size='10'> <b>{}</b> </font>>".format(
|
585
|
+
html.escape(transformer_pipeline.steps[stage][1].func.__name__)
|
586
|
+
)
|
587
|
+
else:
|
588
|
+
is_ads = "ads" in str(transformer_pipeline.steps[stage][1].__class__)
|
589
|
+
return "<<font face='courier' point-size='10'> <b>{}</b> </font>>".format(
|
590
|
+
transformer_pipeline.steps[stage][1].__class__.__name__
|
591
|
+
)
|
592
|
+
|
593
|
+
edges = [x[0] for x in transformer_pipeline.steps]
|
594
|
+
for i, edge in enumerate(list(zip(edges[:-1], edges[1:]))):
|
595
|
+
dot.edge(*edge, len="1.00", label=format_label(i))
|
596
|
+
|
597
|
+
# terminus node
|
598
|
+
dot.node("terminus", "", shape="terminator", fillcolor="white")
|
599
|
+
dot.edge(edges[-1], "terminus", len="1.00", label=format_label(len(edges) - 1))
|
600
|
+
|
601
|
+
graph = graphviz.Source(dot)
|
602
|
+
|
603
|
+
from IPython.core.display import display, SVG
|
604
|
+
|
605
|
+
display(SVG(graph.pipe(format="svg")))
|
606
|
+
|
607
|
+
|
608
|
+
def up_sample(df, target, sampler="default", feature_types=None):
|
609
|
+
"""
|
610
|
+
Fixes imbalanced dataset by up-sampling
|
611
|
+
|
612
|
+
Parameters
|
613
|
+
----------
|
614
|
+
df : Union[pandas.DataFrame, dask.dataframe.core.DataFrame]
|
615
|
+
target : name of the target column in df
|
616
|
+
sampler: Should implement fit_resample(X,y) method
|
617
|
+
fillna: a dictionary contains the column name as well as the fill value,
|
618
|
+
only needed when the column has missing values
|
619
|
+
|
620
|
+
Returns
|
621
|
+
-------
|
622
|
+
upsampled_df : Union[pandas.DataFrame, dask.dataframe.core.DataFrame]
|
623
|
+
"""
|
624
|
+
if sampler != "default":
|
625
|
+
if inspect.getattr_static(sampler, "fit_resample", None) is None:
|
626
|
+
raise AttributeError("`sampler` object must has method `fit_resample`.")
|
627
|
+
else:
|
628
|
+
# exactly two input args X, y will be passed to fit_resample()
|
629
|
+
# check signature of fit_sample
|
630
|
+
num_no_default_params = 0
|
631
|
+
sig = inspect.signature(sampler.fit_resample)
|
632
|
+
for param in sig.parameters.values():
|
633
|
+
if param.default is param.empty:
|
634
|
+
num_no_default_params += 1
|
635
|
+
if len(sig.parameters) < 2 or num_no_default_params > 2:
|
636
|
+
raise RuntimeError(
|
637
|
+
"The signature for `sampler.fit_resample` has to be `fit_resample(X, y)`."
|
638
|
+
)
|
639
|
+
|
640
|
+
X = df.drop(target, axis=1)
|
641
|
+
y = df[target]
|
642
|
+
|
643
|
+
feature_types = feature_types if feature_types is not None else {}
|
644
|
+
|
645
|
+
columns_with_nans = X.columns.values[X.isna().any()]
|
646
|
+
if len(columns_with_nans) > 0:
|
647
|
+
fill_nan_dict = {}
|
648
|
+
for column in columns_with_nans:
|
649
|
+
if column in feature_types and "mode" in feature_types[column]["stats"]:
|
650
|
+
fill_nan_dict[column] = feature_types[column]["stats"]["mode"]
|
651
|
+
elif column in feature_types and "mean" in feature_types[column]["stats"]:
|
652
|
+
fill_nan_dict[column] = feature_types[column]["stats"]["mean"]
|
653
|
+
elif column in feature_types and "median" in feature_types[column]["stats"]:
|
654
|
+
fill_nan_dict[column] = feature_types[column]["stats"]["median"]
|
655
|
+
else:
|
656
|
+
logger.warning(
|
657
|
+
"Sampling from a column that has missing values may cause an error."
|
658
|
+
)
|
659
|
+
X = X.fillna(fill_nan_dict)
|
660
|
+
|
661
|
+
if sampler == "default":
|
662
|
+
imblearn_found = importlib.util.find_spec("imblearn") is not None
|
663
|
+
if not imblearn_found:
|
664
|
+
raise ModuleNotFoundError(
|
665
|
+
"""
|
666
|
+
Required package for up-sampling `imblearn` not found.
|
667
|
+
Install `imblearn` with `pip install imbalanced-learn`
|
668
|
+
and rerun to enable up-sampling.
|
669
|
+
"""
|
670
|
+
)
|
671
|
+
else:
|
672
|
+
sampler = _get_imblearn_sampler(X, y)
|
673
|
+
return _sample(sampler, X, y)
|
674
|
+
|
675
|
+
|
676
|
+
def _get_imblearn_sampler(X, y):
|
677
|
+
from imblearn.over_sampling import SMOTE, RandomOverSampler
|
678
|
+
|
679
|
+
categorical_feature_indices = [
|
680
|
+
X.columns.get_loc(c)
|
681
|
+
for c in X.select_dtypes(
|
682
|
+
include=["category", "object", "datetime64"]
|
683
|
+
).columns.values
|
684
|
+
]
|
685
|
+
|
686
|
+
if len(categorical_feature_indices) > 0:
|
687
|
+
logger.info(
|
688
|
+
"""
|
689
|
+
Using the default `RandomOverSampler` sampler. Use `sample` to specify a sampler.
|
690
|
+
Classes will be equalized.
|
691
|
+
You can also pass in other samplers such as `imblearn.SMOTENC` instead, e.g.
|
692
|
+
sampler = SMOTENC(categorical_features=categorical_feature_indices)
|
693
|
+
ds.up_sample(sampler=sampler)
|
694
|
+
"""
|
695
|
+
)
|
696
|
+
return RandomOverSampler(random_state=42)
|
697
|
+
|
698
|
+
min_sample_size = y.value_counts().min()
|
699
|
+
|
700
|
+
k_neighbors = min(min_sample_size - 1, 5)
|
701
|
+
if k_neighbors == 0:
|
702
|
+
logger.warning(
|
703
|
+
f"""k_neighbors is 0 as in the target there exists a class label that appeared only once.
|
704
|
+
SMOTE will fail. Default to RandomOverSampler.
|
705
|
+
"""
|
706
|
+
)
|
707
|
+
return RandomOverSampler(random_state=42)
|
708
|
+
else:
|
709
|
+
if 5 > k_neighbors > 0:
|
710
|
+
logger.info(
|
711
|
+
f"`k_neighbors()` of SMOTE has changed to {k_neighbors}"
|
712
|
+
" as the target has at least one class which appeared "
|
713
|
+
f"only {min_sample_size} times in the data. "
|
714
|
+
)
|
715
|
+
logger.info("Using SMOTE for over sampling. Classes will be equalized.")
|
716
|
+
return SMOTE(random_state=42, k_neighbors=k_neighbors)
|
717
|
+
|
718
|
+
|
719
|
+
def down_sample(df, target):
|
720
|
+
"""
|
721
|
+
Fixes imbalanced dataset by down-sampling
|
722
|
+
|
723
|
+
Parameters
|
724
|
+
----------
|
725
|
+
df : pandas.DataFrame
|
726
|
+
target : name of the target column in df
|
727
|
+
|
728
|
+
Returns
|
729
|
+
-------
|
730
|
+
downsampled_df : pandas.DataFrame
|
731
|
+
"""
|
732
|
+
dfs = []
|
733
|
+
target_value_counts = df[target].value_counts()
|
734
|
+
min_key = min(target_value_counts.items(), key=lambda k: k[1])
|
735
|
+
for key, value in target_value_counts.items():
|
736
|
+
if key != min_key[0]:
|
737
|
+
dfs.append(
|
738
|
+
df[df[target] == key].sample(frac=1 - ((value - min_key[1]) / value))
|
739
|
+
)
|
740
|
+
dfs.append(df[df[target] == min_key[0]])
|
741
|
+
return pd.concat(dfs)
|
742
|
+
|
743
|
+
|
744
|
+
def _sample(sampler, X, y):
|
745
|
+
if isinstance(y, pd.Series) and (
|
746
|
+
isinstance(y[0], bool) or isinstance(y[0], np.bool_)
|
747
|
+
):
|
748
|
+
y_trans = y.astype(int) ## Convert to ints to let SMOTE sample properly
|
749
|
+
X_resampled, y_resampled = sampler.fit_resample(X=X, y=y_trans)
|
750
|
+
else:
|
751
|
+
X_resampled, y_resampled = sampler.fit_resample(X=X, y=y)
|
752
|
+
|
753
|
+
if not isinstance(X_resampled, pd.DataFrame):
|
754
|
+
X_resampled = pd.DataFrame(X_resampled, columns=X.columns.values)
|
755
|
+
if not isinstance(y_resampled, pd.Series):
|
756
|
+
y_resampled = pd.DataFrame(y_resampled, columns=[y.name])[y.name]
|
757
|
+
|
758
|
+
for k in X.dtypes.keys():
|
759
|
+
X_resampled[k] = X_resampled[k].astype(X.dtypes[k].name)
|
760
|
+
balanced_df = concatenate(X_resampled, y_resampled)
|
761
|
+
return balanced_df
|
762
|
+
|
763
|
+
|
764
|
+
def get_fill_val(feature_types, column, action, constant="constant"):
|
765
|
+
# action can be one of the following
|
766
|
+
# "Fill missing values with mean", "Fill missing values with median",
|
767
|
+
# "Fill missing values with frequent", "Fill missing values with constant"
|
768
|
+
action_ = action.split(" ")[-1]
|
769
|
+
fill_type = "mode" if action_ == "frequent" else action_
|
770
|
+
try:
|
771
|
+
fill_val = (
|
772
|
+
feature_types[column].meta_data["stats"][fill_type]
|
773
|
+
if action_ != "constant"
|
774
|
+
else constant
|
775
|
+
)
|
776
|
+
fill_val = round(fill_val, 4) if isinstance(fill_val, Number) else fill_val
|
777
|
+
except:
|
778
|
+
fill_val = None
|
779
|
+
return fill_val
|
780
|
+
|
781
|
+
|
782
|
+
def parse_apache_log_str(x):
|
783
|
+
"""
|
784
|
+
Returns the string delimited by two characters.
|
785
|
+
|
786
|
+
Source: https://mmas.github.io/read-apache-access-log-pandas
|
787
|
+
Example:
|
788
|
+
`>>> parse_str('[my string]')`
|
789
|
+
`'my string'`
|
790
|
+
"""
|
791
|
+
if x is not None:
|
792
|
+
return x[1:-1]
|
793
|
+
return np.nan
|
794
|
+
|
795
|
+
|
796
|
+
def parse_apache_log_datetime(x):
|
797
|
+
"""
|
798
|
+
Parses datetime with timezone formatted as:
|
799
|
+
`[day/month/year:hour:minute:second zone]`
|
800
|
+
|
801
|
+
Source: https://mmas.github.io/read-apache-access-log-pandas
|
802
|
+
Example:
|
803
|
+
`>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
|
804
|
+
`datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`
|
805
|
+
|
806
|
+
Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the
|
807
|
+
timezone will be obtained using the `pytz` library.
|
808
|
+
"""
|
809
|
+
import pytz
|
810
|
+
from datetime import datetime
|
811
|
+
|
812
|
+
dt = datetime.strptime(x[1:-7], "%d/%b/%Y:%H:%M:%S")
|
813
|
+
dt_tz = int(x[-6:-3]) * 60 + int(x[-3:-1])
|
814
|
+
return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
|
815
|
+
|
816
|
+
|
817
|
+
def deprecate_variable(old_var, new_var, warning_msg, warning_type):
|
818
|
+
if old_var is not None:
|
819
|
+
warnings.warn(warning_msg, warning_type)
|
820
|
+
return old_var
|
821
|
+
return new_var
|
822
|
+
|
823
|
+
|
824
|
+
def deprecate_default_value(var, old_value, new_value, warning_msg, warning_type):
|
825
|
+
if var == old_value:
|
826
|
+
warnings.warn(warning_msg, warning_type)
|
827
|
+
return new_value
|
828
|
+
else:
|
829
|
+
return var
|
830
|
+
|
831
|
+
|
832
|
+
def _log_yscale_not_set():
|
833
|
+
logger.info(
|
834
|
+
"`yscale` parameter is not set. Valid values are `'linear'`, `'log'`, `'symlog'`."
|
835
|
+
)
|
836
|
+
|
837
|
+
|
838
|
+
def infer_target_type(target, target_series, discover_target_type=True):
|
839
|
+
# if type discovery is turned off, infer type from pandas dtype
|
840
|
+
if discover_target_type:
|
841
|
+
target_type = TypeDiscoveryDriver().discover(
|
842
|
+
target, target_series, is_target=True
|
843
|
+
)
|
844
|
+
else:
|
845
|
+
target_type = get_feature_type(target, target_series)
|
846
|
+
return target_type
|
847
|
+
|
848
|
+
|
849
|
+
def get_target_type(target, sampled_df, **init_kwargs):
|
850
|
+
discover_target_type = init_kwargs.get("type_discovery", True)
|
851
|
+
if target in init_kwargs.get("types", {}):
|
852
|
+
sampled_df[target] = sampled_df[target].astype(init_kwargs.get("types")[target])
|
853
|
+
discover_target_type = False
|
854
|
+
return infer_target_type(target, sampled_df[target], discover_target_type)
|
855
|
+
|
856
|
+
|
857
|
+
def get_dataset(
|
858
|
+
df: pd.DataFrame,
|
859
|
+
sampled_df: pd.DataFrame,
|
860
|
+
target: str,
|
861
|
+
target_type: TypedFeature,
|
862
|
+
shape: Tuple[int, int],
|
863
|
+
positive_class=None,
|
864
|
+
**init_kwargs,
|
865
|
+
):
|
866
|
+
from ads.dataset.classification_dataset import (
|
867
|
+
BinaryClassificationDataset,
|
868
|
+
BinaryTextClassificationDataset,
|
869
|
+
MultiClassClassificationDataset,
|
870
|
+
MultiClassTextClassificationDataset,
|
871
|
+
)
|
872
|
+
from ads.dataset.forecasting_dataset import ForecastingDataset
|
873
|
+
from ads.dataset.regression_dataset import RegressionDataset
|
874
|
+
|
875
|
+
if len(df[target].dropna()) == 0:
|
876
|
+
logger.warning(
|
877
|
+
"It is not recommended to use an empty column as the target variable."
|
878
|
+
)
|
879
|
+
raise ValueError(f"We do not support using empty columns as the chosen target")
|
880
|
+
if utils.is_same_class(target_type, ContinuousTypedFeature):
|
881
|
+
return RegressionDataset(
|
882
|
+
df=df,
|
883
|
+
sampled_df=sampled_df,
|
884
|
+
target=target,
|
885
|
+
target_type=target_type,
|
886
|
+
shape=shape,
|
887
|
+
**init_kwargs,
|
888
|
+
)
|
889
|
+
elif utils.is_same_class(
|
890
|
+
target_type, DateTimeTypedFeature
|
891
|
+
) or df.index.dtype.name.startswith("datetime"):
|
892
|
+
return ForecastingDataset(
|
893
|
+
df=df,
|
894
|
+
sampled_df=sampled_df,
|
895
|
+
target=target,
|
896
|
+
target_type=target_type,
|
897
|
+
shape=shape,
|
898
|
+
**init_kwargs,
|
899
|
+
)
|
900
|
+
|
901
|
+
# Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
|
902
|
+
elif utils.is_same_class(
|
903
|
+
target_type, CategoricalTypedFeature
|
904
|
+
) or utils.is_same_class(target_type, OrdinalTypedFeature):
|
905
|
+
if target_type.meta_data["internal"]["unique"] == 2:
|
906
|
+
if is_text_data(sampled_df, target):
|
907
|
+
return BinaryTextClassificationDataset(
|
908
|
+
df=df,
|
909
|
+
sampled_df=sampled_df,
|
910
|
+
target=target,
|
911
|
+
shape=shape,
|
912
|
+
target_type=target_type,
|
913
|
+
positive_class=positive_class,
|
914
|
+
**init_kwargs,
|
915
|
+
)
|
916
|
+
|
917
|
+
return BinaryClassificationDataset(
|
918
|
+
df=df,
|
919
|
+
sampled_df=sampled_df,
|
920
|
+
target=target,
|
921
|
+
shape=shape,
|
922
|
+
target_type=target_type,
|
923
|
+
positive_class=positive_class,
|
924
|
+
**init_kwargs,
|
925
|
+
)
|
926
|
+
else:
|
927
|
+
if is_text_data(sampled_df, target):
|
928
|
+
return MultiClassTextClassificationDataset(
|
929
|
+
df=df,
|
930
|
+
sampled_df=sampled_df,
|
931
|
+
target=target,
|
932
|
+
target_type=target_type,
|
933
|
+
shape=shape,
|
934
|
+
**init_kwargs,
|
935
|
+
)
|
936
|
+
return MultiClassClassificationDataset(
|
937
|
+
df=df,
|
938
|
+
sampled_df=sampled_df,
|
939
|
+
target=target,
|
940
|
+
target_type=target_type,
|
941
|
+
shape=shape,
|
942
|
+
**init_kwargs,
|
943
|
+
)
|
944
|
+
elif (
|
945
|
+
utils.is_same_class(target, DocumentTypedFeature)
|
946
|
+
or "text" in target_type["type"]
|
947
|
+
or "text" in target
|
948
|
+
):
|
949
|
+
raise ValueError(f"The column {target} cannot be used as the target column.")
|
950
|
+
elif (
|
951
|
+
utils.is_same_class(target_type, GISTypedFeature)
|
952
|
+
or "coord" in target_type["type"]
|
953
|
+
or "coord" in target
|
954
|
+
):
|
955
|
+
raise ValueError(f"The column {target} cannot be used as the target column.")
|
956
|
+
# This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
|
957
|
+
# binary target, but only data on one instance
|
958
|
+
elif target_type["low_level_type"] == "bool":
|
959
|
+
return BinaryClassificationDataset(
|
960
|
+
df=df,
|
961
|
+
sampled_df=sampled_df,
|
962
|
+
target=target,
|
963
|
+
shape=shape,
|
964
|
+
target_type=target_type,
|
965
|
+
positive_class=positive_class,
|
966
|
+
**init_kwargs,
|
967
|
+
)
|
968
|
+
raise ValueError(
|
969
|
+
f"Unable to identify problem type. Specify the data type of {target} using 'types'. "
|
970
|
+
f"For example, types = {{{target}: 'category'}}"
|
971
|
+
)
|
972
|
+
|
973
|
+
|
974
|
+
def open(
|
975
|
+
source,
|
976
|
+
target=None,
|
977
|
+
format="infer",
|
978
|
+
reader_fn: Callable = None,
|
979
|
+
name: str = None,
|
980
|
+
description="",
|
981
|
+
npartitions: int = None,
|
982
|
+
type_discovery=True,
|
983
|
+
html_table_index=None,
|
984
|
+
column_names="infer",
|
985
|
+
sample_max_rows=10000,
|
986
|
+
positive_class=None,
|
987
|
+
transformer_pipeline=None,
|
988
|
+
types={},
|
989
|
+
**kwargs,
|
990
|
+
):
|
991
|
+
"""
|
992
|
+
Returns an object of ADSDataset or ADSDatasetWithTarget read from the given path
|
993
|
+
|
994
|
+
.. deprecated:: 2.6.6
|
995
|
+
"Deprecated in favor of using Pandas. Pandas supports reading from object storage directly.
|
996
|
+
Check https://accelerated-data-science.readthedocs.io/en/latest/user_guide/loading_data/connect.html",
|
997
|
+
|
998
|
+
Parameters
|
999
|
+
----------
|
1000
|
+
source: Union[str, pandas.DataFrame, h2o.DataFrame, pyspark.sql.dataframe.DataFrame]
|
1001
|
+
If str, URI for the dataset. The dataset could be read from local or network file system, hdfs, s3, gcs and optionally pyspark in pyspark
|
1002
|
+
conda env
|
1003
|
+
target: str, optional
|
1004
|
+
Name of the target in dataset.
|
1005
|
+
If set an ADSDatasetWithTarget object is returned, otherwise an ADSDataset object is returned which can be
|
1006
|
+
used to understand the dataset through visualizations
|
1007
|
+
format: str, default: infer
|
1008
|
+
Format of the dataset.
|
1009
|
+
Supported formats: CSV, TSV, Parquet, libsvm, JSON, XLS/XLSX (Excel), HDF5, SQL, XML,
|
1010
|
+
Apache server log files (clf, log), ARFF.
|
1011
|
+
By default, the format would be inferred from the ending of the dataset file path.
|
1012
|
+
reader_fn: Callable, default: None
|
1013
|
+
The user may pass in their own custom reader function.
|
1014
|
+
It must accept `(path, **kwarg)` and return a pandas DataFrame
|
1015
|
+
name: str, optional default: ""
|
1016
|
+
description: str, optional default: ""
|
1017
|
+
Text describing the dataset
|
1018
|
+
npartitions: int, deprecated
|
1019
|
+
Number of partitions to split the data
|
1020
|
+
By default this is set to the max number of cores supported by the backend compute accelerator
|
1021
|
+
type_discovery: bool, default: True
|
1022
|
+
If false, the data types of the dataframe are used as such.
|
1023
|
+
By default, the dataframe columns are associated with the best suited data types. Associating the features
|
1024
|
+
with the disovered datatypes would impact visualizations and model prediction.
|
1025
|
+
html_table_index: int, optional
|
1026
|
+
The index of the dataframe table in html content. This is used when the format of dataset is html
|
1027
|
+
column_names: 'infer', list of str or None, default: 'infer'
|
1028
|
+
Supported only for CSV and TSV.
|
1029
|
+
List of column names to use.
|
1030
|
+
By default, column names are inferred from the first line of the file.
|
1031
|
+
If set to None, column names would be auto-generated instead of inferring from file.
|
1032
|
+
If the file already contains a column header, specify header=0 to ignore the existing column names.
|
1033
|
+
sample_max_rows: int, default: 10000, use -1 auto calculate sample size, use 0 (zero) for no sampling
|
1034
|
+
Sample size of the dataframe to use for visualization and optimization.
|
1035
|
+
positive_class: Any, optional
|
1036
|
+
Label in target for binary classification problems which should be identified as positive for modeling.
|
1037
|
+
By default, the first unique value is considered as the positive label.
|
1038
|
+
types: dict, optional
|
1039
|
+
Dictionary of <feature_name> : <data_type> to override the data type of features.
|
1040
|
+
transformer_pipeline: datasets.pipeline.TransformerPipeline, optional
|
1041
|
+
A pipeline of transformations done outside the sdk and need to be applied at the time of scoring
|
1042
|
+
storage_options: dict, default: varies by source type
|
1043
|
+
Parameters passed on to the backend filesystem class.
|
1044
|
+
sep: str
|
1045
|
+
Delimiting character for parsing the input file.
|
1046
|
+
kwargs: additional keyword arguments that would be passed to underlying dataframe read API
|
1047
|
+
based on the format of the dataset
|
1048
|
+
|
1049
|
+
Returns
|
1050
|
+
-------
|
1051
|
+
dataset : An instance of ADSDataset
|
1052
|
+
(or)
|
1053
|
+
dataset_with_target : An instance of ADSDatasetWithTarget
|
1054
|
+
"""
|
1055
|
+
if npartitions:
|
1056
|
+
warnings.warn(
|
1057
|
+
"Variable `npartitions` is deprecated and will not be used",
|
1058
|
+
DeprecationWarning,
|
1059
|
+
stacklevel=2,
|
1060
|
+
)
|
1061
|
+
if (
|
1062
|
+
"storage_options" not in kwargs
|
1063
|
+
and type(source) is str
|
1064
|
+
and len(source) > 6
|
1065
|
+
and source[:6] == "oci://"
|
1066
|
+
):
|
1067
|
+
kwargs["storage_options"] = {"config": {}}
|
1068
|
+
|
1069
|
+
if isinstance(source, str) or isinstance(source, list):
|
1070
|
+
progress = utils.get_progress_bar(4)
|
1071
|
+
progress.update("Opening data")
|
1072
|
+
path = ElaboratedPath(source, format=format, **kwargs)
|
1073
|
+
reader_fn = (
|
1074
|
+
get_format_reader(path=path, **kwargs) if reader_fn is None else reader_fn
|
1075
|
+
)
|
1076
|
+
df = load_dataset(path=path, reader_fn=reader_fn, **kwargs)
|
1077
|
+
name = path.name
|
1078
|
+
elif isinstance(source, pd.DataFrame):
|
1079
|
+
progress = utils.get_progress_bar(4)
|
1080
|
+
progress.update("Partitioning data")
|
1081
|
+
df = source
|
1082
|
+
name = "User Provided DataFrame" if name is None else name
|
1083
|
+
else:
|
1084
|
+
raise TypeError(
|
1085
|
+
f"The Source type: {type(source)} is not supported for DatasetFactory."
|
1086
|
+
)
|
1087
|
+
shape = df.shape
|
1088
|
+
return build_dataset(
|
1089
|
+
df=df,
|
1090
|
+
shape=shape,
|
1091
|
+
target=target,
|
1092
|
+
sample_max_rows=sample_max_rows,
|
1093
|
+
type_discovery=type_discovery,
|
1094
|
+
types=types,
|
1095
|
+
positive_class=positive_class,
|
1096
|
+
name=name,
|
1097
|
+
transformer_pipeline=transformer_pipeline,
|
1098
|
+
description=description,
|
1099
|
+
progress=progress,
|
1100
|
+
**utils.inject_and_copy_kwargs(
|
1101
|
+
kwargs,
|
1102
|
+
**{"html_table_index": html_table_index, "column_names": column_names},
|
1103
|
+
),
|
1104
|
+
)
|
1105
|
+
|
1106
|
+
|
1107
|
+
def build_dataset(
|
1108
|
+
df: pd.DataFrame,
|
1109
|
+
shape: Tuple[int, int],
|
1110
|
+
target: str = None,
|
1111
|
+
progress=None,
|
1112
|
+
**kwargs,
|
1113
|
+
):
|
1114
|
+
from ads.dataset.dataset import ADSDataset
|
1115
|
+
|
1116
|
+
n = shape[0]
|
1117
|
+
if progress:
|
1118
|
+
progress.update("Generating data sample")
|
1119
|
+
|
1120
|
+
sampled_df = generate_sample(
|
1121
|
+
df,
|
1122
|
+
n,
|
1123
|
+
DatasetDefaults.sampling_confidence_level,
|
1124
|
+
DatasetDefaults.sampling_confidence_interval,
|
1125
|
+
**kwargs,
|
1126
|
+
)
|
1127
|
+
|
1128
|
+
if target is None:
|
1129
|
+
if progress:
|
1130
|
+
progress.update("Building the dataset with no target.")
|
1131
|
+
result = ADSDataset(df=df, sampled_df=sampled_df, shape=shape, **kwargs)
|
1132
|
+
if progress:
|
1133
|
+
progress.update("Done")
|
1134
|
+
logger.info(
|
1135
|
+
"Use `set_target()` to type the dataset for a particular learning task."
|
1136
|
+
)
|
1137
|
+
return result
|
1138
|
+
|
1139
|
+
if progress:
|
1140
|
+
progress.update("Building dataset")
|
1141
|
+
|
1142
|
+
discover_target_type = kwargs["type_discovery"]
|
1143
|
+
if target in kwargs["types"]:
|
1144
|
+
sampled_df[target] = sampled_df[target].astype(kwargs["types"][target])
|
1145
|
+
discover_target_type = False
|
1146
|
+
|
1147
|
+
# if type discovery is turned off, infer type from pandas dtype
|
1148
|
+
target_type = infer_target_type(target, sampled_df[target], discover_target_type)
|
1149
|
+
|
1150
|
+
result = get_dataset(
|
1151
|
+
df=df,
|
1152
|
+
sampled_df=sampled_df,
|
1153
|
+
target=target,
|
1154
|
+
target_type=target_type,
|
1155
|
+
shape=shape,
|
1156
|
+
**kwargs,
|
1157
|
+
)
|
1158
|
+
if progress:
|
1159
|
+
progress.update("Done")
|
1160
|
+
logger.info(
|
1161
|
+
"Use `suggest_recommendations()` to view and apply recommendations for dataset optimization."
|
1162
|
+
)
|
1163
|
+
return result
|
1164
|
+
|
1165
|
+
|
1166
|
+
class CustomFormatReaders:
|
1167
|
+
@staticmethod
|
1168
|
+
def read_tsv(path: str, **kwargs) -> pd.DataFrame:
|
1169
|
+
return pd.read_csv(
|
1170
|
+
path, **utils.inject_and_copy_kwargs(kwargs, **{"sep": "\t"})
|
1171
|
+
)
|
1172
|
+
|
1173
|
+
@staticmethod
|
1174
|
+
def read_json(path: str, **kwargs) -> pd.DataFrame:
|
1175
|
+
try:
|
1176
|
+
return pd.read_json(path, **kwargs)
|
1177
|
+
except ValueError as e:
|
1178
|
+
return pd.read_json(
|
1179
|
+
path, **utils.inject_and_copy_kwargs(kwargs, **{"lines": True})
|
1180
|
+
)
|
1181
|
+
|
1182
|
+
@staticmethod
|
1183
|
+
def read_libsvm(path: str, **kwargs) -> pd.DataFrame:
|
1184
|
+
from sklearn.datasets import load_svmlight_file
|
1185
|
+
from joblib import Memory
|
1186
|
+
|
1187
|
+
mem = Memory("./mycache")
|
1188
|
+
|
1189
|
+
@mem.cache
|
1190
|
+
def get_data(path):
|
1191
|
+
X, y = load_svmlight_file(path)
|
1192
|
+
df = pd.DataFrame(X.todense())
|
1193
|
+
df["target"] = y
|
1194
|
+
return df
|
1195
|
+
|
1196
|
+
return get_data(path)
|
1197
|
+
|
1198
|
+
@staticmethod
|
1199
|
+
@runtime_dependency(
|
1200
|
+
module="pandavro", object="read_avro", install_from=OptionalDependency.DATA
|
1201
|
+
)
|
1202
|
+
def read_avro(path: str, **kwargs) -> pd.DataFrame:
|
1203
|
+
return read_avro(path, **kwargs)
|
1204
|
+
|
1205
|
+
DEFAULT_SQL_CHUNKSIZE = 12007
|
1206
|
+
DEFAULT_SQL_ARRAYSIZE = 50000
|
1207
|
+
DEFAULT_SQL_MIL = 128
|
1208
|
+
DEFAULT_SQL_CTU = False
|
1209
|
+
|
1210
|
+
@classmethod
|
1211
|
+
def read_sql(cls, path: str, table: str = None, **kwargs) -> pd.DataFrame:
|
1212
|
+
"""
|
1213
|
+
|
1214
|
+
:param path: str
|
1215
|
+
This is the connection URL that gets passed to sqlalchemy's create_engine method
|
1216
|
+
:param table: str
|
1217
|
+
This is either the name of a table to select * from or a sql query to be run
|
1218
|
+
:param kwargs:
|
1219
|
+
:return: pd.DataFrame
|
1220
|
+
"""
|
1221
|
+
if table is None:
|
1222
|
+
raise ValueError(
|
1223
|
+
"In order to read from a database you need to specify the table using the `table` "
|
1224
|
+
"argument."
|
1225
|
+
)
|
1226
|
+
# check if it's oracle dialect
|
1227
|
+
if str(path).lower().startswith("oracle"):
|
1228
|
+
kwargs = utils.inject_and_copy_kwargs(
|
1229
|
+
kwargs,
|
1230
|
+
**{
|
1231
|
+
"arraysize": cls.DEFAULT_SQL_ARRAYSIZE,
|
1232
|
+
"max_identifier_length": cls.DEFAULT_SQL_MIL,
|
1233
|
+
"coerce_to_unicode": cls.DEFAULT_SQL_CTU,
|
1234
|
+
},
|
1235
|
+
)
|
1236
|
+
engine = utils.get_sqlalchemy_engine(path, **kwargs)
|
1237
|
+
|
1238
|
+
table_name = table.strip()
|
1239
|
+
with engine.connect() as connection:
|
1240
|
+
# if it's a query expression:
|
1241
|
+
if table_name.lower().startswith("select"):
|
1242
|
+
sql_query = table_name
|
1243
|
+
else:
|
1244
|
+
sql_query = f"select * from {table_name}"
|
1245
|
+
|
1246
|
+
chunks = pd.read_sql_query(
|
1247
|
+
sql_query,
|
1248
|
+
con=connection,
|
1249
|
+
**validate_kwargs(
|
1250
|
+
pd.read_sql_query,
|
1251
|
+
utils.inject_and_copy_kwargs(
|
1252
|
+
kwargs, **{"chunksize": cls.DEFAULT_SQL_CHUNKSIZE}
|
1253
|
+
),
|
1254
|
+
),
|
1255
|
+
)
|
1256
|
+
df = pd.DataFrame()
|
1257
|
+
from tqdm import tqdm
|
1258
|
+
|
1259
|
+
with tqdm(chunks, unit=" rows") as t:
|
1260
|
+
for chunk in chunks:
|
1261
|
+
df = pd.concat([df, chunk])
|
1262
|
+
t.update(len(chunk))
|
1263
|
+
|
1264
|
+
df = df.reset_index(drop=True)
|
1265
|
+
if df.shape[0] == 0:
|
1266
|
+
logger.warning(
|
1267
|
+
"The SQL expression returned zero rows. Therefore, no `ADSdataset` object was created."
|
1268
|
+
)
|
1269
|
+
raise Exception("The SQL expression returned no rows")
|
1270
|
+
return df
|
1271
|
+
|
1272
|
+
@staticmethod
|
1273
|
+
def read_log(path, **kwargs):
|
1274
|
+
from ads.dataset.helper import parse_apache_log_str, parse_apache_log_datetime
|
1275
|
+
|
1276
|
+
df = pd.read_csv(
|
1277
|
+
path,
|
1278
|
+
# assume_missing=True,
|
1279
|
+
sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
|
1280
|
+
engine="python",
|
1281
|
+
na_values="-",
|
1282
|
+
header=None,
|
1283
|
+
names=[
|
1284
|
+
"host",
|
1285
|
+
"identity",
|
1286
|
+
"user",
|
1287
|
+
"time",
|
1288
|
+
"request",
|
1289
|
+
"http_code",
|
1290
|
+
"response_bytes",
|
1291
|
+
"referer",
|
1292
|
+
"user_agent",
|
1293
|
+
"unknown",
|
1294
|
+
],
|
1295
|
+
converters={
|
1296
|
+
"time": parse_apache_log_datetime,
|
1297
|
+
"request": parse_apache_log_str,
|
1298
|
+
"status": int,
|
1299
|
+
"size": int,
|
1300
|
+
"referer": parse_apache_log_str,
|
1301
|
+
"user_agent": parse_apache_log_str,
|
1302
|
+
},
|
1303
|
+
**kwargs,
|
1304
|
+
)
|
1305
|
+
return df
|
1306
|
+
|
1307
|
+
@staticmethod
|
1308
|
+
def read_html(path, html_table_index: int = None, **kwargs):
|
1309
|
+
if html_table_index is None:
|
1310
|
+
return pd.concat(df for df in pd.read_html(path, **kwargs))
|
1311
|
+
else:
|
1312
|
+
return pd.read_html(path, **kwargs)[html_table_index]
|
1313
|
+
|
1314
|
+
@staticmethod
|
1315
|
+
@runtime_dependency(module="scipy", install_from=OptionalDependency.VIZ)
|
1316
|
+
def read_arff(path, **kwargs):
|
1317
|
+
from scipy.io import arff
|
1318
|
+
import requests
|
1319
|
+
from io import BytesIO, TextIOWrapper
|
1320
|
+
|
1321
|
+
data = None
|
1322
|
+
if os.path.isfile(path):
|
1323
|
+
data, _ = arff.loadarff(path)
|
1324
|
+
else:
|
1325
|
+
with requests.get(path) as r:
|
1326
|
+
if r.status_code == requests.codes.ok:
|
1327
|
+
f = TextIOWrapper(BytesIO(r.content))
|
1328
|
+
data, _ = arff.loadarff(f)
|
1329
|
+
return pd.DataFrame(data)
|
1330
|
+
|
1331
|
+
@staticmethod
|
1332
|
+
def read_xml(path: str, **kwargs) -> pd.DataFrame:
|
1333
|
+
"""
|
1334
|
+
Load data from xml file.
|
1335
|
+
|
1336
|
+
Parameters
|
1337
|
+
----------
|
1338
|
+
path: str
|
1339
|
+
Path to XML file
|
1340
|
+
storage_options: dict, optional
|
1341
|
+
Storage options passed to Pandas to read the file.
|
1342
|
+
|
1343
|
+
Returns
|
1344
|
+
-------
|
1345
|
+
dataframe : pandas.DataFrame
|
1346
|
+
"""
|
1347
|
+
import xml.etree.cElementTree as et
|
1348
|
+
|
1349
|
+
def get_children(df, node, parent, i):
|
1350
|
+
for name in node.attrib.keys():
|
1351
|
+
df.at[i, parent + name] = node.attrib[name]
|
1352
|
+
for child in list(node):
|
1353
|
+
if len(list(child)) > 0:
|
1354
|
+
get_children(df, child, parent + child.tag + "/", i)
|
1355
|
+
else:
|
1356
|
+
df.at[i, parent + child.tag] = child.text
|
1357
|
+
|
1358
|
+
storage_options = kwargs.get("storage_options", {})
|
1359
|
+
|
1360
|
+
file_handles = fsspec.open_files(path, mode="rb", **storage_options)
|
1361
|
+
ret_df = pd.DataFrame()
|
1362
|
+
last_i = 0
|
1363
|
+
for file_handle in file_handles:
|
1364
|
+
with file_handle:
|
1365
|
+
parsed_xml = et.parse(path)
|
1366
|
+
for i, node in enumerate(parsed_xml.getroot()):
|
1367
|
+
get_children(ret_df, node, node.tag + "/", last_i + i)
|
1368
|
+
last_i = i
|
1369
|
+
return ret_df
|
1370
|
+
|
1371
|
+
|
1372
|
+
reader_fns = {
|
1373
|
+
"csv": pd.read_csv,
|
1374
|
+
"tsv": CustomFormatReaders.read_tsv,
|
1375
|
+
"json": CustomFormatReaders.read_json,
|
1376
|
+
"jsonl": CustomFormatReaders.read_json,
|
1377
|
+
"excel": pd.read_excel,
|
1378
|
+
"xls": pd.read_excel,
|
1379
|
+
"xlsx": pd.read_excel,
|
1380
|
+
"parquet": pd.read_parquet,
|
1381
|
+
"libsvm": CustomFormatReaders.read_libsvm,
|
1382
|
+
"hdf": pd.read_hdf, # Todo: re.match(format, "hdf\d*") or format == "h5"
|
1383
|
+
"hdf3": pd.read_hdf,
|
1384
|
+
"hdf4": pd.read_hdf,
|
1385
|
+
"h5": pd.read_hdf,
|
1386
|
+
"avro": CustomFormatReaders.read_avro,
|
1387
|
+
"avsc": CustomFormatReaders.read_avro,
|
1388
|
+
"sql": CustomFormatReaders.read_sql,
|
1389
|
+
"db": CustomFormatReaders.read_sql,
|
1390
|
+
"log": CustomFormatReaders.read_log,
|
1391
|
+
"clf": CustomFormatReaders.read_log,
|
1392
|
+
"html": CustomFormatReaders.read_html,
|
1393
|
+
"arff": CustomFormatReaders.read_arff,
|
1394
|
+
"xml": CustomFormatReaders.read_xml,
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
|
1398
|
+
def validate_kwargs(func: Callable, kwargs):
|
1399
|
+
valid_params = inspect.signature(func).parameters
|
1400
|
+
if "kwargs" in valid_params:
|
1401
|
+
return kwargs
|
1402
|
+
else:
|
1403
|
+
return {k: v for k, v in kwargs.items() if k in valid_params}
|
1404
|
+
|
1405
|
+
|
1406
|
+
def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable:
|
1407
|
+
format_key = path.format
|
1408
|
+
try:
|
1409
|
+
reader_fn = reader_fns[format_key]
|
1410
|
+
except (KeyError, NameError):
|
1411
|
+
raise ValueError(
|
1412
|
+
f"We were unable to load the specified dataset. We have interpreted the format "
|
1413
|
+
f"as {format_key}, if this is not correct, call again and set the `format` parameter = "
|
1414
|
+
f"to the desired format. Read more here: https://docs.cloud.oracle.com/en-us/iaas/tools/ads"
|
1415
|
+
f"-sdk/latest/user_guide/loading_data/loading_data.html#specify-data-types-in-load-dataset"
|
1416
|
+
)
|
1417
|
+
|
1418
|
+
return reader_fn
|
1419
|
+
|
1420
|
+
|
1421
|
+
def load_dataset(path: ElaboratedPath, reader_fn: Callable, **kwargs) -> pd.DataFrame:
|
1422
|
+
dfs = []
|
1423
|
+
for filename in path.paths:
|
1424
|
+
data = reader_fn(filename, **validate_kwargs(reader_fn, kwargs))
|
1425
|
+
if not isinstance(data, pd.DataFrame):
|
1426
|
+
fn_name = f"{reader_fn.__module__}.{reader_fn.__qualname__}"
|
1427
|
+
raise ValueError(
|
1428
|
+
f"{fn_name} is used to load the data. "
|
1429
|
+
f"However, {fn_name} returned {type(data)} instead of pandas DataFrame. "
|
1430
|
+
f"Refer to the usage of {fn_name} to set the correct arguments."
|
1431
|
+
)
|
1432
|
+
dfs.append(data)
|
1433
|
+
if len(dfs) == 0:
|
1434
|
+
raise ValueError(
|
1435
|
+
f"We were unable to load the specified dataset. Read more here: "
|
1436
|
+
f"https://docs.cloud.oracle.com/en-us/iaas/tools/ads"
|
1437
|
+
f"-sdk/latest/user_guide/loading_data/loading_data.html#specify-data-types-in-load-dataset"
|
1438
|
+
)
|
1439
|
+
|
1440
|
+
df = pd.concat(dfs)
|
1441
|
+
|
1442
|
+
if df is None:
|
1443
|
+
raise ValueError(
|
1444
|
+
f"We were unable to load the specified dataset. Read more here: "
|
1445
|
+
f"https://docs.cloud.oracle.com/en-us/iaas/tools/ads"
|
1446
|
+
f"-sdk/latest/user_guide/loading_data/loading_data.html#specify-data-types-in-load-dataset"
|
1447
|
+
)
|
1448
|
+
if df.empty:
|
1449
|
+
raise DatasetLoadException("Empty DataFrame, not producing a ADSDataset")
|
1450
|
+
return df
|