snowpark-connect 0.20.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/__init__.py +23 -0
- snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
- snowflake/snowpark_connect/column_name_handler.py +735 -0
- snowflake/snowpark_connect/config.py +576 -0
- snowflake/snowpark_connect/constants.py +47 -0
- snowflake/snowpark_connect/control_server.py +52 -0
- snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
- snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
- snowflake/snowpark_connect/empty_dataframe.py +18 -0
- snowflake/snowpark_connect/error/__init__.py +11 -0
- snowflake/snowpark_connect/error/error_mapping.py +6174 -0
- snowflake/snowpark_connect/error/error_utils.py +321 -0
- snowflake/snowpark_connect/error/exceptions.py +24 -0
- snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
- snowflake/snowpark_connect/execute_plan/utils.py +183 -0
- snowflake/snowpark_connect/expression/__init__.py +3 -0
- snowflake/snowpark_connect/expression/literal.py +90 -0
- snowflake/snowpark_connect/expression/map_cast.py +343 -0
- snowflake/snowpark_connect/expression/map_expression.py +293 -0
- snowflake/snowpark_connect/expression/map_extension.py +104 -0
- snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
- snowflake/snowpark_connect/expression/map_udf.py +142 -0
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
- snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
- snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
- snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
- snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
- snowflake/snowpark_connect/expression/map_window_function.py +258 -0
- snowflake/snowpark_connect/expression/typer.py +125 -0
- snowflake/snowpark_connect/includes/__init__.py +0 -0
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/python/__init__.py +21 -0
- snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
- snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
- snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
- snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
- snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
- snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
- snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
- snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
- snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
- snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
- snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
- snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
- snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
- snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
- snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
- snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
- snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
- snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
- snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
- snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
- snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
- snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
- snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
- snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
- snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
- snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
- snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
- snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
- snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
- snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
- snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
- snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
- snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
- snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
- snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
- snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
- snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
- snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
- snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
- snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
- snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
- snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
- snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
- snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
- snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
- snowflake/snowpark_connect/proto/__init__.py +10 -0
- snowflake/snowpark_connect/proto/control_pb2.py +35 -0
- snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
- snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
- snowflake/snowpark_connect/relation/__init__.py +3 -0
- snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
- snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
- snowflake/snowpark_connect/relation/io_utils.py +76 -0
- snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
- snowflake/snowpark_connect/relation/map_catalog.py +151 -0
- snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
- snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
- snowflake/snowpark_connect/relation/map_extension.py +412 -0
- snowflake/snowpark_connect/relation/map_join.py +341 -0
- snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
- snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
- snowflake/snowpark_connect/relation/map_relation.py +253 -0
- snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
- snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
- snowflake/snowpark_connect/relation/map_show_string.py +50 -0
- snowflake/snowpark_connect/relation/map_sql.py +1874 -0
- snowflake/snowpark_connect/relation/map_stats.py +324 -0
- snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
- snowflake/snowpark_connect/relation/map_udtf.py +288 -0
- snowflake/snowpark_connect/relation/read/__init__.py +7 -0
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
- snowflake/snowpark_connect/relation/read/map_read.py +367 -0
- snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
- snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
- snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
- snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
- snowflake/snowpark_connect/relation/read/utils.py +155 -0
- snowflake/snowpark_connect/relation/stage_locator.py +161 -0
- snowflake/snowpark_connect/relation/utils.py +219 -0
- snowflake/snowpark_connect/relation/write/__init__.py +3 -0
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
- snowflake/snowpark_connect/relation/write/map_write.py +436 -0
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +75 -0
- snowflake/snowpark_connect/server.py +1136 -0
- snowflake/snowpark_connect/start_server.py +32 -0
- snowflake/snowpark_connect/tcm.py +8 -0
- snowflake/snowpark_connect/type_mapping.py +1003 -0
- snowflake/snowpark_connect/typed_column.py +94 -0
- snowflake/snowpark_connect/utils/__init__.py +3 -0
- snowflake/snowpark_connect/utils/artifacts.py +48 -0
- snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
- snowflake/snowpark_connect/utils/cache.py +84 -0
- snowflake/snowpark_connect/utils/concurrent.py +124 -0
- snowflake/snowpark_connect/utils/context.py +390 -0
- snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
- snowflake/snowpark_connect/utils/interrupt.py +85 -0
- snowflake/snowpark_connect/utils/io_utils.py +35 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
- snowflake/snowpark_connect/utils/profiling.py +47 -0
- snowflake/snowpark_connect/utils/session.py +180 -0
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
- snowflake/snowpark_connect/utils/telemetry.py +513 -0
- snowflake/snowpark_connect/utils/udf_cache.py +392 -0
- snowflake/snowpark_connect/utils/udf_helper.py +328 -0
- snowflake/snowpark_connect/utils/udf_utils.py +310 -0
- snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
- snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
- snowflake/snowpark_connect/utils/xxhash64.py +247 -0
- snowflake/snowpark_connect/version.py +6 -0
- snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
- snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
- snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
- snowpark_connect-0.20.2.dist-info/METADATA +37 -0
- snowpark_connect-0.20.2.dist-info/RECORD +879 -0
- snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
- snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
- snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,3560 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
|
+
# contributor license agreements. See the NOTICE file distributed with
|
|
4
|
+
# this work for additional information regarding copyright ownership.
|
|
5
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
6
|
+
# (the "License"); you may not use this file except in compliance with
|
|
7
|
+
# the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
A base class of DataFrame/Column to behave like pandas DataFrame/Series.
|
|
20
|
+
"""
|
|
21
|
+
from abc import ABCMeta, abstractmethod
|
|
22
|
+
from collections import Counter
|
|
23
|
+
from functools import reduce
|
|
24
|
+
from typing import (
|
|
25
|
+
Any,
|
|
26
|
+
Callable,
|
|
27
|
+
Dict,
|
|
28
|
+
Iterable,
|
|
29
|
+
IO,
|
|
30
|
+
List,
|
|
31
|
+
Optional,
|
|
32
|
+
NoReturn,
|
|
33
|
+
Tuple,
|
|
34
|
+
Union,
|
|
35
|
+
TYPE_CHECKING,
|
|
36
|
+
cast,
|
|
37
|
+
)
|
|
38
|
+
import warnings
|
|
39
|
+
|
|
40
|
+
import numpy as np
|
|
41
|
+
import pandas as pd
|
|
42
|
+
from pandas.api.types import is_list_like # type: ignore[attr-defined]
|
|
43
|
+
|
|
44
|
+
from pyspark.sql import Column, functions as F
|
|
45
|
+
from pyspark.sql.types import (
|
|
46
|
+
BooleanType,
|
|
47
|
+
DoubleType,
|
|
48
|
+
LongType,
|
|
49
|
+
NumericType,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm.
|
|
53
|
+
from pyspark.pandas._typing import (
|
|
54
|
+
Axis,
|
|
55
|
+
DataFrameOrSeries,
|
|
56
|
+
Dtype,
|
|
57
|
+
FrameLike,
|
|
58
|
+
Label,
|
|
59
|
+
Name,
|
|
60
|
+
Scalar,
|
|
61
|
+
)
|
|
62
|
+
from pyspark.pandas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
|
|
63
|
+
from pyspark.pandas.internal import InternalFrame
|
|
64
|
+
from pyspark.pandas.spark import functions as SF
|
|
65
|
+
from pyspark.pandas.typedef import spark_type_to_pandas_dtype
|
|
66
|
+
from pyspark.pandas.utils import (
|
|
67
|
+
is_name_like_tuple,
|
|
68
|
+
is_name_like_value,
|
|
69
|
+
name_like_string,
|
|
70
|
+
scol_for,
|
|
71
|
+
sql_conf,
|
|
72
|
+
validate_arguments_and_invoke_function,
|
|
73
|
+
validate_axis,
|
|
74
|
+
validate_mode,
|
|
75
|
+
SPARK_CONF_ARROW_ENABLED,
|
|
76
|
+
log_advice,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if TYPE_CHECKING:
|
|
80
|
+
from pyspark.pandas.frame import DataFrame
|
|
81
|
+
from pyspark.pandas.indexes.base import Index
|
|
82
|
+
from pyspark.pandas.groupby import GroupBy
|
|
83
|
+
from pyspark.pandas.series import Series
|
|
84
|
+
from pyspark.pandas.window import Rolling, Expanding, ExponentialMoving
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
bool_type = bool
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Frame(object, metaclass=ABCMeta):
|
|
91
|
+
"""
|
|
92
|
+
The base class for both DataFrame and Series.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def __getitem__(self, key: Any) -> Any:
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def _internal(self) -> InternalFrame:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
@abstractmethod
|
|
105
|
+
def _apply_series_op(
|
|
106
|
+
self: FrameLike,
|
|
107
|
+
op: Callable[["Series"], Union["Series", Column]],
|
|
108
|
+
should_resolve: bool = False,
|
|
109
|
+
) -> FrameLike:
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def _reduce_for_stat_function(
|
|
114
|
+
self,
|
|
115
|
+
sfun: Callable[["Series"], Column],
|
|
116
|
+
name: str,
|
|
117
|
+
axis: Optional[Axis] = None,
|
|
118
|
+
numeric_only: bool = True,
|
|
119
|
+
skipna: bool = True,
|
|
120
|
+
**kwargs: Any,
|
|
121
|
+
) -> Union["Series", Scalar]:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
@abstractmethod
|
|
126
|
+
def dtypes(self) -> Union[pd.Series, Dtype]:
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
@abstractmethod
|
|
130
|
+
def to_pandas(self) -> Union[pd.DataFrame, pd.Series]:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
@abstractmethod
|
|
134
|
+
def _to_pandas(self) -> Union[pd.DataFrame, pd.Series]:
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
@abstractmethod
|
|
139
|
+
def index(self) -> "Index":
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
@abstractmethod
|
|
143
|
+
def copy(self: FrameLike) -> FrameLike:
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
@abstractmethod
|
|
147
|
+
def _to_internal_pandas(self) -> Union[pd.DataFrame, pd.Series]:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
@abstractmethod
|
|
151
|
+
def head(self: FrameLike, n: int = 5) -> FrameLike:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
# TODO: add 'axis' parameter
|
|
155
|
+
def cummin(self: FrameLike, skipna: bool = True) -> FrameLike:
|
|
156
|
+
"""
|
|
157
|
+
Return cumulative minimum over a DataFrame or Series axis.
|
|
158
|
+
|
|
159
|
+
Returns a DataFrame or Series of the same size containing the cumulative minimum.
|
|
160
|
+
|
|
161
|
+
.. note:: the current implementation of cummin uses Spark's Window without
|
|
162
|
+
specifying partition specification. This leads to moveing all data into a
|
|
163
|
+
single partition in a single machine and could cause serious
|
|
164
|
+
performance degradation. Avoid this method with very large datasets.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
skipna: boolean, default True
|
|
169
|
+
Exclude NA/null values. If an entire row/column is NA, the result will be NA.
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
DataFrame or Series
|
|
174
|
+
|
|
175
|
+
See Also
|
|
176
|
+
--------
|
|
177
|
+
DataFrame.min: Return the minimum over DataFrame axis.
|
|
178
|
+
DataFrame.cummax: Return cumulative maximum over DataFrame axis.
|
|
179
|
+
DataFrame.cummin: Return cumulative minimum over DataFrame axis.
|
|
180
|
+
DataFrame.cumsum: Return cumulative sum over DataFrame axis.
|
|
181
|
+
Series.min: Return the minimum over Series axis.
|
|
182
|
+
Series.cummax: Return cumulative maximum over Series axis.
|
|
183
|
+
Series.cummin: Return cumulative minimum over Series axis.
|
|
184
|
+
Series.cumsum: Return cumulative sum over Series axis.
|
|
185
|
+
Series.cumprod: Return cumulative product over Series axis.
|
|
186
|
+
|
|
187
|
+
Examples
|
|
188
|
+
--------
|
|
189
|
+
>>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))
|
|
190
|
+
>>> df
|
|
191
|
+
A B
|
|
192
|
+
0 2.0 1.0
|
|
193
|
+
1 3.0 NaN
|
|
194
|
+
2 1.0 0.0
|
|
195
|
+
|
|
196
|
+
By default, iterates over rows and finds the minimum in each column.
|
|
197
|
+
|
|
198
|
+
>>> df.cummin()
|
|
199
|
+
A B
|
|
200
|
+
0 2.0 1.0
|
|
201
|
+
1 2.0 NaN
|
|
202
|
+
2 1.0 0.0
|
|
203
|
+
|
|
204
|
+
It works identically in Series.
|
|
205
|
+
|
|
206
|
+
>>> df.A.cummin()
|
|
207
|
+
0 2.0
|
|
208
|
+
1 2.0
|
|
209
|
+
2 1.0
|
|
210
|
+
Name: A, dtype: float64
|
|
211
|
+
"""
|
|
212
|
+
return self._apply_series_op(lambda psser: psser._cum(F.min, skipna), should_resolve=True)
|
|
213
|
+
|
|
214
|
+
# TODO: add 'axis' parameter
|
|
215
|
+
def cummax(self: FrameLike, skipna: bool = True) -> FrameLike:
|
|
216
|
+
"""
|
|
217
|
+
Return cumulative maximum over a DataFrame or Series axis.
|
|
218
|
+
|
|
219
|
+
Returns a DataFrame or Series of the same size containing the cumulative maximum.
|
|
220
|
+
|
|
221
|
+
.. note:: the current implementation of cummax uses Spark's Window without
|
|
222
|
+
specifying partition specification. This leads to moveing all data into a
|
|
223
|
+
single partition in a single machine and could cause serious
|
|
224
|
+
performance degradation. Avoid this method with very large datasets.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
skipna: boolean, default True
|
|
229
|
+
Exclude NA/null values. If an entire row/column is NA, the result will be NA.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
DataFrame or Series
|
|
234
|
+
|
|
235
|
+
See Also
|
|
236
|
+
--------
|
|
237
|
+
DataFrame.max: Return the maximum over DataFrame axis.
|
|
238
|
+
DataFrame.cummax: Return cumulative maximum over DataFrame axis.
|
|
239
|
+
DataFrame.cummin: Return cumulative minimum over DataFrame axis.
|
|
240
|
+
DataFrame.cumsum: Return cumulative sum over DataFrame axis.
|
|
241
|
+
DataFrame.cumprod: Return cumulative product over DataFrame axis.
|
|
242
|
+
Series.max: Return the maximum over Series axis.
|
|
243
|
+
Series.cummax: Return cumulative maximum over Series axis.
|
|
244
|
+
Series.cummin: Return cumulative minimum over Series axis.
|
|
245
|
+
Series.cumsum: Return cumulative sum over Series axis.
|
|
246
|
+
Series.cumprod: Return cumulative product over Series axis.
|
|
247
|
+
|
|
248
|
+
Examples
|
|
249
|
+
--------
|
|
250
|
+
>>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))
|
|
251
|
+
>>> df
|
|
252
|
+
A B
|
|
253
|
+
0 2.0 1.0
|
|
254
|
+
1 3.0 NaN
|
|
255
|
+
2 1.0 0.0
|
|
256
|
+
|
|
257
|
+
By default, iterates over rows and finds the maximum in each column.
|
|
258
|
+
|
|
259
|
+
>>> df.cummax()
|
|
260
|
+
A B
|
|
261
|
+
0 2.0 1.0
|
|
262
|
+
1 3.0 NaN
|
|
263
|
+
2 3.0 1.0
|
|
264
|
+
|
|
265
|
+
It works identically in Series.
|
|
266
|
+
|
|
267
|
+
>>> df.B.cummax()
|
|
268
|
+
0 1.0
|
|
269
|
+
1 NaN
|
|
270
|
+
2 1.0
|
|
271
|
+
Name: B, dtype: float64
|
|
272
|
+
"""
|
|
273
|
+
return self._apply_series_op(lambda psser: psser._cum(F.max, skipna), should_resolve=True)
|
|
274
|
+
|
|
275
|
+
# TODO: add 'axis' parameter
|
|
276
|
+
def cumsum(self: FrameLike, skipna: bool = True) -> FrameLike:
|
|
277
|
+
"""
|
|
278
|
+
Return cumulative sum over a DataFrame or Series axis.
|
|
279
|
+
|
|
280
|
+
Returns a DataFrame or Series of the same size containing the cumulative sum.
|
|
281
|
+
|
|
282
|
+
.. note:: the current implementation of cumsum uses Spark's Window without
|
|
283
|
+
specifying partition specification. This leads to moveing all data into a
|
|
284
|
+
single partition in a single machine and could cause serious
|
|
285
|
+
performance degradation. Avoid this method with very large datasets.
|
|
286
|
+
|
|
287
|
+
Parameters
|
|
288
|
+
----------
|
|
289
|
+
skipna: boolean, default True
|
|
290
|
+
Exclude NA/null values. If an entire row/column is NA, the result will be NA.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
DataFrame or Series
|
|
295
|
+
|
|
296
|
+
See Also
|
|
297
|
+
--------
|
|
298
|
+
DataFrame.sum: Return the sum over DataFrame axis.
|
|
299
|
+
DataFrame.cummax: Return cumulative maximum over DataFrame axis.
|
|
300
|
+
DataFrame.cummin: Return cumulative minimum over DataFrame axis.
|
|
301
|
+
DataFrame.cumsum: Return cumulative sum over DataFrame axis.
|
|
302
|
+
DataFrame.cumprod: Return cumulative product over DataFrame axis.
|
|
303
|
+
Series.sum: Return the sum over Series axis.
|
|
304
|
+
Series.cummax: Return cumulative maximum over Series axis.
|
|
305
|
+
Series.cummin: Return cumulative minimum over Series axis.
|
|
306
|
+
Series.cumsum: Return cumulative sum over Series axis.
|
|
307
|
+
Series.cumprod: Return cumulative product over Series axis.
|
|
308
|
+
|
|
309
|
+
Examples
|
|
310
|
+
--------
|
|
311
|
+
>>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))
|
|
312
|
+
>>> df
|
|
313
|
+
A B
|
|
314
|
+
0 2.0 1.0
|
|
315
|
+
1 3.0 NaN
|
|
316
|
+
2 1.0 0.0
|
|
317
|
+
|
|
318
|
+
By default, iterates over rows and finds the sum in each column.
|
|
319
|
+
|
|
320
|
+
>>> df.cumsum()
|
|
321
|
+
A B
|
|
322
|
+
0 2.0 1.0
|
|
323
|
+
1 5.0 NaN
|
|
324
|
+
2 6.0 1.0
|
|
325
|
+
|
|
326
|
+
It works identically in Series.
|
|
327
|
+
|
|
328
|
+
>>> df.A.cumsum()
|
|
329
|
+
0 2.0
|
|
330
|
+
1 5.0
|
|
331
|
+
2 6.0
|
|
332
|
+
Name: A, dtype: float64
|
|
333
|
+
"""
|
|
334
|
+
return self._apply_series_op(lambda psser: psser._cumsum(skipna), should_resolve=True)
|
|
335
|
+
|
|
336
|
+
# TODO: add 'axis' parameter
|
|
337
|
+
# TODO: use pandas_udf to support negative values and other options later
|
|
338
|
+
# other window except unbounded ones is supported as of Spark 3.0.
|
|
339
|
+
def cumprod(self: FrameLike, skipna: bool = True) -> FrameLike:
|
|
340
|
+
"""
|
|
341
|
+
Return cumulative product over a DataFrame or Series axis.
|
|
342
|
+
|
|
343
|
+
Returns a DataFrame or Series of the same size containing the cumulative product.
|
|
344
|
+
|
|
345
|
+
.. note:: the current implementation of cumprod uses Spark's Window without
|
|
346
|
+
specifying partition specification. This leads to moveing all data into a
|
|
347
|
+
single partition in a single machine and could cause serious
|
|
348
|
+
performance degradation. Avoid this method with very large datasets.
|
|
349
|
+
|
|
350
|
+
.. note:: unlike pandas', pandas-on-Spark's emulates cumulative product by
|
|
351
|
+
``exp(sum(log(...)))`` trick. Therefore, it only works for positive numbers.
|
|
352
|
+
|
|
353
|
+
Parameters
|
|
354
|
+
----------
|
|
355
|
+
skipna: boolean, default True
|
|
356
|
+
Exclude NA/null values. If an entire row/column is NA, the result will be NA.
|
|
357
|
+
|
|
358
|
+
Returns
|
|
359
|
+
-------
|
|
360
|
+
DataFrame or Series
|
|
361
|
+
|
|
362
|
+
See Also
|
|
363
|
+
--------
|
|
364
|
+
DataFrame.cummax: Return cumulative maximum over DataFrame axis.
|
|
365
|
+
DataFrame.cummin: Return cumulative minimum over DataFrame axis.
|
|
366
|
+
DataFrame.cumsum: Return cumulative sum over DataFrame axis.
|
|
367
|
+
DataFrame.cumprod: Return cumulative product over DataFrame axis.
|
|
368
|
+
Series.cummax: Return cumulative maximum over Series axis.
|
|
369
|
+
Series.cummin: Return cumulative minimum over Series axis.
|
|
370
|
+
Series.cumsum: Return cumulative sum over Series axis.
|
|
371
|
+
Series.cumprod: Return cumulative product over Series axis.
|
|
372
|
+
|
|
373
|
+
Raises
|
|
374
|
+
------
|
|
375
|
+
Exception: If the values is equal to or lower than 0.
|
|
376
|
+
|
|
377
|
+
Examples
|
|
378
|
+
--------
|
|
379
|
+
>>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [4.0, 10.0]], columns=list('AB'))
|
|
380
|
+
>>> df
|
|
381
|
+
A B
|
|
382
|
+
0 2.0 1.0
|
|
383
|
+
1 3.0 NaN
|
|
384
|
+
2 4.0 10.0
|
|
385
|
+
|
|
386
|
+
By default, iterates over rows and finds the sum in each column.
|
|
387
|
+
|
|
388
|
+
>>> df.cumprod()
|
|
389
|
+
A B
|
|
390
|
+
0 2.0 1.0
|
|
391
|
+
1 6.0 NaN
|
|
392
|
+
2 24.0 10.0
|
|
393
|
+
|
|
394
|
+
It works identically in Series.
|
|
395
|
+
|
|
396
|
+
>>> df.A.cumprod()
|
|
397
|
+
0 2.0
|
|
398
|
+
1 6.0
|
|
399
|
+
2 24.0
|
|
400
|
+
Name: A, dtype: float64
|
|
401
|
+
"""
|
|
402
|
+
return self._apply_series_op(lambda psser: psser._cumprod(skipna), should_resolve=True)
|
|
403
|
+
|
|
404
|
+
# TODO: Although this has removed pandas >= 1.0.0, but we're keeping this as deprecated
|
|
405
|
+
# since we're using this for `DataFrame.info` internally.
|
|
406
|
+
# We can drop it once our minimal pandas version becomes 1.0.0.
|
|
407
|
+
def get_dtype_counts(self) -> pd.Series:
|
|
408
|
+
"""
|
|
409
|
+
Return counts of unique dtypes in this object.
|
|
410
|
+
|
|
411
|
+
.. deprecated:: 0.14.0
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
dtype: pd.Series
|
|
416
|
+
Series with the count of columns with each dtype.
|
|
417
|
+
|
|
418
|
+
See Also
|
|
419
|
+
--------
|
|
420
|
+
dtypes: Return the dtypes in this object.
|
|
421
|
+
|
|
422
|
+
Examples
|
|
423
|
+
--------
|
|
424
|
+
>>> a = [['a', 1, 1], ['b', 2, 2], ['c', 3, 3]]
|
|
425
|
+
>>> df = ps.DataFrame(a, columns=['str', 'int1', 'int2'])
|
|
426
|
+
>>> df
|
|
427
|
+
str int1 int2
|
|
428
|
+
0 a 1 1
|
|
429
|
+
1 b 2 2
|
|
430
|
+
2 c 3 3
|
|
431
|
+
|
|
432
|
+
>>> df.get_dtype_counts().sort_values()
|
|
433
|
+
object 1
|
|
434
|
+
int64 2
|
|
435
|
+
dtype: int64
|
|
436
|
+
|
|
437
|
+
>>> df.str.get_dtype_counts().sort_values()
|
|
438
|
+
object 1
|
|
439
|
+
dtype: int64
|
|
440
|
+
"""
|
|
441
|
+
warnings.warn(
|
|
442
|
+
"`get_dtype_counts` has been deprecated and will be "
|
|
443
|
+
"removed in a future version. For DataFrames use "
|
|
444
|
+
"`.dtypes.value_counts()",
|
|
445
|
+
FutureWarning,
|
|
446
|
+
)
|
|
447
|
+
if not isinstance(self.dtypes, Iterable):
|
|
448
|
+
dtypes = [self.dtypes]
|
|
449
|
+
else:
|
|
450
|
+
dtypes = list(self.dtypes)
|
|
451
|
+
return pd.Series(dict(Counter([d.name for d in dtypes])))
|
|
452
|
+
|
|
453
|
+
def pipe(self, func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
|
|
454
|
+
r"""
|
|
455
|
+
Apply func(self, \*args, \*\*kwargs).
|
|
456
|
+
|
|
457
|
+
Parameters
|
|
458
|
+
----------
|
|
459
|
+
func: function
|
|
460
|
+
function to apply to the DataFrame.
|
|
461
|
+
``args``, and ``kwargs`` are passed into ``func``.
|
|
462
|
+
Alternatively a ``(callable, data_keyword)`` tuple where
|
|
463
|
+
``data_keyword`` is a string indicating the keyword of
|
|
464
|
+
``callable`` that expects the DataFrames.
|
|
465
|
+
args: iterable, optional
|
|
466
|
+
positional arguments passed into ``func``.
|
|
467
|
+
kwargs: mapping, optional
|
|
468
|
+
a dictionary of keyword arguments passed into ``func``.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
object: the return type of ``func``.
|
|
473
|
+
|
|
474
|
+
Notes
|
|
475
|
+
-----
|
|
476
|
+
Use ``.pipe`` when chaining together functions that expect
|
|
477
|
+
Series, DataFrames or GroupBy objects. For example, given
|
|
478
|
+
|
|
479
|
+
>>> df = ps.DataFrame({'category': ['A', 'A', 'B'],
|
|
480
|
+
... 'col1': [1, 2, 3],
|
|
481
|
+
... 'col2': [4, 5, 6]},
|
|
482
|
+
... columns=['category', 'col1', 'col2'])
|
|
483
|
+
>>> def keep_category_a(df):
|
|
484
|
+
... return df[df['category'] == 'A']
|
|
485
|
+
>>> def add_one(df, column):
|
|
486
|
+
... return df.assign(col3=df[column] + 1)
|
|
487
|
+
>>> def multiply(df, column1, column2):
|
|
488
|
+
... return df.assign(col4=df[column1] * df[column2])
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
instead of writing
|
|
492
|
+
|
|
493
|
+
>>> multiply(add_one(keep_category_a(df), column="col1"), column1="col2", column2="col3")
|
|
494
|
+
category col1 col2 col3 col4
|
|
495
|
+
0 A 1 4 2 8
|
|
496
|
+
1 A 2 5 3 15
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
You can write
|
|
500
|
+
|
|
501
|
+
>>> (df.pipe(keep_category_a)
|
|
502
|
+
... .pipe(add_one, column="col1")
|
|
503
|
+
... .pipe(multiply, column1="col2", column2="col3")
|
|
504
|
+
... )
|
|
505
|
+
category col1 col2 col3 col4
|
|
506
|
+
0 A 1 4 2 8
|
|
507
|
+
1 A 2 5 3 15
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
If you have a function that takes the data as the second
|
|
511
|
+
argument, pass a tuple indicating which keyword expects the
|
|
512
|
+
data. For example, suppose ``f`` takes its data as ``df``:
|
|
513
|
+
|
|
514
|
+
>>> def multiply_2(column1, df, column2):
|
|
515
|
+
... return df.assign(col4=df[column1] * df[column2])
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
Then you can write
|
|
519
|
+
|
|
520
|
+
>>> (df.pipe(keep_category_a)
|
|
521
|
+
... .pipe(add_one, column="col1")
|
|
522
|
+
... .pipe((multiply_2, 'df'), column1="col2", column2="col3")
|
|
523
|
+
... )
|
|
524
|
+
category col1 col2 col3 col4
|
|
525
|
+
0 A 1 4 2 8
|
|
526
|
+
1 A 2 5 3 15
|
|
527
|
+
|
|
528
|
+
You can use lambda as well
|
|
529
|
+
|
|
530
|
+
>>> ps.Series([1, 2, 3]).pipe(lambda x: (x + 1).rename("value"))
|
|
531
|
+
0 2
|
|
532
|
+
1 3
|
|
533
|
+
2 4
|
|
534
|
+
Name: value, dtype: int64
|
|
535
|
+
"""
|
|
536
|
+
|
|
537
|
+
if isinstance(func, tuple):
|
|
538
|
+
func, target = func
|
|
539
|
+
if target in kwargs:
|
|
540
|
+
raise ValueError("%s is both the pipe target and a keyword " "argument" % target)
|
|
541
|
+
kwargs[target] = self
|
|
542
|
+
return func(*args, **kwargs)
|
|
543
|
+
else:
|
|
544
|
+
return func(self, *args, **kwargs)
|
|
545
|
+
|
|
546
|
+
def to_numpy(self) -> np.ndarray:
|
|
547
|
+
"""
|
|
548
|
+
A NumPy ndarray representing the values in this DataFrame or Series.
|
|
549
|
+
|
|
550
|
+
.. note:: This method should only be used if the resulting NumPy ndarray is expected
|
|
551
|
+
to be small, as all the data is loaded into the driver's memory.
|
|
552
|
+
|
|
553
|
+
Returns
|
|
554
|
+
-------
|
|
555
|
+
numpy.ndarray
|
|
556
|
+
|
|
557
|
+
Examples
|
|
558
|
+
--------
|
|
559
|
+
>>> ps.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
|
|
560
|
+
array([[1, 3],
|
|
561
|
+
[2, 4]])
|
|
562
|
+
|
|
563
|
+
With heterogeneous data, the lowest common type will have to be used.
|
|
564
|
+
|
|
565
|
+
>>> ps.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}).to_numpy()
|
|
566
|
+
array([[1. , 3. ],
|
|
567
|
+
[2. , 4.5]])
|
|
568
|
+
|
|
569
|
+
For a mix of numeric and non-numeric types, the output array will have object dtype.
|
|
570
|
+
|
|
571
|
+
>>> df = ps.DataFrame({"A": [1, 2], "B": [3.0, 4.5], "C": pd.date_range('2000', periods=2)})
|
|
572
|
+
>>> df.to_numpy()
|
|
573
|
+
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
|
|
574
|
+
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
|
|
575
|
+
|
|
576
|
+
For Series,
|
|
577
|
+
|
|
578
|
+
>>> ps.Series(['a', 'b', 'a']).to_numpy()
|
|
579
|
+
array(['a', 'b', 'a'], dtype=object)
|
|
580
|
+
"""
|
|
581
|
+
log_advice(
|
|
582
|
+
"`to_numpy` loads all data into the driver's memory. "
|
|
583
|
+
"It should only be used if the resulting NumPy ndarray is expected to be small."
|
|
584
|
+
)
|
|
585
|
+
return cast(np.ndarray, self._to_pandas().values)
|
|
586
|
+
|
|
587
|
+
@property
|
|
588
|
+
def values(self) -> np.ndarray:
|
|
589
|
+
"""
|
|
590
|
+
Return a Numpy representation of the DataFrame or the Series.
|
|
591
|
+
|
|
592
|
+
.. warning:: We recommend using `DataFrame.to_numpy()` or `Series.to_numpy()` instead.
|
|
593
|
+
|
|
594
|
+
.. note:: This method should only be used if the resulting NumPy ndarray is expected
|
|
595
|
+
to be small, as all the data is loaded into the driver's memory.
|
|
596
|
+
|
|
597
|
+
Returns
|
|
598
|
+
-------
|
|
599
|
+
numpy.ndarray
|
|
600
|
+
|
|
601
|
+
Examples
|
|
602
|
+
--------
|
|
603
|
+
A DataFrame where all columns are the same type (e.g., int64) results in an array of
|
|
604
|
+
the same type.
|
|
605
|
+
|
|
606
|
+
>>> df = ps.DataFrame({'age': [ 3, 29],
|
|
607
|
+
... 'height': [94, 170],
|
|
608
|
+
... 'weight': [31, 115]})
|
|
609
|
+
>>> df
|
|
610
|
+
age height weight
|
|
611
|
+
0 3 94 31
|
|
612
|
+
1 29 170 115
|
|
613
|
+
>>> df.dtypes
|
|
614
|
+
age int64
|
|
615
|
+
height int64
|
|
616
|
+
weight int64
|
|
617
|
+
dtype: object
|
|
618
|
+
>>> df.values
|
|
619
|
+
array([[ 3, 94, 31],
|
|
620
|
+
[ 29, 170, 115]])
|
|
621
|
+
|
|
622
|
+
A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray
|
|
623
|
+
of the broadest type that accommodates these mixed types (e.g., object).
|
|
624
|
+
|
|
625
|
+
>>> df2 = ps.DataFrame([('parrot', 24.0, 'second'),
|
|
626
|
+
... ('lion', 80.5, 'first'),
|
|
627
|
+
... ('monkey', np.nan, None)],
|
|
628
|
+
... columns=('name', 'max_speed', 'rank'))
|
|
629
|
+
>>> df2.dtypes
|
|
630
|
+
name object
|
|
631
|
+
max_speed float64
|
|
632
|
+
rank object
|
|
633
|
+
dtype: object
|
|
634
|
+
>>> df2.values
|
|
635
|
+
array([['parrot', 24.0, 'second'],
|
|
636
|
+
['lion', 80.5, 'first'],
|
|
637
|
+
['monkey', nan, None]], dtype=object)
|
|
638
|
+
|
|
639
|
+
For Series,
|
|
640
|
+
|
|
641
|
+
>>> ps.Series([1, 2, 3]).values
|
|
642
|
+
array([1, 2, 3])
|
|
643
|
+
|
|
644
|
+
>>> ps.Series(list('aabc')).values
|
|
645
|
+
array(['a', 'a', 'b', 'c'], dtype=object)
|
|
646
|
+
"""
|
|
647
|
+
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
|
|
648
|
+
return self.to_numpy()
|
|
649
|
+
|
|
650
|
+
def to_csv(
|
|
651
|
+
self,
|
|
652
|
+
path: Optional[str] = None,
|
|
653
|
+
sep: str = ",",
|
|
654
|
+
na_rep: str = "",
|
|
655
|
+
columns: Optional[List[Name]] = None,
|
|
656
|
+
header: bool = True,
|
|
657
|
+
quotechar: str = '"',
|
|
658
|
+
date_format: Optional[str] = None,
|
|
659
|
+
escapechar: Optional[str] = None,
|
|
660
|
+
num_files: Optional[int] = None,
|
|
661
|
+
mode: str = "w",
|
|
662
|
+
partition_cols: Optional[Union[str, List[str]]] = None,
|
|
663
|
+
index_col: Optional[Union[str, List[str]]] = None,
|
|
664
|
+
**options: Any,
|
|
665
|
+
) -> Optional[str]:
|
|
666
|
+
r"""
|
|
667
|
+
Write object to a comma-separated values (csv) file.
|
|
668
|
+
|
|
669
|
+
.. note:: pandas-on-Spark `to_csv` writes files to a path or URI. Unlike pandas',
|
|
670
|
+
pandas-on-Spark respects HDFS's property such as 'fs.default.name'.
|
|
671
|
+
|
|
672
|
+
.. note:: pandas-on-Spark writes CSV files into the directory, `path`, and writes
|
|
673
|
+
multiple `part-...` files in the directory when `path` is specified.
|
|
674
|
+
This behavior was inherited from Apache Spark. The number of partitions can
|
|
675
|
+
be controlled by `num_files`. This is deprecated.
|
|
676
|
+
Use `DataFrame.spark.repartition` instead.
|
|
677
|
+
|
|
678
|
+
Parameters
|
|
679
|
+
----------
|
|
680
|
+
path: str, default None
|
|
681
|
+
File path. If None is provided the result is returned as a string.
|
|
682
|
+
sep: str, default ','
|
|
683
|
+
String of length 1. Field delimiter for the output file.
|
|
684
|
+
na_rep: str, default ''
|
|
685
|
+
Missing data representation.
|
|
686
|
+
columns: sequence, optional
|
|
687
|
+
Columns to write.
|
|
688
|
+
header: bool or list of str, default True
|
|
689
|
+
Write out the column names. If a list of strings is given it is
|
|
690
|
+
assumed to be aliases for the column names.
|
|
691
|
+
quotechar: str, default '\"'
|
|
692
|
+
String of length 1. Character used to quote fields.
|
|
693
|
+
date_format: str, default None
|
|
694
|
+
Format string for datetime objects.
|
|
695
|
+
escapechar: str, default None
|
|
696
|
+
String of length 1. Character used to escape `sep` and `quotechar`
|
|
697
|
+
when appropriate.
|
|
698
|
+
num_files: the number of partitions to be written in `path` directory when
|
|
699
|
+
this is a path. This is deprecated. Use `DataFrame.spark.repartition` instead.
|
|
700
|
+
mode: str
|
|
701
|
+
Python write mode, default 'w'.
|
|
702
|
+
|
|
703
|
+
.. note:: mode can accept the strings for Spark writing mode.
|
|
704
|
+
Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.
|
|
705
|
+
|
|
706
|
+
- 'append' (equivalent to 'a'): Append the new data to existing data.
|
|
707
|
+
- 'overwrite' (equivalent to 'w'): Overwrite existing data.
|
|
708
|
+
- 'ignore': Silently ignore this operation if data already exists.
|
|
709
|
+
- 'error' or 'errorifexists': Throw an exception if data already exists.
|
|
710
|
+
|
|
711
|
+
partition_cols: str or list of str, optional, default None
|
|
712
|
+
Names of partitioning columns
|
|
713
|
+
index_col: str or list of str, optional, default: None
|
|
714
|
+
Column names to be used in Spark to represent pandas-on-Spark's index. The index name
|
|
715
|
+
in pandas-on-Spark is ignored. By default, the index is always lost.
|
|
716
|
+
options: keyword arguments for additional options specific to PySpark.
|
|
717
|
+
These kwargs are specific to PySpark's CSV options to pass. Check
|
|
718
|
+
the options in PySpark's API documentation for spark.write.csv(...).
|
|
719
|
+
It has higher priority and overwrites all other options.
|
|
720
|
+
This parameter only works when `path` is specified.
|
|
721
|
+
|
|
722
|
+
Returns
|
|
723
|
+
-------
|
|
724
|
+
str or None
|
|
725
|
+
|
|
726
|
+
See Also
|
|
727
|
+
--------
|
|
728
|
+
read_csv
|
|
729
|
+
DataFrame.to_delta
|
|
730
|
+
DataFrame.to_table
|
|
731
|
+
DataFrame.to_parquet
|
|
732
|
+
DataFrame.to_spark_io
|
|
733
|
+
|
|
734
|
+
Examples
|
|
735
|
+
--------
|
|
736
|
+
>>> df = ps.DataFrame(dict(
|
|
737
|
+
... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),
|
|
738
|
+
... country=['KR', 'US', 'JP'],
|
|
739
|
+
... code=[1, 2 ,3]), columns=['date', 'country', 'code'])
|
|
740
|
+
>>> df.sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
|
741
|
+
date country code
|
|
742
|
+
... 2012-01-31 12:00:00 KR 1
|
|
743
|
+
... 2012-02-29 12:00:00 US 2
|
|
744
|
+
... 2012-03-31 12:00:00 JP 3
|
|
745
|
+
|
|
746
|
+
>>> print(df.to_csv()) # doctest: +NORMALIZE_WHITESPACE
|
|
747
|
+
date,country,code
|
|
748
|
+
2012-01-31 12:00:00,KR,1
|
|
749
|
+
2012-02-29 12:00:00,US,2
|
|
750
|
+
2012-03-31 12:00:00,JP,3
|
|
751
|
+
|
|
752
|
+
>>> df.cummax().to_csv(path=r'%s/to_csv/foo.csv' % path, num_files=1)
|
|
753
|
+
>>> ps.read_csv(
|
|
754
|
+
... path=r'%s/to_csv/foo.csv' % path
|
|
755
|
+
... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
|
756
|
+
date country code
|
|
757
|
+
... 2012-01-31 12:00:00 KR 1
|
|
758
|
+
... 2012-02-29 12:00:00 US 2
|
|
759
|
+
... 2012-03-31 12:00:00 US 3
|
|
760
|
+
|
|
761
|
+
In case of Series,
|
|
762
|
+
|
|
763
|
+
>>> print(df.date.to_csv()) # doctest: +NORMALIZE_WHITESPACE
|
|
764
|
+
date
|
|
765
|
+
2012-01-31 12:00:00
|
|
766
|
+
2012-02-29 12:00:00
|
|
767
|
+
2012-03-31 12:00:00
|
|
768
|
+
|
|
769
|
+
>>> df.date.to_csv(path=r'%s/to_csv/foo.csv' % path, num_files=1)
|
|
770
|
+
>>> ps.read_csv(
|
|
771
|
+
... path=r'%s/to_csv/foo.csv' % path
|
|
772
|
+
... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
|
773
|
+
date
|
|
774
|
+
... 2012-01-31 12:00:00
|
|
775
|
+
... 2012-02-29 12:00:00
|
|
776
|
+
... 2012-03-31 12:00:00
|
|
777
|
+
|
|
778
|
+
You can preserve the index in the roundtrip as below.
|
|
779
|
+
|
|
780
|
+
>>> df.set_index("country", append=True, inplace=True)
|
|
781
|
+
>>> df.date.to_csv(
|
|
782
|
+
... path=r'%s/to_csv/bar.csv' % path,
|
|
783
|
+
... num_files=1,
|
|
784
|
+
... index_col=["index1", "index2"])
|
|
785
|
+
>>> ps.read_csv(
|
|
786
|
+
... path=r'%s/to_csv/bar.csv' % path, index_col=["index1", "index2"]
|
|
787
|
+
... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
|
788
|
+
date
|
|
789
|
+
index1 index2
|
|
790
|
+
... ... 2012-01-31 12:00:00
|
|
791
|
+
... ... 2012-02-29 12:00:00
|
|
792
|
+
... ... 2012-03-31 12:00:00
|
|
793
|
+
"""
|
|
794
|
+
if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
|
|
795
|
+
options = options.get("options")
|
|
796
|
+
|
|
797
|
+
if path is None:
|
|
798
|
+
# If path is none, just collect and use pandas's to_csv.
|
|
799
|
+
return self._to_pandas().to_csv(
|
|
800
|
+
None,
|
|
801
|
+
sep=sep,
|
|
802
|
+
na_rep=na_rep,
|
|
803
|
+
columns=columns,
|
|
804
|
+
header=header,
|
|
805
|
+
quotechar=quotechar,
|
|
806
|
+
date_format=date_format,
|
|
807
|
+
escapechar=escapechar,
|
|
808
|
+
index=False,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
if isinstance(self, ps.DataFrame):
|
|
812
|
+
psdf = self
|
|
813
|
+
else:
|
|
814
|
+
assert isinstance(self, ps.Series)
|
|
815
|
+
psdf = self.to_frame()
|
|
816
|
+
|
|
817
|
+
if columns is None:
|
|
818
|
+
column_labels = psdf._internal.column_labels
|
|
819
|
+
else:
|
|
820
|
+
column_labels = []
|
|
821
|
+
for col in columns:
|
|
822
|
+
if is_name_like_tuple(col):
|
|
823
|
+
label = cast(Label, col)
|
|
824
|
+
else:
|
|
825
|
+
label = cast(Label, (col,))
|
|
826
|
+
if label not in psdf._internal.column_labels:
|
|
827
|
+
raise KeyError(name_like_string(label))
|
|
828
|
+
column_labels.append(label)
|
|
829
|
+
|
|
830
|
+
if isinstance(index_col, str):
|
|
831
|
+
index_cols = [index_col]
|
|
832
|
+
elif index_col is None:
|
|
833
|
+
index_cols = []
|
|
834
|
+
else:
|
|
835
|
+
index_cols = index_col
|
|
836
|
+
|
|
837
|
+
if header is True and psdf._internal.column_labels_level > 1:
|
|
838
|
+
raise ValueError("to_csv only support one-level index column now")
|
|
839
|
+
elif isinstance(header, list):
|
|
840
|
+
sdf = psdf.to_spark(index_col)
|
|
841
|
+
sdf = sdf.select(
|
|
842
|
+
[scol_for(sdf, name_like_string(label)) for label in index_cols]
|
|
843
|
+
+ [
|
|
844
|
+
scol_for(sdf, str(i) if label is None else name_like_string(label)).alias(
|
|
845
|
+
new_name
|
|
846
|
+
)
|
|
847
|
+
for i, (label, new_name) in enumerate(zip(column_labels, header))
|
|
848
|
+
]
|
|
849
|
+
)
|
|
850
|
+
header = True
|
|
851
|
+
else:
|
|
852
|
+
sdf = psdf.to_spark(index_col)
|
|
853
|
+
sdf = sdf.select(
|
|
854
|
+
[scol_for(sdf, name_like_string(label)) for label in index_cols]
|
|
855
|
+
+ [
|
|
856
|
+
scol_for(sdf, str(i) if label is None else name_like_string(label))
|
|
857
|
+
for i, label in enumerate(column_labels)
|
|
858
|
+
]
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
if num_files is not None:
|
|
862
|
+
warnings.warn(
|
|
863
|
+
"`num_files` has been deprecated and might be removed in a future version. "
|
|
864
|
+
"Use `DataFrame.spark.repartition` instead.",
|
|
865
|
+
FutureWarning,
|
|
866
|
+
)
|
|
867
|
+
sdf = sdf.repartition(num_files)
|
|
868
|
+
|
|
869
|
+
mode = validate_mode(mode)
|
|
870
|
+
builder = sdf.write.mode(mode)
|
|
871
|
+
if partition_cols is not None:
|
|
872
|
+
builder.partitionBy(partition_cols)
|
|
873
|
+
builder._set_opts(
|
|
874
|
+
sep=sep,
|
|
875
|
+
nullValue=na_rep,
|
|
876
|
+
header=header,
|
|
877
|
+
quote=quotechar,
|
|
878
|
+
dateFormat=date_format,
|
|
879
|
+
charToEscapeQuoteEscaping=escapechar,
|
|
880
|
+
)
|
|
881
|
+
builder.options(**options).format("csv").save(path)
|
|
882
|
+
return None
|
|
883
|
+
|
|
884
|
+
def to_json(
|
|
885
|
+
self,
|
|
886
|
+
path: Optional[str] = None,
|
|
887
|
+
compression: str = "uncompressed",
|
|
888
|
+
num_files: Optional[int] = None,
|
|
889
|
+
mode: str = "w",
|
|
890
|
+
orient: str = "records",
|
|
891
|
+
lines: bool = True,
|
|
892
|
+
partition_cols: Optional[Union[str, List[str]]] = None,
|
|
893
|
+
index_col: Optional[Union[str, List[str]]] = None,
|
|
894
|
+
**options: Any,
|
|
895
|
+
) -> Optional[str]:
|
|
896
|
+
"""
|
|
897
|
+
Convert the object to a JSON string.
|
|
898
|
+
|
|
899
|
+
.. note:: pandas-on-Spark `to_json` writes files to a path or URI. Unlike pandas',
|
|
900
|
+
pandas-on-Spark respects HDFS's property such as 'fs.default.name'.
|
|
901
|
+
|
|
902
|
+
.. note:: pandas-on-Spark writes JSON files into the directory, `path`, and writes
|
|
903
|
+
multiple `part-...` files in the directory when `path` is specified.
|
|
904
|
+
This behavior was inherited from Apache Spark. The number of partitions can
|
|
905
|
+
be controlled by `num_files`. This is deprecated.
|
|
906
|
+
Use `DataFrame.spark.repartition` instead.
|
|
907
|
+
|
|
908
|
+
.. note:: output JSON format is different from pandas'. It always uses `orient='records'`
|
|
909
|
+
for its output. This behavior might have to change soon.
|
|
910
|
+
|
|
911
|
+
.. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values
|
|
912
|
+
when writing JSON objects. It works only when `path` is provided.
|
|
913
|
+
|
|
914
|
+
Note NaN's and None will be converted to null and datetime objects
|
|
915
|
+
will be converted to UNIX timestamps.
|
|
916
|
+
|
|
917
|
+
Parameters
|
|
918
|
+
----------
|
|
919
|
+
path: string, optional
|
|
920
|
+
File path. If not specified, the result is returned as
|
|
921
|
+
a string.
|
|
922
|
+
lines: bool, default True
|
|
923
|
+
If ‘orient’ is ‘records’ write out line delimited JSON format.
|
|
924
|
+
Will throw ValueError if incorrect ‘orient’ since others are not
|
|
925
|
+
list like. It should be always True for now.
|
|
926
|
+
orient: str, default 'records'
|
|
927
|
+
It should be always 'records' for now.
|
|
928
|
+
compression: {'gzip', 'bz2', 'xz', None}
|
|
929
|
+
A string representing the compression to use in the output file,
|
|
930
|
+
only used when the first argument is a filename. By default, the
|
|
931
|
+
compression is inferred from the filename.
|
|
932
|
+
num_files: the number of partitions to be written in `path` directory when
|
|
933
|
+
this is a path. This is deprecated. Use `DataFrame.spark.repartition` instead.
|
|
934
|
+
mode: str
|
|
935
|
+
Python write mode, default 'w'.
|
|
936
|
+
|
|
937
|
+
.. note:: mode can accept the strings for Spark writing mode.
|
|
938
|
+
Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.
|
|
939
|
+
|
|
940
|
+
- 'append' (equivalent to 'a'): Append the new data to existing data.
|
|
941
|
+
- 'overwrite' (equivalent to 'w'): Overwrite existing data.
|
|
942
|
+
- 'ignore': Silently ignore this operation if data already exists.
|
|
943
|
+
- 'error' or 'errorifexists': Throw an exception if data already exists.
|
|
944
|
+
|
|
945
|
+
partition_cols: str or list of str, optional, default None
|
|
946
|
+
Names of partitioning columns
|
|
947
|
+
index_col: str or list of str, optional, default: None
|
|
948
|
+
Column names to be used in Spark to represent pandas-on-Spark's index. The index name
|
|
949
|
+
in pandas-on-Spark is ignored. By default, the index is always lost.
|
|
950
|
+
options: keyword arguments for additional options specific to PySpark.
|
|
951
|
+
It is specific to PySpark's JSON options to pass. Check
|
|
952
|
+
the options in PySpark's API documentation for `spark.write.json(...)`.
|
|
953
|
+
It has a higher priority and overwrites all other options.
|
|
954
|
+
This parameter only works when `path` is specified.
|
|
955
|
+
|
|
956
|
+
Returns
|
|
957
|
+
-------
|
|
958
|
+
str or None
|
|
959
|
+
|
|
960
|
+
Examples
|
|
961
|
+
--------
|
|
962
|
+
>>> df = ps.DataFrame([['a', 'b'], ['c', 'd']],
|
|
963
|
+
... columns=['col 1', 'col 2'])
|
|
964
|
+
>>> df.to_json()
|
|
965
|
+
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
|
|
966
|
+
|
|
967
|
+
>>> df['col 1'].to_json()
|
|
968
|
+
'[{"col 1":"a"},{"col 1":"c"}]'
|
|
969
|
+
|
|
970
|
+
>>> df.to_json(path=r'%s/to_json/foo.json' % path, num_files=1)
|
|
971
|
+
>>> ps.read_json(
|
|
972
|
+
... path=r'%s/to_json/foo.json' % path
|
|
973
|
+
... ).sort_values(by="col 1")
|
|
974
|
+
col 1 col 2
|
|
975
|
+
0 a b
|
|
976
|
+
1 c d
|
|
977
|
+
|
|
978
|
+
>>> df['col 1'].to_json(path=r'%s/to_json/foo.json' % path, num_files=1, index_col="index")
|
|
979
|
+
>>> ps.read_json(
|
|
980
|
+
... path=r'%s/to_json/foo.json' % path, index_col="index"
|
|
981
|
+
... ).sort_values(by="col 1") # doctest: +NORMALIZE_WHITESPACE
|
|
982
|
+
col 1
|
|
983
|
+
index
|
|
984
|
+
0 a
|
|
985
|
+
1 c
|
|
986
|
+
"""
|
|
987
|
+
if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
|
|
988
|
+
options = options.get("options")
|
|
989
|
+
|
|
990
|
+
default_options: Dict[str, Any] = {"ignoreNullFields": False}
|
|
991
|
+
options = {**default_options, **options}
|
|
992
|
+
|
|
993
|
+
if not lines:
|
|
994
|
+
raise NotImplementedError("lines=False is not implemented yet.")
|
|
995
|
+
|
|
996
|
+
if orient != "records":
|
|
997
|
+
raise NotImplementedError("orient='records' is supported only for now.")
|
|
998
|
+
|
|
999
|
+
if path is None:
|
|
1000
|
+
# If path is none, just collect and use pandas's to_json.
|
|
1001
|
+
psdf_or_ser = self
|
|
1002
|
+
pdf = psdf_or_ser._to_pandas()
|
|
1003
|
+
if isinstance(self, ps.Series):
|
|
1004
|
+
pdf = pdf.to_frame()
|
|
1005
|
+
# To make the format consistent and readable by `read_json`, convert it to pandas' and
|
|
1006
|
+
# use 'records' orient for now.
|
|
1007
|
+
return pdf.to_json(orient="records")
|
|
1008
|
+
|
|
1009
|
+
if isinstance(self, ps.DataFrame):
|
|
1010
|
+
psdf = self
|
|
1011
|
+
else:
|
|
1012
|
+
assert isinstance(self, ps.Series)
|
|
1013
|
+
psdf = self.to_frame()
|
|
1014
|
+
sdf = psdf.to_spark(index_col=index_col)
|
|
1015
|
+
|
|
1016
|
+
if num_files is not None:
|
|
1017
|
+
warnings.warn(
|
|
1018
|
+
"`num_files` has been deprecated and might be removed in a future version. "
|
|
1019
|
+
"Use `DataFrame.spark.repartition` instead.",
|
|
1020
|
+
FutureWarning,
|
|
1021
|
+
)
|
|
1022
|
+
sdf = sdf.repartition(num_files)
|
|
1023
|
+
|
|
1024
|
+
mode = validate_mode(mode)
|
|
1025
|
+
builder = sdf.write.mode(mode)
|
|
1026
|
+
if partition_cols is not None:
|
|
1027
|
+
builder.partitionBy(partition_cols)
|
|
1028
|
+
builder._set_opts(compression=compression)
|
|
1029
|
+
builder.options(**options).format("json").save(path)
|
|
1030
|
+
return None
|
|
1031
|
+
|
|
1032
|
+
def to_excel(
|
|
1033
|
+
self,
|
|
1034
|
+
excel_writer: Union[str, pd.ExcelWriter],
|
|
1035
|
+
sheet_name: str = "Sheet1",
|
|
1036
|
+
na_rep: str = "",
|
|
1037
|
+
float_format: Optional[str] = None,
|
|
1038
|
+
columns: Optional[Union[str, List[str]]] = None,
|
|
1039
|
+
header: bool = True,
|
|
1040
|
+
index: bool = True,
|
|
1041
|
+
index_label: Optional[Union[str, List[str]]] = None,
|
|
1042
|
+
startrow: int = 0,
|
|
1043
|
+
startcol: int = 0,
|
|
1044
|
+
engine: Optional[str] = None,
|
|
1045
|
+
merge_cells: bool = True,
|
|
1046
|
+
encoding: Optional[str] = None,
|
|
1047
|
+
inf_rep: str = "inf",
|
|
1048
|
+
verbose: bool = True,
|
|
1049
|
+
freeze_panes: Optional[Tuple[int, int]] = None,
|
|
1050
|
+
) -> None:
|
|
1051
|
+
"""
|
|
1052
|
+
Write object to an Excel sheet.
|
|
1053
|
+
|
|
1054
|
+
.. note:: This method should only be used if the resulting DataFrame is expected
|
|
1055
|
+
to be small, as all the data is loaded into the driver's memory.
|
|
1056
|
+
|
|
1057
|
+
To write a single object to an Excel .xlsx file it is only necessary to
|
|
1058
|
+
specify a target file name. To write to multiple sheets it is necessary to
|
|
1059
|
+
create an `ExcelWriter` object with a target file name, and specify a sheet
|
|
1060
|
+
in the file to write to.
|
|
1061
|
+
|
|
1062
|
+
Multiple sheets may be written to by specifying unique `sheet_name`.
|
|
1063
|
+
With all data written to the file it is necessary to save the changes.
|
|
1064
|
+
Note that creating an `ExcelWriter` object with a file name that already
|
|
1065
|
+
exists will result in the contents of the existing file being erased.
|
|
1066
|
+
|
|
1067
|
+
Parameters
|
|
1068
|
+
----------
|
|
1069
|
+
excel_writer: str or ExcelWriter object
|
|
1070
|
+
File path or existing ExcelWriter.
|
|
1071
|
+
sheet_name: str, default 'Sheet1'
|
|
1072
|
+
Name of sheet which will contain DataFrame.
|
|
1073
|
+
na_rep: str, default ''
|
|
1074
|
+
Missing data representation.
|
|
1075
|
+
float_format: str, optional
|
|
1076
|
+
Format string for floating point numbers. For example
|
|
1077
|
+
``float_format="%%.2f"`` will format 0.1234 to 0.12.
|
|
1078
|
+
columns: sequence or list of str, optional
|
|
1079
|
+
Columns to write.
|
|
1080
|
+
header: bool or list of str, default True
|
|
1081
|
+
Write out the column names. If a list of string is given it is
|
|
1082
|
+
assumed to be aliases for the column names.
|
|
1083
|
+
index: bool, default True
|
|
1084
|
+
Write row names (index).
|
|
1085
|
+
index_label: str or sequence, optional
|
|
1086
|
+
Column label for index column(s) if desired. If not specified, and
|
|
1087
|
+
`header` and `index` are True, then the index names are used. A
|
|
1088
|
+
sequence should be given if the DataFrame uses MultiIndex.
|
|
1089
|
+
startrow: int, default 0
|
|
1090
|
+
Upper left cell row to dump data frame.
|
|
1091
|
+
startcol: int, default 0
|
|
1092
|
+
Upper left cell column to dump data frame.
|
|
1093
|
+
engine: str, optional
|
|
1094
|
+
Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
|
|
1095
|
+
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
|
|
1096
|
+
``io.excel.xlsm.writer``.
|
|
1097
|
+
merge_cells: bool, default True
|
|
1098
|
+
Write MultiIndex and Hierarchical Rows as merged cells.
|
|
1099
|
+
encoding: str, optional
|
|
1100
|
+
Encoding of the resulting excel file. Only necessary for xlwt,
|
|
1101
|
+
other writers support unicode natively.
|
|
1102
|
+
|
|
1103
|
+
.. deprecated:: 3.4.0
|
|
1104
|
+
|
|
1105
|
+
inf_rep: str, default 'inf'
|
|
1106
|
+
Representation for infinity (there is no native representation for
|
|
1107
|
+
infinity in Excel).
|
|
1108
|
+
verbose: bool, default True
|
|
1109
|
+
Display more information in the error logs.
|
|
1110
|
+
|
|
1111
|
+
.. deprecated:: 3.4.0
|
|
1112
|
+
|
|
1113
|
+
freeze_panes: tuple of int (length 2), optional
|
|
1114
|
+
Specifies the one-based bottommost row and rightmost column that
|
|
1115
|
+
is to be frozen.
|
|
1116
|
+
|
|
1117
|
+
Notes
|
|
1118
|
+
-----
|
|
1119
|
+
Once a workbook has been saved it is not possible write further data
|
|
1120
|
+
without rewriting the whole workbook.
|
|
1121
|
+
|
|
1122
|
+
See Also
|
|
1123
|
+
--------
|
|
1124
|
+
read_excel: Read Excel file.
|
|
1125
|
+
|
|
1126
|
+
Examples
|
|
1127
|
+
--------
|
|
1128
|
+
Create, write to, and save a workbook:
|
|
1129
|
+
|
|
1130
|
+
>>> df1 = ps.DataFrame([['a', 'b'], ['c', 'd']],
|
|
1131
|
+
... index=['row 1', 'row 2'],
|
|
1132
|
+
... columns=['col 1', 'col 2'])
|
|
1133
|
+
>>> df1.to_excel("output.xlsx") # doctest: +SKIP
|
|
1134
|
+
|
|
1135
|
+
To specify the sheet name:
|
|
1136
|
+
|
|
1137
|
+
>>> df1.to_excel("output.xlsx") # doctest: +SKIP
|
|
1138
|
+
>>> df1.to_excel("output.xlsx",
|
|
1139
|
+
... sheet_name='Sheet_name_1') # doctest: +SKIP
|
|
1140
|
+
|
|
1141
|
+
If you wish to write to more than one sheet in the workbook, it is
|
|
1142
|
+
necessary to specify an ExcelWriter object:
|
|
1143
|
+
|
|
1144
|
+
>>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
|
|
1145
|
+
... df1.to_excel(writer, sheet_name='Sheet_name_1')
|
|
1146
|
+
... df2.to_excel(writer, sheet_name='Sheet_name_2')
|
|
1147
|
+
|
|
1148
|
+
To set the library that is used to write the Excel file,
|
|
1149
|
+
you can pass the `engine` keyword (the default engine is
|
|
1150
|
+
automatically chosen depending on the file extension):
|
|
1151
|
+
|
|
1152
|
+
>>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
|
|
1153
|
+
"""
|
|
1154
|
+
log_advice(
|
|
1155
|
+
"`to_excel` loads all data into the driver's memory. "
|
|
1156
|
+
"It should only be used if the resulting DataFrame is expected to be small."
|
|
1157
|
+
)
|
|
1158
|
+
# Make sure locals() call is at the top of the function so we don't capture local variables.
|
|
1159
|
+
args = locals()
|
|
1160
|
+
psdf = self
|
|
1161
|
+
|
|
1162
|
+
if isinstance(self, ps.DataFrame):
|
|
1163
|
+
f = pd.DataFrame.to_excel
|
|
1164
|
+
elif isinstance(self, ps.Series):
|
|
1165
|
+
f = pd.Series.to_excel
|
|
1166
|
+
else:
|
|
1167
|
+
raise TypeError(
|
|
1168
|
+
"Constructor expects DataFrame or Series; however, " "got [%s]" % (self,)
|
|
1169
|
+
)
|
|
1170
|
+
return validate_arguments_and_invoke_function(
|
|
1171
|
+
psdf._to_internal_pandas(), self.to_excel, f, args
|
|
1172
|
+
)
|
|
1173
|
+
|
|
1174
|
+
def mean(
|
|
1175
|
+
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
|
|
1176
|
+
) -> Union[Scalar, "Series"]:
|
|
1177
|
+
"""
|
|
1178
|
+
Return the mean of the values.
|
|
1179
|
+
|
|
1180
|
+
Parameters
|
|
1181
|
+
----------
|
|
1182
|
+
axis: {index (0), columns (1)}
|
|
1183
|
+
Axis for the function to be applied on.
|
|
1184
|
+
skipna: bool, default True
|
|
1185
|
+
Exclude NA/null values when computing the result.
|
|
1186
|
+
|
|
1187
|
+
.. versionchanged:: 3.4.0
|
|
1188
|
+
Supported including NA/null values.
|
|
1189
|
+
numeric_only: bool, default None
|
|
1190
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1191
|
+
is mainly for pandas compatibility.
|
|
1192
|
+
|
|
1193
|
+
Returns
|
|
1194
|
+
-------
|
|
1195
|
+
mean: scalar for a Series, and a Series for a DataFrame.
|
|
1196
|
+
|
|
1197
|
+
Examples
|
|
1198
|
+
--------
|
|
1199
|
+
|
|
1200
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
|
|
1201
|
+
... columns=['a', 'b'])
|
|
1202
|
+
|
|
1203
|
+
On a DataFrame:
|
|
1204
|
+
|
|
1205
|
+
>>> df.mean()
|
|
1206
|
+
a 2.0
|
|
1207
|
+
b 0.2
|
|
1208
|
+
dtype: float64
|
|
1209
|
+
|
|
1210
|
+
>>> df.mean(axis=1)
|
|
1211
|
+
0 0.55
|
|
1212
|
+
1 1.10
|
|
1213
|
+
2 1.65
|
|
1214
|
+
3 NaN
|
|
1215
|
+
dtype: float64
|
|
1216
|
+
|
|
1217
|
+
On a Series:
|
|
1218
|
+
|
|
1219
|
+
>>> df['a'].mean()
|
|
1220
|
+
2.0
|
|
1221
|
+
"""
|
|
1222
|
+
axis = validate_axis(axis)
|
|
1223
|
+
|
|
1224
|
+
if numeric_only is None and axis == 0:
|
|
1225
|
+
numeric_only = True
|
|
1226
|
+
|
|
1227
|
+
def mean(psser: "Series") -> Column:
|
|
1228
|
+
spark_type = psser.spark.data_type
|
|
1229
|
+
spark_column = psser.spark.column
|
|
1230
|
+
if isinstance(spark_type, BooleanType):
|
|
1231
|
+
spark_column = spark_column.cast(LongType())
|
|
1232
|
+
elif not isinstance(spark_type, NumericType):
|
|
1233
|
+
raise TypeError(
|
|
1234
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1235
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1236
|
+
)
|
|
1237
|
+
)
|
|
1238
|
+
return F.mean(spark_column)
|
|
1239
|
+
|
|
1240
|
+
return self._reduce_for_stat_function(
|
|
1241
|
+
mean,
|
|
1242
|
+
name="mean",
|
|
1243
|
+
axis=axis,
|
|
1244
|
+
numeric_only=numeric_only,
|
|
1245
|
+
skipna=skipna,
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
def sum(
|
|
1249
|
+
self,
|
|
1250
|
+
axis: Optional[Axis] = None,
|
|
1251
|
+
skipna: bool = True,
|
|
1252
|
+
numeric_only: bool = None,
|
|
1253
|
+
min_count: int = 0,
|
|
1254
|
+
) -> Union[Scalar, "Series"]:
|
|
1255
|
+
"""
|
|
1256
|
+
Return the sum of the values.
|
|
1257
|
+
|
|
1258
|
+
Parameters
|
|
1259
|
+
----------
|
|
1260
|
+
axis: {index (0), columns (1)}
|
|
1261
|
+
Axis for the function to be applied on.
|
|
1262
|
+
skipna: bool, default True
|
|
1263
|
+
Exclude NA/null values when computing the result.
|
|
1264
|
+
|
|
1265
|
+
.. versionchanged:: 3.4.0
|
|
1266
|
+
Added *skipna* to exclude.
|
|
1267
|
+
numeric_only: bool, default None
|
|
1268
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1269
|
+
is mainly for pandas compatibility.
|
|
1270
|
+
min_count: int, default 0
|
|
1271
|
+
The required number of valid values to perform the operation. If fewer than
|
|
1272
|
+
``min_count`` non-NA values are present the result will be NA.
|
|
1273
|
+
|
|
1274
|
+
Returns
|
|
1275
|
+
-------
|
|
1276
|
+
sum: scalar for a Series, and a Series for a DataFrame.
|
|
1277
|
+
|
|
1278
|
+
Examples
|
|
1279
|
+
--------
|
|
1280
|
+
|
|
1281
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, np.nan, 0.3, np.nan]},
|
|
1282
|
+
... columns=['a', 'b'])
|
|
1283
|
+
|
|
1284
|
+
On a DataFrame:
|
|
1285
|
+
|
|
1286
|
+
>>> df.sum()
|
|
1287
|
+
a 6.0
|
|
1288
|
+
b 0.4
|
|
1289
|
+
dtype: float64
|
|
1290
|
+
|
|
1291
|
+
>>> df.sum(axis=1)
|
|
1292
|
+
0 1.1
|
|
1293
|
+
1 2.0
|
|
1294
|
+
2 3.3
|
|
1295
|
+
3 0.0
|
|
1296
|
+
dtype: float64
|
|
1297
|
+
|
|
1298
|
+
>>> df.sum(min_count=3)
|
|
1299
|
+
a 6.0
|
|
1300
|
+
b NaN
|
|
1301
|
+
dtype: float64
|
|
1302
|
+
|
|
1303
|
+
>>> df.sum(axis=1, min_count=1)
|
|
1304
|
+
0 1.1
|
|
1305
|
+
1 2.0
|
|
1306
|
+
2 3.3
|
|
1307
|
+
3 NaN
|
|
1308
|
+
dtype: float64
|
|
1309
|
+
|
|
1310
|
+
On a Series:
|
|
1311
|
+
|
|
1312
|
+
>>> df['a'].sum()
|
|
1313
|
+
6.0
|
|
1314
|
+
|
|
1315
|
+
>>> df['a'].sum(min_count=3)
|
|
1316
|
+
6.0
|
|
1317
|
+
>>> df['b'].sum(min_count=3)
|
|
1318
|
+
nan
|
|
1319
|
+
"""
|
|
1320
|
+
axis = validate_axis(axis)
|
|
1321
|
+
|
|
1322
|
+
if numeric_only is None and axis == 0:
|
|
1323
|
+
numeric_only = True
|
|
1324
|
+
elif numeric_only is True and axis == 1:
|
|
1325
|
+
numeric_only = None
|
|
1326
|
+
|
|
1327
|
+
def sum(psser: "Series") -> Column:
|
|
1328
|
+
spark_type = psser.spark.data_type
|
|
1329
|
+
spark_column = psser.spark.column
|
|
1330
|
+
|
|
1331
|
+
if isinstance(spark_type, BooleanType):
|
|
1332
|
+
spark_column = spark_column.cast(LongType())
|
|
1333
|
+
elif not isinstance(spark_type, NumericType):
|
|
1334
|
+
raise TypeError(
|
|
1335
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1336
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1337
|
+
)
|
|
1338
|
+
)
|
|
1339
|
+
return F.coalesce(F.sum(spark_column), F.lit(0))
|
|
1340
|
+
|
|
1341
|
+
return self._reduce_for_stat_function(
|
|
1342
|
+
sum,
|
|
1343
|
+
name="sum",
|
|
1344
|
+
axis=axis,
|
|
1345
|
+
numeric_only=numeric_only,
|
|
1346
|
+
min_count=min_count,
|
|
1347
|
+
skipna=skipna,
|
|
1348
|
+
)
|
|
1349
|
+
|
|
1350
|
+
def product(
|
|
1351
|
+
self,
|
|
1352
|
+
axis: Optional[Axis] = None,
|
|
1353
|
+
skipna: bool = True,
|
|
1354
|
+
numeric_only: bool = None,
|
|
1355
|
+
min_count: int = 0,
|
|
1356
|
+
) -> Union[Scalar, "Series"]:
|
|
1357
|
+
"""
|
|
1358
|
+
Return the product of the values.
|
|
1359
|
+
|
|
1360
|
+
.. note:: unlike pandas', pandas-on-Spark's emulates product by ``exp(sum(log(...)))``
|
|
1361
|
+
trick. Therefore, it only works for positive numbers.
|
|
1362
|
+
|
|
1363
|
+
Parameters
|
|
1364
|
+
----------
|
|
1365
|
+
axis: {index (0), columns (1)}
|
|
1366
|
+
Axis for the function to be applied on.
|
|
1367
|
+
skipna: bool, default True
|
|
1368
|
+
Exclude NA/null values when computing the result.
|
|
1369
|
+
|
|
1370
|
+
.. versionchanged:: 3.4.0
|
|
1371
|
+
Supported including NA/null values.
|
|
1372
|
+
numeric_only: bool, default None
|
|
1373
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1374
|
+
is mainly for pandas compatibility.
|
|
1375
|
+
min_count: int, default 0
|
|
1376
|
+
The required number of valid values to perform the operation. If fewer than
|
|
1377
|
+
``min_count`` non-NA values are present the result will be NA.
|
|
1378
|
+
|
|
1379
|
+
Examples
|
|
1380
|
+
--------
|
|
1381
|
+
On a DataFrame:
|
|
1382
|
+
|
|
1383
|
+
Non-numeric type column is not included to the result.
|
|
1384
|
+
|
|
1385
|
+
>>> psdf = ps.DataFrame({'A': [1, 2, 3, 4, 5],
|
|
1386
|
+
... 'B': [10, 20, 30, 40, 50],
|
|
1387
|
+
... 'C': ['a', 'b', 'c', 'd', 'e']})
|
|
1388
|
+
>>> psdf
|
|
1389
|
+
A B C
|
|
1390
|
+
0 1 10 a
|
|
1391
|
+
1 2 20 b
|
|
1392
|
+
2 3 30 c
|
|
1393
|
+
3 4 40 d
|
|
1394
|
+
4 5 50 e
|
|
1395
|
+
|
|
1396
|
+
>>> psdf.prod()
|
|
1397
|
+
A 120
|
|
1398
|
+
B 12000000
|
|
1399
|
+
dtype: int64
|
|
1400
|
+
|
|
1401
|
+
If there is no numeric type columns, returns empty Series.
|
|
1402
|
+
|
|
1403
|
+
>>> ps.DataFrame({"key": ['a', 'b', 'c'], "val": ['x', 'y', 'z']}).prod() # doctest: +SKIP
|
|
1404
|
+
Series([], dtype: float64)
|
|
1405
|
+
|
|
1406
|
+
On a Series:
|
|
1407
|
+
|
|
1408
|
+
>>> ps.Series([1, 2, 3, 4, 5]).prod()
|
|
1409
|
+
120
|
|
1410
|
+
|
|
1411
|
+
By default, the product of an empty or all-NA Series is ``1``
|
|
1412
|
+
|
|
1413
|
+
>>> ps.Series([]).prod() # doctest: +SKIP
|
|
1414
|
+
1.0
|
|
1415
|
+
|
|
1416
|
+
This can be controlled with the ``min_count`` parameter
|
|
1417
|
+
|
|
1418
|
+
>>> ps.Series([]).prod(min_count=1) # doctest: +SKIP
|
|
1419
|
+
nan
|
|
1420
|
+
"""
|
|
1421
|
+
axis = validate_axis(axis)
|
|
1422
|
+
warnings.warn(
|
|
1423
|
+
"Default value of `numeric_only` will be changed to `False` "
|
|
1424
|
+
"instead of `None` in 4.0.0.",
|
|
1425
|
+
FutureWarning,
|
|
1426
|
+
)
|
|
1427
|
+
|
|
1428
|
+
if numeric_only is None and axis == 0:
|
|
1429
|
+
numeric_only = True
|
|
1430
|
+
elif numeric_only is True and axis == 1:
|
|
1431
|
+
numeric_only = None
|
|
1432
|
+
|
|
1433
|
+
def prod(psser: "Series") -> Column:
|
|
1434
|
+
spark_type = psser.spark.data_type
|
|
1435
|
+
spark_column = psser.spark.column
|
|
1436
|
+
if isinstance(spark_type, BooleanType):
|
|
1437
|
+
spark_column = spark_column.cast(LongType())
|
|
1438
|
+
elif not isinstance(spark_type, NumericType):
|
|
1439
|
+
raise TypeError(
|
|
1440
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1441
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1442
|
+
)
|
|
1443
|
+
)
|
|
1444
|
+
|
|
1445
|
+
return SF.product(spark_column, skipna)
|
|
1446
|
+
|
|
1447
|
+
return self._reduce_for_stat_function(
|
|
1448
|
+
prod,
|
|
1449
|
+
name="prod",
|
|
1450
|
+
axis=axis,
|
|
1451
|
+
numeric_only=numeric_only,
|
|
1452
|
+
min_count=min_count,
|
|
1453
|
+
skipna=skipna,
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
prod = product
|
|
1457
|
+
|
|
1458
|
+
def skew(
|
|
1459
|
+
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
|
|
1460
|
+
) -> Union[Scalar, "Series"]:
|
|
1461
|
+
"""
|
|
1462
|
+
Return unbiased skew normalized by N-1.
|
|
1463
|
+
|
|
1464
|
+
Parameters
|
|
1465
|
+
----------
|
|
1466
|
+
axis: {index (0), columns (1)}
|
|
1467
|
+
Axis for the function to be applied on.
|
|
1468
|
+
skipna: bool, default True
|
|
1469
|
+
Exclude NA/null values when computing the result.
|
|
1470
|
+
|
|
1471
|
+
.. versionchanged:: 3.4.0
|
|
1472
|
+
Supported including NA/null values.
|
|
1473
|
+
numeric_only: bool, default None
|
|
1474
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1475
|
+
is mainly for pandas compatibility.
|
|
1476
|
+
|
|
1477
|
+
Returns
|
|
1478
|
+
-------
|
|
1479
|
+
skew: scalar for a Series, and a Series for a DataFrame.
|
|
1480
|
+
|
|
1481
|
+
Examples
|
|
1482
|
+
--------
|
|
1483
|
+
|
|
1484
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
|
|
1485
|
+
... columns=['a', 'b'])
|
|
1486
|
+
|
|
1487
|
+
On a DataFrame:
|
|
1488
|
+
|
|
1489
|
+
>>> df.skew()
|
|
1490
|
+
a 0.0
|
|
1491
|
+
b 0.0
|
|
1492
|
+
dtype: float64
|
|
1493
|
+
|
|
1494
|
+
On a Series:
|
|
1495
|
+
|
|
1496
|
+
>>> df['a'].skew()
|
|
1497
|
+
0.0
|
|
1498
|
+
"""
|
|
1499
|
+
axis = validate_axis(axis)
|
|
1500
|
+
|
|
1501
|
+
if numeric_only is None and axis == 0:
|
|
1502
|
+
numeric_only = True
|
|
1503
|
+
|
|
1504
|
+
def skew(psser: "Series") -> Column:
|
|
1505
|
+
spark_type = psser.spark.data_type
|
|
1506
|
+
spark_column = psser.spark.column
|
|
1507
|
+
if isinstance(spark_type, BooleanType):
|
|
1508
|
+
spark_column = spark_column.cast(LongType())
|
|
1509
|
+
elif not isinstance(spark_type, NumericType):
|
|
1510
|
+
raise TypeError(
|
|
1511
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1512
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1513
|
+
)
|
|
1514
|
+
)
|
|
1515
|
+
|
|
1516
|
+
return SF.skew(spark_column)
|
|
1517
|
+
|
|
1518
|
+
return self._reduce_for_stat_function(
|
|
1519
|
+
skew,
|
|
1520
|
+
name="skew",
|
|
1521
|
+
axis=axis,
|
|
1522
|
+
numeric_only=numeric_only,
|
|
1523
|
+
skipna=skipna,
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
def kurtosis(
|
|
1527
|
+
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
|
|
1528
|
+
) -> Union[Scalar, "Series"]:
|
|
1529
|
+
"""
|
|
1530
|
+
Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
|
|
1531
|
+
Normalized by N-1.
|
|
1532
|
+
|
|
1533
|
+
Parameters
|
|
1534
|
+
----------
|
|
1535
|
+
axis: {index (0), columns (1)}
|
|
1536
|
+
Axis for the function to be applied on.
|
|
1537
|
+
skipna: bool, default True
|
|
1538
|
+
Exclude NA/null values when computing the result.
|
|
1539
|
+
|
|
1540
|
+
.. versionchanged:: 3.4.0
|
|
1541
|
+
Supported including NA/null values.
|
|
1542
|
+
numeric_only: bool, default None
|
|
1543
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1544
|
+
is mainly for pandas compatibility.
|
|
1545
|
+
|
|
1546
|
+
Returns
|
|
1547
|
+
-------
|
|
1548
|
+
kurt: scalar for a Series, and a Series for a DataFrame.
|
|
1549
|
+
|
|
1550
|
+
Examples
|
|
1551
|
+
--------
|
|
1552
|
+
|
|
1553
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan, 6], 'b': [0.1, 0.2, 0.3, np.nan, 0.8]},
|
|
1554
|
+
... columns=['a', 'b'])
|
|
1555
|
+
|
|
1556
|
+
On a DataFrame:
|
|
1557
|
+
|
|
1558
|
+
>>> df.kurtosis()
|
|
1559
|
+
a 1.500000
|
|
1560
|
+
b 2.703924
|
|
1561
|
+
dtype: float64
|
|
1562
|
+
|
|
1563
|
+
On a Series:
|
|
1564
|
+
|
|
1565
|
+
>>> df['a'].kurtosis()
|
|
1566
|
+
1.5
|
|
1567
|
+
"""
|
|
1568
|
+
axis = validate_axis(axis)
|
|
1569
|
+
|
|
1570
|
+
if numeric_only is None and axis == 0:
|
|
1571
|
+
numeric_only = True
|
|
1572
|
+
|
|
1573
|
+
def kurtosis(psser: "Series") -> Column:
|
|
1574
|
+
spark_type = psser.spark.data_type
|
|
1575
|
+
spark_column = psser.spark.column
|
|
1576
|
+
if isinstance(spark_type, BooleanType):
|
|
1577
|
+
spark_column = spark_column.cast(LongType())
|
|
1578
|
+
elif not isinstance(spark_type, NumericType):
|
|
1579
|
+
raise TypeError(
|
|
1580
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1581
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1582
|
+
)
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
return SF.kurt(spark_column)
|
|
1586
|
+
|
|
1587
|
+
return self._reduce_for_stat_function(
|
|
1588
|
+
kurtosis,
|
|
1589
|
+
name="kurtosis",
|
|
1590
|
+
axis=axis,
|
|
1591
|
+
numeric_only=numeric_only,
|
|
1592
|
+
skipna=skipna,
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
kurt = kurtosis
|
|
1596
|
+
|
|
1597
|
+
def min(
|
|
1598
|
+
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
|
|
1599
|
+
) -> Union[Scalar, "Series"]:
|
|
1600
|
+
"""
|
|
1601
|
+
Return the minimum of the values.
|
|
1602
|
+
|
|
1603
|
+
Parameters
|
|
1604
|
+
----------
|
|
1605
|
+
axis: {index (0), columns (1)}
|
|
1606
|
+
Axis for the function to be applied on.
|
|
1607
|
+
skipna: bool, default True
|
|
1608
|
+
Exclude NA/null values when computing the result.
|
|
1609
|
+
|
|
1610
|
+
.. versionchanged:: 3.4.0
|
|
1611
|
+
Supported including NA/null values.
|
|
1612
|
+
numeric_only: bool, default None
|
|
1613
|
+
If True, include only float, int, boolean columns. This parameter is mainly for
|
|
1614
|
+
pandas compatibility. False is supported; however, the columns should
|
|
1615
|
+
be all numeric or all non-numeric.
|
|
1616
|
+
|
|
1617
|
+
Returns
|
|
1618
|
+
-------
|
|
1619
|
+
min: scalar for a Series, and a Series for a DataFrame.
|
|
1620
|
+
|
|
1621
|
+
Examples
|
|
1622
|
+
--------
|
|
1623
|
+
|
|
1624
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
|
|
1625
|
+
... columns=['a', 'b'])
|
|
1626
|
+
|
|
1627
|
+
On a DataFrame:
|
|
1628
|
+
|
|
1629
|
+
>>> df.min()
|
|
1630
|
+
a 1.0
|
|
1631
|
+
b 0.1
|
|
1632
|
+
dtype: float64
|
|
1633
|
+
|
|
1634
|
+
>>> df.min(axis=1)
|
|
1635
|
+
0 0.1
|
|
1636
|
+
1 0.2
|
|
1637
|
+
2 0.3
|
|
1638
|
+
3 NaN
|
|
1639
|
+
dtype: float64
|
|
1640
|
+
|
|
1641
|
+
On a Series:
|
|
1642
|
+
|
|
1643
|
+
>>> df['a'].min()
|
|
1644
|
+
1.0
|
|
1645
|
+
"""
|
|
1646
|
+
axis = validate_axis(axis)
|
|
1647
|
+
|
|
1648
|
+
if numeric_only is None and axis == 0:
|
|
1649
|
+
numeric_only = True
|
|
1650
|
+
elif numeric_only is True and axis == 1:
|
|
1651
|
+
numeric_only = None
|
|
1652
|
+
|
|
1653
|
+
return self._reduce_for_stat_function(
|
|
1654
|
+
lambda psser: F.min(psser.spark.column),
|
|
1655
|
+
name="min",
|
|
1656
|
+
axis=axis,
|
|
1657
|
+
numeric_only=numeric_only,
|
|
1658
|
+
skipna=skipna,
|
|
1659
|
+
)
|
|
1660
|
+
|
|
1661
|
+
def max(
|
|
1662
|
+
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
|
|
1663
|
+
) -> Union[Scalar, "Series"]:
|
|
1664
|
+
"""
|
|
1665
|
+
Return the maximum of the values.
|
|
1666
|
+
|
|
1667
|
+
Parameters
|
|
1668
|
+
----------
|
|
1669
|
+
axis: {index (0), columns (1)}
|
|
1670
|
+
Axis for the function to be applied on.
|
|
1671
|
+
skipna: bool, default True
|
|
1672
|
+
Exclude NA/null values when computing the result.
|
|
1673
|
+
|
|
1674
|
+
.. versionchanged:: 3.4.0
|
|
1675
|
+
Supported including NA/null values.
|
|
1676
|
+
numeric_only: bool, default None
|
|
1677
|
+
If True, include only float, int, boolean columns. This parameter is mainly for
|
|
1678
|
+
pandas compatibility. False is supported; however, the columns should
|
|
1679
|
+
be all numeric or all non-numeric.
|
|
1680
|
+
|
|
1681
|
+
Returns
|
|
1682
|
+
-------
|
|
1683
|
+
max: scalar for a Series, and a Series for a DataFrame.
|
|
1684
|
+
|
|
1685
|
+
Examples
|
|
1686
|
+
--------
|
|
1687
|
+
|
|
1688
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
|
|
1689
|
+
... columns=['a', 'b'])
|
|
1690
|
+
|
|
1691
|
+
On a DataFrame:
|
|
1692
|
+
|
|
1693
|
+
>>> df.max()
|
|
1694
|
+
a 3.0
|
|
1695
|
+
b 0.3
|
|
1696
|
+
dtype: float64
|
|
1697
|
+
|
|
1698
|
+
>>> df.max(axis=1)
|
|
1699
|
+
0 1.0
|
|
1700
|
+
1 2.0
|
|
1701
|
+
2 3.0
|
|
1702
|
+
3 NaN
|
|
1703
|
+
dtype: float64
|
|
1704
|
+
|
|
1705
|
+
On a Series:
|
|
1706
|
+
|
|
1707
|
+
>>> df['a'].max()
|
|
1708
|
+
3.0
|
|
1709
|
+
"""
|
|
1710
|
+
axis = validate_axis(axis)
|
|
1711
|
+
|
|
1712
|
+
if numeric_only is None and axis == 0:
|
|
1713
|
+
numeric_only = True
|
|
1714
|
+
elif numeric_only is True and axis == 1:
|
|
1715
|
+
numeric_only = None
|
|
1716
|
+
|
|
1717
|
+
return self._reduce_for_stat_function(
|
|
1718
|
+
lambda psser: F.max(psser.spark.column),
|
|
1719
|
+
name="max",
|
|
1720
|
+
axis=axis,
|
|
1721
|
+
numeric_only=numeric_only,
|
|
1722
|
+
skipna=skipna,
|
|
1723
|
+
)
|
|
1724
|
+
|
|
1725
|
+
def count(
|
|
1726
|
+
self, axis: Optional[Axis] = None, numeric_only: bool = False
|
|
1727
|
+
) -> Union[Scalar, "Series"]:
|
|
1728
|
+
"""
|
|
1729
|
+
Count non-NA cells for each column.
|
|
1730
|
+
|
|
1731
|
+
The values `None`, `NaN` are considered NA.
|
|
1732
|
+
|
|
1733
|
+
Parameters
|
|
1734
|
+
----------
|
|
1735
|
+
axis: {0 or ‘index’, 1 or ‘columns’}, default 0
|
|
1736
|
+
If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are
|
|
1737
|
+
generated for each row.
|
|
1738
|
+
numeric_only: bool, default False
|
|
1739
|
+
If True, include only float, int, boolean columns. This parameter is mainly for
|
|
1740
|
+
pandas compatibility.
|
|
1741
|
+
|
|
1742
|
+
Returns
|
|
1743
|
+
-------
|
|
1744
|
+
max: scalar for a Series, and a Series for a DataFrame.
|
|
1745
|
+
|
|
1746
|
+
See Also
|
|
1747
|
+
--------
|
|
1748
|
+
DataFrame.shape: Number of DataFrame rows and columns (including NA
|
|
1749
|
+
elements).
|
|
1750
|
+
DataFrame.isna: Boolean same-sized DataFrame showing places of NA
|
|
1751
|
+
elements.
|
|
1752
|
+
|
|
1753
|
+
Examples
|
|
1754
|
+
--------
|
|
1755
|
+
Constructing DataFrame from a dictionary:
|
|
1756
|
+
|
|
1757
|
+
>>> df = ps.DataFrame({"Person":
|
|
1758
|
+
... ["John", "Myla", "Lewis", "John", "Myla"],
|
|
1759
|
+
... "Age": [24., np.nan, 21., 33, 26],
|
|
1760
|
+
... "Single": [False, True, True, True, False]},
|
|
1761
|
+
... columns=["Person", "Age", "Single"])
|
|
1762
|
+
>>> df
|
|
1763
|
+
Person Age Single
|
|
1764
|
+
0 John 24.0 False
|
|
1765
|
+
1 Myla NaN True
|
|
1766
|
+
2 Lewis 21.0 True
|
|
1767
|
+
3 John 33.0 True
|
|
1768
|
+
4 Myla 26.0 False
|
|
1769
|
+
|
|
1770
|
+
Notice the uncounted NA values:
|
|
1771
|
+
|
|
1772
|
+
>>> df.count()
|
|
1773
|
+
Person 5
|
|
1774
|
+
Age 4
|
|
1775
|
+
Single 5
|
|
1776
|
+
dtype: int64
|
|
1777
|
+
|
|
1778
|
+
>>> df.count(axis=1)
|
|
1779
|
+
0 3
|
|
1780
|
+
1 2
|
|
1781
|
+
2 3
|
|
1782
|
+
3 3
|
|
1783
|
+
4 3
|
|
1784
|
+
dtype: int64
|
|
1785
|
+
|
|
1786
|
+
On a Series:
|
|
1787
|
+
|
|
1788
|
+
>>> df['Person'].count()
|
|
1789
|
+
5
|
|
1790
|
+
|
|
1791
|
+
>>> df['Age'].count()
|
|
1792
|
+
4
|
|
1793
|
+
"""
|
|
1794
|
+
|
|
1795
|
+
return self._reduce_for_stat_function(
|
|
1796
|
+
Frame._count_expr, name="count", axis=axis, numeric_only=numeric_only
|
|
1797
|
+
)
|
|
1798
|
+
|
|
1799
|
+
def std(
|
|
1800
|
+
self,
|
|
1801
|
+
axis: Optional[Axis] = None,
|
|
1802
|
+
skipna: bool = True,
|
|
1803
|
+
ddof: int = 1,
|
|
1804
|
+
numeric_only: bool = None,
|
|
1805
|
+
) -> Union[Scalar, "Series"]:
|
|
1806
|
+
"""
|
|
1807
|
+
Return sample standard deviation.
|
|
1808
|
+
|
|
1809
|
+
.. versionadded:: 3.3.0
|
|
1810
|
+
|
|
1811
|
+
Parameters
|
|
1812
|
+
----------
|
|
1813
|
+
axis: {index (0), columns (1)}
|
|
1814
|
+
Axis for the function to be applied on.
|
|
1815
|
+
skipna: bool, default True
|
|
1816
|
+
Exclude NA/null values when computing the result.
|
|
1817
|
+
|
|
1818
|
+
.. versionchanged:: 3.4.0
|
|
1819
|
+
Supported including NA/null values.
|
|
1820
|
+
ddof: int, default 1
|
|
1821
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
|
|
1822
|
+
where N represents the number of elements.
|
|
1823
|
+
|
|
1824
|
+
.. versionchanged:: 3.4.0
|
|
1825
|
+
Supported including arbitary integers.
|
|
1826
|
+
numeric_only: bool, default None
|
|
1827
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1828
|
+
is mainly for pandas compatibility.
|
|
1829
|
+
|
|
1830
|
+
Returns
|
|
1831
|
+
-------
|
|
1832
|
+
std: scalar for a Series, and a Series for a DataFrame.
|
|
1833
|
+
|
|
1834
|
+
Examples
|
|
1835
|
+
--------
|
|
1836
|
+
|
|
1837
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
|
|
1838
|
+
... columns=['a', 'b'])
|
|
1839
|
+
|
|
1840
|
+
On a DataFrame:
|
|
1841
|
+
|
|
1842
|
+
>>> df.std()
|
|
1843
|
+
a 1.0
|
|
1844
|
+
b 0.1
|
|
1845
|
+
dtype: float64
|
|
1846
|
+
|
|
1847
|
+
>>> df.std(ddof=2)
|
|
1848
|
+
a 1.414214
|
|
1849
|
+
b 0.141421
|
|
1850
|
+
dtype: float64
|
|
1851
|
+
|
|
1852
|
+
>>> df.std(axis=1)
|
|
1853
|
+
0 0.636396
|
|
1854
|
+
1 1.272792
|
|
1855
|
+
2 1.909188
|
|
1856
|
+
3 NaN
|
|
1857
|
+
dtype: float64
|
|
1858
|
+
|
|
1859
|
+
>>> df.std(ddof=0)
|
|
1860
|
+
a 0.816497
|
|
1861
|
+
b 0.081650
|
|
1862
|
+
dtype: float64
|
|
1863
|
+
|
|
1864
|
+
On a Series:
|
|
1865
|
+
|
|
1866
|
+
>>> df['a'].std()
|
|
1867
|
+
1.0
|
|
1868
|
+
|
|
1869
|
+
>>> df['a'].std(ddof=0)
|
|
1870
|
+
0.816496580927726
|
|
1871
|
+
|
|
1872
|
+
>>> df['a'].std(ddof=-1)
|
|
1873
|
+
0.707106...
|
|
1874
|
+
"""
|
|
1875
|
+
if not isinstance(ddof, int):
|
|
1876
|
+
raise TypeError("ddof must be integer")
|
|
1877
|
+
|
|
1878
|
+
axis = validate_axis(axis)
|
|
1879
|
+
|
|
1880
|
+
if numeric_only is None and axis == 0:
|
|
1881
|
+
numeric_only = True
|
|
1882
|
+
|
|
1883
|
+
def std(psser: "Series") -> Column:
|
|
1884
|
+
spark_type = psser.spark.data_type
|
|
1885
|
+
spark_column = psser.spark.column
|
|
1886
|
+
if isinstance(spark_type, BooleanType):
|
|
1887
|
+
spark_column = spark_column.cast(LongType())
|
|
1888
|
+
elif not isinstance(spark_type, NumericType):
|
|
1889
|
+
raise TypeError(
|
|
1890
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1891
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1892
|
+
)
|
|
1893
|
+
)
|
|
1894
|
+
return SF.stddev(spark_column, ddof)
|
|
1895
|
+
|
|
1896
|
+
return self._reduce_for_stat_function(
|
|
1897
|
+
std, name="std", axis=axis, numeric_only=numeric_only, ddof=ddof, skipna=skipna
|
|
1898
|
+
)
|
|
1899
|
+
|
|
1900
|
+
def var(
|
|
1901
|
+
self, axis: Optional[Axis] = None, ddof: int = 1, numeric_only: bool = None
|
|
1902
|
+
) -> Union[Scalar, "Series"]:
|
|
1903
|
+
"""
|
|
1904
|
+
Return unbiased variance.
|
|
1905
|
+
|
|
1906
|
+
.. versionadded:: 3.3.0
|
|
1907
|
+
|
|
1908
|
+
Parameters
|
|
1909
|
+
----------
|
|
1910
|
+
axis: {index (0), columns (1)}
|
|
1911
|
+
Axis for the function to be applied on.
|
|
1912
|
+
ddof: int, default 1
|
|
1913
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
|
|
1914
|
+
where N represents the number of elements.
|
|
1915
|
+
|
|
1916
|
+
.. versionchanged:: 3.4.0
|
|
1917
|
+
Supported including arbitary integers.
|
|
1918
|
+
numeric_only: bool, default None
|
|
1919
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
1920
|
+
is mainly for pandas compatibility.
|
|
1921
|
+
|
|
1922
|
+
Returns
|
|
1923
|
+
-------
|
|
1924
|
+
var: scalar for a Series, and a Series for a DataFrame.
|
|
1925
|
+
|
|
1926
|
+
Examples
|
|
1927
|
+
--------
|
|
1928
|
+
|
|
1929
|
+
>>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
|
|
1930
|
+
... columns=['a', 'b'])
|
|
1931
|
+
|
|
1932
|
+
On a DataFrame:
|
|
1933
|
+
|
|
1934
|
+
>>> df.var()
|
|
1935
|
+
a 1.00
|
|
1936
|
+
b 0.01
|
|
1937
|
+
dtype: float64
|
|
1938
|
+
|
|
1939
|
+
>>> df.var(ddof=2)
|
|
1940
|
+
a 2.00
|
|
1941
|
+
b 0.02
|
|
1942
|
+
dtype: float64
|
|
1943
|
+
|
|
1944
|
+
>>> df.var(axis=1)
|
|
1945
|
+
0 0.405
|
|
1946
|
+
1 1.620
|
|
1947
|
+
2 3.645
|
|
1948
|
+
3 NaN
|
|
1949
|
+
dtype: float64
|
|
1950
|
+
|
|
1951
|
+
>>> df.var(ddof=0)
|
|
1952
|
+
a 0.666667
|
|
1953
|
+
b 0.006667
|
|
1954
|
+
dtype: float64
|
|
1955
|
+
|
|
1956
|
+
On a Series:
|
|
1957
|
+
|
|
1958
|
+
>>> df['a'].var()
|
|
1959
|
+
1.0
|
|
1960
|
+
|
|
1961
|
+
>>> df['a'].var(ddof=0)
|
|
1962
|
+
0.6666666666666666
|
|
1963
|
+
|
|
1964
|
+
>>> df['a'].var(ddof=-2)
|
|
1965
|
+
0.4
|
|
1966
|
+
"""
|
|
1967
|
+
if not isinstance(ddof, int):
|
|
1968
|
+
raise TypeError("ddof must be integer")
|
|
1969
|
+
|
|
1970
|
+
axis = validate_axis(axis)
|
|
1971
|
+
|
|
1972
|
+
if numeric_only is None and axis == 0:
|
|
1973
|
+
numeric_only = True
|
|
1974
|
+
|
|
1975
|
+
def var(psser: "Series") -> Column:
|
|
1976
|
+
spark_type = psser.spark.data_type
|
|
1977
|
+
spark_column = psser.spark.column
|
|
1978
|
+
if isinstance(spark_type, BooleanType):
|
|
1979
|
+
spark_column = spark_column.cast(LongType())
|
|
1980
|
+
elif not isinstance(spark_type, NumericType):
|
|
1981
|
+
raise TypeError(
|
|
1982
|
+
"Could not convert {} ({}) to numeric".format(
|
|
1983
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
1984
|
+
)
|
|
1985
|
+
)
|
|
1986
|
+
return SF.var(spark_column, ddof)
|
|
1987
|
+
|
|
1988
|
+
return self._reduce_for_stat_function(
|
|
1989
|
+
var, name="var", axis=axis, numeric_only=numeric_only, ddof=ddof
|
|
1990
|
+
)
|
|
1991
|
+
|
|
1992
|
+
def median(
|
|
1993
|
+
self,
|
|
1994
|
+
axis: Optional[Axis] = None,
|
|
1995
|
+
skipna: bool = True,
|
|
1996
|
+
numeric_only: bool = None,
|
|
1997
|
+
accuracy: int = 10000,
|
|
1998
|
+
) -> Union[Scalar, "Series"]:
|
|
1999
|
+
"""
|
|
2000
|
+
Return the median of the values for the requested axis.
|
|
2001
|
+
|
|
2002
|
+
.. note:: Unlike pandas', the median in pandas-on-Spark is an approximated median based upon
|
|
2003
|
+
approximate percentile computation because computing median across a large dataset
|
|
2004
|
+
is extremely expensive.
|
|
2005
|
+
|
|
2006
|
+
Parameters
|
|
2007
|
+
----------
|
|
2008
|
+
axis: {index (0), columns (1)}
|
|
2009
|
+
Axis for the function to be applied on.
|
|
2010
|
+
skipna: bool, default True
|
|
2011
|
+
Exclude NA/null values when computing the result.
|
|
2012
|
+
|
|
2013
|
+
.. versionchanged:: 3.4.0
|
|
2014
|
+
Supported including NA/null values.
|
|
2015
|
+
numeric_only: bool, default None
|
|
2016
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
2017
|
+
is mainly for pandas compatibility.
|
|
2018
|
+
accuracy: int, optional
|
|
2019
|
+
Default accuracy of approximation. Larger value means better accuracy.
|
|
2020
|
+
The relative error can be deduced by 1.0 / accuracy.
|
|
2021
|
+
|
|
2022
|
+
Returns
|
|
2023
|
+
-------
|
|
2024
|
+
median: scalar or Series
|
|
2025
|
+
|
|
2026
|
+
Examples
|
|
2027
|
+
--------
|
|
2028
|
+
>>> df = ps.DataFrame({
|
|
2029
|
+
... 'a': [24., 21., 25., 33., 26.], 'b': [1, 2, 3, 4, 5]}, columns=['a', 'b'])
|
|
2030
|
+
>>> df
|
|
2031
|
+
a b
|
|
2032
|
+
0 24.0 1
|
|
2033
|
+
1 21.0 2
|
|
2034
|
+
2 25.0 3
|
|
2035
|
+
3 33.0 4
|
|
2036
|
+
4 26.0 5
|
|
2037
|
+
|
|
2038
|
+
On a DataFrame:
|
|
2039
|
+
|
|
2040
|
+
>>> df.median()
|
|
2041
|
+
a 25.0
|
|
2042
|
+
b 3.0
|
|
2043
|
+
dtype: float64
|
|
2044
|
+
|
|
2045
|
+
On a Series:
|
|
2046
|
+
|
|
2047
|
+
>>> df['a'].median()
|
|
2048
|
+
25.0
|
|
2049
|
+
>>> (df['b'] + 100).median()
|
|
2050
|
+
103.0
|
|
2051
|
+
|
|
2052
|
+
For multi-index columns,
|
|
2053
|
+
|
|
2054
|
+
>>> df.columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')])
|
|
2055
|
+
>>> df
|
|
2056
|
+
x y
|
|
2057
|
+
a b
|
|
2058
|
+
0 24.0 1
|
|
2059
|
+
1 21.0 2
|
|
2060
|
+
2 25.0 3
|
|
2061
|
+
3 33.0 4
|
|
2062
|
+
4 26.0 5
|
|
2063
|
+
|
|
2064
|
+
On a DataFrame:
|
|
2065
|
+
|
|
2066
|
+
>>> df.median()
|
|
2067
|
+
x a 25.0
|
|
2068
|
+
y b 3.0
|
|
2069
|
+
dtype: float64
|
|
2070
|
+
|
|
2071
|
+
>>> df.median(axis=1)
|
|
2072
|
+
0 12.5
|
|
2073
|
+
1 11.5
|
|
2074
|
+
2 14.0
|
|
2075
|
+
3 18.5
|
|
2076
|
+
4 15.5
|
|
2077
|
+
dtype: float64
|
|
2078
|
+
|
|
2079
|
+
On a Series:
|
|
2080
|
+
|
|
2081
|
+
>>> df[('x', 'a')].median()
|
|
2082
|
+
25.0
|
|
2083
|
+
>>> (df[('y', 'b')] + 100).median()
|
|
2084
|
+
103.0
|
|
2085
|
+
"""
|
|
2086
|
+
axis = validate_axis(axis)
|
|
2087
|
+
|
|
2088
|
+
if numeric_only is None and axis == 0:
|
|
2089
|
+
numeric_only = True
|
|
2090
|
+
|
|
2091
|
+
if not isinstance(accuracy, int):
|
|
2092
|
+
raise TypeError(
|
|
2093
|
+
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
|
|
2094
|
+
)
|
|
2095
|
+
|
|
2096
|
+
def median(psser: "Series") -> Column:
|
|
2097
|
+
spark_type = psser.spark.data_type
|
|
2098
|
+
spark_column = psser.spark.column
|
|
2099
|
+
if isinstance(spark_type, (BooleanType, NumericType)):
|
|
2100
|
+
return F.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
|
|
2101
|
+
else:
|
|
2102
|
+
raise TypeError(
|
|
2103
|
+
"Could not convert {} ({}) to numeric".format(
|
|
2104
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
2105
|
+
)
|
|
2106
|
+
)
|
|
2107
|
+
|
|
2108
|
+
return self._reduce_for_stat_function(
|
|
2109
|
+
median,
|
|
2110
|
+
name="median",
|
|
2111
|
+
numeric_only=numeric_only,
|
|
2112
|
+
axis=axis,
|
|
2113
|
+
skipna=skipna,
|
|
2114
|
+
)
|
|
2115
|
+
|
|
2116
|
+
def sem(
|
|
2117
|
+
self,
|
|
2118
|
+
axis: Optional[Axis] = None,
|
|
2119
|
+
skipna: bool = True,
|
|
2120
|
+
ddof: int = 1,
|
|
2121
|
+
numeric_only: bool = None,
|
|
2122
|
+
) -> Union[Scalar, "Series"]:
|
|
2123
|
+
"""
|
|
2124
|
+
Return unbiased standard error of the mean over requested axis.
|
|
2125
|
+
|
|
2126
|
+
.. versionadded:: 3.3.0
|
|
2127
|
+
|
|
2128
|
+
Parameters
|
|
2129
|
+
----------
|
|
2130
|
+
axis: {index (0), columns (1)}
|
|
2131
|
+
Axis for the function to be applied on.
|
|
2132
|
+
skipna: bool, default True
|
|
2133
|
+
Exclude NA/null values when computing the result.
|
|
2134
|
+
|
|
2135
|
+
.. versionchanged:: 3.4.0
|
|
2136
|
+
Supported including NA/null values.
|
|
2137
|
+
ddof: int, default 1
|
|
2138
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
|
|
2139
|
+
where N represents the number of elements.
|
|
2140
|
+
|
|
2141
|
+
.. versionchanged:: 3.4.0
|
|
2142
|
+
Supported including arbitary integers.
|
|
2143
|
+
numeric_only: bool, default None
|
|
2144
|
+
Include only float, int, boolean columns. False is not supported. This parameter
|
|
2145
|
+
is mainly for pandas compatibility.
|
|
2146
|
+
|
|
2147
|
+
Returns
|
|
2148
|
+
-------
|
|
2149
|
+
scalar(for Series) or Series(for DataFrame)
|
|
2150
|
+
|
|
2151
|
+
Examples
|
|
2152
|
+
--------
|
|
2153
|
+
>>> psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
2154
|
+
>>> psdf
|
|
2155
|
+
a b
|
|
2156
|
+
0 1 4
|
|
2157
|
+
1 2 5
|
|
2158
|
+
2 3 6
|
|
2159
|
+
|
|
2160
|
+
>>> psdf.sem()
|
|
2161
|
+
a 0.57735
|
|
2162
|
+
b 0.57735
|
|
2163
|
+
dtype: float64
|
|
2164
|
+
|
|
2165
|
+
>>> psdf.sem(ddof=0)
|
|
2166
|
+
a 0.471405
|
|
2167
|
+
b 0.471405
|
|
2168
|
+
dtype: float64
|
|
2169
|
+
|
|
2170
|
+
>>> psdf.sem(ddof=2)
|
|
2171
|
+
a 0.816497
|
|
2172
|
+
b 0.816497
|
|
2173
|
+
dtype: float64
|
|
2174
|
+
|
|
2175
|
+
>>> psdf.sem(axis=1)
|
|
2176
|
+
0 1.5
|
|
2177
|
+
1 1.5
|
|
2178
|
+
2 1.5
|
|
2179
|
+
dtype: float64
|
|
2180
|
+
|
|
2181
|
+
Support for Series
|
|
2182
|
+
|
|
2183
|
+
>>> psser = psdf.a
|
|
2184
|
+
>>> psser
|
|
2185
|
+
0 1
|
|
2186
|
+
1 2
|
|
2187
|
+
2 3
|
|
2188
|
+
Name: a, dtype: int64
|
|
2189
|
+
|
|
2190
|
+
>>> psser.sem()
|
|
2191
|
+
0.5773502691896258
|
|
2192
|
+
|
|
2193
|
+
>>> psser.sem(ddof=0)
|
|
2194
|
+
0.47140452079103173
|
|
2195
|
+
"""
|
|
2196
|
+
if not isinstance(ddof, int):
|
|
2197
|
+
raise TypeError("ddof must be integer")
|
|
2198
|
+
|
|
2199
|
+
axis = validate_axis(axis)
|
|
2200
|
+
|
|
2201
|
+
if numeric_only is None and axis == 0:
|
|
2202
|
+
numeric_only = True
|
|
2203
|
+
|
|
2204
|
+
def std(psser: "Series") -> Column:
|
|
2205
|
+
spark_type = psser.spark.data_type
|
|
2206
|
+
spark_column = psser.spark.column
|
|
2207
|
+
if isinstance(spark_type, BooleanType):
|
|
2208
|
+
spark_column = spark_column.cast(LongType())
|
|
2209
|
+
elif not isinstance(spark_type, NumericType):
|
|
2210
|
+
raise TypeError(
|
|
2211
|
+
"Could not convert {} ({}) to numeric".format(
|
|
2212
|
+
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
|
2213
|
+
)
|
|
2214
|
+
)
|
|
2215
|
+
return SF.stddev(spark_column, ddof)
|
|
2216
|
+
|
|
2217
|
+
def sem(psser: "Series") -> Column:
|
|
2218
|
+
return std(psser) / F.sqrt(Frame._count_expr(psser))
|
|
2219
|
+
|
|
2220
|
+
return self._reduce_for_stat_function(
|
|
2221
|
+
sem,
|
|
2222
|
+
name="sem",
|
|
2223
|
+
numeric_only=numeric_only,
|
|
2224
|
+
axis=axis,
|
|
2225
|
+
ddof=ddof,
|
|
2226
|
+
skipna=skipna,
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2229
|
+
@property
|
|
2230
|
+
def size(self) -> int:
|
|
2231
|
+
"""
|
|
2232
|
+
Return an int representing the number of elements in this object.
|
|
2233
|
+
|
|
2234
|
+
Return the number of rows if Series. Otherwise return the number of
|
|
2235
|
+
rows times number of columns if DataFrame.
|
|
2236
|
+
|
|
2237
|
+
Examples
|
|
2238
|
+
--------
|
|
2239
|
+
>>> s = ps.Series({'a': 1, 'b': 2, 'c': None})
|
|
2240
|
+
>>> s.size
|
|
2241
|
+
3
|
|
2242
|
+
|
|
2243
|
+
>>> df = ps.DataFrame({'col1': [1, 2, None], 'col2': [3, 4, None]})
|
|
2244
|
+
>>> df.size
|
|
2245
|
+
6
|
|
2246
|
+
|
|
2247
|
+
>>> df = ps.DataFrame(index=[1, 2, None])
|
|
2248
|
+
>>> df.size
|
|
2249
|
+
0
|
|
2250
|
+
"""
|
|
2251
|
+
num_columns = len(self._internal.data_spark_columns)
|
|
2252
|
+
if num_columns == 0:
|
|
2253
|
+
return 0
|
|
2254
|
+
else:
|
|
2255
|
+
return len(self) * num_columns # type: ignore[arg-type]
|
|
2256
|
+
|
|
2257
|
+
def abs(self: FrameLike) -> FrameLike:
|
|
2258
|
+
"""
|
|
2259
|
+
Return a Series/DataFrame with absolute numeric value of each element.
|
|
2260
|
+
|
|
2261
|
+
Returns
|
|
2262
|
+
-------
|
|
2263
|
+
abs: Series/DataFrame containing the absolute value of each element.
|
|
2264
|
+
|
|
2265
|
+
Examples
|
|
2266
|
+
--------
|
|
2267
|
+
|
|
2268
|
+
Absolute numeric values in a Series.
|
|
2269
|
+
|
|
2270
|
+
>>> s = ps.Series([-1.10, 2, -3.33, 4])
|
|
2271
|
+
>>> s.abs()
|
|
2272
|
+
0 1.10
|
|
2273
|
+
1 2.00
|
|
2274
|
+
2 3.33
|
|
2275
|
+
3 4.00
|
|
2276
|
+
dtype: float64
|
|
2277
|
+
|
|
2278
|
+
Absolute numeric values in a DataFrame.
|
|
2279
|
+
|
|
2280
|
+
>>> df = ps.DataFrame({
|
|
2281
|
+
... 'a': [4, 5, 6, 7],
|
|
2282
|
+
... 'b': [10, 20, 30, 40],
|
|
2283
|
+
... 'c': [100, 50, -30, -50]
|
|
2284
|
+
... },
|
|
2285
|
+
... columns=['a', 'b', 'c'])
|
|
2286
|
+
>>> df.abs()
|
|
2287
|
+
a b c
|
|
2288
|
+
0 4 10 100
|
|
2289
|
+
1 5 20 50
|
|
2290
|
+
2 6 30 30
|
|
2291
|
+
3 7 40 50
|
|
2292
|
+
"""
|
|
2293
|
+
|
|
2294
|
+
def abs(psser: "Series") -> Union["Series", Column]:
|
|
2295
|
+
if isinstance(psser.spark.data_type, BooleanType):
|
|
2296
|
+
return psser
|
|
2297
|
+
elif isinstance(psser.spark.data_type, NumericType):
|
|
2298
|
+
return psser._with_new_scol(
|
|
2299
|
+
F.abs(psser.spark.column), field=psser._internal.data_fields[0]
|
|
2300
|
+
)
|
|
2301
|
+
else:
|
|
2302
|
+
raise TypeError(
|
|
2303
|
+
"bad operand type for abs(): {} ({})".format(
|
|
2304
|
+
spark_type_to_pandas_dtype(psser.spark.data_type),
|
|
2305
|
+
psser.spark.data_type.simpleString(),
|
|
2306
|
+
)
|
|
2307
|
+
)
|
|
2308
|
+
|
|
2309
|
+
return self._apply_series_op(abs)
|
|
2310
|
+
|
|
2311
|
+
# TODO: by argument only support the grouping name and as_index only for now. Documentation
|
|
2312
|
+
# should be updated when it's supported.
|
|
2313
|
+
def groupby(
|
|
2314
|
+
self: FrameLike,
|
|
2315
|
+
by: Union[Name, "Series", List[Union[Name, "Series"]]],
|
|
2316
|
+
axis: Axis = 0,
|
|
2317
|
+
as_index: bool = True,
|
|
2318
|
+
dropna: bool = True,
|
|
2319
|
+
) -> "GroupBy[FrameLike]":
|
|
2320
|
+
"""
|
|
2321
|
+
Group DataFrame or Series using one or more columns.
|
|
2322
|
+
|
|
2323
|
+
A groupby operation involves some combination of splitting the
|
|
2324
|
+
object, applying a function, and combining the results. This can be
|
|
2325
|
+
used to group large amounts of data and compute operations on these
|
|
2326
|
+
groups.
|
|
2327
|
+
|
|
2328
|
+
Parameters
|
|
2329
|
+
----------
|
|
2330
|
+
by: Series, label, or list of labels
|
|
2331
|
+
Used to determine the groups for the groupby.
|
|
2332
|
+
If Series is passed, the Series or dict VALUES
|
|
2333
|
+
will be used to determine the groups. A label or list of
|
|
2334
|
+
labels may be passed to group by the columns in ``self``.
|
|
2335
|
+
axis: int, default 0 or 'index'
|
|
2336
|
+
Can only be set to 0 now.
|
|
2337
|
+
as_index: bool, default True
|
|
2338
|
+
For aggregated output, return object with group labels as the
|
|
2339
|
+
index. Only relevant for DataFrame input. as_index=False is
|
|
2340
|
+
effectively "SQL-style" grouped output.
|
|
2341
|
+
dropna: bool, default True
|
|
2342
|
+
If True, and if group keys contain NA values,
|
|
2343
|
+
NA values together with row/column will be dropped.
|
|
2344
|
+
If False, NA values will also be treated as the key in groups.
|
|
2345
|
+
|
|
2346
|
+
Returns
|
|
2347
|
+
-------
|
|
2348
|
+
DataFrameGroupBy or SeriesGroupBy
|
|
2349
|
+
Depends on the calling object and returns groupby object that
|
|
2350
|
+
contains information about the groups.
|
|
2351
|
+
|
|
2352
|
+
See Also
|
|
2353
|
+
--------
|
|
2354
|
+
pyspark.pandas.groupby.GroupBy
|
|
2355
|
+
|
|
2356
|
+
Examples
|
|
2357
|
+
--------
|
|
2358
|
+
>>> df = ps.DataFrame({'Animal': ['Falcon', 'Falcon',
|
|
2359
|
+
... 'Parrot', 'Parrot'],
|
|
2360
|
+
... 'Max Speed': [380., 370., 24., 26.]},
|
|
2361
|
+
... columns=['Animal', 'Max Speed'])
|
|
2362
|
+
>>> df
|
|
2363
|
+
Animal Max Speed
|
|
2364
|
+
0 Falcon 380.0
|
|
2365
|
+
1 Falcon 370.0
|
|
2366
|
+
2 Parrot 24.0
|
|
2367
|
+
3 Parrot 26.0
|
|
2368
|
+
|
|
2369
|
+
>>> df.groupby(['Animal']).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE
|
|
2370
|
+
Max Speed
|
|
2371
|
+
Animal
|
|
2372
|
+
Falcon 375.0
|
|
2373
|
+
Parrot 25.0
|
|
2374
|
+
|
|
2375
|
+
>>> df.groupby(['Animal'], as_index=False).mean().sort_values('Animal')
|
|
2376
|
+
... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
|
2377
|
+
Animal Max Speed
|
|
2378
|
+
...Falcon 375.0
|
|
2379
|
+
...Parrot 25.0
|
|
2380
|
+
|
|
2381
|
+
We can also choose to include NA in group keys or not by setting dropna parameter,
|
|
2382
|
+
the default setting is True:
|
|
2383
|
+
|
|
2384
|
+
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
|
|
2385
|
+
>>> df = ps.DataFrame(l, columns=["a", "b", "c"])
|
|
2386
|
+
>>> df.groupby(by=["b"]).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
|
|
2387
|
+
a c
|
|
2388
|
+
b
|
|
2389
|
+
1.0 2 3
|
|
2390
|
+
2.0 2 5
|
|
2391
|
+
|
|
2392
|
+
>>> df.groupby(by=["b"], dropna=False).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
|
|
2393
|
+
a c
|
|
2394
|
+
b
|
|
2395
|
+
1.0 2 3
|
|
2396
|
+
2.0 2 5
|
|
2397
|
+
NaN 1 4
|
|
2398
|
+
"""
|
|
2399
|
+
new_by: List[Union[Label, ps.Series]]
|
|
2400
|
+
if isinstance(by, ps.DataFrame):
|
|
2401
|
+
raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by).__name__))
|
|
2402
|
+
elif isinstance(by, ps.Series):
|
|
2403
|
+
new_by = [by]
|
|
2404
|
+
elif is_name_like_tuple(by):
|
|
2405
|
+
if isinstance(self, ps.Series):
|
|
2406
|
+
raise KeyError(by)
|
|
2407
|
+
new_by = [cast(Label, by)]
|
|
2408
|
+
elif is_name_like_value(by):
|
|
2409
|
+
if isinstance(self, ps.Series):
|
|
2410
|
+
raise KeyError(by)
|
|
2411
|
+
new_by = [cast(Label, (by,))]
|
|
2412
|
+
elif is_list_like(by):
|
|
2413
|
+
new_by = []
|
|
2414
|
+
for key in by:
|
|
2415
|
+
if isinstance(key, ps.DataFrame):
|
|
2416
|
+
raise ValueError(
|
|
2417
|
+
"Grouper for '{}' not 1-dimensional".format(type(key).__name__)
|
|
2418
|
+
)
|
|
2419
|
+
elif isinstance(key, ps.Series):
|
|
2420
|
+
new_by.append(key)
|
|
2421
|
+
elif is_name_like_tuple(key):
|
|
2422
|
+
if isinstance(self, ps.Series):
|
|
2423
|
+
raise KeyError(key)
|
|
2424
|
+
new_by.append(cast(Label, key))
|
|
2425
|
+
elif is_name_like_value(key):
|
|
2426
|
+
if isinstance(self, ps.Series):
|
|
2427
|
+
raise KeyError(key)
|
|
2428
|
+
new_by.append(cast(Label, (key,)))
|
|
2429
|
+
else:
|
|
2430
|
+
raise ValueError(
|
|
2431
|
+
"Grouper for '{}' not 1-dimensional".format(type(key).__name__)
|
|
2432
|
+
)
|
|
2433
|
+
else:
|
|
2434
|
+
raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by).__name__))
|
|
2435
|
+
if not len(new_by):
|
|
2436
|
+
raise ValueError("No group keys passed!")
|
|
2437
|
+
axis = validate_axis(axis)
|
|
2438
|
+
if axis != 0:
|
|
2439
|
+
raise NotImplementedError('axis should be either 0 or "index" currently.')
|
|
2440
|
+
|
|
2441
|
+
return self._build_groupby(by=new_by, as_index=as_index, dropna=dropna)
|
|
2442
|
+
|
|
2443
|
+
@abstractmethod
|
|
2444
|
+
def _build_groupby(
|
|
2445
|
+
self: FrameLike, by: List[Union["Series", Label]], as_index: bool, dropna: bool
|
|
2446
|
+
) -> "GroupBy[FrameLike]":
|
|
2447
|
+
pass
|
|
2448
|
+
|
|
2449
|
+
def bool(self) -> bool:
|
|
2450
|
+
"""
|
|
2451
|
+
Return the bool of a single element in the current object.
|
|
2452
|
+
|
|
2453
|
+
This must be a boolean scalar value, either True or False. Raise a ValueError if
|
|
2454
|
+
the object does not have exactly 1 element, or that element is not boolean
|
|
2455
|
+
|
|
2456
|
+
Returns
|
|
2457
|
+
-------
|
|
2458
|
+
bool
|
|
2459
|
+
|
|
2460
|
+
Examples
|
|
2461
|
+
--------
|
|
2462
|
+
>>> ps.DataFrame({'a': [True]}).bool()
|
|
2463
|
+
True
|
|
2464
|
+
|
|
2465
|
+
>>> ps.Series([False]).bool()
|
|
2466
|
+
False
|
|
2467
|
+
|
|
2468
|
+
If there are non-boolean or multiple values exist, it raises an exception in all
|
|
2469
|
+
cases as below.
|
|
2470
|
+
|
|
2471
|
+
>>> ps.DataFrame({'a': ['a']}).bool()
|
|
2472
|
+
Traceback (most recent call last):
|
|
2473
|
+
...
|
|
2474
|
+
ValueError: bool cannot act on a non-boolean single element DataFrame
|
|
2475
|
+
|
|
2476
|
+
>>> ps.DataFrame({'a': [True], 'b': [False]}).bool() # doctest: +NORMALIZE_WHITESPACE
|
|
2477
|
+
Traceback (most recent call last):
|
|
2478
|
+
...
|
|
2479
|
+
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(),
|
|
2480
|
+
a.item(), a.any() or a.all().
|
|
2481
|
+
|
|
2482
|
+
>>> ps.Series([1]).bool()
|
|
2483
|
+
Traceback (most recent call last):
|
|
2484
|
+
...
|
|
2485
|
+
ValueError: bool cannot act on a non-boolean single element DataFrame
|
|
2486
|
+
"""
|
|
2487
|
+
if isinstance(self, ps.DataFrame):
|
|
2488
|
+
df = self
|
|
2489
|
+
elif isinstance(self, ps.Series):
|
|
2490
|
+
df = self.to_dataframe()
|
|
2491
|
+
return df.head(2)._to_internal_pandas().bool()
|
|
2492
|
+
|
|
2493
|
+
def first_valid_index(self) -> Optional[Union[Scalar, Tuple[Scalar, ...]]]:
|
|
2494
|
+
"""
|
|
2495
|
+
Retrieves the index of the first valid value.
|
|
2496
|
+
|
|
2497
|
+
Returns
|
|
2498
|
+
-------
|
|
2499
|
+
scalar, tuple, or None
|
|
2500
|
+
|
|
2501
|
+
Examples
|
|
2502
|
+
--------
|
|
2503
|
+
|
|
2504
|
+
Support for DataFrame
|
|
2505
|
+
|
|
2506
|
+
>>> psdf = ps.DataFrame({'a': [None, 2, 3, 2],
|
|
2507
|
+
... 'b': [None, 2.0, 3.0, 1.0],
|
|
2508
|
+
... 'c': [None, 200, 400, 200]},
|
|
2509
|
+
... index=['Q', 'W', 'E', 'R'])
|
|
2510
|
+
>>> psdf
|
|
2511
|
+
a b c
|
|
2512
|
+
Q NaN NaN NaN
|
|
2513
|
+
W 2.0 2.0 200.0
|
|
2514
|
+
E 3.0 3.0 400.0
|
|
2515
|
+
R 2.0 1.0 200.0
|
|
2516
|
+
|
|
2517
|
+
>>> psdf.first_valid_index()
|
|
2518
|
+
'W'
|
|
2519
|
+
|
|
2520
|
+
Support for MultiIndex columns
|
|
2521
|
+
|
|
2522
|
+
>>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
|
|
2523
|
+
>>> psdf
|
|
2524
|
+
a b c
|
|
2525
|
+
x y z
|
|
2526
|
+
Q NaN NaN NaN
|
|
2527
|
+
W 2.0 2.0 200.0
|
|
2528
|
+
E 3.0 3.0 400.0
|
|
2529
|
+
R 2.0 1.0 200.0
|
|
2530
|
+
|
|
2531
|
+
>>> psdf.first_valid_index()
|
|
2532
|
+
'W'
|
|
2533
|
+
|
|
2534
|
+
Support for Series.
|
|
2535
|
+
|
|
2536
|
+
>>> s = ps.Series([None, None, 3, 4, 5], index=[100, 200, 300, 400, 500])
|
|
2537
|
+
>>> s
|
|
2538
|
+
100 NaN
|
|
2539
|
+
200 NaN
|
|
2540
|
+
300 3.0
|
|
2541
|
+
400 4.0
|
|
2542
|
+
500 5.0
|
|
2543
|
+
dtype: float64
|
|
2544
|
+
|
|
2545
|
+
>>> s.first_valid_index()
|
|
2546
|
+
300
|
|
2547
|
+
|
|
2548
|
+
Support for MultiIndex
|
|
2549
|
+
|
|
2550
|
+
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
|
|
2551
|
+
... ['speed', 'weight', 'length']],
|
|
2552
|
+
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
|
2553
|
+
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
|
2554
|
+
>>> s = ps.Series([None, None, None, None, 250, 1.5, 320, 1, 0.3], index=midx)
|
|
2555
|
+
>>> s
|
|
2556
|
+
lama speed NaN
|
|
2557
|
+
weight NaN
|
|
2558
|
+
length NaN
|
|
2559
|
+
cow speed NaN
|
|
2560
|
+
weight 250.0
|
|
2561
|
+
length 1.5
|
|
2562
|
+
falcon speed 320.0
|
|
2563
|
+
weight 1.0
|
|
2564
|
+
length 0.3
|
|
2565
|
+
dtype: float64
|
|
2566
|
+
|
|
2567
|
+
>>> s.first_valid_index()
|
|
2568
|
+
('cow', 'weight')
|
|
2569
|
+
"""
|
|
2570
|
+
data_spark_columns = self._internal.data_spark_columns
|
|
2571
|
+
|
|
2572
|
+
if len(data_spark_columns) == 0:
|
|
2573
|
+
return None
|
|
2574
|
+
|
|
2575
|
+
cond = reduce(lambda x, y: x & y, map(lambda x: x.isNotNull(), data_spark_columns))
|
|
2576
|
+
|
|
2577
|
+
with sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
|
2578
|
+
# Disable Arrow to keep row ordering.
|
|
2579
|
+
first_valid_row = (
|
|
2580
|
+
self._internal.spark_frame.filter(cond)
|
|
2581
|
+
.select(self._internal.index_spark_columns)
|
|
2582
|
+
.limit(1)
|
|
2583
|
+
.toPandas()
|
|
2584
|
+
)
|
|
2585
|
+
|
|
2586
|
+
# For Empty Series or DataFrame, returns None.
|
|
2587
|
+
if len(first_valid_row) == 0:
|
|
2588
|
+
return None
|
|
2589
|
+
|
|
2590
|
+
first_valid_row = first_valid_row.iloc[0]
|
|
2591
|
+
if len(first_valid_row) == 1:
|
|
2592
|
+
return first_valid_row.iloc[0]
|
|
2593
|
+
else:
|
|
2594
|
+
return tuple(first_valid_row)
|
|
2595
|
+
|
|
2596
|
+
def last_valid_index(self) -> Optional[Union[Scalar, Tuple[Scalar, ...]]]:
|
|
2597
|
+
"""
|
|
2598
|
+
Return index for last non-NA/null value.
|
|
2599
|
+
|
|
2600
|
+
Returns
|
|
2601
|
+
-------
|
|
2602
|
+
scalar, tuple, or None
|
|
2603
|
+
|
|
2604
|
+
Notes
|
|
2605
|
+
-----
|
|
2606
|
+
This API only works with PySpark >= 3.0.
|
|
2607
|
+
|
|
2608
|
+
Examples
|
|
2609
|
+
--------
|
|
2610
|
+
|
|
2611
|
+
Support for DataFrame
|
|
2612
|
+
|
|
2613
|
+
>>> psdf = ps.DataFrame({'a': [1, 2, 3, None],
|
|
2614
|
+
... 'b': [1.0, 2.0, 3.0, None],
|
|
2615
|
+
... 'c': [100, 200, 400, None]},
|
|
2616
|
+
... index=['Q', 'W', 'E', 'R'])
|
|
2617
|
+
>>> psdf
|
|
2618
|
+
a b c
|
|
2619
|
+
Q 1.0 1.0 100.0
|
|
2620
|
+
W 2.0 2.0 200.0
|
|
2621
|
+
E 3.0 3.0 400.0
|
|
2622
|
+
R NaN NaN NaN
|
|
2623
|
+
|
|
2624
|
+
>>> psdf.last_valid_index() # doctest: +SKIP
|
|
2625
|
+
'E'
|
|
2626
|
+
|
|
2627
|
+
Support for MultiIndex columns
|
|
2628
|
+
|
|
2629
|
+
>>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
|
|
2630
|
+
>>> psdf
|
|
2631
|
+
a b c
|
|
2632
|
+
x y z
|
|
2633
|
+
Q 1.0 1.0 100.0
|
|
2634
|
+
W 2.0 2.0 200.0
|
|
2635
|
+
E 3.0 3.0 400.0
|
|
2636
|
+
R NaN NaN NaN
|
|
2637
|
+
|
|
2638
|
+
>>> psdf.last_valid_index() # doctest: +SKIP
|
|
2639
|
+
'E'
|
|
2640
|
+
|
|
2641
|
+
Support for Series.
|
|
2642
|
+
|
|
2643
|
+
>>> s = ps.Series([1, 2, 3, None, None], index=[100, 200, 300, 400, 500])
|
|
2644
|
+
>>> s
|
|
2645
|
+
100 1.0
|
|
2646
|
+
200 2.0
|
|
2647
|
+
300 3.0
|
|
2648
|
+
400 NaN
|
|
2649
|
+
500 NaN
|
|
2650
|
+
dtype: float64
|
|
2651
|
+
|
|
2652
|
+
>>> s.last_valid_index() # doctest: +SKIP
|
|
2653
|
+
300
|
|
2654
|
+
|
|
2655
|
+
Support for MultiIndex
|
|
2656
|
+
|
|
2657
|
+
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
|
|
2658
|
+
... ['speed', 'weight', 'length']],
|
|
2659
|
+
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
|
2660
|
+
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
|
2661
|
+
>>> s = ps.Series([250, 1.5, 320, 1, 0.3, None, None, None, None], index=midx)
|
|
2662
|
+
>>> s
|
|
2663
|
+
lama speed 250.0
|
|
2664
|
+
weight 1.5
|
|
2665
|
+
length 320.0
|
|
2666
|
+
cow speed 1.0
|
|
2667
|
+
weight 0.3
|
|
2668
|
+
length NaN
|
|
2669
|
+
falcon speed NaN
|
|
2670
|
+
weight NaN
|
|
2671
|
+
length NaN
|
|
2672
|
+
dtype: float64
|
|
2673
|
+
|
|
2674
|
+
>>> s.last_valid_index() # doctest: +SKIP
|
|
2675
|
+
('cow', 'weight')
|
|
2676
|
+
"""
|
|
2677
|
+
data_spark_columns = self._internal.data_spark_columns
|
|
2678
|
+
|
|
2679
|
+
if len(data_spark_columns) == 0:
|
|
2680
|
+
return None
|
|
2681
|
+
|
|
2682
|
+
cond = reduce(lambda x, y: x & y, map(lambda x: x.isNotNull(), data_spark_columns))
|
|
2683
|
+
|
|
2684
|
+
last_valid_rows = (
|
|
2685
|
+
self._internal.spark_frame.filter(cond)
|
|
2686
|
+
.select(self._internal.index_spark_columns)
|
|
2687
|
+
.tail(1)
|
|
2688
|
+
)
|
|
2689
|
+
|
|
2690
|
+
# For Empty Series or DataFrame, returns None.
|
|
2691
|
+
if len(last_valid_rows) == 0:
|
|
2692
|
+
return None
|
|
2693
|
+
|
|
2694
|
+
last_valid_row = last_valid_rows[0]
|
|
2695
|
+
|
|
2696
|
+
if len(last_valid_row) == 1:
|
|
2697
|
+
return last_valid_row[0]
|
|
2698
|
+
else:
|
|
2699
|
+
return tuple(last_valid_row)
|
|
2700
|
+
|
|
2701
|
+
# TODO: 'center', 'win_type', 'on', 'axis' parameter should be implemented.
|
|
2702
|
+
def rolling(
|
|
2703
|
+
self: FrameLike, window: int, min_periods: Optional[int] = None
|
|
2704
|
+
) -> "Rolling[FrameLike]":
|
|
2705
|
+
"""
|
|
2706
|
+
Provide rolling transformations.
|
|
2707
|
+
|
|
2708
|
+
.. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.
|
|
2709
|
+
Unlike pandas, NA is also counted as the period. This might be changed
|
|
2710
|
+
soon.
|
|
2711
|
+
|
|
2712
|
+
Parameters
|
|
2713
|
+
----------
|
|
2714
|
+
window: int, or offset
|
|
2715
|
+
Size of the moving window.
|
|
2716
|
+
This is the number of observations used for calculating the statistic.
|
|
2717
|
+
Each window will be a fixed size.
|
|
2718
|
+
|
|
2719
|
+
min_periods: int, default None
|
|
2720
|
+
Minimum number of observations in window required to have a value
|
|
2721
|
+
(otherwise result is NA).
|
|
2722
|
+
For a window that is specified by an offset, min_periods will default to 1.
|
|
2723
|
+
Otherwise, min_periods will default to the size of the window.
|
|
2724
|
+
|
|
2725
|
+
Returns
|
|
2726
|
+
-------
|
|
2727
|
+
a Window sub-classed for the operation
|
|
2728
|
+
"""
|
|
2729
|
+
from pyspark.pandas.window import Rolling
|
|
2730
|
+
|
|
2731
|
+
return Rolling(self, window=window, min_periods=min_periods)
|
|
2732
|
+
|
|
2733
|
+
# TODO: 'center' and 'axis' parameter should be implemented.
|
|
2734
|
+
# 'axis' implementation, refer https://github.com/databricks/koalas/pull/607
|
|
2735
|
+
def expanding(self: FrameLike, min_periods: int = 1) -> "Expanding[FrameLike]":
|
|
2736
|
+
"""
|
|
2737
|
+
Provide expanding transformations.
|
|
2738
|
+
|
|
2739
|
+
.. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.
|
|
2740
|
+
Unlike pandas, NA is also counted as the period. This might be changed
|
|
2741
|
+
soon.
|
|
2742
|
+
|
|
2743
|
+
Parameters
|
|
2744
|
+
----------
|
|
2745
|
+
min_periods: int, default 1
|
|
2746
|
+
Minimum number of observations in window required to have a value
|
|
2747
|
+
(otherwise result is NA).
|
|
2748
|
+
|
|
2749
|
+
Returns
|
|
2750
|
+
-------
|
|
2751
|
+
a Window sub-classed for the operation
|
|
2752
|
+
"""
|
|
2753
|
+
from pyspark.pandas.window import Expanding
|
|
2754
|
+
|
|
2755
|
+
return Expanding(self, min_periods=min_periods)
|
|
2756
|
+
|
|
2757
|
+
# TODO: 'adjust', 'axis', 'method' parameter should be implemented.
|
|
2758
|
+
def ewm(
|
|
2759
|
+
self: FrameLike,
|
|
2760
|
+
com: Optional[float] = None,
|
|
2761
|
+
span: Optional[float] = None,
|
|
2762
|
+
halflife: Optional[float] = None,
|
|
2763
|
+
alpha: Optional[float] = None,
|
|
2764
|
+
min_periods: Optional[int] = None,
|
|
2765
|
+
ignore_na: bool_type = False,
|
|
2766
|
+
) -> "ExponentialMoving[FrameLike]":
|
|
2767
|
+
"""
|
|
2768
|
+
Provide exponentially weighted window transformations.
|
|
2769
|
+
|
|
2770
|
+
.. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.
|
|
2771
|
+
Unlike pandas, NA is also counted as the period. This might be changed
|
|
2772
|
+
soon.
|
|
2773
|
+
|
|
2774
|
+
.. versionadded:: 3.4.0
|
|
2775
|
+
|
|
2776
|
+
Parameters
|
|
2777
|
+
----------
|
|
2778
|
+
com: float, optional
|
|
2779
|
+
Specify decay in terms of center of mass.
|
|
2780
|
+
alpha = 1 / (1 + com), for com >= 0.
|
|
2781
|
+
|
|
2782
|
+
span: float, optional
|
|
2783
|
+
Specify decay in terms of span.
|
|
2784
|
+
alpha = 2 / (span + 1), for span >= 1.
|
|
2785
|
+
|
|
2786
|
+
halflife: float, optional
|
|
2787
|
+
Specify decay in terms of half-life.
|
|
2788
|
+
alpha = 1 - exp(-ln(2) / halflife), for halflife > 0.
|
|
2789
|
+
|
|
2790
|
+
alpha: float, optional
|
|
2791
|
+
Specify smoothing factor alpha directly.
|
|
2792
|
+
0 < alpha <= 1.
|
|
2793
|
+
|
|
2794
|
+
min_periods: int, default None
|
|
2795
|
+
Minimum number of observations in window required to have a value
|
|
2796
|
+
(otherwise result is NA).
|
|
2797
|
+
|
|
2798
|
+
ignore_na: bool, default False
|
|
2799
|
+
Ignore missing values when calculating weights.
|
|
2800
|
+
|
|
2801
|
+
- When ``ignore_na=False`` (default), weights are based on absolute positions.
|
|
2802
|
+
For example, the weights of :math:`x_0` and :math:`x_2` used in calculating
|
|
2803
|
+
the final weighted average of [:math:`x_0`, None, :math:`x_2`] are
|
|
2804
|
+
:math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and
|
|
2805
|
+
:math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.
|
|
2806
|
+
|
|
2807
|
+
- When ``ignore_na=True``, weights are based
|
|
2808
|
+
on relative positions. For example, the weights of :math:`x_0` and :math:`x_2`
|
|
2809
|
+
used in calculating the final weighted average of
|
|
2810
|
+
[:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if
|
|
2811
|
+
``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.
|
|
2812
|
+
|
|
2813
|
+
Returns
|
|
2814
|
+
-------
|
|
2815
|
+
a Window sub-classed for the operation
|
|
2816
|
+
"""
|
|
2817
|
+
from pyspark.pandas.window import ExponentialMoving
|
|
2818
|
+
|
|
2819
|
+
return ExponentialMoving(
|
|
2820
|
+
self,
|
|
2821
|
+
com=com,
|
|
2822
|
+
span=span,
|
|
2823
|
+
halflife=halflife,
|
|
2824
|
+
alpha=alpha,
|
|
2825
|
+
min_periods=min_periods,
|
|
2826
|
+
ignore_na=ignore_na,
|
|
2827
|
+
)
|
|
2828
|
+
|
|
2829
|
+
def get(self, key: Any, default: Optional[Any] = None) -> Any:
|
|
2830
|
+
"""
|
|
2831
|
+
Get item from object for given key (DataFrame column, Panel slice,
|
|
2832
|
+
etc.). Returns default value if not found.
|
|
2833
|
+
|
|
2834
|
+
Parameters
|
|
2835
|
+
----------
|
|
2836
|
+
key: object
|
|
2837
|
+
|
|
2838
|
+
Returns
|
|
2839
|
+
-------
|
|
2840
|
+
value: same type as items contained in object
|
|
2841
|
+
|
|
2842
|
+
Examples
|
|
2843
|
+
--------
|
|
2844
|
+
>>> df = ps.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']},
|
|
2845
|
+
... columns=['x', 'y', 'z'], index=[10, 20, 20])
|
|
2846
|
+
>>> df
|
|
2847
|
+
x y z
|
|
2848
|
+
10 0 a a
|
|
2849
|
+
20 1 b b
|
|
2850
|
+
20 2 b b
|
|
2851
|
+
|
|
2852
|
+
>>> df.get('x')
|
|
2853
|
+
10 0
|
|
2854
|
+
20 1
|
|
2855
|
+
20 2
|
|
2856
|
+
Name: x, dtype: int64
|
|
2857
|
+
|
|
2858
|
+
>>> df.get(['x', 'y'])
|
|
2859
|
+
x y
|
|
2860
|
+
10 0 a
|
|
2861
|
+
20 1 b
|
|
2862
|
+
20 2 b
|
|
2863
|
+
|
|
2864
|
+
>>> df.x.get(10)
|
|
2865
|
+
0
|
|
2866
|
+
|
|
2867
|
+
>>> df.x.get(20)
|
|
2868
|
+
20 1
|
|
2869
|
+
20 2
|
|
2870
|
+
Name: x, dtype: int64
|
|
2871
|
+
|
|
2872
|
+
>>> df.x.get(15, -1)
|
|
2873
|
+
-1
|
|
2874
|
+
"""
|
|
2875
|
+
try:
|
|
2876
|
+
return self[key]
|
|
2877
|
+
except (KeyError, ValueError, IndexError):
|
|
2878
|
+
return default
|
|
2879
|
+
|
|
2880
|
+
def squeeze(self, axis: Optional[Axis] = None) -> Union[Scalar, "DataFrame", "Series"]:
|
|
2881
|
+
"""
|
|
2882
|
+
Squeeze 1 dimensional axis objects into scalars.
|
|
2883
|
+
|
|
2884
|
+
Series or DataFrames with a single element are squeezed to a scalar.
|
|
2885
|
+
DataFrames with a single column or a single row are squeezed to a
|
|
2886
|
+
Series. Otherwise the object is unchanged.
|
|
2887
|
+
|
|
2888
|
+
This method is most useful when you don't know if your
|
|
2889
|
+
object is a Series or DataFrame, but you do know it has just a single
|
|
2890
|
+
column. In that case you can safely call `squeeze` to ensure you have a
|
|
2891
|
+
Series.
|
|
2892
|
+
|
|
2893
|
+
Parameters
|
|
2894
|
+
----------
|
|
2895
|
+
axis: {0 or 'index', 1 or 'columns', None}, default None
|
|
2896
|
+
A specific axis to squeeze. By default, all length-1 axes are
|
|
2897
|
+
squeezed.
|
|
2898
|
+
|
|
2899
|
+
Returns
|
|
2900
|
+
-------
|
|
2901
|
+
DataFrame, Series, or scalar
|
|
2902
|
+
The projection after squeezing `axis` or all the axes.
|
|
2903
|
+
|
|
2904
|
+
See Also
|
|
2905
|
+
--------
|
|
2906
|
+
Series.iloc: Integer-location based indexing for selecting scalars.
|
|
2907
|
+
DataFrame.iloc: Integer-location based indexing for selecting Series.
|
|
2908
|
+
Series.to_frame: Inverse of DataFrame.squeeze for a
|
|
2909
|
+
single-column DataFrame.
|
|
2910
|
+
|
|
2911
|
+
Examples
|
|
2912
|
+
--------
|
|
2913
|
+
>>> primes = ps.Series([2, 3, 5, 7])
|
|
2914
|
+
|
|
2915
|
+
Slicing might produce a Series with a single value:
|
|
2916
|
+
|
|
2917
|
+
>>> even_primes = primes[primes % 2 == 0]
|
|
2918
|
+
>>> even_primes
|
|
2919
|
+
0 2
|
|
2920
|
+
dtype: int64
|
|
2921
|
+
|
|
2922
|
+
>>> even_primes.squeeze()
|
|
2923
|
+
2
|
|
2924
|
+
|
|
2925
|
+
Squeezing objects with more than one value in every axis does nothing:
|
|
2926
|
+
|
|
2927
|
+
>>> odd_primes = primes[primes % 2 == 1]
|
|
2928
|
+
>>> odd_primes
|
|
2929
|
+
1 3
|
|
2930
|
+
2 5
|
|
2931
|
+
3 7
|
|
2932
|
+
dtype: int64
|
|
2933
|
+
|
|
2934
|
+
>>> odd_primes.squeeze()
|
|
2935
|
+
1 3
|
|
2936
|
+
2 5
|
|
2937
|
+
3 7
|
|
2938
|
+
dtype: int64
|
|
2939
|
+
|
|
2940
|
+
Squeezing is even more effective when used with DataFrames.
|
|
2941
|
+
|
|
2942
|
+
>>> df = ps.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
|
|
2943
|
+
>>> df
|
|
2944
|
+
a b
|
|
2945
|
+
0 1 2
|
|
2946
|
+
1 3 4
|
|
2947
|
+
|
|
2948
|
+
Slicing a single column will produce a DataFrame with the columns
|
|
2949
|
+
having only one value:
|
|
2950
|
+
|
|
2951
|
+
>>> df_a = df[['a']]
|
|
2952
|
+
>>> df_a
|
|
2953
|
+
a
|
|
2954
|
+
0 1
|
|
2955
|
+
1 3
|
|
2956
|
+
|
|
2957
|
+
The columns can be squeezed down, resulting in a Series:
|
|
2958
|
+
|
|
2959
|
+
>>> df_a.squeeze('columns')
|
|
2960
|
+
0 1
|
|
2961
|
+
1 3
|
|
2962
|
+
Name: a, dtype: int64
|
|
2963
|
+
|
|
2964
|
+
Slicing a single row from a single column will produce a single
|
|
2965
|
+
scalar DataFrame:
|
|
2966
|
+
|
|
2967
|
+
>>> df_1a = df.loc[[1], ['a']]
|
|
2968
|
+
>>> df_1a
|
|
2969
|
+
a
|
|
2970
|
+
1 3
|
|
2971
|
+
|
|
2972
|
+
Squeezing the rows produces a single scalar Series:
|
|
2973
|
+
|
|
2974
|
+
>>> df_1a.squeeze('rows')
|
|
2975
|
+
a 3
|
|
2976
|
+
Name: 1, dtype: int64
|
|
2977
|
+
|
|
2978
|
+
Squeezing all axes will project directly into a scalar:
|
|
2979
|
+
|
|
2980
|
+
>>> df_1a.squeeze()
|
|
2981
|
+
3
|
|
2982
|
+
"""
|
|
2983
|
+
if axis is not None:
|
|
2984
|
+
axis = "index" if axis == "rows" else axis
|
|
2985
|
+
axis = validate_axis(axis)
|
|
2986
|
+
|
|
2987
|
+
if isinstance(self, ps.DataFrame):
|
|
2988
|
+
from pyspark.pandas.series import first_series
|
|
2989
|
+
|
|
2990
|
+
is_squeezable = len(self.columns[:2]) == 1
|
|
2991
|
+
# If DataFrame has multiple columns, there is no change.
|
|
2992
|
+
if not is_squeezable:
|
|
2993
|
+
return self
|
|
2994
|
+
series_from_column = first_series(self)
|
|
2995
|
+
has_single_value = len(series_from_column.head(2)) == 1
|
|
2996
|
+
# If DataFrame has only a single value, use pandas API directly.
|
|
2997
|
+
if has_single_value:
|
|
2998
|
+
result = self._to_internal_pandas().squeeze(axis)
|
|
2999
|
+
return ps.Series(result) if isinstance(result, pd.Series) else result
|
|
3000
|
+
elif axis == 0:
|
|
3001
|
+
return self
|
|
3002
|
+
else:
|
|
3003
|
+
return series_from_column
|
|
3004
|
+
else:
|
|
3005
|
+
# The case of Series is simple.
|
|
3006
|
+
# If Series has only a single value, just return it as a scalar.
|
|
3007
|
+
# Otherwise, there is no change.
|
|
3008
|
+
self_top_two = cast("Series", self).head(2)
|
|
3009
|
+
has_single_value = len(self_top_two) == 1
|
|
3010
|
+
return cast(Union[Scalar, ps.Series], self_top_two[0] if has_single_value else self)
|
|
3011
|
+
|
|
3012
|
+
def truncate(
|
|
3013
|
+
self,
|
|
3014
|
+
before: Optional[Any] = None,
|
|
3015
|
+
after: Optional[Any] = None,
|
|
3016
|
+
axis: Optional[Axis] = None,
|
|
3017
|
+
copy: bool_type = True,
|
|
3018
|
+
) -> DataFrameOrSeries:
|
|
3019
|
+
"""
|
|
3020
|
+
Truncate a Series or DataFrame before and after some index value.
|
|
3021
|
+
|
|
3022
|
+
This is a useful shorthand for boolean indexing based on index
|
|
3023
|
+
values above or below certain thresholds.
|
|
3024
|
+
|
|
3025
|
+
.. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
|
|
3026
|
+
which can be expensive.
|
|
3027
|
+
|
|
3028
|
+
Parameters
|
|
3029
|
+
----------
|
|
3030
|
+
before: date, str, int
|
|
3031
|
+
Truncate all rows before this index value.
|
|
3032
|
+
after: date, str, int
|
|
3033
|
+
Truncate all rows after this index value.
|
|
3034
|
+
axis: {0 or 'index', 1 or 'columns'}, optional
|
|
3035
|
+
Axis to truncate. Truncates the index (rows) by default.
|
|
3036
|
+
copy: bool, default is True,
|
|
3037
|
+
Return a copy of the truncated section.
|
|
3038
|
+
|
|
3039
|
+
Returns
|
|
3040
|
+
-------
|
|
3041
|
+
type of caller
|
|
3042
|
+
The truncated Series or DataFrame.
|
|
3043
|
+
|
|
3044
|
+
See Also
|
|
3045
|
+
--------
|
|
3046
|
+
DataFrame.loc: Select a subset of a DataFrame by label.
|
|
3047
|
+
DataFrame.iloc: Select a subset of a DataFrame by position.
|
|
3048
|
+
|
|
3049
|
+
Examples
|
|
3050
|
+
--------
|
|
3051
|
+
>>> df = ps.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
|
|
3052
|
+
... 'B': ['f', 'g', 'h', 'i', 'j'],
|
|
3053
|
+
... 'C': ['k', 'l', 'm', 'n', 'o']},
|
|
3054
|
+
... index=[1, 2, 3, 4, 5])
|
|
3055
|
+
>>> df
|
|
3056
|
+
A B C
|
|
3057
|
+
1 a f k
|
|
3058
|
+
2 b g l
|
|
3059
|
+
3 c h m
|
|
3060
|
+
4 d i n
|
|
3061
|
+
5 e j o
|
|
3062
|
+
|
|
3063
|
+
>>> df.truncate(before=2, after=4)
|
|
3064
|
+
A B C
|
|
3065
|
+
2 b g l
|
|
3066
|
+
3 c h m
|
|
3067
|
+
4 d i n
|
|
3068
|
+
|
|
3069
|
+
The columns of a DataFrame can be truncated.
|
|
3070
|
+
|
|
3071
|
+
>>> df.truncate(before="A", after="B", axis="columns")
|
|
3072
|
+
A B
|
|
3073
|
+
1 a f
|
|
3074
|
+
2 b g
|
|
3075
|
+
3 c h
|
|
3076
|
+
4 d i
|
|
3077
|
+
5 e j
|
|
3078
|
+
|
|
3079
|
+
For Series, only rows can be truncated.
|
|
3080
|
+
|
|
3081
|
+
>>> df['A'].truncate(before=2, after=4)
|
|
3082
|
+
2 b
|
|
3083
|
+
3 c
|
|
3084
|
+
4 d
|
|
3085
|
+
Name: A, dtype: object
|
|
3086
|
+
|
|
3087
|
+
A Series has index that sorted integers.
|
|
3088
|
+
|
|
3089
|
+
>>> s = ps.Series([10, 20, 30, 40, 50, 60, 70],
|
|
3090
|
+
... index=[1, 2, 3, 4, 5, 6, 7])
|
|
3091
|
+
>>> s
|
|
3092
|
+
1 10
|
|
3093
|
+
2 20
|
|
3094
|
+
3 30
|
|
3095
|
+
4 40
|
|
3096
|
+
5 50
|
|
3097
|
+
6 60
|
|
3098
|
+
7 70
|
|
3099
|
+
dtype: int64
|
|
3100
|
+
|
|
3101
|
+
>>> s.truncate(2, 5)
|
|
3102
|
+
2 20
|
|
3103
|
+
3 30
|
|
3104
|
+
4 40
|
|
3105
|
+
5 50
|
|
3106
|
+
dtype: int64
|
|
3107
|
+
|
|
3108
|
+
A Series has index that sorted strings.
|
|
3109
|
+
|
|
3110
|
+
>>> s = ps.Series([10, 20, 30, 40, 50, 60, 70],
|
|
3111
|
+
... index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
|
|
3112
|
+
>>> s
|
|
3113
|
+
a 10
|
|
3114
|
+
b 20
|
|
3115
|
+
c 30
|
|
3116
|
+
d 40
|
|
3117
|
+
e 50
|
|
3118
|
+
f 60
|
|
3119
|
+
g 70
|
|
3120
|
+
dtype: int64
|
|
3121
|
+
|
|
3122
|
+
>>> s.truncate('b', 'e')
|
|
3123
|
+
b 20
|
|
3124
|
+
c 30
|
|
3125
|
+
d 40
|
|
3126
|
+
e 50
|
|
3127
|
+
dtype: int64
|
|
3128
|
+
"""
|
|
3129
|
+
from pyspark.pandas.series import first_series
|
|
3130
|
+
|
|
3131
|
+
axis = validate_axis(axis)
|
|
3132
|
+
indexes = self.index
|
|
3133
|
+
indexes_increasing = indexes.is_monotonic_increasing
|
|
3134
|
+
if not indexes_increasing and not indexes.is_monotonic_decreasing:
|
|
3135
|
+
raise ValueError("truncate requires a sorted index")
|
|
3136
|
+
if (before is None) and (after is None):
|
|
3137
|
+
return cast(Union[ps.DataFrame, ps.Series], self.copy() if copy else self)
|
|
3138
|
+
if (before is not None and after is not None) and before > after:
|
|
3139
|
+
raise ValueError("Truncate: %s must be after %s" % (after, before))
|
|
3140
|
+
|
|
3141
|
+
if isinstance(self, ps.Series):
|
|
3142
|
+
if indexes_increasing:
|
|
3143
|
+
result = first_series(
|
|
3144
|
+
self.to_frame().loc[before:after] # type: ignore[arg-type]
|
|
3145
|
+
).rename(self.name)
|
|
3146
|
+
else:
|
|
3147
|
+
result = first_series(
|
|
3148
|
+
self.to_frame().loc[after:before] # type: ignore[arg-type]
|
|
3149
|
+
).rename(self.name)
|
|
3150
|
+
elif isinstance(self, ps.DataFrame):
|
|
3151
|
+
if axis == 0:
|
|
3152
|
+
if indexes_increasing:
|
|
3153
|
+
result = self.loc[before:after] # type: ignore[assignment]
|
|
3154
|
+
else:
|
|
3155
|
+
result = self.loc[after:before] # type: ignore[assignment]
|
|
3156
|
+
elif axis == 1:
|
|
3157
|
+
result = self.loc[:, before:after] # type: ignore[assignment]
|
|
3158
|
+
|
|
3159
|
+
return cast(DataFrameOrSeries, result.copy() if copy else result)
|
|
3160
|
+
|
|
3161
|
+
def to_markdown(
|
|
3162
|
+
self, buf: Optional[Union[IO[str], str]] = None, mode: Optional[str] = None
|
|
3163
|
+
) -> str:
|
|
3164
|
+
"""
|
|
3165
|
+
Print Series or DataFrame in Markdown-friendly format.
|
|
3166
|
+
|
|
3167
|
+
.. note:: This method should only be used if the resulting pandas object is expected
|
|
3168
|
+
to be small, as all the data is loaded into the driver's memory.
|
|
3169
|
+
|
|
3170
|
+
Parameters
|
|
3171
|
+
----------
|
|
3172
|
+
buf: writable buffer, defaults to sys.stdout
|
|
3173
|
+
Where to send the output. By default, the output is printed to
|
|
3174
|
+
sys.stdout. Pass a writable buffer if you need to further process
|
|
3175
|
+
the output.
|
|
3176
|
+
mode: str, optional
|
|
3177
|
+
Mode in which file is opened.
|
|
3178
|
+
**kwargs
|
|
3179
|
+
These parameters will be passed to `tabulate`.
|
|
3180
|
+
|
|
3181
|
+
Returns
|
|
3182
|
+
-------
|
|
3183
|
+
str
|
|
3184
|
+
Series or DataFrame in Markdown-friendly format.
|
|
3185
|
+
|
|
3186
|
+
Notes
|
|
3187
|
+
-----
|
|
3188
|
+
Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
|
|
3189
|
+
|
|
3190
|
+
Examples
|
|
3191
|
+
--------
|
|
3192
|
+
>>> psser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")
|
|
3193
|
+
>>> print(psser.to_markdown()) # doctest: +SKIP
|
|
3194
|
+
| | animal |
|
|
3195
|
+
|---:|:---------|
|
|
3196
|
+
| 0 | elk |
|
|
3197
|
+
| 1 | pig |
|
|
3198
|
+
| 2 | dog |
|
|
3199
|
+
| 3 | quetzal |
|
|
3200
|
+
|
|
3201
|
+
>>> psdf = ps.DataFrame(
|
|
3202
|
+
... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
|
|
3203
|
+
... )
|
|
3204
|
+
>>> print(psdf.to_markdown()) # doctest: +SKIP
|
|
3205
|
+
| | animal_1 | animal_2 |
|
|
3206
|
+
|---:|:-----------|:-----------|
|
|
3207
|
+
| 0 | elk | dog |
|
|
3208
|
+
| 1 | pig | quetzal |
|
|
3209
|
+
"""
|
|
3210
|
+
log_advice(
|
|
3211
|
+
"`to_markdown` loads all data into the driver's memory. "
|
|
3212
|
+
"It should only be used if the resulting pandas object is expected to be small."
|
|
3213
|
+
)
|
|
3214
|
+
# Make sure locals() call is at the top of the function so we don't capture local variables.
|
|
3215
|
+
args = locals()
|
|
3216
|
+
psser_or_psdf = self
|
|
3217
|
+
internal_pandas = psser_or_psdf._to_internal_pandas()
|
|
3218
|
+
return validate_arguments_and_invoke_function(
|
|
3219
|
+
internal_pandas, self.to_markdown, type(internal_pandas).to_markdown, args
|
|
3220
|
+
)
|
|
3221
|
+
|
|
3222
|
+
@abstractmethod
|
|
3223
|
+
def fillna(
|
|
3224
|
+
self: FrameLike,
|
|
3225
|
+
value: Optional[Any] = None,
|
|
3226
|
+
method: Optional[str] = None,
|
|
3227
|
+
axis: Optional[Axis] = None,
|
|
3228
|
+
inplace: bool_type = False,
|
|
3229
|
+
limit: Optional[int] = None,
|
|
3230
|
+
) -> FrameLike:
|
|
3231
|
+
pass
|
|
3232
|
+
|
|
3233
|
+
# TODO: add 'downcast' when value parameter exists
|
|
3234
|
+
def bfill(
|
|
3235
|
+
self: FrameLike,
|
|
3236
|
+
axis: Optional[Axis] = None,
|
|
3237
|
+
inplace: bool_type = False,
|
|
3238
|
+
limit: Optional[int] = None,
|
|
3239
|
+
) -> FrameLike:
|
|
3240
|
+
"""
|
|
3241
|
+
Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`bfill```.
|
|
3242
|
+
|
|
3243
|
+
.. note:: the current implementation of 'bfill' uses Spark's Window
|
|
3244
|
+
without specifying partition specification. This leads to moveing all data into a
|
|
3245
|
+
single partition in a single machine and could cause serious
|
|
3246
|
+
performance degradation. Avoid this method with very large datasets.
|
|
3247
|
+
|
|
3248
|
+
Parameters
|
|
3249
|
+
----------
|
|
3250
|
+
axis: {0 or `index`}
|
|
3251
|
+
1 and `columns` are not supported.
|
|
3252
|
+
inplace: boolean, default False
|
|
3253
|
+
Fill in place (do not create a new object)
|
|
3254
|
+
limit: int, default None
|
|
3255
|
+
If method is specified, this is the maximum number of consecutive NaN values to
|
|
3256
|
+
forward/backward fill. In other words, if there is a gap with more than this number of
|
|
3257
|
+
consecutive NaNs, it will only be partially filled. If method is not specified,
|
|
3258
|
+
this is the maximum number of entries along the entire axis where NaNs will be filled.
|
|
3259
|
+
Must be greater than 0 if not None
|
|
3260
|
+
|
|
3261
|
+
Returns
|
|
3262
|
+
-------
|
|
3263
|
+
DataFrame or Series
|
|
3264
|
+
DataFrame or Series with NA entries filled.
|
|
3265
|
+
|
|
3266
|
+
Examples
|
|
3267
|
+
--------
|
|
3268
|
+
>>> psdf = ps.DataFrame({
|
|
3269
|
+
... 'A': [None, 3, None, None],
|
|
3270
|
+
... 'B': [2, 4, None, 3],
|
|
3271
|
+
... 'C': [None, None, None, 1],
|
|
3272
|
+
... 'D': [0, 1, 5, 4]
|
|
3273
|
+
... },
|
|
3274
|
+
... columns=['A', 'B', 'C', 'D'])
|
|
3275
|
+
>>> psdf
|
|
3276
|
+
A B C D
|
|
3277
|
+
0 NaN 2.0 NaN 0
|
|
3278
|
+
1 3.0 4.0 NaN 1
|
|
3279
|
+
2 NaN NaN NaN 5
|
|
3280
|
+
3 NaN 3.0 1.0 4
|
|
3281
|
+
|
|
3282
|
+
Propagate non-null values backward.
|
|
3283
|
+
|
|
3284
|
+
>>> psdf.bfill()
|
|
3285
|
+
A B C D
|
|
3286
|
+
0 3.0 2.0 1.0 0
|
|
3287
|
+
1 3.0 4.0 1.0 1
|
|
3288
|
+
2 NaN 3.0 1.0 5
|
|
3289
|
+
3 NaN 3.0 1.0 4
|
|
3290
|
+
|
|
3291
|
+
For Series
|
|
3292
|
+
|
|
3293
|
+
>>> psser = ps.Series([None, None, None, 1])
|
|
3294
|
+
>>> psser
|
|
3295
|
+
0 NaN
|
|
3296
|
+
1 NaN
|
|
3297
|
+
2 NaN
|
|
3298
|
+
3 1.0
|
|
3299
|
+
dtype: float64
|
|
3300
|
+
|
|
3301
|
+
>>> psser.bfill()
|
|
3302
|
+
0 1.0
|
|
3303
|
+
1 1.0
|
|
3304
|
+
2 1.0
|
|
3305
|
+
3 1.0
|
|
3306
|
+
dtype: float64
|
|
3307
|
+
"""
|
|
3308
|
+
return self.fillna(method="bfill", axis=axis, inplace=inplace, limit=limit)
|
|
3309
|
+
|
|
3310
|
+
backfill = bfill
|
|
3311
|
+
|
|
3312
|
+
# TODO: add 'downcast' when value parameter exists
|
|
3313
|
+
def ffill(
|
|
3314
|
+
self: FrameLike,
|
|
3315
|
+
axis: Optional[Axis] = None,
|
|
3316
|
+
inplace: bool_type = False,
|
|
3317
|
+
limit: Optional[int] = None,
|
|
3318
|
+
) -> FrameLike:
|
|
3319
|
+
"""
|
|
3320
|
+
Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`ffill```.
|
|
3321
|
+
|
|
3322
|
+
.. note:: the current implementation of 'ffill' uses Spark's Window
|
|
3323
|
+
without specifying partition specification. This leads to moveing all data into a
|
|
3324
|
+
single a partition in a single machine and could cause serious
|
|
3325
|
+
performance degradation. Avoid this method with very large datasets.
|
|
3326
|
+
|
|
3327
|
+
Parameters
|
|
3328
|
+
----------
|
|
3329
|
+
axis: {0 or `index`}
|
|
3330
|
+
1 and `columns` are not supported.
|
|
3331
|
+
inplace: boolean, default False
|
|
3332
|
+
Fill in place (do not create a new object)
|
|
3333
|
+
limit: int, default None
|
|
3334
|
+
If method is specified, this is the maximum number of consecutive NaN values to
|
|
3335
|
+
forward/backward fill. In other words, if there is a gap with more than this number of
|
|
3336
|
+
consecutive NaNs, it will only be partially filled. If method is not specified,
|
|
3337
|
+
this is the maximum number of entries along the entire axis where NaNs will be filled.
|
|
3338
|
+
Must be greater than 0 if not None
|
|
3339
|
+
|
|
3340
|
+
Returns
|
|
3341
|
+
-------
|
|
3342
|
+
DataFrame or Series
|
|
3343
|
+
DataFrame or Series with NA entries filled.
|
|
3344
|
+
|
|
3345
|
+
Examples
|
|
3346
|
+
--------
|
|
3347
|
+
>>> psdf = ps.DataFrame({
|
|
3348
|
+
... 'A': [None, 3, None, None],
|
|
3349
|
+
... 'B': [2, 4, None, 3],
|
|
3350
|
+
... 'C': [None, None, None, 1],
|
|
3351
|
+
... 'D': [0, 1, 5, 4]
|
|
3352
|
+
... },
|
|
3353
|
+
... columns=['A', 'B', 'C', 'D'])
|
|
3354
|
+
>>> psdf
|
|
3355
|
+
A B C D
|
|
3356
|
+
0 NaN 2.0 NaN 0
|
|
3357
|
+
1 3.0 4.0 NaN 1
|
|
3358
|
+
2 NaN NaN NaN 5
|
|
3359
|
+
3 NaN 3.0 1.0 4
|
|
3360
|
+
|
|
3361
|
+
Propagate non-null values forward.
|
|
3362
|
+
|
|
3363
|
+
>>> psdf.ffill()
|
|
3364
|
+
A B C D
|
|
3365
|
+
0 NaN 2.0 NaN 0
|
|
3366
|
+
1 3.0 4.0 NaN 1
|
|
3367
|
+
2 3.0 4.0 NaN 5
|
|
3368
|
+
3 3.0 3.0 1.0 4
|
|
3369
|
+
|
|
3370
|
+
For Series
|
|
3371
|
+
|
|
3372
|
+
>>> psser = ps.Series([2, 4, None, 3])
|
|
3373
|
+
>>> psser
|
|
3374
|
+
0 2.0
|
|
3375
|
+
1 4.0
|
|
3376
|
+
2 NaN
|
|
3377
|
+
3 3.0
|
|
3378
|
+
dtype: float64
|
|
3379
|
+
|
|
3380
|
+
>>> psser.ffill()
|
|
3381
|
+
0 2.0
|
|
3382
|
+
1 4.0
|
|
3383
|
+
2 4.0
|
|
3384
|
+
3 3.0
|
|
3385
|
+
dtype: float64
|
|
3386
|
+
"""
|
|
3387
|
+
return self.fillna(method="ffill", axis=axis, inplace=inplace, limit=limit)
|
|
3388
|
+
|
|
3389
|
+
pad = ffill
|
|
3390
|
+
|
|
3391
|
+
# TODO: add 'axis', 'inplace', 'downcast'
|
|
3392
|
+
def interpolate(
|
|
3393
|
+
self: FrameLike,
|
|
3394
|
+
method: str = "linear",
|
|
3395
|
+
limit: Optional[int] = None,
|
|
3396
|
+
limit_direction: Optional[str] = None,
|
|
3397
|
+
limit_area: Optional[str] = None,
|
|
3398
|
+
) -> FrameLike:
|
|
3399
|
+
"""
|
|
3400
|
+
Fill NaN values using an interpolation method.
|
|
3401
|
+
|
|
3402
|
+
.. note:: the current implementation of interpolate uses Spark's Window without
|
|
3403
|
+
specifying partition specification. This leads to moveing all data into a
|
|
3404
|
+
single partition in a single machine and could cause serious
|
|
3405
|
+
performance degradation. Avoid this method with very large datasets.
|
|
3406
|
+
|
|
3407
|
+
.. versionadded:: 3.4.0
|
|
3408
|
+
|
|
3409
|
+
Parameters
|
|
3410
|
+
----------
|
|
3411
|
+
method: str, default 'linear'
|
|
3412
|
+
Interpolation technique to use. One of:
|
|
3413
|
+
|
|
3414
|
+
* 'linear': Ignore the index and treat the values as equally
|
|
3415
|
+
spaced.
|
|
3416
|
+
|
|
3417
|
+
limit: int, optional
|
|
3418
|
+
Maximum number of consecutive NaNs to fill. Must be greater than
|
|
3419
|
+
0.
|
|
3420
|
+
|
|
3421
|
+
limit_direction: str, default None
|
|
3422
|
+
Consecutive NaNs will be filled in this direction.
|
|
3423
|
+
One of {{'forward', 'backward', 'both'}}.
|
|
3424
|
+
|
|
3425
|
+
limit_area: str, default None
|
|
3426
|
+
If limit is specified, consecutive NaNs will be filled with this restriction. One of:
|
|
3427
|
+
|
|
3428
|
+
* None: No fill restriction.
|
|
3429
|
+
* 'inside': Only fill NaNs surrounded by valid values (interpolate).
|
|
3430
|
+
* 'outside': Only fill NaNs outside valid values (extrapolate).
|
|
3431
|
+
|
|
3432
|
+
Returns
|
|
3433
|
+
-------
|
|
3434
|
+
Series or DataFrame or None
|
|
3435
|
+
Returns the same object type as the caller, interpolated at
|
|
3436
|
+
some or all NA values.
|
|
3437
|
+
|
|
3438
|
+
See Also
|
|
3439
|
+
--------
|
|
3440
|
+
fillna: Fill missing values using different methods.
|
|
3441
|
+
|
|
3442
|
+
Examples
|
|
3443
|
+
--------
|
|
3444
|
+
Filling in NA via linear interpolation.
|
|
3445
|
+
|
|
3446
|
+
>>> s = ps.Series([0, 1, np.nan, 3])
|
|
3447
|
+
>>> s
|
|
3448
|
+
0 0.0
|
|
3449
|
+
1 1.0
|
|
3450
|
+
2 NaN
|
|
3451
|
+
3 3.0
|
|
3452
|
+
dtype: float64
|
|
3453
|
+
>>> s.interpolate()
|
|
3454
|
+
0 0.0
|
|
3455
|
+
1 1.0
|
|
3456
|
+
2 2.0
|
|
3457
|
+
3 3.0
|
|
3458
|
+
dtype: float64
|
|
3459
|
+
|
|
3460
|
+
Fill the DataFrame forward (that is, going down) along each column
|
|
3461
|
+
using linear interpolation.
|
|
3462
|
+
|
|
3463
|
+
Note how the last entry in column 'a' is interpolated differently,
|
|
3464
|
+
because there is no entry after it to use for interpolation.
|
|
3465
|
+
Note how the first entry in column 'b' remains NA, because there
|
|
3466
|
+
is no entry before it to use for interpolation.
|
|
3467
|
+
|
|
3468
|
+
>>> df = ps.DataFrame([(0.0, np.nan, -1.0, 1.0),
|
|
3469
|
+
... (np.nan, 2.0, np.nan, np.nan),
|
|
3470
|
+
... (2.0, 3.0, np.nan, 9.0),
|
|
3471
|
+
... (np.nan, 4.0, -4.0, 16.0)],
|
|
3472
|
+
... columns=list('abcd'))
|
|
3473
|
+
>>> df
|
|
3474
|
+
a b c d
|
|
3475
|
+
0 0.0 NaN -1.0 1.0
|
|
3476
|
+
1 NaN 2.0 NaN NaN
|
|
3477
|
+
2 2.0 3.0 NaN 9.0
|
|
3478
|
+
3 NaN 4.0 -4.0 16.0
|
|
3479
|
+
>>> df.interpolate(method='linear')
|
|
3480
|
+
a b c d
|
|
3481
|
+
0 0.0 NaN -1.0 1.0
|
|
3482
|
+
1 1.0 2.0 -2.0 5.0
|
|
3483
|
+
2 2.0 3.0 -3.0 9.0
|
|
3484
|
+
3 2.0 4.0 -4.0 16.0
|
|
3485
|
+
"""
|
|
3486
|
+
return self.interpolate(
|
|
3487
|
+
method=method, limit=limit, limit_direction=limit_direction, limit_area=limit_area
|
|
3488
|
+
)
|
|
3489
|
+
|
|
3490
|
+
@property
|
|
3491
|
+
def at(self) -> AtIndexer:
|
|
3492
|
+
return AtIndexer(self)
|
|
3493
|
+
|
|
3494
|
+
at.__doc__ = AtIndexer.__doc__
|
|
3495
|
+
|
|
3496
|
+
@property
|
|
3497
|
+
def iat(self) -> iAtIndexer:
|
|
3498
|
+
return iAtIndexer(self)
|
|
3499
|
+
|
|
3500
|
+
iat.__doc__ = iAtIndexer.__doc__
|
|
3501
|
+
|
|
3502
|
+
@property
|
|
3503
|
+
def iloc(self) -> iLocIndexer:
|
|
3504
|
+
return iLocIndexer(self)
|
|
3505
|
+
|
|
3506
|
+
iloc.__doc__ = iLocIndexer.__doc__
|
|
3507
|
+
|
|
3508
|
+
@property
|
|
3509
|
+
def loc(self) -> LocIndexer:
|
|
3510
|
+
return LocIndexer(self)
|
|
3511
|
+
|
|
3512
|
+
loc.__doc__ = LocIndexer.__doc__
|
|
3513
|
+
|
|
3514
|
+
def __bool__(self) -> NoReturn:
|
|
3515
|
+
raise ValueError(
|
|
3516
|
+
"The truth value of a {0} is ambiguous. "
|
|
3517
|
+
"Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(self.__class__.__name__)
|
|
3518
|
+
)
|
|
3519
|
+
|
|
3520
|
+
@staticmethod
|
|
3521
|
+
def _count_expr(psser: "Series") -> Column:
|
|
3522
|
+
return F.count(psser._dtype_op.nan_to_null(psser).spark.column)
|
|
3523
|
+
|
|
3524
|
+
|
|
3525
|
+
def _test() -> None:
|
|
3526
|
+
import os
|
|
3527
|
+
import doctest
|
|
3528
|
+
import shutil
|
|
3529
|
+
import sys
|
|
3530
|
+
import tempfile
|
|
3531
|
+
from pyspark.sql import SparkSession
|
|
3532
|
+
import pyspark.pandas.generic
|
|
3533
|
+
|
|
3534
|
+
os.chdir(os.environ["SPARK_HOME"])
|
|
3535
|
+
|
|
3536
|
+
globs = pyspark.pandas.generic.__dict__.copy()
|
|
3537
|
+
globs["ps"] = pyspark.pandas
|
|
3538
|
+
spark = (
|
|
3539
|
+
SparkSession.builder.master("local[4]")
|
|
3540
|
+
.appName("pyspark.pandas.generic tests")
|
|
3541
|
+
.getOrCreate()
|
|
3542
|
+
)
|
|
3543
|
+
|
|
3544
|
+
path = tempfile.mkdtemp()
|
|
3545
|
+
globs["path"] = path
|
|
3546
|
+
|
|
3547
|
+
(failure_count, test_count) = doctest.testmod(
|
|
3548
|
+
pyspark.pandas.generic,
|
|
3549
|
+
globs=globs,
|
|
3550
|
+
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
|
|
3551
|
+
)
|
|
3552
|
+
|
|
3553
|
+
shutil.rmtree(path, ignore_errors=True)
|
|
3554
|
+
spark.stop()
|
|
3555
|
+
if failure_count:
|
|
3556
|
+
sys.exit(-1)
|
|
3557
|
+
|
|
3558
|
+
|
|
3559
|
+
if __name__ == "__main__":
|
|
3560
|
+
_test()
|