snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3560 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ A base class of DataFrame/Column to behave like pandas DataFrame/Series.
20
+ """
21
+ from abc import ABCMeta, abstractmethod
22
+ from collections import Counter
23
+ from functools import reduce
24
+ from typing import (
25
+ Any,
26
+ Callable,
27
+ Dict,
28
+ Iterable,
29
+ IO,
30
+ List,
31
+ Optional,
32
+ NoReturn,
33
+ Tuple,
34
+ Union,
35
+ TYPE_CHECKING,
36
+ cast,
37
+ )
38
+ import warnings
39
+
40
+ import numpy as np
41
+ import pandas as pd
42
+ from pandas.api.types import is_list_like # type: ignore[attr-defined]
43
+
44
+ from pyspark.sql import Column, functions as F
45
+ from pyspark.sql.types import (
46
+ BooleanType,
47
+ DoubleType,
48
+ LongType,
49
+ NumericType,
50
+ )
51
+
52
+ from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm.
53
+ from pyspark.pandas._typing import (
54
+ Axis,
55
+ DataFrameOrSeries,
56
+ Dtype,
57
+ FrameLike,
58
+ Label,
59
+ Name,
60
+ Scalar,
61
+ )
62
+ from pyspark.pandas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
63
+ from pyspark.pandas.internal import InternalFrame
64
+ from pyspark.pandas.spark import functions as SF
65
+ from pyspark.pandas.typedef import spark_type_to_pandas_dtype
66
+ from pyspark.pandas.utils import (
67
+ is_name_like_tuple,
68
+ is_name_like_value,
69
+ name_like_string,
70
+ scol_for,
71
+ sql_conf,
72
+ validate_arguments_and_invoke_function,
73
+ validate_axis,
74
+ validate_mode,
75
+ SPARK_CONF_ARROW_ENABLED,
76
+ log_advice,
77
+ )
78
+
79
+ if TYPE_CHECKING:
80
+ from pyspark.pandas.frame import DataFrame
81
+ from pyspark.pandas.indexes.base import Index
82
+ from pyspark.pandas.groupby import GroupBy
83
+ from pyspark.pandas.series import Series
84
+ from pyspark.pandas.window import Rolling, Expanding, ExponentialMoving
85
+
86
+
87
+ bool_type = bool
88
+
89
+
90
+ class Frame(object, metaclass=ABCMeta):
91
+ """
92
+ The base class for both DataFrame and Series.
93
+ """
94
+
95
+ @abstractmethod
96
+ def __getitem__(self, key: Any) -> Any:
97
+ pass
98
+
99
+ @property
100
+ @abstractmethod
101
+ def _internal(self) -> InternalFrame:
102
+ pass
103
+
104
+ @abstractmethod
105
+ def _apply_series_op(
106
+ self: FrameLike,
107
+ op: Callable[["Series"], Union["Series", Column]],
108
+ should_resolve: bool = False,
109
+ ) -> FrameLike:
110
+ pass
111
+
112
+ @abstractmethod
113
+ def _reduce_for_stat_function(
114
+ self,
115
+ sfun: Callable[["Series"], Column],
116
+ name: str,
117
+ axis: Optional[Axis] = None,
118
+ numeric_only: bool = True,
119
+ skipna: bool = True,
120
+ **kwargs: Any,
121
+ ) -> Union["Series", Scalar]:
122
+ pass
123
+
124
+ @property
125
+ @abstractmethod
126
+ def dtypes(self) -> Union[pd.Series, Dtype]:
127
+ pass
128
+
129
+ @abstractmethod
130
+ def to_pandas(self) -> Union[pd.DataFrame, pd.Series]:
131
+ pass
132
+
133
+ @abstractmethod
134
+ def _to_pandas(self) -> Union[pd.DataFrame, pd.Series]:
135
+ pass
136
+
137
+ @property
138
+ @abstractmethod
139
+ def index(self) -> "Index":
140
+ pass
141
+
142
+ @abstractmethod
143
+ def copy(self: FrameLike) -> FrameLike:
144
+ pass
145
+
146
+ @abstractmethod
147
+ def _to_internal_pandas(self) -> Union[pd.DataFrame, pd.Series]:
148
+ pass
149
+
150
+ @abstractmethod
151
+ def head(self: FrameLike, n: int = 5) -> FrameLike:
152
+ pass
153
+
154
+ # TODO: add 'axis' parameter
155
+ def cummin(self: FrameLike, skipna: bool = True) -> FrameLike:
156
+ """
157
+ Return cumulative minimum over a DataFrame or Series axis.
158
+
159
+ Returns a DataFrame or Series of the same size containing the cumulative minimum.
160
+
161
+ .. note:: the current implementation of cummin uses Spark's Window without
162
+ specifying partition specification. This leads to moveing all data into a
163
+ single partition in a single machine and could cause serious
164
+ performance degradation. Avoid this method with very large datasets.
165
+
166
+ Parameters
167
+ ----------
168
+ skipna: boolean, default True
169
+ Exclude NA/null values. If an entire row/column is NA, the result will be NA.
170
+
171
+ Returns
172
+ -------
173
+ DataFrame or Series
174
+
175
+ See Also
176
+ --------
177
+ DataFrame.min: Return the minimum over DataFrame axis.
178
+ DataFrame.cummax: Return cumulative maximum over DataFrame axis.
179
+ DataFrame.cummin: Return cumulative minimum over DataFrame axis.
180
+ DataFrame.cumsum: Return cumulative sum over DataFrame axis.
181
+ Series.min: Return the minimum over Series axis.
182
+ Series.cummax: Return cumulative maximum over Series axis.
183
+ Series.cummin: Return cumulative minimum over Series axis.
184
+ Series.cumsum: Return cumulative sum over Series axis.
185
+ Series.cumprod: Return cumulative product over Series axis.
186
+
187
+ Examples
188
+ --------
189
+ >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))
190
+ >>> df
191
+ A B
192
+ 0 2.0 1.0
193
+ 1 3.0 NaN
194
+ 2 1.0 0.0
195
+
196
+ By default, iterates over rows and finds the minimum in each column.
197
+
198
+ >>> df.cummin()
199
+ A B
200
+ 0 2.0 1.0
201
+ 1 2.0 NaN
202
+ 2 1.0 0.0
203
+
204
+ It works identically in Series.
205
+
206
+ >>> df.A.cummin()
207
+ 0 2.0
208
+ 1 2.0
209
+ 2 1.0
210
+ Name: A, dtype: float64
211
+ """
212
+ return self._apply_series_op(lambda psser: psser._cum(F.min, skipna), should_resolve=True)
213
+
214
+ # TODO: add 'axis' parameter
215
+ def cummax(self: FrameLike, skipna: bool = True) -> FrameLike:
216
+ """
217
+ Return cumulative maximum over a DataFrame or Series axis.
218
+
219
+ Returns a DataFrame or Series of the same size containing the cumulative maximum.
220
+
221
+ .. note:: the current implementation of cummax uses Spark's Window without
222
+ specifying partition specification. This leads to moveing all data into a
223
+ single partition in a single machine and could cause serious
224
+ performance degradation. Avoid this method with very large datasets.
225
+
226
+ Parameters
227
+ ----------
228
+ skipna: boolean, default True
229
+ Exclude NA/null values. If an entire row/column is NA, the result will be NA.
230
+
231
+ Returns
232
+ -------
233
+ DataFrame or Series
234
+
235
+ See Also
236
+ --------
237
+ DataFrame.max: Return the maximum over DataFrame axis.
238
+ DataFrame.cummax: Return cumulative maximum over DataFrame axis.
239
+ DataFrame.cummin: Return cumulative minimum over DataFrame axis.
240
+ DataFrame.cumsum: Return cumulative sum over DataFrame axis.
241
+ DataFrame.cumprod: Return cumulative product over DataFrame axis.
242
+ Series.max: Return the maximum over Series axis.
243
+ Series.cummax: Return cumulative maximum over Series axis.
244
+ Series.cummin: Return cumulative minimum over Series axis.
245
+ Series.cumsum: Return cumulative sum over Series axis.
246
+ Series.cumprod: Return cumulative product over Series axis.
247
+
248
+ Examples
249
+ --------
250
+ >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))
251
+ >>> df
252
+ A B
253
+ 0 2.0 1.0
254
+ 1 3.0 NaN
255
+ 2 1.0 0.0
256
+
257
+ By default, iterates over rows and finds the maximum in each column.
258
+
259
+ >>> df.cummax()
260
+ A B
261
+ 0 2.0 1.0
262
+ 1 3.0 NaN
263
+ 2 3.0 1.0
264
+
265
+ It works identically in Series.
266
+
267
+ >>> df.B.cummax()
268
+ 0 1.0
269
+ 1 NaN
270
+ 2 1.0
271
+ Name: B, dtype: float64
272
+ """
273
+ return self._apply_series_op(lambda psser: psser._cum(F.max, skipna), should_resolve=True)
274
+
275
+ # TODO: add 'axis' parameter
276
+ def cumsum(self: FrameLike, skipna: bool = True) -> FrameLike:
277
+ """
278
+ Return cumulative sum over a DataFrame or Series axis.
279
+
280
+ Returns a DataFrame or Series of the same size containing the cumulative sum.
281
+
282
+ .. note:: the current implementation of cumsum uses Spark's Window without
283
+ specifying partition specification. This leads to moveing all data into a
284
+ single partition in a single machine and could cause serious
285
+ performance degradation. Avoid this method with very large datasets.
286
+
287
+ Parameters
288
+ ----------
289
+ skipna: boolean, default True
290
+ Exclude NA/null values. If an entire row/column is NA, the result will be NA.
291
+
292
+ Returns
293
+ -------
294
+ DataFrame or Series
295
+
296
+ See Also
297
+ --------
298
+ DataFrame.sum: Return the sum over DataFrame axis.
299
+ DataFrame.cummax: Return cumulative maximum over DataFrame axis.
300
+ DataFrame.cummin: Return cumulative minimum over DataFrame axis.
301
+ DataFrame.cumsum: Return cumulative sum over DataFrame axis.
302
+ DataFrame.cumprod: Return cumulative product over DataFrame axis.
303
+ Series.sum: Return the sum over Series axis.
304
+ Series.cummax: Return cumulative maximum over Series axis.
305
+ Series.cummin: Return cumulative minimum over Series axis.
306
+ Series.cumsum: Return cumulative sum over Series axis.
307
+ Series.cumprod: Return cumulative product over Series axis.
308
+
309
+ Examples
310
+ --------
311
+ >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))
312
+ >>> df
313
+ A B
314
+ 0 2.0 1.0
315
+ 1 3.0 NaN
316
+ 2 1.0 0.0
317
+
318
+ By default, iterates over rows and finds the sum in each column.
319
+
320
+ >>> df.cumsum()
321
+ A B
322
+ 0 2.0 1.0
323
+ 1 5.0 NaN
324
+ 2 6.0 1.0
325
+
326
+ It works identically in Series.
327
+
328
+ >>> df.A.cumsum()
329
+ 0 2.0
330
+ 1 5.0
331
+ 2 6.0
332
+ Name: A, dtype: float64
333
+ """
334
+ return self._apply_series_op(lambda psser: psser._cumsum(skipna), should_resolve=True)
335
+
336
+ # TODO: add 'axis' parameter
337
+ # TODO: use pandas_udf to support negative values and other options later
338
+ # other window except unbounded ones is supported as of Spark 3.0.
339
+ def cumprod(self: FrameLike, skipna: bool = True) -> FrameLike:
340
+ """
341
+ Return cumulative product over a DataFrame or Series axis.
342
+
343
+ Returns a DataFrame or Series of the same size containing the cumulative product.
344
+
345
+ .. note:: the current implementation of cumprod uses Spark's Window without
346
+ specifying partition specification. This leads to moveing all data into a
347
+ single partition in a single machine and could cause serious
348
+ performance degradation. Avoid this method with very large datasets.
349
+
350
+ .. note:: unlike pandas', pandas-on-Spark's emulates cumulative product by
351
+ ``exp(sum(log(...)))`` trick. Therefore, it only works for positive numbers.
352
+
353
+ Parameters
354
+ ----------
355
+ skipna: boolean, default True
356
+ Exclude NA/null values. If an entire row/column is NA, the result will be NA.
357
+
358
+ Returns
359
+ -------
360
+ DataFrame or Series
361
+
362
+ See Also
363
+ --------
364
+ DataFrame.cummax: Return cumulative maximum over DataFrame axis.
365
+ DataFrame.cummin: Return cumulative minimum over DataFrame axis.
366
+ DataFrame.cumsum: Return cumulative sum over DataFrame axis.
367
+ DataFrame.cumprod: Return cumulative product over DataFrame axis.
368
+ Series.cummax: Return cumulative maximum over Series axis.
369
+ Series.cummin: Return cumulative minimum over Series axis.
370
+ Series.cumsum: Return cumulative sum over Series axis.
371
+ Series.cumprod: Return cumulative product over Series axis.
372
+
373
+ Raises
374
+ ------
375
+ Exception: If the values is equal to or lower than 0.
376
+
377
+ Examples
378
+ --------
379
+ >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [4.0, 10.0]], columns=list('AB'))
380
+ >>> df
381
+ A B
382
+ 0 2.0 1.0
383
+ 1 3.0 NaN
384
+ 2 4.0 10.0
385
+
386
+ By default, iterates over rows and finds the sum in each column.
387
+
388
+ >>> df.cumprod()
389
+ A B
390
+ 0 2.0 1.0
391
+ 1 6.0 NaN
392
+ 2 24.0 10.0
393
+
394
+ It works identically in Series.
395
+
396
+ >>> df.A.cumprod()
397
+ 0 2.0
398
+ 1 6.0
399
+ 2 24.0
400
+ Name: A, dtype: float64
401
+ """
402
+ return self._apply_series_op(lambda psser: psser._cumprod(skipna), should_resolve=True)
403
+
404
+ # TODO: Although this has removed pandas >= 1.0.0, but we're keeping this as deprecated
405
+ # since we're using this for `DataFrame.info` internally.
406
+ # We can drop it once our minimal pandas version becomes 1.0.0.
407
+ def get_dtype_counts(self) -> pd.Series:
408
+ """
409
+ Return counts of unique dtypes in this object.
410
+
411
+ .. deprecated:: 0.14.0
412
+
413
+ Returns
414
+ -------
415
+ dtype: pd.Series
416
+ Series with the count of columns with each dtype.
417
+
418
+ See Also
419
+ --------
420
+ dtypes: Return the dtypes in this object.
421
+
422
+ Examples
423
+ --------
424
+ >>> a = [['a', 1, 1], ['b', 2, 2], ['c', 3, 3]]
425
+ >>> df = ps.DataFrame(a, columns=['str', 'int1', 'int2'])
426
+ >>> df
427
+ str int1 int2
428
+ 0 a 1 1
429
+ 1 b 2 2
430
+ 2 c 3 3
431
+
432
+ >>> df.get_dtype_counts().sort_values()
433
+ object 1
434
+ int64 2
435
+ dtype: int64
436
+
437
+ >>> df.str.get_dtype_counts().sort_values()
438
+ object 1
439
+ dtype: int64
440
+ """
441
+ warnings.warn(
442
+ "`get_dtype_counts` has been deprecated and will be "
443
+ "removed in a future version. For DataFrames use "
444
+ "`.dtypes.value_counts()",
445
+ FutureWarning,
446
+ )
447
+ if not isinstance(self.dtypes, Iterable):
448
+ dtypes = [self.dtypes]
449
+ else:
450
+ dtypes = list(self.dtypes)
451
+ return pd.Series(dict(Counter([d.name for d in dtypes])))
452
+
453
+ def pipe(self, func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
454
+ r"""
455
+ Apply func(self, \*args, \*\*kwargs).
456
+
457
+ Parameters
458
+ ----------
459
+ func: function
460
+ function to apply to the DataFrame.
461
+ ``args``, and ``kwargs`` are passed into ``func``.
462
+ Alternatively a ``(callable, data_keyword)`` tuple where
463
+ ``data_keyword`` is a string indicating the keyword of
464
+ ``callable`` that expects the DataFrames.
465
+ args: iterable, optional
466
+ positional arguments passed into ``func``.
467
+ kwargs: mapping, optional
468
+ a dictionary of keyword arguments passed into ``func``.
469
+
470
+ Returns
471
+ -------
472
+ object: the return type of ``func``.
473
+
474
+ Notes
475
+ -----
476
+ Use ``.pipe`` when chaining together functions that expect
477
+ Series, DataFrames or GroupBy objects. For example, given
478
+
479
+ >>> df = ps.DataFrame({'category': ['A', 'A', 'B'],
480
+ ... 'col1': [1, 2, 3],
481
+ ... 'col2': [4, 5, 6]},
482
+ ... columns=['category', 'col1', 'col2'])
483
+ >>> def keep_category_a(df):
484
+ ... return df[df['category'] == 'A']
485
+ >>> def add_one(df, column):
486
+ ... return df.assign(col3=df[column] + 1)
487
+ >>> def multiply(df, column1, column2):
488
+ ... return df.assign(col4=df[column1] * df[column2])
489
+
490
+
491
+ instead of writing
492
+
493
+ >>> multiply(add_one(keep_category_a(df), column="col1"), column1="col2", column2="col3")
494
+ category col1 col2 col3 col4
495
+ 0 A 1 4 2 8
496
+ 1 A 2 5 3 15
497
+
498
+
499
+ You can write
500
+
501
+ >>> (df.pipe(keep_category_a)
502
+ ... .pipe(add_one, column="col1")
503
+ ... .pipe(multiply, column1="col2", column2="col3")
504
+ ... )
505
+ category col1 col2 col3 col4
506
+ 0 A 1 4 2 8
507
+ 1 A 2 5 3 15
508
+
509
+
510
+ If you have a function that takes the data as the second
511
+ argument, pass a tuple indicating which keyword expects the
512
+ data. For example, suppose ``f`` takes its data as ``df``:
513
+
514
+ >>> def multiply_2(column1, df, column2):
515
+ ... return df.assign(col4=df[column1] * df[column2])
516
+
517
+
518
+ Then you can write
519
+
520
+ >>> (df.pipe(keep_category_a)
521
+ ... .pipe(add_one, column="col1")
522
+ ... .pipe((multiply_2, 'df'), column1="col2", column2="col3")
523
+ ... )
524
+ category col1 col2 col3 col4
525
+ 0 A 1 4 2 8
526
+ 1 A 2 5 3 15
527
+
528
+ You can use lambda as well
529
+
530
+ >>> ps.Series([1, 2, 3]).pipe(lambda x: (x + 1).rename("value"))
531
+ 0 2
532
+ 1 3
533
+ 2 4
534
+ Name: value, dtype: int64
535
+ """
536
+
537
+ if isinstance(func, tuple):
538
+ func, target = func
539
+ if target in kwargs:
540
+ raise ValueError("%s is both the pipe target and a keyword " "argument" % target)
541
+ kwargs[target] = self
542
+ return func(*args, **kwargs)
543
+ else:
544
+ return func(self, *args, **kwargs)
545
+
546
+ def to_numpy(self) -> np.ndarray:
547
+ """
548
+ A NumPy ndarray representing the values in this DataFrame or Series.
549
+
550
+ .. note:: This method should only be used if the resulting NumPy ndarray is expected
551
+ to be small, as all the data is loaded into the driver's memory.
552
+
553
+ Returns
554
+ -------
555
+ numpy.ndarray
556
+
557
+ Examples
558
+ --------
559
+ >>> ps.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
560
+ array([[1, 3],
561
+ [2, 4]])
562
+
563
+ With heterogeneous data, the lowest common type will have to be used.
564
+
565
+ >>> ps.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}).to_numpy()
566
+ array([[1. , 3. ],
567
+ [2. , 4.5]])
568
+
569
+ For a mix of numeric and non-numeric types, the output array will have object dtype.
570
+
571
+ >>> df = ps.DataFrame({"A": [1, 2], "B": [3.0, 4.5], "C": pd.date_range('2000', periods=2)})
572
+ >>> df.to_numpy()
573
+ array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
574
+ [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
575
+
576
+ For Series,
577
+
578
+ >>> ps.Series(['a', 'b', 'a']).to_numpy()
579
+ array(['a', 'b', 'a'], dtype=object)
580
+ """
581
+ log_advice(
582
+ "`to_numpy` loads all data into the driver's memory. "
583
+ "It should only be used if the resulting NumPy ndarray is expected to be small."
584
+ )
585
+ return cast(np.ndarray, self._to_pandas().values)
586
+
587
+ @property
588
+ def values(self) -> np.ndarray:
589
+ """
590
+ Return a Numpy representation of the DataFrame or the Series.
591
+
592
+ .. warning:: We recommend using `DataFrame.to_numpy()` or `Series.to_numpy()` instead.
593
+
594
+ .. note:: This method should only be used if the resulting NumPy ndarray is expected
595
+ to be small, as all the data is loaded into the driver's memory.
596
+
597
+ Returns
598
+ -------
599
+ numpy.ndarray
600
+
601
+ Examples
602
+ --------
603
+ A DataFrame where all columns are the same type (e.g., int64) results in an array of
604
+ the same type.
605
+
606
+ >>> df = ps.DataFrame({'age': [ 3, 29],
607
+ ... 'height': [94, 170],
608
+ ... 'weight': [31, 115]})
609
+ >>> df
610
+ age height weight
611
+ 0 3 94 31
612
+ 1 29 170 115
613
+ >>> df.dtypes
614
+ age int64
615
+ height int64
616
+ weight int64
617
+ dtype: object
618
+ >>> df.values
619
+ array([[ 3, 94, 31],
620
+ [ 29, 170, 115]])
621
+
622
+ A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray
623
+ of the broadest type that accommodates these mixed types (e.g., object).
624
+
625
+ >>> df2 = ps.DataFrame([('parrot', 24.0, 'second'),
626
+ ... ('lion', 80.5, 'first'),
627
+ ... ('monkey', np.nan, None)],
628
+ ... columns=('name', 'max_speed', 'rank'))
629
+ >>> df2.dtypes
630
+ name object
631
+ max_speed float64
632
+ rank object
633
+ dtype: object
634
+ >>> df2.values
635
+ array([['parrot', 24.0, 'second'],
636
+ ['lion', 80.5, 'first'],
637
+ ['monkey', nan, None]], dtype=object)
638
+
639
+ For Series,
640
+
641
+ >>> ps.Series([1, 2, 3]).values
642
+ array([1, 2, 3])
643
+
644
+ >>> ps.Series(list('aabc')).values
645
+ array(['a', 'a', 'b', 'c'], dtype=object)
646
+ """
647
+ warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
648
+ return self.to_numpy()
649
+
650
+ def to_csv(
651
+ self,
652
+ path: Optional[str] = None,
653
+ sep: str = ",",
654
+ na_rep: str = "",
655
+ columns: Optional[List[Name]] = None,
656
+ header: bool = True,
657
+ quotechar: str = '"',
658
+ date_format: Optional[str] = None,
659
+ escapechar: Optional[str] = None,
660
+ num_files: Optional[int] = None,
661
+ mode: str = "w",
662
+ partition_cols: Optional[Union[str, List[str]]] = None,
663
+ index_col: Optional[Union[str, List[str]]] = None,
664
+ **options: Any,
665
+ ) -> Optional[str]:
666
+ r"""
667
+ Write object to a comma-separated values (csv) file.
668
+
669
+ .. note:: pandas-on-Spark `to_csv` writes files to a path or URI. Unlike pandas',
670
+ pandas-on-Spark respects HDFS's property such as 'fs.default.name'.
671
+
672
+ .. note:: pandas-on-Spark writes CSV files into the directory, `path`, and writes
673
+ multiple `part-...` files in the directory when `path` is specified.
674
+ This behavior was inherited from Apache Spark. The number of partitions can
675
+ be controlled by `num_files`. This is deprecated.
676
+ Use `DataFrame.spark.repartition` instead.
677
+
678
+ Parameters
679
+ ----------
680
+ path: str, default None
681
+ File path. If None is provided the result is returned as a string.
682
+ sep: str, default ','
683
+ String of length 1. Field delimiter for the output file.
684
+ na_rep: str, default ''
685
+ Missing data representation.
686
+ columns: sequence, optional
687
+ Columns to write.
688
+ header: bool or list of str, default True
689
+ Write out the column names. If a list of strings is given it is
690
+ assumed to be aliases for the column names.
691
+ quotechar: str, default '\"'
692
+ String of length 1. Character used to quote fields.
693
+ date_format: str, default None
694
+ Format string for datetime objects.
695
+ escapechar: str, default None
696
+ String of length 1. Character used to escape `sep` and `quotechar`
697
+ when appropriate.
698
+ num_files: the number of partitions to be written in `path` directory when
699
+ this is a path. This is deprecated. Use `DataFrame.spark.repartition` instead.
700
+ mode: str
701
+ Python write mode, default 'w'.
702
+
703
+ .. note:: mode can accept the strings for Spark writing mode.
704
+ Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.
705
+
706
+ - 'append' (equivalent to 'a'): Append the new data to existing data.
707
+ - 'overwrite' (equivalent to 'w'): Overwrite existing data.
708
+ - 'ignore': Silently ignore this operation if data already exists.
709
+ - 'error' or 'errorifexists': Throw an exception if data already exists.
710
+
711
+ partition_cols: str or list of str, optional, default None
712
+ Names of partitioning columns
713
+ index_col: str or list of str, optional, default: None
714
+ Column names to be used in Spark to represent pandas-on-Spark's index. The index name
715
+ in pandas-on-Spark is ignored. By default, the index is always lost.
716
+ options: keyword arguments for additional options specific to PySpark.
717
+ These kwargs are specific to PySpark's CSV options to pass. Check
718
+ the options in PySpark's API documentation for spark.write.csv(...).
719
+ It has higher priority and overwrites all other options.
720
+ This parameter only works when `path` is specified.
721
+
722
+ Returns
723
+ -------
724
+ str or None
725
+
726
+ See Also
727
+ --------
728
+ read_csv
729
+ DataFrame.to_delta
730
+ DataFrame.to_table
731
+ DataFrame.to_parquet
732
+ DataFrame.to_spark_io
733
+
734
+ Examples
735
+ --------
736
+ >>> df = ps.DataFrame(dict(
737
+ ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),
738
+ ... country=['KR', 'US', 'JP'],
739
+ ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])
740
+ >>> df.sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
741
+ date country code
742
+ ... 2012-01-31 12:00:00 KR 1
743
+ ... 2012-02-29 12:00:00 US 2
744
+ ... 2012-03-31 12:00:00 JP 3
745
+
746
+ >>> print(df.to_csv()) # doctest: +NORMALIZE_WHITESPACE
747
+ date,country,code
748
+ 2012-01-31 12:00:00,KR,1
749
+ 2012-02-29 12:00:00,US,2
750
+ 2012-03-31 12:00:00,JP,3
751
+
752
+ >>> df.cummax().to_csv(path=r'%s/to_csv/foo.csv' % path, num_files=1)
753
+ >>> ps.read_csv(
754
+ ... path=r'%s/to_csv/foo.csv' % path
755
+ ... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
756
+ date country code
757
+ ... 2012-01-31 12:00:00 KR 1
758
+ ... 2012-02-29 12:00:00 US 2
759
+ ... 2012-03-31 12:00:00 US 3
760
+
761
+ In case of Series,
762
+
763
+ >>> print(df.date.to_csv()) # doctest: +NORMALIZE_WHITESPACE
764
+ date
765
+ 2012-01-31 12:00:00
766
+ 2012-02-29 12:00:00
767
+ 2012-03-31 12:00:00
768
+
769
+ >>> df.date.to_csv(path=r'%s/to_csv/foo.csv' % path, num_files=1)
770
+ >>> ps.read_csv(
771
+ ... path=r'%s/to_csv/foo.csv' % path
772
+ ... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
773
+ date
774
+ ... 2012-01-31 12:00:00
775
+ ... 2012-02-29 12:00:00
776
+ ... 2012-03-31 12:00:00
777
+
778
+ You can preserve the index in the roundtrip as below.
779
+
780
+ >>> df.set_index("country", append=True, inplace=True)
781
+ >>> df.date.to_csv(
782
+ ... path=r'%s/to_csv/bar.csv' % path,
783
+ ... num_files=1,
784
+ ... index_col=["index1", "index2"])
785
+ >>> ps.read_csv(
786
+ ... path=r'%s/to_csv/bar.csv' % path, index_col=["index1", "index2"]
787
+ ... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
788
+ date
789
+ index1 index2
790
+ ... ... 2012-01-31 12:00:00
791
+ ... ... 2012-02-29 12:00:00
792
+ ... ... 2012-03-31 12:00:00
793
+ """
794
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
795
+ options = options.get("options")
796
+
797
+ if path is None:
798
+ # If path is none, just collect and use pandas's to_csv.
799
+ return self._to_pandas().to_csv(
800
+ None,
801
+ sep=sep,
802
+ na_rep=na_rep,
803
+ columns=columns,
804
+ header=header,
805
+ quotechar=quotechar,
806
+ date_format=date_format,
807
+ escapechar=escapechar,
808
+ index=False,
809
+ )
810
+
811
+ if isinstance(self, ps.DataFrame):
812
+ psdf = self
813
+ else:
814
+ assert isinstance(self, ps.Series)
815
+ psdf = self.to_frame()
816
+
817
+ if columns is None:
818
+ column_labels = psdf._internal.column_labels
819
+ else:
820
+ column_labels = []
821
+ for col in columns:
822
+ if is_name_like_tuple(col):
823
+ label = cast(Label, col)
824
+ else:
825
+ label = cast(Label, (col,))
826
+ if label not in psdf._internal.column_labels:
827
+ raise KeyError(name_like_string(label))
828
+ column_labels.append(label)
829
+
830
+ if isinstance(index_col, str):
831
+ index_cols = [index_col]
832
+ elif index_col is None:
833
+ index_cols = []
834
+ else:
835
+ index_cols = index_col
836
+
837
+ if header is True and psdf._internal.column_labels_level > 1:
838
+ raise ValueError("to_csv only support one-level index column now")
839
+ elif isinstance(header, list):
840
+ sdf = psdf.to_spark(index_col)
841
+ sdf = sdf.select(
842
+ [scol_for(sdf, name_like_string(label)) for label in index_cols]
843
+ + [
844
+ scol_for(sdf, str(i) if label is None else name_like_string(label)).alias(
845
+ new_name
846
+ )
847
+ for i, (label, new_name) in enumerate(zip(column_labels, header))
848
+ ]
849
+ )
850
+ header = True
851
+ else:
852
+ sdf = psdf.to_spark(index_col)
853
+ sdf = sdf.select(
854
+ [scol_for(sdf, name_like_string(label)) for label in index_cols]
855
+ + [
856
+ scol_for(sdf, str(i) if label is None else name_like_string(label))
857
+ for i, label in enumerate(column_labels)
858
+ ]
859
+ )
860
+
861
+ if num_files is not None:
862
+ warnings.warn(
863
+ "`num_files` has been deprecated and might be removed in a future version. "
864
+ "Use `DataFrame.spark.repartition` instead.",
865
+ FutureWarning,
866
+ )
867
+ sdf = sdf.repartition(num_files)
868
+
869
+ mode = validate_mode(mode)
870
+ builder = sdf.write.mode(mode)
871
+ if partition_cols is not None:
872
+ builder.partitionBy(partition_cols)
873
+ builder._set_opts(
874
+ sep=sep,
875
+ nullValue=na_rep,
876
+ header=header,
877
+ quote=quotechar,
878
+ dateFormat=date_format,
879
+ charToEscapeQuoteEscaping=escapechar,
880
+ )
881
+ builder.options(**options).format("csv").save(path)
882
+ return None
883
+
884
+ def to_json(
885
+ self,
886
+ path: Optional[str] = None,
887
+ compression: str = "uncompressed",
888
+ num_files: Optional[int] = None,
889
+ mode: str = "w",
890
+ orient: str = "records",
891
+ lines: bool = True,
892
+ partition_cols: Optional[Union[str, List[str]]] = None,
893
+ index_col: Optional[Union[str, List[str]]] = None,
894
+ **options: Any,
895
+ ) -> Optional[str]:
896
+ """
897
+ Convert the object to a JSON string.
898
+
899
+ .. note:: pandas-on-Spark `to_json` writes files to a path or URI. Unlike pandas',
900
+ pandas-on-Spark respects HDFS's property such as 'fs.default.name'.
901
+
902
+ .. note:: pandas-on-Spark writes JSON files into the directory, `path`, and writes
903
+ multiple `part-...` files in the directory when `path` is specified.
904
+ This behavior was inherited from Apache Spark. The number of partitions can
905
+ be controlled by `num_files`. This is deprecated.
906
+ Use `DataFrame.spark.repartition` instead.
907
+
908
+ .. note:: output JSON format is different from pandas'. It always uses `orient='records'`
909
+ for its output. This behavior might have to change soon.
910
+
911
+ .. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values
912
+ when writing JSON objects. It works only when `path` is provided.
913
+
914
+ Note NaN's and None will be converted to null and datetime objects
915
+ will be converted to UNIX timestamps.
916
+
917
+ Parameters
918
+ ----------
919
+ path: string, optional
920
+ File path. If not specified, the result is returned as
921
+ a string.
922
+ lines: bool, default True
923
+ If ‘orient’ is ‘records’ write out line delimited JSON format.
924
+ Will throw ValueError if incorrect ‘orient’ since others are not
925
+ list like. It should be always True for now.
926
+ orient: str, default 'records'
927
+ It should be always 'records' for now.
928
+ compression: {'gzip', 'bz2', 'xz', None}
929
+ A string representing the compression to use in the output file,
930
+ only used when the first argument is a filename. By default, the
931
+ compression is inferred from the filename.
932
+ num_files: the number of partitions to be written in `path` directory when
933
+ this is a path. This is deprecated. Use `DataFrame.spark.repartition` instead.
934
+ mode: str
935
+ Python write mode, default 'w'.
936
+
937
+ .. note:: mode can accept the strings for Spark writing mode.
938
+ Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.
939
+
940
+ - 'append' (equivalent to 'a'): Append the new data to existing data.
941
+ - 'overwrite' (equivalent to 'w'): Overwrite existing data.
942
+ - 'ignore': Silently ignore this operation if data already exists.
943
+ - 'error' or 'errorifexists': Throw an exception if data already exists.
944
+
945
+ partition_cols: str or list of str, optional, default None
946
+ Names of partitioning columns
947
+ index_col: str or list of str, optional, default: None
948
+ Column names to be used in Spark to represent pandas-on-Spark's index. The index name
949
+ in pandas-on-Spark is ignored. By default, the index is always lost.
950
+ options: keyword arguments for additional options specific to PySpark.
951
+ It is specific to PySpark's JSON options to pass. Check
952
+ the options in PySpark's API documentation for `spark.write.json(...)`.
953
+ It has a higher priority and overwrites all other options.
954
+ This parameter only works when `path` is specified.
955
+
956
+ Returns
957
+ -------
958
+ str or None
959
+
960
+ Examples
961
+ --------
962
+ >>> df = ps.DataFrame([['a', 'b'], ['c', 'd']],
963
+ ... columns=['col 1', 'col 2'])
964
+ >>> df.to_json()
965
+ '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
966
+
967
+ >>> df['col 1'].to_json()
968
+ '[{"col 1":"a"},{"col 1":"c"}]'
969
+
970
+ >>> df.to_json(path=r'%s/to_json/foo.json' % path, num_files=1)
971
+ >>> ps.read_json(
972
+ ... path=r'%s/to_json/foo.json' % path
973
+ ... ).sort_values(by="col 1")
974
+ col 1 col 2
975
+ 0 a b
976
+ 1 c d
977
+
978
+ >>> df['col 1'].to_json(path=r'%s/to_json/foo.json' % path, num_files=1, index_col="index")
979
+ >>> ps.read_json(
980
+ ... path=r'%s/to_json/foo.json' % path, index_col="index"
981
+ ... ).sort_values(by="col 1") # doctest: +NORMALIZE_WHITESPACE
982
+ col 1
983
+ index
984
+ 0 a
985
+ 1 c
986
+ """
987
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
988
+ options = options.get("options")
989
+
990
+ default_options: Dict[str, Any] = {"ignoreNullFields": False}
991
+ options = {**default_options, **options}
992
+
993
+ if not lines:
994
+ raise NotImplementedError("lines=False is not implemented yet.")
995
+
996
+ if orient != "records":
997
+ raise NotImplementedError("orient='records' is supported only for now.")
998
+
999
+ if path is None:
1000
+ # If path is none, just collect and use pandas's to_json.
1001
+ psdf_or_ser = self
1002
+ pdf = psdf_or_ser._to_pandas()
1003
+ if isinstance(self, ps.Series):
1004
+ pdf = pdf.to_frame()
1005
+ # To make the format consistent and readable by `read_json`, convert it to pandas' and
1006
+ # use 'records' orient for now.
1007
+ return pdf.to_json(orient="records")
1008
+
1009
+ if isinstance(self, ps.DataFrame):
1010
+ psdf = self
1011
+ else:
1012
+ assert isinstance(self, ps.Series)
1013
+ psdf = self.to_frame()
1014
+ sdf = psdf.to_spark(index_col=index_col)
1015
+
1016
+ if num_files is not None:
1017
+ warnings.warn(
1018
+ "`num_files` has been deprecated and might be removed in a future version. "
1019
+ "Use `DataFrame.spark.repartition` instead.",
1020
+ FutureWarning,
1021
+ )
1022
+ sdf = sdf.repartition(num_files)
1023
+
1024
+ mode = validate_mode(mode)
1025
+ builder = sdf.write.mode(mode)
1026
+ if partition_cols is not None:
1027
+ builder.partitionBy(partition_cols)
1028
+ builder._set_opts(compression=compression)
1029
+ builder.options(**options).format("json").save(path)
1030
+ return None
1031
+
1032
+ def to_excel(
1033
+ self,
1034
+ excel_writer: Union[str, pd.ExcelWriter],
1035
+ sheet_name: str = "Sheet1",
1036
+ na_rep: str = "",
1037
+ float_format: Optional[str] = None,
1038
+ columns: Optional[Union[str, List[str]]] = None,
1039
+ header: bool = True,
1040
+ index: bool = True,
1041
+ index_label: Optional[Union[str, List[str]]] = None,
1042
+ startrow: int = 0,
1043
+ startcol: int = 0,
1044
+ engine: Optional[str] = None,
1045
+ merge_cells: bool = True,
1046
+ encoding: Optional[str] = None,
1047
+ inf_rep: str = "inf",
1048
+ verbose: bool = True,
1049
+ freeze_panes: Optional[Tuple[int, int]] = None,
1050
+ ) -> None:
1051
+ """
1052
+ Write object to an Excel sheet.
1053
+
1054
+ .. note:: This method should only be used if the resulting DataFrame is expected
1055
+ to be small, as all the data is loaded into the driver's memory.
1056
+
1057
+ To write a single object to an Excel .xlsx file it is only necessary to
1058
+ specify a target file name. To write to multiple sheets it is necessary to
1059
+ create an `ExcelWriter` object with a target file name, and specify a sheet
1060
+ in the file to write to.
1061
+
1062
+ Multiple sheets may be written to by specifying unique `sheet_name`.
1063
+ With all data written to the file it is necessary to save the changes.
1064
+ Note that creating an `ExcelWriter` object with a file name that already
1065
+ exists will result in the contents of the existing file being erased.
1066
+
1067
+ Parameters
1068
+ ----------
1069
+ excel_writer: str or ExcelWriter object
1070
+ File path or existing ExcelWriter.
1071
+ sheet_name: str, default 'Sheet1'
1072
+ Name of sheet which will contain DataFrame.
1073
+ na_rep: str, default ''
1074
+ Missing data representation.
1075
+ float_format: str, optional
1076
+ Format string for floating point numbers. For example
1077
+ ``float_format="%%.2f"`` will format 0.1234 to 0.12.
1078
+ columns: sequence or list of str, optional
1079
+ Columns to write.
1080
+ header: bool or list of str, default True
1081
+ Write out the column names. If a list of string is given it is
1082
+ assumed to be aliases for the column names.
1083
+ index: bool, default True
1084
+ Write row names (index).
1085
+ index_label: str or sequence, optional
1086
+ Column label for index column(s) if desired. If not specified, and
1087
+ `header` and `index` are True, then the index names are used. A
1088
+ sequence should be given if the DataFrame uses MultiIndex.
1089
+ startrow: int, default 0
1090
+ Upper left cell row to dump data frame.
1091
+ startcol: int, default 0
1092
+ Upper left cell column to dump data frame.
1093
+ engine: str, optional
1094
+ Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
1095
+ via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
1096
+ ``io.excel.xlsm.writer``.
1097
+ merge_cells: bool, default True
1098
+ Write MultiIndex and Hierarchical Rows as merged cells.
1099
+ encoding: str, optional
1100
+ Encoding of the resulting excel file. Only necessary for xlwt,
1101
+ other writers support unicode natively.
1102
+
1103
+ .. deprecated:: 3.4.0
1104
+
1105
+ inf_rep: str, default 'inf'
1106
+ Representation for infinity (there is no native representation for
1107
+ infinity in Excel).
1108
+ verbose: bool, default True
1109
+ Display more information in the error logs.
1110
+
1111
+ .. deprecated:: 3.4.0
1112
+
1113
+ freeze_panes: tuple of int (length 2), optional
1114
+ Specifies the one-based bottommost row and rightmost column that
1115
+ is to be frozen.
1116
+
1117
+ Notes
1118
+ -----
1119
+ Once a workbook has been saved it is not possible write further data
1120
+ without rewriting the whole workbook.
1121
+
1122
+ See Also
1123
+ --------
1124
+ read_excel: Read Excel file.
1125
+
1126
+ Examples
1127
+ --------
1128
+ Create, write to, and save a workbook:
1129
+
1130
+ >>> df1 = ps.DataFrame([['a', 'b'], ['c', 'd']],
1131
+ ... index=['row 1', 'row 2'],
1132
+ ... columns=['col 1', 'col 2'])
1133
+ >>> df1.to_excel("output.xlsx") # doctest: +SKIP
1134
+
1135
+ To specify the sheet name:
1136
+
1137
+ >>> df1.to_excel("output.xlsx") # doctest: +SKIP
1138
+ >>> df1.to_excel("output.xlsx",
1139
+ ... sheet_name='Sheet_name_1') # doctest: +SKIP
1140
+
1141
+ If you wish to write to more than one sheet in the workbook, it is
1142
+ necessary to specify an ExcelWriter object:
1143
+
1144
+ >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
1145
+ ... df1.to_excel(writer, sheet_name='Sheet_name_1')
1146
+ ... df2.to_excel(writer, sheet_name='Sheet_name_2')
1147
+
1148
+ To set the library that is used to write the Excel file,
1149
+ you can pass the `engine` keyword (the default engine is
1150
+ automatically chosen depending on the file extension):
1151
+
1152
+ >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
1153
+ """
1154
+ log_advice(
1155
+ "`to_excel` loads all data into the driver's memory. "
1156
+ "It should only be used if the resulting DataFrame is expected to be small."
1157
+ )
1158
+ # Make sure locals() call is at the top of the function so we don't capture local variables.
1159
+ args = locals()
1160
+ psdf = self
1161
+
1162
+ if isinstance(self, ps.DataFrame):
1163
+ f = pd.DataFrame.to_excel
1164
+ elif isinstance(self, ps.Series):
1165
+ f = pd.Series.to_excel
1166
+ else:
1167
+ raise TypeError(
1168
+ "Constructor expects DataFrame or Series; however, " "got [%s]" % (self,)
1169
+ )
1170
+ return validate_arguments_and_invoke_function(
1171
+ psdf._to_internal_pandas(), self.to_excel, f, args
1172
+ )
1173
+
1174
+ def mean(
1175
+ self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
1176
+ ) -> Union[Scalar, "Series"]:
1177
+ """
1178
+ Return the mean of the values.
1179
+
1180
+ Parameters
1181
+ ----------
1182
+ axis: {index (0), columns (1)}
1183
+ Axis for the function to be applied on.
1184
+ skipna: bool, default True
1185
+ Exclude NA/null values when computing the result.
1186
+
1187
+ .. versionchanged:: 3.4.0
1188
+ Supported including NA/null values.
1189
+ numeric_only: bool, default None
1190
+ Include only float, int, boolean columns. False is not supported. This parameter
1191
+ is mainly for pandas compatibility.
1192
+
1193
+ Returns
1194
+ -------
1195
+ mean: scalar for a Series, and a Series for a DataFrame.
1196
+
1197
+ Examples
1198
+ --------
1199
+
1200
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
1201
+ ... columns=['a', 'b'])
1202
+
1203
+ On a DataFrame:
1204
+
1205
+ >>> df.mean()
1206
+ a 2.0
1207
+ b 0.2
1208
+ dtype: float64
1209
+
1210
+ >>> df.mean(axis=1)
1211
+ 0 0.55
1212
+ 1 1.10
1213
+ 2 1.65
1214
+ 3 NaN
1215
+ dtype: float64
1216
+
1217
+ On a Series:
1218
+
1219
+ >>> df['a'].mean()
1220
+ 2.0
1221
+ """
1222
+ axis = validate_axis(axis)
1223
+
1224
+ if numeric_only is None and axis == 0:
1225
+ numeric_only = True
1226
+
1227
+ def mean(psser: "Series") -> Column:
1228
+ spark_type = psser.spark.data_type
1229
+ spark_column = psser.spark.column
1230
+ if isinstance(spark_type, BooleanType):
1231
+ spark_column = spark_column.cast(LongType())
1232
+ elif not isinstance(spark_type, NumericType):
1233
+ raise TypeError(
1234
+ "Could not convert {} ({}) to numeric".format(
1235
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1236
+ )
1237
+ )
1238
+ return F.mean(spark_column)
1239
+
1240
+ return self._reduce_for_stat_function(
1241
+ mean,
1242
+ name="mean",
1243
+ axis=axis,
1244
+ numeric_only=numeric_only,
1245
+ skipna=skipna,
1246
+ )
1247
+
1248
+ def sum(
1249
+ self,
1250
+ axis: Optional[Axis] = None,
1251
+ skipna: bool = True,
1252
+ numeric_only: bool = None,
1253
+ min_count: int = 0,
1254
+ ) -> Union[Scalar, "Series"]:
1255
+ """
1256
+ Return the sum of the values.
1257
+
1258
+ Parameters
1259
+ ----------
1260
+ axis: {index (0), columns (1)}
1261
+ Axis for the function to be applied on.
1262
+ skipna: bool, default True
1263
+ Exclude NA/null values when computing the result.
1264
+
1265
+ .. versionchanged:: 3.4.0
1266
+ Added *skipna* to exclude.
1267
+ numeric_only: bool, default None
1268
+ Include only float, int, boolean columns. False is not supported. This parameter
1269
+ is mainly for pandas compatibility.
1270
+ min_count: int, default 0
1271
+ The required number of valid values to perform the operation. If fewer than
1272
+ ``min_count`` non-NA values are present the result will be NA.
1273
+
1274
+ Returns
1275
+ -------
1276
+ sum: scalar for a Series, and a Series for a DataFrame.
1277
+
1278
+ Examples
1279
+ --------
1280
+
1281
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, np.nan, 0.3, np.nan]},
1282
+ ... columns=['a', 'b'])
1283
+
1284
+ On a DataFrame:
1285
+
1286
+ >>> df.sum()
1287
+ a 6.0
1288
+ b 0.4
1289
+ dtype: float64
1290
+
1291
+ >>> df.sum(axis=1)
1292
+ 0 1.1
1293
+ 1 2.0
1294
+ 2 3.3
1295
+ 3 0.0
1296
+ dtype: float64
1297
+
1298
+ >>> df.sum(min_count=3)
1299
+ a 6.0
1300
+ b NaN
1301
+ dtype: float64
1302
+
1303
+ >>> df.sum(axis=1, min_count=1)
1304
+ 0 1.1
1305
+ 1 2.0
1306
+ 2 3.3
1307
+ 3 NaN
1308
+ dtype: float64
1309
+
1310
+ On a Series:
1311
+
1312
+ >>> df['a'].sum()
1313
+ 6.0
1314
+
1315
+ >>> df['a'].sum(min_count=3)
1316
+ 6.0
1317
+ >>> df['b'].sum(min_count=3)
1318
+ nan
1319
+ """
1320
+ axis = validate_axis(axis)
1321
+
1322
+ if numeric_only is None and axis == 0:
1323
+ numeric_only = True
1324
+ elif numeric_only is True and axis == 1:
1325
+ numeric_only = None
1326
+
1327
+ def sum(psser: "Series") -> Column:
1328
+ spark_type = psser.spark.data_type
1329
+ spark_column = psser.spark.column
1330
+
1331
+ if isinstance(spark_type, BooleanType):
1332
+ spark_column = spark_column.cast(LongType())
1333
+ elif not isinstance(spark_type, NumericType):
1334
+ raise TypeError(
1335
+ "Could not convert {} ({}) to numeric".format(
1336
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1337
+ )
1338
+ )
1339
+ return F.coalesce(F.sum(spark_column), F.lit(0))
1340
+
1341
+ return self._reduce_for_stat_function(
1342
+ sum,
1343
+ name="sum",
1344
+ axis=axis,
1345
+ numeric_only=numeric_only,
1346
+ min_count=min_count,
1347
+ skipna=skipna,
1348
+ )
1349
+
1350
+ def product(
1351
+ self,
1352
+ axis: Optional[Axis] = None,
1353
+ skipna: bool = True,
1354
+ numeric_only: bool = None,
1355
+ min_count: int = 0,
1356
+ ) -> Union[Scalar, "Series"]:
1357
+ """
1358
+ Return the product of the values.
1359
+
1360
+ .. note:: unlike pandas', pandas-on-Spark's emulates product by ``exp(sum(log(...)))``
1361
+ trick. Therefore, it only works for positive numbers.
1362
+
1363
+ Parameters
1364
+ ----------
1365
+ axis: {index (0), columns (1)}
1366
+ Axis for the function to be applied on.
1367
+ skipna: bool, default True
1368
+ Exclude NA/null values when computing the result.
1369
+
1370
+ .. versionchanged:: 3.4.0
1371
+ Supported including NA/null values.
1372
+ numeric_only: bool, default None
1373
+ Include only float, int, boolean columns. False is not supported. This parameter
1374
+ is mainly for pandas compatibility.
1375
+ min_count: int, default 0
1376
+ The required number of valid values to perform the operation. If fewer than
1377
+ ``min_count`` non-NA values are present the result will be NA.
1378
+
1379
+ Examples
1380
+ --------
1381
+ On a DataFrame:
1382
+
1383
+ Non-numeric type column is not included to the result.
1384
+
1385
+ >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4, 5],
1386
+ ... 'B': [10, 20, 30, 40, 50],
1387
+ ... 'C': ['a', 'b', 'c', 'd', 'e']})
1388
+ >>> psdf
1389
+ A B C
1390
+ 0 1 10 a
1391
+ 1 2 20 b
1392
+ 2 3 30 c
1393
+ 3 4 40 d
1394
+ 4 5 50 e
1395
+
1396
+ >>> psdf.prod()
1397
+ A 120
1398
+ B 12000000
1399
+ dtype: int64
1400
+
1401
+ If there is no numeric type columns, returns empty Series.
1402
+
1403
+ >>> ps.DataFrame({"key": ['a', 'b', 'c'], "val": ['x', 'y', 'z']}).prod() # doctest: +SKIP
1404
+ Series([], dtype: float64)
1405
+
1406
+ On a Series:
1407
+
1408
+ >>> ps.Series([1, 2, 3, 4, 5]).prod()
1409
+ 120
1410
+
1411
+ By default, the product of an empty or all-NA Series is ``1``
1412
+
1413
+ >>> ps.Series([]).prod() # doctest: +SKIP
1414
+ 1.0
1415
+
1416
+ This can be controlled with the ``min_count`` parameter
1417
+
1418
+ >>> ps.Series([]).prod(min_count=1) # doctest: +SKIP
1419
+ nan
1420
+ """
1421
+ axis = validate_axis(axis)
1422
+ warnings.warn(
1423
+ "Default value of `numeric_only` will be changed to `False` "
1424
+ "instead of `None` in 4.0.0.",
1425
+ FutureWarning,
1426
+ )
1427
+
1428
+ if numeric_only is None and axis == 0:
1429
+ numeric_only = True
1430
+ elif numeric_only is True and axis == 1:
1431
+ numeric_only = None
1432
+
1433
+ def prod(psser: "Series") -> Column:
1434
+ spark_type = psser.spark.data_type
1435
+ spark_column = psser.spark.column
1436
+ if isinstance(spark_type, BooleanType):
1437
+ spark_column = spark_column.cast(LongType())
1438
+ elif not isinstance(spark_type, NumericType):
1439
+ raise TypeError(
1440
+ "Could not convert {} ({}) to numeric".format(
1441
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1442
+ )
1443
+ )
1444
+
1445
+ return SF.product(spark_column, skipna)
1446
+
1447
+ return self._reduce_for_stat_function(
1448
+ prod,
1449
+ name="prod",
1450
+ axis=axis,
1451
+ numeric_only=numeric_only,
1452
+ min_count=min_count,
1453
+ skipna=skipna,
1454
+ )
1455
+
1456
+ prod = product
1457
+
1458
+ def skew(
1459
+ self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
1460
+ ) -> Union[Scalar, "Series"]:
1461
+ """
1462
+ Return unbiased skew normalized by N-1.
1463
+
1464
+ Parameters
1465
+ ----------
1466
+ axis: {index (0), columns (1)}
1467
+ Axis for the function to be applied on.
1468
+ skipna: bool, default True
1469
+ Exclude NA/null values when computing the result.
1470
+
1471
+ .. versionchanged:: 3.4.0
1472
+ Supported including NA/null values.
1473
+ numeric_only: bool, default None
1474
+ Include only float, int, boolean columns. False is not supported. This parameter
1475
+ is mainly for pandas compatibility.
1476
+
1477
+ Returns
1478
+ -------
1479
+ skew: scalar for a Series, and a Series for a DataFrame.
1480
+
1481
+ Examples
1482
+ --------
1483
+
1484
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
1485
+ ... columns=['a', 'b'])
1486
+
1487
+ On a DataFrame:
1488
+
1489
+ >>> df.skew()
1490
+ a 0.0
1491
+ b 0.0
1492
+ dtype: float64
1493
+
1494
+ On a Series:
1495
+
1496
+ >>> df['a'].skew()
1497
+ 0.0
1498
+ """
1499
+ axis = validate_axis(axis)
1500
+
1501
+ if numeric_only is None and axis == 0:
1502
+ numeric_only = True
1503
+
1504
+ def skew(psser: "Series") -> Column:
1505
+ spark_type = psser.spark.data_type
1506
+ spark_column = psser.spark.column
1507
+ if isinstance(spark_type, BooleanType):
1508
+ spark_column = spark_column.cast(LongType())
1509
+ elif not isinstance(spark_type, NumericType):
1510
+ raise TypeError(
1511
+ "Could not convert {} ({}) to numeric".format(
1512
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1513
+ )
1514
+ )
1515
+
1516
+ return SF.skew(spark_column)
1517
+
1518
+ return self._reduce_for_stat_function(
1519
+ skew,
1520
+ name="skew",
1521
+ axis=axis,
1522
+ numeric_only=numeric_only,
1523
+ skipna=skipna,
1524
+ )
1525
+
1526
+ def kurtosis(
1527
+ self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
1528
+ ) -> Union[Scalar, "Series"]:
1529
+ """
1530
+ Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
1531
+ Normalized by N-1.
1532
+
1533
+ Parameters
1534
+ ----------
1535
+ axis: {index (0), columns (1)}
1536
+ Axis for the function to be applied on.
1537
+ skipna: bool, default True
1538
+ Exclude NA/null values when computing the result.
1539
+
1540
+ .. versionchanged:: 3.4.0
1541
+ Supported including NA/null values.
1542
+ numeric_only: bool, default None
1543
+ Include only float, int, boolean columns. False is not supported. This parameter
1544
+ is mainly for pandas compatibility.
1545
+
1546
+ Returns
1547
+ -------
1548
+ kurt: scalar for a Series, and a Series for a DataFrame.
1549
+
1550
+ Examples
1551
+ --------
1552
+
1553
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan, 6], 'b': [0.1, 0.2, 0.3, np.nan, 0.8]},
1554
+ ... columns=['a', 'b'])
1555
+
1556
+ On a DataFrame:
1557
+
1558
+ >>> df.kurtosis()
1559
+ a 1.500000
1560
+ b 2.703924
1561
+ dtype: float64
1562
+
1563
+ On a Series:
1564
+
1565
+ >>> df['a'].kurtosis()
1566
+ 1.5
1567
+ """
1568
+ axis = validate_axis(axis)
1569
+
1570
+ if numeric_only is None and axis == 0:
1571
+ numeric_only = True
1572
+
1573
+ def kurtosis(psser: "Series") -> Column:
1574
+ spark_type = psser.spark.data_type
1575
+ spark_column = psser.spark.column
1576
+ if isinstance(spark_type, BooleanType):
1577
+ spark_column = spark_column.cast(LongType())
1578
+ elif not isinstance(spark_type, NumericType):
1579
+ raise TypeError(
1580
+ "Could not convert {} ({}) to numeric".format(
1581
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1582
+ )
1583
+ )
1584
+
1585
+ return SF.kurt(spark_column)
1586
+
1587
+ return self._reduce_for_stat_function(
1588
+ kurtosis,
1589
+ name="kurtosis",
1590
+ axis=axis,
1591
+ numeric_only=numeric_only,
1592
+ skipna=skipna,
1593
+ )
1594
+
1595
+ kurt = kurtosis
1596
+
1597
+ def min(
1598
+ self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
1599
+ ) -> Union[Scalar, "Series"]:
1600
+ """
1601
+ Return the minimum of the values.
1602
+
1603
+ Parameters
1604
+ ----------
1605
+ axis: {index (0), columns (1)}
1606
+ Axis for the function to be applied on.
1607
+ skipna: bool, default True
1608
+ Exclude NA/null values when computing the result.
1609
+
1610
+ .. versionchanged:: 3.4.0
1611
+ Supported including NA/null values.
1612
+ numeric_only: bool, default None
1613
+ If True, include only float, int, boolean columns. This parameter is mainly for
1614
+ pandas compatibility. False is supported; however, the columns should
1615
+ be all numeric or all non-numeric.
1616
+
1617
+ Returns
1618
+ -------
1619
+ min: scalar for a Series, and a Series for a DataFrame.
1620
+
1621
+ Examples
1622
+ --------
1623
+
1624
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
1625
+ ... columns=['a', 'b'])
1626
+
1627
+ On a DataFrame:
1628
+
1629
+ >>> df.min()
1630
+ a 1.0
1631
+ b 0.1
1632
+ dtype: float64
1633
+
1634
+ >>> df.min(axis=1)
1635
+ 0 0.1
1636
+ 1 0.2
1637
+ 2 0.3
1638
+ 3 NaN
1639
+ dtype: float64
1640
+
1641
+ On a Series:
1642
+
1643
+ >>> df['a'].min()
1644
+ 1.0
1645
+ """
1646
+ axis = validate_axis(axis)
1647
+
1648
+ if numeric_only is None and axis == 0:
1649
+ numeric_only = True
1650
+ elif numeric_only is True and axis == 1:
1651
+ numeric_only = None
1652
+
1653
+ return self._reduce_for_stat_function(
1654
+ lambda psser: F.min(psser.spark.column),
1655
+ name="min",
1656
+ axis=axis,
1657
+ numeric_only=numeric_only,
1658
+ skipna=skipna,
1659
+ )
1660
+
1661
+ def max(
1662
+ self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None
1663
+ ) -> Union[Scalar, "Series"]:
1664
+ """
1665
+ Return the maximum of the values.
1666
+
1667
+ Parameters
1668
+ ----------
1669
+ axis: {index (0), columns (1)}
1670
+ Axis for the function to be applied on.
1671
+ skipna: bool, default True
1672
+ Exclude NA/null values when computing the result.
1673
+
1674
+ .. versionchanged:: 3.4.0
1675
+ Supported including NA/null values.
1676
+ numeric_only: bool, default None
1677
+ If True, include only float, int, boolean columns. This parameter is mainly for
1678
+ pandas compatibility. False is supported; however, the columns should
1679
+ be all numeric or all non-numeric.
1680
+
1681
+ Returns
1682
+ -------
1683
+ max: scalar for a Series, and a Series for a DataFrame.
1684
+
1685
+ Examples
1686
+ --------
1687
+
1688
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
1689
+ ... columns=['a', 'b'])
1690
+
1691
+ On a DataFrame:
1692
+
1693
+ >>> df.max()
1694
+ a 3.0
1695
+ b 0.3
1696
+ dtype: float64
1697
+
1698
+ >>> df.max(axis=1)
1699
+ 0 1.0
1700
+ 1 2.0
1701
+ 2 3.0
1702
+ 3 NaN
1703
+ dtype: float64
1704
+
1705
+ On a Series:
1706
+
1707
+ >>> df['a'].max()
1708
+ 3.0
1709
+ """
1710
+ axis = validate_axis(axis)
1711
+
1712
+ if numeric_only is None and axis == 0:
1713
+ numeric_only = True
1714
+ elif numeric_only is True and axis == 1:
1715
+ numeric_only = None
1716
+
1717
+ return self._reduce_for_stat_function(
1718
+ lambda psser: F.max(psser.spark.column),
1719
+ name="max",
1720
+ axis=axis,
1721
+ numeric_only=numeric_only,
1722
+ skipna=skipna,
1723
+ )
1724
+
1725
+ def count(
1726
+ self, axis: Optional[Axis] = None, numeric_only: bool = False
1727
+ ) -> Union[Scalar, "Series"]:
1728
+ """
1729
+ Count non-NA cells for each column.
1730
+
1731
+ The values `None`, `NaN` are considered NA.
1732
+
1733
+ Parameters
1734
+ ----------
1735
+ axis: {0 or ‘index’, 1 or ‘columns’}, default 0
1736
+ If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are
1737
+ generated for each row.
1738
+ numeric_only: bool, default False
1739
+ If True, include only float, int, boolean columns. This parameter is mainly for
1740
+ pandas compatibility.
1741
+
1742
+ Returns
1743
+ -------
1744
+ max: scalar for a Series, and a Series for a DataFrame.
1745
+
1746
+ See Also
1747
+ --------
1748
+ DataFrame.shape: Number of DataFrame rows and columns (including NA
1749
+ elements).
1750
+ DataFrame.isna: Boolean same-sized DataFrame showing places of NA
1751
+ elements.
1752
+
1753
+ Examples
1754
+ --------
1755
+ Constructing DataFrame from a dictionary:
1756
+
1757
+ >>> df = ps.DataFrame({"Person":
1758
+ ... ["John", "Myla", "Lewis", "John", "Myla"],
1759
+ ... "Age": [24., np.nan, 21., 33, 26],
1760
+ ... "Single": [False, True, True, True, False]},
1761
+ ... columns=["Person", "Age", "Single"])
1762
+ >>> df
1763
+ Person Age Single
1764
+ 0 John 24.0 False
1765
+ 1 Myla NaN True
1766
+ 2 Lewis 21.0 True
1767
+ 3 John 33.0 True
1768
+ 4 Myla 26.0 False
1769
+
1770
+ Notice the uncounted NA values:
1771
+
1772
+ >>> df.count()
1773
+ Person 5
1774
+ Age 4
1775
+ Single 5
1776
+ dtype: int64
1777
+
1778
+ >>> df.count(axis=1)
1779
+ 0 3
1780
+ 1 2
1781
+ 2 3
1782
+ 3 3
1783
+ 4 3
1784
+ dtype: int64
1785
+
1786
+ On a Series:
1787
+
1788
+ >>> df['Person'].count()
1789
+ 5
1790
+
1791
+ >>> df['Age'].count()
1792
+ 4
1793
+ """
1794
+
1795
+ return self._reduce_for_stat_function(
1796
+ Frame._count_expr, name="count", axis=axis, numeric_only=numeric_only
1797
+ )
1798
+
1799
+ def std(
1800
+ self,
1801
+ axis: Optional[Axis] = None,
1802
+ skipna: bool = True,
1803
+ ddof: int = 1,
1804
+ numeric_only: bool = None,
1805
+ ) -> Union[Scalar, "Series"]:
1806
+ """
1807
+ Return sample standard deviation.
1808
+
1809
+ .. versionadded:: 3.3.0
1810
+
1811
+ Parameters
1812
+ ----------
1813
+ axis: {index (0), columns (1)}
1814
+ Axis for the function to be applied on.
1815
+ skipna: bool, default True
1816
+ Exclude NA/null values when computing the result.
1817
+
1818
+ .. versionchanged:: 3.4.0
1819
+ Supported including NA/null values.
1820
+ ddof: int, default 1
1821
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
1822
+ where N represents the number of elements.
1823
+
1824
+ .. versionchanged:: 3.4.0
1825
+ Supported including arbitary integers.
1826
+ numeric_only: bool, default None
1827
+ Include only float, int, boolean columns. False is not supported. This parameter
1828
+ is mainly for pandas compatibility.
1829
+
1830
+ Returns
1831
+ -------
1832
+ std: scalar for a Series, and a Series for a DataFrame.
1833
+
1834
+ Examples
1835
+ --------
1836
+
1837
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
1838
+ ... columns=['a', 'b'])
1839
+
1840
+ On a DataFrame:
1841
+
1842
+ >>> df.std()
1843
+ a 1.0
1844
+ b 0.1
1845
+ dtype: float64
1846
+
1847
+ >>> df.std(ddof=2)
1848
+ a 1.414214
1849
+ b 0.141421
1850
+ dtype: float64
1851
+
1852
+ >>> df.std(axis=1)
1853
+ 0 0.636396
1854
+ 1 1.272792
1855
+ 2 1.909188
1856
+ 3 NaN
1857
+ dtype: float64
1858
+
1859
+ >>> df.std(ddof=0)
1860
+ a 0.816497
1861
+ b 0.081650
1862
+ dtype: float64
1863
+
1864
+ On a Series:
1865
+
1866
+ >>> df['a'].std()
1867
+ 1.0
1868
+
1869
+ >>> df['a'].std(ddof=0)
1870
+ 0.816496580927726
1871
+
1872
+ >>> df['a'].std(ddof=-1)
1873
+ 0.707106...
1874
+ """
1875
+ if not isinstance(ddof, int):
1876
+ raise TypeError("ddof must be integer")
1877
+
1878
+ axis = validate_axis(axis)
1879
+
1880
+ if numeric_only is None and axis == 0:
1881
+ numeric_only = True
1882
+
1883
+ def std(psser: "Series") -> Column:
1884
+ spark_type = psser.spark.data_type
1885
+ spark_column = psser.spark.column
1886
+ if isinstance(spark_type, BooleanType):
1887
+ spark_column = spark_column.cast(LongType())
1888
+ elif not isinstance(spark_type, NumericType):
1889
+ raise TypeError(
1890
+ "Could not convert {} ({}) to numeric".format(
1891
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1892
+ )
1893
+ )
1894
+ return SF.stddev(spark_column, ddof)
1895
+
1896
+ return self._reduce_for_stat_function(
1897
+ std, name="std", axis=axis, numeric_only=numeric_only, ddof=ddof, skipna=skipna
1898
+ )
1899
+
1900
+ def var(
1901
+ self, axis: Optional[Axis] = None, ddof: int = 1, numeric_only: bool = None
1902
+ ) -> Union[Scalar, "Series"]:
1903
+ """
1904
+ Return unbiased variance.
1905
+
1906
+ .. versionadded:: 3.3.0
1907
+
1908
+ Parameters
1909
+ ----------
1910
+ axis: {index (0), columns (1)}
1911
+ Axis for the function to be applied on.
1912
+ ddof: int, default 1
1913
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
1914
+ where N represents the number of elements.
1915
+
1916
+ .. versionchanged:: 3.4.0
1917
+ Supported including arbitary integers.
1918
+ numeric_only: bool, default None
1919
+ Include only float, int, boolean columns. False is not supported. This parameter
1920
+ is mainly for pandas compatibility.
1921
+
1922
+ Returns
1923
+ -------
1924
+ var: scalar for a Series, and a Series for a DataFrame.
1925
+
1926
+ Examples
1927
+ --------
1928
+
1929
+ >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
1930
+ ... columns=['a', 'b'])
1931
+
1932
+ On a DataFrame:
1933
+
1934
+ >>> df.var()
1935
+ a 1.00
1936
+ b 0.01
1937
+ dtype: float64
1938
+
1939
+ >>> df.var(ddof=2)
1940
+ a 2.00
1941
+ b 0.02
1942
+ dtype: float64
1943
+
1944
+ >>> df.var(axis=1)
1945
+ 0 0.405
1946
+ 1 1.620
1947
+ 2 3.645
1948
+ 3 NaN
1949
+ dtype: float64
1950
+
1951
+ >>> df.var(ddof=0)
1952
+ a 0.666667
1953
+ b 0.006667
1954
+ dtype: float64
1955
+
1956
+ On a Series:
1957
+
1958
+ >>> df['a'].var()
1959
+ 1.0
1960
+
1961
+ >>> df['a'].var(ddof=0)
1962
+ 0.6666666666666666
1963
+
1964
+ >>> df['a'].var(ddof=-2)
1965
+ 0.4
1966
+ """
1967
+ if not isinstance(ddof, int):
1968
+ raise TypeError("ddof must be integer")
1969
+
1970
+ axis = validate_axis(axis)
1971
+
1972
+ if numeric_only is None and axis == 0:
1973
+ numeric_only = True
1974
+
1975
+ def var(psser: "Series") -> Column:
1976
+ spark_type = psser.spark.data_type
1977
+ spark_column = psser.spark.column
1978
+ if isinstance(spark_type, BooleanType):
1979
+ spark_column = spark_column.cast(LongType())
1980
+ elif not isinstance(spark_type, NumericType):
1981
+ raise TypeError(
1982
+ "Could not convert {} ({}) to numeric".format(
1983
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
1984
+ )
1985
+ )
1986
+ return SF.var(spark_column, ddof)
1987
+
1988
+ return self._reduce_for_stat_function(
1989
+ var, name="var", axis=axis, numeric_only=numeric_only, ddof=ddof
1990
+ )
1991
+
1992
+ def median(
1993
+ self,
1994
+ axis: Optional[Axis] = None,
1995
+ skipna: bool = True,
1996
+ numeric_only: bool = None,
1997
+ accuracy: int = 10000,
1998
+ ) -> Union[Scalar, "Series"]:
1999
+ """
2000
+ Return the median of the values for the requested axis.
2001
+
2002
+ .. note:: Unlike pandas', the median in pandas-on-Spark is an approximated median based upon
2003
+ approximate percentile computation because computing median across a large dataset
2004
+ is extremely expensive.
2005
+
2006
+ Parameters
2007
+ ----------
2008
+ axis: {index (0), columns (1)}
2009
+ Axis for the function to be applied on.
2010
+ skipna: bool, default True
2011
+ Exclude NA/null values when computing the result.
2012
+
2013
+ .. versionchanged:: 3.4.0
2014
+ Supported including NA/null values.
2015
+ numeric_only: bool, default None
2016
+ Include only float, int, boolean columns. False is not supported. This parameter
2017
+ is mainly for pandas compatibility.
2018
+ accuracy: int, optional
2019
+ Default accuracy of approximation. Larger value means better accuracy.
2020
+ The relative error can be deduced by 1.0 / accuracy.
2021
+
2022
+ Returns
2023
+ -------
2024
+ median: scalar or Series
2025
+
2026
+ Examples
2027
+ --------
2028
+ >>> df = ps.DataFrame({
2029
+ ... 'a': [24., 21., 25., 33., 26.], 'b': [1, 2, 3, 4, 5]}, columns=['a', 'b'])
2030
+ >>> df
2031
+ a b
2032
+ 0 24.0 1
2033
+ 1 21.0 2
2034
+ 2 25.0 3
2035
+ 3 33.0 4
2036
+ 4 26.0 5
2037
+
2038
+ On a DataFrame:
2039
+
2040
+ >>> df.median()
2041
+ a 25.0
2042
+ b 3.0
2043
+ dtype: float64
2044
+
2045
+ On a Series:
2046
+
2047
+ >>> df['a'].median()
2048
+ 25.0
2049
+ >>> (df['b'] + 100).median()
2050
+ 103.0
2051
+
2052
+ For multi-index columns,
2053
+
2054
+ >>> df.columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')])
2055
+ >>> df
2056
+ x y
2057
+ a b
2058
+ 0 24.0 1
2059
+ 1 21.0 2
2060
+ 2 25.0 3
2061
+ 3 33.0 4
2062
+ 4 26.0 5
2063
+
2064
+ On a DataFrame:
2065
+
2066
+ >>> df.median()
2067
+ x a 25.0
2068
+ y b 3.0
2069
+ dtype: float64
2070
+
2071
+ >>> df.median(axis=1)
2072
+ 0 12.5
2073
+ 1 11.5
2074
+ 2 14.0
2075
+ 3 18.5
2076
+ 4 15.5
2077
+ dtype: float64
2078
+
2079
+ On a Series:
2080
+
2081
+ >>> df[('x', 'a')].median()
2082
+ 25.0
2083
+ >>> (df[('y', 'b')] + 100).median()
2084
+ 103.0
2085
+ """
2086
+ axis = validate_axis(axis)
2087
+
2088
+ if numeric_only is None and axis == 0:
2089
+ numeric_only = True
2090
+
2091
+ if not isinstance(accuracy, int):
2092
+ raise TypeError(
2093
+ "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
2094
+ )
2095
+
2096
+ def median(psser: "Series") -> Column:
2097
+ spark_type = psser.spark.data_type
2098
+ spark_column = psser.spark.column
2099
+ if isinstance(spark_type, (BooleanType, NumericType)):
2100
+ return F.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
2101
+ else:
2102
+ raise TypeError(
2103
+ "Could not convert {} ({}) to numeric".format(
2104
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
2105
+ )
2106
+ )
2107
+
2108
+ return self._reduce_for_stat_function(
2109
+ median,
2110
+ name="median",
2111
+ numeric_only=numeric_only,
2112
+ axis=axis,
2113
+ skipna=skipna,
2114
+ )
2115
+
2116
+ def sem(
2117
+ self,
2118
+ axis: Optional[Axis] = None,
2119
+ skipna: bool = True,
2120
+ ddof: int = 1,
2121
+ numeric_only: bool = None,
2122
+ ) -> Union[Scalar, "Series"]:
2123
+ """
2124
+ Return unbiased standard error of the mean over requested axis.
2125
+
2126
+ .. versionadded:: 3.3.0
2127
+
2128
+ Parameters
2129
+ ----------
2130
+ axis: {index (0), columns (1)}
2131
+ Axis for the function to be applied on.
2132
+ skipna: bool, default True
2133
+ Exclude NA/null values when computing the result.
2134
+
2135
+ .. versionchanged:: 3.4.0
2136
+ Supported including NA/null values.
2137
+ ddof: int, default 1
2138
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
2139
+ where N represents the number of elements.
2140
+
2141
+ .. versionchanged:: 3.4.0
2142
+ Supported including arbitary integers.
2143
+ numeric_only: bool, default None
2144
+ Include only float, int, boolean columns. False is not supported. This parameter
2145
+ is mainly for pandas compatibility.
2146
+
2147
+ Returns
2148
+ -------
2149
+ scalar(for Series) or Series(for DataFrame)
2150
+
2151
+ Examples
2152
+ --------
2153
+ >>> psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
2154
+ >>> psdf
2155
+ a b
2156
+ 0 1 4
2157
+ 1 2 5
2158
+ 2 3 6
2159
+
2160
+ >>> psdf.sem()
2161
+ a 0.57735
2162
+ b 0.57735
2163
+ dtype: float64
2164
+
2165
+ >>> psdf.sem(ddof=0)
2166
+ a 0.471405
2167
+ b 0.471405
2168
+ dtype: float64
2169
+
2170
+ >>> psdf.sem(ddof=2)
2171
+ a 0.816497
2172
+ b 0.816497
2173
+ dtype: float64
2174
+
2175
+ >>> psdf.sem(axis=1)
2176
+ 0 1.5
2177
+ 1 1.5
2178
+ 2 1.5
2179
+ dtype: float64
2180
+
2181
+ Support for Series
2182
+
2183
+ >>> psser = psdf.a
2184
+ >>> psser
2185
+ 0 1
2186
+ 1 2
2187
+ 2 3
2188
+ Name: a, dtype: int64
2189
+
2190
+ >>> psser.sem()
2191
+ 0.5773502691896258
2192
+
2193
+ >>> psser.sem(ddof=0)
2194
+ 0.47140452079103173
2195
+ """
2196
+ if not isinstance(ddof, int):
2197
+ raise TypeError("ddof must be integer")
2198
+
2199
+ axis = validate_axis(axis)
2200
+
2201
+ if numeric_only is None and axis == 0:
2202
+ numeric_only = True
2203
+
2204
+ def std(psser: "Series") -> Column:
2205
+ spark_type = psser.spark.data_type
2206
+ spark_column = psser.spark.column
2207
+ if isinstance(spark_type, BooleanType):
2208
+ spark_column = spark_column.cast(LongType())
2209
+ elif not isinstance(spark_type, NumericType):
2210
+ raise TypeError(
2211
+ "Could not convert {} ({}) to numeric".format(
2212
+ spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
2213
+ )
2214
+ )
2215
+ return SF.stddev(spark_column, ddof)
2216
+
2217
+ def sem(psser: "Series") -> Column:
2218
+ return std(psser) / F.sqrt(Frame._count_expr(psser))
2219
+
2220
+ return self._reduce_for_stat_function(
2221
+ sem,
2222
+ name="sem",
2223
+ numeric_only=numeric_only,
2224
+ axis=axis,
2225
+ ddof=ddof,
2226
+ skipna=skipna,
2227
+ )
2228
+
2229
+ @property
2230
+ def size(self) -> int:
2231
+ """
2232
+ Return an int representing the number of elements in this object.
2233
+
2234
+ Return the number of rows if Series. Otherwise return the number of
2235
+ rows times number of columns if DataFrame.
2236
+
2237
+ Examples
2238
+ --------
2239
+ >>> s = ps.Series({'a': 1, 'b': 2, 'c': None})
2240
+ >>> s.size
2241
+ 3
2242
+
2243
+ >>> df = ps.DataFrame({'col1': [1, 2, None], 'col2': [3, 4, None]})
2244
+ >>> df.size
2245
+ 6
2246
+
2247
+ >>> df = ps.DataFrame(index=[1, 2, None])
2248
+ >>> df.size
2249
+ 0
2250
+ """
2251
+ num_columns = len(self._internal.data_spark_columns)
2252
+ if num_columns == 0:
2253
+ return 0
2254
+ else:
2255
+ return len(self) * num_columns # type: ignore[arg-type]
2256
+
2257
+ def abs(self: FrameLike) -> FrameLike:
2258
+ """
2259
+ Return a Series/DataFrame with absolute numeric value of each element.
2260
+
2261
+ Returns
2262
+ -------
2263
+ abs: Series/DataFrame containing the absolute value of each element.
2264
+
2265
+ Examples
2266
+ --------
2267
+
2268
+ Absolute numeric values in a Series.
2269
+
2270
+ >>> s = ps.Series([-1.10, 2, -3.33, 4])
2271
+ >>> s.abs()
2272
+ 0 1.10
2273
+ 1 2.00
2274
+ 2 3.33
2275
+ 3 4.00
2276
+ dtype: float64
2277
+
2278
+ Absolute numeric values in a DataFrame.
2279
+
2280
+ >>> df = ps.DataFrame({
2281
+ ... 'a': [4, 5, 6, 7],
2282
+ ... 'b': [10, 20, 30, 40],
2283
+ ... 'c': [100, 50, -30, -50]
2284
+ ... },
2285
+ ... columns=['a', 'b', 'c'])
2286
+ >>> df.abs()
2287
+ a b c
2288
+ 0 4 10 100
2289
+ 1 5 20 50
2290
+ 2 6 30 30
2291
+ 3 7 40 50
2292
+ """
2293
+
2294
+ def abs(psser: "Series") -> Union["Series", Column]:
2295
+ if isinstance(psser.spark.data_type, BooleanType):
2296
+ return psser
2297
+ elif isinstance(psser.spark.data_type, NumericType):
2298
+ return psser._with_new_scol(
2299
+ F.abs(psser.spark.column), field=psser._internal.data_fields[0]
2300
+ )
2301
+ else:
2302
+ raise TypeError(
2303
+ "bad operand type for abs(): {} ({})".format(
2304
+ spark_type_to_pandas_dtype(psser.spark.data_type),
2305
+ psser.spark.data_type.simpleString(),
2306
+ )
2307
+ )
2308
+
2309
+ return self._apply_series_op(abs)
2310
+
2311
+ # TODO: by argument only support the grouping name and as_index only for now. Documentation
2312
+ # should be updated when it's supported.
2313
+ def groupby(
2314
+ self: FrameLike,
2315
+ by: Union[Name, "Series", List[Union[Name, "Series"]]],
2316
+ axis: Axis = 0,
2317
+ as_index: bool = True,
2318
+ dropna: bool = True,
2319
+ ) -> "GroupBy[FrameLike]":
2320
+ """
2321
+ Group DataFrame or Series using one or more columns.
2322
+
2323
+ A groupby operation involves some combination of splitting the
2324
+ object, applying a function, and combining the results. This can be
2325
+ used to group large amounts of data and compute operations on these
2326
+ groups.
2327
+
2328
+ Parameters
2329
+ ----------
2330
+ by: Series, label, or list of labels
2331
+ Used to determine the groups for the groupby.
2332
+ If Series is passed, the Series or dict VALUES
2333
+ will be used to determine the groups. A label or list of
2334
+ labels may be passed to group by the columns in ``self``.
2335
+ axis: int, default 0 or 'index'
2336
+ Can only be set to 0 now.
2337
+ as_index: bool, default True
2338
+ For aggregated output, return object with group labels as the
2339
+ index. Only relevant for DataFrame input. as_index=False is
2340
+ effectively "SQL-style" grouped output.
2341
+ dropna: bool, default True
2342
+ If True, and if group keys contain NA values,
2343
+ NA values together with row/column will be dropped.
2344
+ If False, NA values will also be treated as the key in groups.
2345
+
2346
+ Returns
2347
+ -------
2348
+ DataFrameGroupBy or SeriesGroupBy
2349
+ Depends on the calling object and returns groupby object that
2350
+ contains information about the groups.
2351
+
2352
+ See Also
2353
+ --------
2354
+ pyspark.pandas.groupby.GroupBy
2355
+
2356
+ Examples
2357
+ --------
2358
+ >>> df = ps.DataFrame({'Animal': ['Falcon', 'Falcon',
2359
+ ... 'Parrot', 'Parrot'],
2360
+ ... 'Max Speed': [380., 370., 24., 26.]},
2361
+ ... columns=['Animal', 'Max Speed'])
2362
+ >>> df
2363
+ Animal Max Speed
2364
+ 0 Falcon 380.0
2365
+ 1 Falcon 370.0
2366
+ 2 Parrot 24.0
2367
+ 3 Parrot 26.0
2368
+
2369
+ >>> df.groupby(['Animal']).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE
2370
+ Max Speed
2371
+ Animal
2372
+ Falcon 375.0
2373
+ Parrot 25.0
2374
+
2375
+ >>> df.groupby(['Animal'], as_index=False).mean().sort_values('Animal')
2376
+ ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
2377
+ Animal Max Speed
2378
+ ...Falcon 375.0
2379
+ ...Parrot 25.0
2380
+
2381
+ We can also choose to include NA in group keys or not by setting dropna parameter,
2382
+ the default setting is True:
2383
+
2384
+ >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
2385
+ >>> df = ps.DataFrame(l, columns=["a", "b", "c"])
2386
+ >>> df.groupby(by=["b"]).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
2387
+ a c
2388
+ b
2389
+ 1.0 2 3
2390
+ 2.0 2 5
2391
+
2392
+ >>> df.groupby(by=["b"], dropna=False).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
2393
+ a c
2394
+ b
2395
+ 1.0 2 3
2396
+ 2.0 2 5
2397
+ NaN 1 4
2398
+ """
2399
+ new_by: List[Union[Label, ps.Series]]
2400
+ if isinstance(by, ps.DataFrame):
2401
+ raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by).__name__))
2402
+ elif isinstance(by, ps.Series):
2403
+ new_by = [by]
2404
+ elif is_name_like_tuple(by):
2405
+ if isinstance(self, ps.Series):
2406
+ raise KeyError(by)
2407
+ new_by = [cast(Label, by)]
2408
+ elif is_name_like_value(by):
2409
+ if isinstance(self, ps.Series):
2410
+ raise KeyError(by)
2411
+ new_by = [cast(Label, (by,))]
2412
+ elif is_list_like(by):
2413
+ new_by = []
2414
+ for key in by:
2415
+ if isinstance(key, ps.DataFrame):
2416
+ raise ValueError(
2417
+ "Grouper for '{}' not 1-dimensional".format(type(key).__name__)
2418
+ )
2419
+ elif isinstance(key, ps.Series):
2420
+ new_by.append(key)
2421
+ elif is_name_like_tuple(key):
2422
+ if isinstance(self, ps.Series):
2423
+ raise KeyError(key)
2424
+ new_by.append(cast(Label, key))
2425
+ elif is_name_like_value(key):
2426
+ if isinstance(self, ps.Series):
2427
+ raise KeyError(key)
2428
+ new_by.append(cast(Label, (key,)))
2429
+ else:
2430
+ raise ValueError(
2431
+ "Grouper for '{}' not 1-dimensional".format(type(key).__name__)
2432
+ )
2433
+ else:
2434
+ raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by).__name__))
2435
+ if not len(new_by):
2436
+ raise ValueError("No group keys passed!")
2437
+ axis = validate_axis(axis)
2438
+ if axis != 0:
2439
+ raise NotImplementedError('axis should be either 0 or "index" currently.')
2440
+
2441
+ return self._build_groupby(by=new_by, as_index=as_index, dropna=dropna)
2442
+
2443
+ @abstractmethod
2444
+ def _build_groupby(
2445
+ self: FrameLike, by: List[Union["Series", Label]], as_index: bool, dropna: bool
2446
+ ) -> "GroupBy[FrameLike]":
2447
+ pass
2448
+
2449
+ def bool(self) -> bool:
2450
+ """
2451
+ Return the bool of a single element in the current object.
2452
+
2453
+ This must be a boolean scalar value, either True or False. Raise a ValueError if
2454
+ the object does not have exactly 1 element, or that element is not boolean
2455
+
2456
+ Returns
2457
+ -------
2458
+ bool
2459
+
2460
+ Examples
2461
+ --------
2462
+ >>> ps.DataFrame({'a': [True]}).bool()
2463
+ True
2464
+
2465
+ >>> ps.Series([False]).bool()
2466
+ False
2467
+
2468
+ If there are non-boolean or multiple values exist, it raises an exception in all
2469
+ cases as below.
2470
+
2471
+ >>> ps.DataFrame({'a': ['a']}).bool()
2472
+ Traceback (most recent call last):
2473
+ ...
2474
+ ValueError: bool cannot act on a non-boolean single element DataFrame
2475
+
2476
+ >>> ps.DataFrame({'a': [True], 'b': [False]}).bool() # doctest: +NORMALIZE_WHITESPACE
2477
+ Traceback (most recent call last):
2478
+ ...
2479
+ ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(),
2480
+ a.item(), a.any() or a.all().
2481
+
2482
+ >>> ps.Series([1]).bool()
2483
+ Traceback (most recent call last):
2484
+ ...
2485
+ ValueError: bool cannot act on a non-boolean single element DataFrame
2486
+ """
2487
+ if isinstance(self, ps.DataFrame):
2488
+ df = self
2489
+ elif isinstance(self, ps.Series):
2490
+ df = self.to_dataframe()
2491
+ return df.head(2)._to_internal_pandas().bool()
2492
+
2493
+ def first_valid_index(self) -> Optional[Union[Scalar, Tuple[Scalar, ...]]]:
2494
+ """
2495
+ Retrieves the index of the first valid value.
2496
+
2497
+ Returns
2498
+ -------
2499
+ scalar, tuple, or None
2500
+
2501
+ Examples
2502
+ --------
2503
+
2504
+ Support for DataFrame
2505
+
2506
+ >>> psdf = ps.DataFrame({'a': [None, 2, 3, 2],
2507
+ ... 'b': [None, 2.0, 3.0, 1.0],
2508
+ ... 'c': [None, 200, 400, 200]},
2509
+ ... index=['Q', 'W', 'E', 'R'])
2510
+ >>> psdf
2511
+ a b c
2512
+ Q NaN NaN NaN
2513
+ W 2.0 2.0 200.0
2514
+ E 3.0 3.0 400.0
2515
+ R 2.0 1.0 200.0
2516
+
2517
+ >>> psdf.first_valid_index()
2518
+ 'W'
2519
+
2520
+ Support for MultiIndex columns
2521
+
2522
+ >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
2523
+ >>> psdf
2524
+ a b c
2525
+ x y z
2526
+ Q NaN NaN NaN
2527
+ W 2.0 2.0 200.0
2528
+ E 3.0 3.0 400.0
2529
+ R 2.0 1.0 200.0
2530
+
2531
+ >>> psdf.first_valid_index()
2532
+ 'W'
2533
+
2534
+ Support for Series.
2535
+
2536
+ >>> s = ps.Series([None, None, 3, 4, 5], index=[100, 200, 300, 400, 500])
2537
+ >>> s
2538
+ 100 NaN
2539
+ 200 NaN
2540
+ 300 3.0
2541
+ 400 4.0
2542
+ 500 5.0
2543
+ dtype: float64
2544
+
2545
+ >>> s.first_valid_index()
2546
+ 300
2547
+
2548
+ Support for MultiIndex
2549
+
2550
+ >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
2551
+ ... ['speed', 'weight', 'length']],
2552
+ ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
2553
+ ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
2554
+ >>> s = ps.Series([None, None, None, None, 250, 1.5, 320, 1, 0.3], index=midx)
2555
+ >>> s
2556
+ lama speed NaN
2557
+ weight NaN
2558
+ length NaN
2559
+ cow speed NaN
2560
+ weight 250.0
2561
+ length 1.5
2562
+ falcon speed 320.0
2563
+ weight 1.0
2564
+ length 0.3
2565
+ dtype: float64
2566
+
2567
+ >>> s.first_valid_index()
2568
+ ('cow', 'weight')
2569
+ """
2570
+ data_spark_columns = self._internal.data_spark_columns
2571
+
2572
+ if len(data_spark_columns) == 0:
2573
+ return None
2574
+
2575
+ cond = reduce(lambda x, y: x & y, map(lambda x: x.isNotNull(), data_spark_columns))
2576
+
2577
+ with sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
2578
+ # Disable Arrow to keep row ordering.
2579
+ first_valid_row = (
2580
+ self._internal.spark_frame.filter(cond)
2581
+ .select(self._internal.index_spark_columns)
2582
+ .limit(1)
2583
+ .toPandas()
2584
+ )
2585
+
2586
+ # For Empty Series or DataFrame, returns None.
2587
+ if len(first_valid_row) == 0:
2588
+ return None
2589
+
2590
+ first_valid_row = first_valid_row.iloc[0]
2591
+ if len(first_valid_row) == 1:
2592
+ return first_valid_row.iloc[0]
2593
+ else:
2594
+ return tuple(first_valid_row)
2595
+
2596
+ def last_valid_index(self) -> Optional[Union[Scalar, Tuple[Scalar, ...]]]:
2597
+ """
2598
+ Return index for last non-NA/null value.
2599
+
2600
+ Returns
2601
+ -------
2602
+ scalar, tuple, or None
2603
+
2604
+ Notes
2605
+ -----
2606
+ This API only works with PySpark >= 3.0.
2607
+
2608
+ Examples
2609
+ --------
2610
+
2611
+ Support for DataFrame
2612
+
2613
+ >>> psdf = ps.DataFrame({'a': [1, 2, 3, None],
2614
+ ... 'b': [1.0, 2.0, 3.0, None],
2615
+ ... 'c': [100, 200, 400, None]},
2616
+ ... index=['Q', 'W', 'E', 'R'])
2617
+ >>> psdf
2618
+ a b c
2619
+ Q 1.0 1.0 100.0
2620
+ W 2.0 2.0 200.0
2621
+ E 3.0 3.0 400.0
2622
+ R NaN NaN NaN
2623
+
2624
+ >>> psdf.last_valid_index() # doctest: +SKIP
2625
+ 'E'
2626
+
2627
+ Support for MultiIndex columns
2628
+
2629
+ >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
2630
+ >>> psdf
2631
+ a b c
2632
+ x y z
2633
+ Q 1.0 1.0 100.0
2634
+ W 2.0 2.0 200.0
2635
+ E 3.0 3.0 400.0
2636
+ R NaN NaN NaN
2637
+
2638
+ >>> psdf.last_valid_index() # doctest: +SKIP
2639
+ 'E'
2640
+
2641
+ Support for Series.
2642
+
2643
+ >>> s = ps.Series([1, 2, 3, None, None], index=[100, 200, 300, 400, 500])
2644
+ >>> s
2645
+ 100 1.0
2646
+ 200 2.0
2647
+ 300 3.0
2648
+ 400 NaN
2649
+ 500 NaN
2650
+ dtype: float64
2651
+
2652
+ >>> s.last_valid_index() # doctest: +SKIP
2653
+ 300
2654
+
2655
+ Support for MultiIndex
2656
+
2657
+ >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
2658
+ ... ['speed', 'weight', 'length']],
2659
+ ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
2660
+ ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
2661
+ >>> s = ps.Series([250, 1.5, 320, 1, 0.3, None, None, None, None], index=midx)
2662
+ >>> s
2663
+ lama speed 250.0
2664
+ weight 1.5
2665
+ length 320.0
2666
+ cow speed 1.0
2667
+ weight 0.3
2668
+ length NaN
2669
+ falcon speed NaN
2670
+ weight NaN
2671
+ length NaN
2672
+ dtype: float64
2673
+
2674
+ >>> s.last_valid_index() # doctest: +SKIP
2675
+ ('cow', 'weight')
2676
+ """
2677
+ data_spark_columns = self._internal.data_spark_columns
2678
+
2679
+ if len(data_spark_columns) == 0:
2680
+ return None
2681
+
2682
+ cond = reduce(lambda x, y: x & y, map(lambda x: x.isNotNull(), data_spark_columns))
2683
+
2684
+ last_valid_rows = (
2685
+ self._internal.spark_frame.filter(cond)
2686
+ .select(self._internal.index_spark_columns)
2687
+ .tail(1)
2688
+ )
2689
+
2690
+ # For Empty Series or DataFrame, returns None.
2691
+ if len(last_valid_rows) == 0:
2692
+ return None
2693
+
2694
+ last_valid_row = last_valid_rows[0]
2695
+
2696
+ if len(last_valid_row) == 1:
2697
+ return last_valid_row[0]
2698
+ else:
2699
+ return tuple(last_valid_row)
2700
+
2701
+ # TODO: 'center', 'win_type', 'on', 'axis' parameter should be implemented.
2702
+ def rolling(
2703
+ self: FrameLike, window: int, min_periods: Optional[int] = None
2704
+ ) -> "Rolling[FrameLike]":
2705
+ """
2706
+ Provide rolling transformations.
2707
+
2708
+ .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.
2709
+ Unlike pandas, NA is also counted as the period. This might be changed
2710
+ soon.
2711
+
2712
+ Parameters
2713
+ ----------
2714
+ window: int, or offset
2715
+ Size of the moving window.
2716
+ This is the number of observations used for calculating the statistic.
2717
+ Each window will be a fixed size.
2718
+
2719
+ min_periods: int, default None
2720
+ Minimum number of observations in window required to have a value
2721
+ (otherwise result is NA).
2722
+ For a window that is specified by an offset, min_periods will default to 1.
2723
+ Otherwise, min_periods will default to the size of the window.
2724
+
2725
+ Returns
2726
+ -------
2727
+ a Window sub-classed for the operation
2728
+ """
2729
+ from pyspark.pandas.window import Rolling
2730
+
2731
+ return Rolling(self, window=window, min_periods=min_periods)
2732
+
2733
+ # TODO: 'center' and 'axis' parameter should be implemented.
2734
+ # 'axis' implementation, refer https://github.com/databricks/koalas/pull/607
2735
+ def expanding(self: FrameLike, min_periods: int = 1) -> "Expanding[FrameLike]":
2736
+ """
2737
+ Provide expanding transformations.
2738
+
2739
+ .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.
2740
+ Unlike pandas, NA is also counted as the period. This might be changed
2741
+ soon.
2742
+
2743
+ Parameters
2744
+ ----------
2745
+ min_periods: int, default 1
2746
+ Minimum number of observations in window required to have a value
2747
+ (otherwise result is NA).
2748
+
2749
+ Returns
2750
+ -------
2751
+ a Window sub-classed for the operation
2752
+ """
2753
+ from pyspark.pandas.window import Expanding
2754
+
2755
+ return Expanding(self, min_periods=min_periods)
2756
+
2757
+ # TODO: 'adjust', 'axis', 'method' parameter should be implemented.
2758
+ def ewm(
2759
+ self: FrameLike,
2760
+ com: Optional[float] = None,
2761
+ span: Optional[float] = None,
2762
+ halflife: Optional[float] = None,
2763
+ alpha: Optional[float] = None,
2764
+ min_periods: Optional[int] = None,
2765
+ ignore_na: bool_type = False,
2766
+ ) -> "ExponentialMoving[FrameLike]":
2767
+ """
2768
+ Provide exponentially weighted window transformations.
2769
+
2770
+ .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.
2771
+ Unlike pandas, NA is also counted as the period. This might be changed
2772
+ soon.
2773
+
2774
+ .. versionadded:: 3.4.0
2775
+
2776
+ Parameters
2777
+ ----------
2778
+ com: float, optional
2779
+ Specify decay in terms of center of mass.
2780
+ alpha = 1 / (1 + com), for com >= 0.
2781
+
2782
+ span: float, optional
2783
+ Specify decay in terms of span.
2784
+ alpha = 2 / (span + 1), for span >= 1.
2785
+
2786
+ halflife: float, optional
2787
+ Specify decay in terms of half-life.
2788
+ alpha = 1 - exp(-ln(2) / halflife), for halflife > 0.
2789
+
2790
+ alpha: float, optional
2791
+ Specify smoothing factor alpha directly.
2792
+ 0 < alpha <= 1.
2793
+
2794
+ min_periods: int, default None
2795
+ Minimum number of observations in window required to have a value
2796
+ (otherwise result is NA).
2797
+
2798
+ ignore_na: bool, default False
2799
+ Ignore missing values when calculating weights.
2800
+
2801
+ - When ``ignore_na=False`` (default), weights are based on absolute positions.
2802
+ For example, the weights of :math:`x_0` and :math:`x_2` used in calculating
2803
+ the final weighted average of [:math:`x_0`, None, :math:`x_2`] are
2804
+ :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and
2805
+ :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.
2806
+
2807
+ - When ``ignore_na=True``, weights are based
2808
+ on relative positions. For example, the weights of :math:`x_0` and :math:`x_2`
2809
+ used in calculating the final weighted average of
2810
+ [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if
2811
+ ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.
2812
+
2813
+ Returns
2814
+ -------
2815
+ a Window sub-classed for the operation
2816
+ """
2817
+ from pyspark.pandas.window import ExponentialMoving
2818
+
2819
+ return ExponentialMoving(
2820
+ self,
2821
+ com=com,
2822
+ span=span,
2823
+ halflife=halflife,
2824
+ alpha=alpha,
2825
+ min_periods=min_periods,
2826
+ ignore_na=ignore_na,
2827
+ )
2828
+
2829
+ def get(self, key: Any, default: Optional[Any] = None) -> Any:
2830
+ """
2831
+ Get item from object for given key (DataFrame column, Panel slice,
2832
+ etc.). Returns default value if not found.
2833
+
2834
+ Parameters
2835
+ ----------
2836
+ key: object
2837
+
2838
+ Returns
2839
+ -------
2840
+ value: same type as items contained in object
2841
+
2842
+ Examples
2843
+ --------
2844
+ >>> df = ps.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']},
2845
+ ... columns=['x', 'y', 'z'], index=[10, 20, 20])
2846
+ >>> df
2847
+ x y z
2848
+ 10 0 a a
2849
+ 20 1 b b
2850
+ 20 2 b b
2851
+
2852
+ >>> df.get('x')
2853
+ 10 0
2854
+ 20 1
2855
+ 20 2
2856
+ Name: x, dtype: int64
2857
+
2858
+ >>> df.get(['x', 'y'])
2859
+ x y
2860
+ 10 0 a
2861
+ 20 1 b
2862
+ 20 2 b
2863
+
2864
+ >>> df.x.get(10)
2865
+ 0
2866
+
2867
+ >>> df.x.get(20)
2868
+ 20 1
2869
+ 20 2
2870
+ Name: x, dtype: int64
2871
+
2872
+ >>> df.x.get(15, -1)
2873
+ -1
2874
+ """
2875
+ try:
2876
+ return self[key]
2877
+ except (KeyError, ValueError, IndexError):
2878
+ return default
2879
+
2880
+ def squeeze(self, axis: Optional[Axis] = None) -> Union[Scalar, "DataFrame", "Series"]:
2881
+ """
2882
+ Squeeze 1 dimensional axis objects into scalars.
2883
+
2884
+ Series or DataFrames with a single element are squeezed to a scalar.
2885
+ DataFrames with a single column or a single row are squeezed to a
2886
+ Series. Otherwise the object is unchanged.
2887
+
2888
+ This method is most useful when you don't know if your
2889
+ object is a Series or DataFrame, but you do know it has just a single
2890
+ column. In that case you can safely call `squeeze` to ensure you have a
2891
+ Series.
2892
+
2893
+ Parameters
2894
+ ----------
2895
+ axis: {0 or 'index', 1 or 'columns', None}, default None
2896
+ A specific axis to squeeze. By default, all length-1 axes are
2897
+ squeezed.
2898
+
2899
+ Returns
2900
+ -------
2901
+ DataFrame, Series, or scalar
2902
+ The projection after squeezing `axis` or all the axes.
2903
+
2904
+ See Also
2905
+ --------
2906
+ Series.iloc: Integer-location based indexing for selecting scalars.
2907
+ DataFrame.iloc: Integer-location based indexing for selecting Series.
2908
+ Series.to_frame: Inverse of DataFrame.squeeze for a
2909
+ single-column DataFrame.
2910
+
2911
+ Examples
2912
+ --------
2913
+ >>> primes = ps.Series([2, 3, 5, 7])
2914
+
2915
+ Slicing might produce a Series with a single value:
2916
+
2917
+ >>> even_primes = primes[primes % 2 == 0]
2918
+ >>> even_primes
2919
+ 0 2
2920
+ dtype: int64
2921
+
2922
+ >>> even_primes.squeeze()
2923
+ 2
2924
+
2925
+ Squeezing objects with more than one value in every axis does nothing:
2926
+
2927
+ >>> odd_primes = primes[primes % 2 == 1]
2928
+ >>> odd_primes
2929
+ 1 3
2930
+ 2 5
2931
+ 3 7
2932
+ dtype: int64
2933
+
2934
+ >>> odd_primes.squeeze()
2935
+ 1 3
2936
+ 2 5
2937
+ 3 7
2938
+ dtype: int64
2939
+
2940
+ Squeezing is even more effective when used with DataFrames.
2941
+
2942
+ >>> df = ps.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
2943
+ >>> df
2944
+ a b
2945
+ 0 1 2
2946
+ 1 3 4
2947
+
2948
+ Slicing a single column will produce a DataFrame with the columns
2949
+ having only one value:
2950
+
2951
+ >>> df_a = df[['a']]
2952
+ >>> df_a
2953
+ a
2954
+ 0 1
2955
+ 1 3
2956
+
2957
+ The columns can be squeezed down, resulting in a Series:
2958
+
2959
+ >>> df_a.squeeze('columns')
2960
+ 0 1
2961
+ 1 3
2962
+ Name: a, dtype: int64
2963
+
2964
+ Slicing a single row from a single column will produce a single
2965
+ scalar DataFrame:
2966
+
2967
+ >>> df_1a = df.loc[[1], ['a']]
2968
+ >>> df_1a
2969
+ a
2970
+ 1 3
2971
+
2972
+ Squeezing the rows produces a single scalar Series:
2973
+
2974
+ >>> df_1a.squeeze('rows')
2975
+ a 3
2976
+ Name: 1, dtype: int64
2977
+
2978
+ Squeezing all axes will project directly into a scalar:
2979
+
2980
+ >>> df_1a.squeeze()
2981
+ 3
2982
+ """
2983
+ if axis is not None:
2984
+ axis = "index" if axis == "rows" else axis
2985
+ axis = validate_axis(axis)
2986
+
2987
+ if isinstance(self, ps.DataFrame):
2988
+ from pyspark.pandas.series import first_series
2989
+
2990
+ is_squeezable = len(self.columns[:2]) == 1
2991
+ # If DataFrame has multiple columns, there is no change.
2992
+ if not is_squeezable:
2993
+ return self
2994
+ series_from_column = first_series(self)
2995
+ has_single_value = len(series_from_column.head(2)) == 1
2996
+ # If DataFrame has only a single value, use pandas API directly.
2997
+ if has_single_value:
2998
+ result = self._to_internal_pandas().squeeze(axis)
2999
+ return ps.Series(result) if isinstance(result, pd.Series) else result
3000
+ elif axis == 0:
3001
+ return self
3002
+ else:
3003
+ return series_from_column
3004
+ else:
3005
+ # The case of Series is simple.
3006
+ # If Series has only a single value, just return it as a scalar.
3007
+ # Otherwise, there is no change.
3008
+ self_top_two = cast("Series", self).head(2)
3009
+ has_single_value = len(self_top_two) == 1
3010
+ return cast(Union[Scalar, ps.Series], self_top_two[0] if has_single_value else self)
3011
+
3012
+ def truncate(
3013
+ self,
3014
+ before: Optional[Any] = None,
3015
+ after: Optional[Any] = None,
3016
+ axis: Optional[Axis] = None,
3017
+ copy: bool_type = True,
3018
+ ) -> DataFrameOrSeries:
3019
+ """
3020
+ Truncate a Series or DataFrame before and after some index value.
3021
+
3022
+ This is a useful shorthand for boolean indexing based on index
3023
+ values above or below certain thresholds.
3024
+
3025
+ .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
3026
+ which can be expensive.
3027
+
3028
+ Parameters
3029
+ ----------
3030
+ before: date, str, int
3031
+ Truncate all rows before this index value.
3032
+ after: date, str, int
3033
+ Truncate all rows after this index value.
3034
+ axis: {0 or 'index', 1 or 'columns'}, optional
3035
+ Axis to truncate. Truncates the index (rows) by default.
3036
+ copy: bool, default is True,
3037
+ Return a copy of the truncated section.
3038
+
3039
+ Returns
3040
+ -------
3041
+ type of caller
3042
+ The truncated Series or DataFrame.
3043
+
3044
+ See Also
3045
+ --------
3046
+ DataFrame.loc: Select a subset of a DataFrame by label.
3047
+ DataFrame.iloc: Select a subset of a DataFrame by position.
3048
+
3049
+ Examples
3050
+ --------
3051
+ >>> df = ps.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
3052
+ ... 'B': ['f', 'g', 'h', 'i', 'j'],
3053
+ ... 'C': ['k', 'l', 'm', 'n', 'o']},
3054
+ ... index=[1, 2, 3, 4, 5])
3055
+ >>> df
3056
+ A B C
3057
+ 1 a f k
3058
+ 2 b g l
3059
+ 3 c h m
3060
+ 4 d i n
3061
+ 5 e j o
3062
+
3063
+ >>> df.truncate(before=2, after=4)
3064
+ A B C
3065
+ 2 b g l
3066
+ 3 c h m
3067
+ 4 d i n
3068
+
3069
+ The columns of a DataFrame can be truncated.
3070
+
3071
+ >>> df.truncate(before="A", after="B", axis="columns")
3072
+ A B
3073
+ 1 a f
3074
+ 2 b g
3075
+ 3 c h
3076
+ 4 d i
3077
+ 5 e j
3078
+
3079
+ For Series, only rows can be truncated.
3080
+
3081
+ >>> df['A'].truncate(before=2, after=4)
3082
+ 2 b
3083
+ 3 c
3084
+ 4 d
3085
+ Name: A, dtype: object
3086
+
3087
+ A Series has index that sorted integers.
3088
+
3089
+ >>> s = ps.Series([10, 20, 30, 40, 50, 60, 70],
3090
+ ... index=[1, 2, 3, 4, 5, 6, 7])
3091
+ >>> s
3092
+ 1 10
3093
+ 2 20
3094
+ 3 30
3095
+ 4 40
3096
+ 5 50
3097
+ 6 60
3098
+ 7 70
3099
+ dtype: int64
3100
+
3101
+ >>> s.truncate(2, 5)
3102
+ 2 20
3103
+ 3 30
3104
+ 4 40
3105
+ 5 50
3106
+ dtype: int64
3107
+
3108
+ A Series has index that sorted strings.
3109
+
3110
+ >>> s = ps.Series([10, 20, 30, 40, 50, 60, 70],
3111
+ ... index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
3112
+ >>> s
3113
+ a 10
3114
+ b 20
3115
+ c 30
3116
+ d 40
3117
+ e 50
3118
+ f 60
3119
+ g 70
3120
+ dtype: int64
3121
+
3122
+ >>> s.truncate('b', 'e')
3123
+ b 20
3124
+ c 30
3125
+ d 40
3126
+ e 50
3127
+ dtype: int64
3128
+ """
3129
+ from pyspark.pandas.series import first_series
3130
+
3131
+ axis = validate_axis(axis)
3132
+ indexes = self.index
3133
+ indexes_increasing = indexes.is_monotonic_increasing
3134
+ if not indexes_increasing and not indexes.is_monotonic_decreasing:
3135
+ raise ValueError("truncate requires a sorted index")
3136
+ if (before is None) and (after is None):
3137
+ return cast(Union[ps.DataFrame, ps.Series], self.copy() if copy else self)
3138
+ if (before is not None and after is not None) and before > after:
3139
+ raise ValueError("Truncate: %s must be after %s" % (after, before))
3140
+
3141
+ if isinstance(self, ps.Series):
3142
+ if indexes_increasing:
3143
+ result = first_series(
3144
+ self.to_frame().loc[before:after] # type: ignore[arg-type]
3145
+ ).rename(self.name)
3146
+ else:
3147
+ result = first_series(
3148
+ self.to_frame().loc[after:before] # type: ignore[arg-type]
3149
+ ).rename(self.name)
3150
+ elif isinstance(self, ps.DataFrame):
3151
+ if axis == 0:
3152
+ if indexes_increasing:
3153
+ result = self.loc[before:after] # type: ignore[assignment]
3154
+ else:
3155
+ result = self.loc[after:before] # type: ignore[assignment]
3156
+ elif axis == 1:
3157
+ result = self.loc[:, before:after] # type: ignore[assignment]
3158
+
3159
+ return cast(DataFrameOrSeries, result.copy() if copy else result)
3160
+
3161
+ def to_markdown(
3162
+ self, buf: Optional[Union[IO[str], str]] = None, mode: Optional[str] = None
3163
+ ) -> str:
3164
+ """
3165
+ Print Series or DataFrame in Markdown-friendly format.
3166
+
3167
+ .. note:: This method should only be used if the resulting pandas object is expected
3168
+ to be small, as all the data is loaded into the driver's memory.
3169
+
3170
+ Parameters
3171
+ ----------
3172
+ buf: writable buffer, defaults to sys.stdout
3173
+ Where to send the output. By default, the output is printed to
3174
+ sys.stdout. Pass a writable buffer if you need to further process
3175
+ the output.
3176
+ mode: str, optional
3177
+ Mode in which file is opened.
3178
+ **kwargs
3179
+ These parameters will be passed to `tabulate`.
3180
+
3181
+ Returns
3182
+ -------
3183
+ str
3184
+ Series or DataFrame in Markdown-friendly format.
3185
+
3186
+ Notes
3187
+ -----
3188
+ Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
3189
+
3190
+ Examples
3191
+ --------
3192
+ >>> psser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")
3193
+ >>> print(psser.to_markdown()) # doctest: +SKIP
3194
+ | | animal |
3195
+ |---:|:---------|
3196
+ | 0 | elk |
3197
+ | 1 | pig |
3198
+ | 2 | dog |
3199
+ | 3 | quetzal |
3200
+
3201
+ >>> psdf = ps.DataFrame(
3202
+ ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
3203
+ ... )
3204
+ >>> print(psdf.to_markdown()) # doctest: +SKIP
3205
+ | | animal_1 | animal_2 |
3206
+ |---:|:-----------|:-----------|
3207
+ | 0 | elk | dog |
3208
+ | 1 | pig | quetzal |
3209
+ """
3210
+ log_advice(
3211
+ "`to_markdown` loads all data into the driver's memory. "
3212
+ "It should only be used if the resulting pandas object is expected to be small."
3213
+ )
3214
+ # Make sure locals() call is at the top of the function so we don't capture local variables.
3215
+ args = locals()
3216
+ psser_or_psdf = self
3217
+ internal_pandas = psser_or_psdf._to_internal_pandas()
3218
+ return validate_arguments_and_invoke_function(
3219
+ internal_pandas, self.to_markdown, type(internal_pandas).to_markdown, args
3220
+ )
3221
+
3222
+ @abstractmethod
3223
+ def fillna(
3224
+ self: FrameLike,
3225
+ value: Optional[Any] = None,
3226
+ method: Optional[str] = None,
3227
+ axis: Optional[Axis] = None,
3228
+ inplace: bool_type = False,
3229
+ limit: Optional[int] = None,
3230
+ ) -> FrameLike:
3231
+ pass
3232
+
3233
+ # TODO: add 'downcast' when value parameter exists
3234
+ def bfill(
3235
+ self: FrameLike,
3236
+ axis: Optional[Axis] = None,
3237
+ inplace: bool_type = False,
3238
+ limit: Optional[int] = None,
3239
+ ) -> FrameLike:
3240
+ """
3241
+ Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`bfill```.
3242
+
3243
+ .. note:: the current implementation of 'bfill' uses Spark's Window
3244
+ without specifying partition specification. This leads to moveing all data into a
3245
+ single partition in a single machine and could cause serious
3246
+ performance degradation. Avoid this method with very large datasets.
3247
+
3248
+ Parameters
3249
+ ----------
3250
+ axis: {0 or `index`}
3251
+ 1 and `columns` are not supported.
3252
+ inplace: boolean, default False
3253
+ Fill in place (do not create a new object)
3254
+ limit: int, default None
3255
+ If method is specified, this is the maximum number of consecutive NaN values to
3256
+ forward/backward fill. In other words, if there is a gap with more than this number of
3257
+ consecutive NaNs, it will only be partially filled. If method is not specified,
3258
+ this is the maximum number of entries along the entire axis where NaNs will be filled.
3259
+ Must be greater than 0 if not None
3260
+
3261
+ Returns
3262
+ -------
3263
+ DataFrame or Series
3264
+ DataFrame or Series with NA entries filled.
3265
+
3266
+ Examples
3267
+ --------
3268
+ >>> psdf = ps.DataFrame({
3269
+ ... 'A': [None, 3, None, None],
3270
+ ... 'B': [2, 4, None, 3],
3271
+ ... 'C': [None, None, None, 1],
3272
+ ... 'D': [0, 1, 5, 4]
3273
+ ... },
3274
+ ... columns=['A', 'B', 'C', 'D'])
3275
+ >>> psdf
3276
+ A B C D
3277
+ 0 NaN 2.0 NaN 0
3278
+ 1 3.0 4.0 NaN 1
3279
+ 2 NaN NaN NaN 5
3280
+ 3 NaN 3.0 1.0 4
3281
+
3282
+ Propagate non-null values backward.
3283
+
3284
+ >>> psdf.bfill()
3285
+ A B C D
3286
+ 0 3.0 2.0 1.0 0
3287
+ 1 3.0 4.0 1.0 1
3288
+ 2 NaN 3.0 1.0 5
3289
+ 3 NaN 3.0 1.0 4
3290
+
3291
+ For Series
3292
+
3293
+ >>> psser = ps.Series([None, None, None, 1])
3294
+ >>> psser
3295
+ 0 NaN
3296
+ 1 NaN
3297
+ 2 NaN
3298
+ 3 1.0
3299
+ dtype: float64
3300
+
3301
+ >>> psser.bfill()
3302
+ 0 1.0
3303
+ 1 1.0
3304
+ 2 1.0
3305
+ 3 1.0
3306
+ dtype: float64
3307
+ """
3308
+ return self.fillna(method="bfill", axis=axis, inplace=inplace, limit=limit)
3309
+
3310
+ backfill = bfill
3311
+
3312
+ # TODO: add 'downcast' when value parameter exists
3313
+ def ffill(
3314
+ self: FrameLike,
3315
+ axis: Optional[Axis] = None,
3316
+ inplace: bool_type = False,
3317
+ limit: Optional[int] = None,
3318
+ ) -> FrameLike:
3319
+ """
3320
+ Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`ffill```.
3321
+
3322
+ .. note:: the current implementation of 'ffill' uses Spark's Window
3323
+ without specifying partition specification. This leads to moveing all data into a
3324
+ single a partition in a single machine and could cause serious
3325
+ performance degradation. Avoid this method with very large datasets.
3326
+
3327
+ Parameters
3328
+ ----------
3329
+ axis: {0 or `index`}
3330
+ 1 and `columns` are not supported.
3331
+ inplace: boolean, default False
3332
+ Fill in place (do not create a new object)
3333
+ limit: int, default None
3334
+ If method is specified, this is the maximum number of consecutive NaN values to
3335
+ forward/backward fill. In other words, if there is a gap with more than this number of
3336
+ consecutive NaNs, it will only be partially filled. If method is not specified,
3337
+ this is the maximum number of entries along the entire axis where NaNs will be filled.
3338
+ Must be greater than 0 if not None
3339
+
3340
+ Returns
3341
+ -------
3342
+ DataFrame or Series
3343
+ DataFrame or Series with NA entries filled.
3344
+
3345
+ Examples
3346
+ --------
3347
+ >>> psdf = ps.DataFrame({
3348
+ ... 'A': [None, 3, None, None],
3349
+ ... 'B': [2, 4, None, 3],
3350
+ ... 'C': [None, None, None, 1],
3351
+ ... 'D': [0, 1, 5, 4]
3352
+ ... },
3353
+ ... columns=['A', 'B', 'C', 'D'])
3354
+ >>> psdf
3355
+ A B C D
3356
+ 0 NaN 2.0 NaN 0
3357
+ 1 3.0 4.0 NaN 1
3358
+ 2 NaN NaN NaN 5
3359
+ 3 NaN 3.0 1.0 4
3360
+
3361
+ Propagate non-null values forward.
3362
+
3363
+ >>> psdf.ffill()
3364
+ A B C D
3365
+ 0 NaN 2.0 NaN 0
3366
+ 1 3.0 4.0 NaN 1
3367
+ 2 3.0 4.0 NaN 5
3368
+ 3 3.0 3.0 1.0 4
3369
+
3370
+ For Series
3371
+
3372
+ >>> psser = ps.Series([2, 4, None, 3])
3373
+ >>> psser
3374
+ 0 2.0
3375
+ 1 4.0
3376
+ 2 NaN
3377
+ 3 3.0
3378
+ dtype: float64
3379
+
3380
+ >>> psser.ffill()
3381
+ 0 2.0
3382
+ 1 4.0
3383
+ 2 4.0
3384
+ 3 3.0
3385
+ dtype: float64
3386
+ """
3387
+ return self.fillna(method="ffill", axis=axis, inplace=inplace, limit=limit)
3388
+
3389
+ pad = ffill
3390
+
3391
+ # TODO: add 'axis', 'inplace', 'downcast'
3392
+ def interpolate(
3393
+ self: FrameLike,
3394
+ method: str = "linear",
3395
+ limit: Optional[int] = None,
3396
+ limit_direction: Optional[str] = None,
3397
+ limit_area: Optional[str] = None,
3398
+ ) -> FrameLike:
3399
+ """
3400
+ Fill NaN values using an interpolation method.
3401
+
3402
+ .. note:: the current implementation of interpolate uses Spark's Window without
3403
+ specifying partition specification. This leads to moveing all data into a
3404
+ single partition in a single machine and could cause serious
3405
+ performance degradation. Avoid this method with very large datasets.
3406
+
3407
+ .. versionadded:: 3.4.0
3408
+
3409
+ Parameters
3410
+ ----------
3411
+ method: str, default 'linear'
3412
+ Interpolation technique to use. One of:
3413
+
3414
+ * 'linear': Ignore the index and treat the values as equally
3415
+ spaced.
3416
+
3417
+ limit: int, optional
3418
+ Maximum number of consecutive NaNs to fill. Must be greater than
3419
+ 0.
3420
+
3421
+ limit_direction: str, default None
3422
+ Consecutive NaNs will be filled in this direction.
3423
+ One of {{'forward', 'backward', 'both'}}.
3424
+
3425
+ limit_area: str, default None
3426
+ If limit is specified, consecutive NaNs will be filled with this restriction. One of:
3427
+
3428
+ * None: No fill restriction.
3429
+ * 'inside': Only fill NaNs surrounded by valid values (interpolate).
3430
+ * 'outside': Only fill NaNs outside valid values (extrapolate).
3431
+
3432
+ Returns
3433
+ -------
3434
+ Series or DataFrame or None
3435
+ Returns the same object type as the caller, interpolated at
3436
+ some or all NA values.
3437
+
3438
+ See Also
3439
+ --------
3440
+ fillna: Fill missing values using different methods.
3441
+
3442
+ Examples
3443
+ --------
3444
+ Filling in NA via linear interpolation.
3445
+
3446
+ >>> s = ps.Series([0, 1, np.nan, 3])
3447
+ >>> s
3448
+ 0 0.0
3449
+ 1 1.0
3450
+ 2 NaN
3451
+ 3 3.0
3452
+ dtype: float64
3453
+ >>> s.interpolate()
3454
+ 0 0.0
3455
+ 1 1.0
3456
+ 2 2.0
3457
+ 3 3.0
3458
+ dtype: float64
3459
+
3460
+ Fill the DataFrame forward (that is, going down) along each column
3461
+ using linear interpolation.
3462
+
3463
+ Note how the last entry in column 'a' is interpolated differently,
3464
+ because there is no entry after it to use for interpolation.
3465
+ Note how the first entry in column 'b' remains NA, because there
3466
+ is no entry before it to use for interpolation.
3467
+
3468
+ >>> df = ps.DataFrame([(0.0, np.nan, -1.0, 1.0),
3469
+ ... (np.nan, 2.0, np.nan, np.nan),
3470
+ ... (2.0, 3.0, np.nan, 9.0),
3471
+ ... (np.nan, 4.0, -4.0, 16.0)],
3472
+ ... columns=list('abcd'))
3473
+ >>> df
3474
+ a b c d
3475
+ 0 0.0 NaN -1.0 1.0
3476
+ 1 NaN 2.0 NaN NaN
3477
+ 2 2.0 3.0 NaN 9.0
3478
+ 3 NaN 4.0 -4.0 16.0
3479
+ >>> df.interpolate(method='linear')
3480
+ a b c d
3481
+ 0 0.0 NaN -1.0 1.0
3482
+ 1 1.0 2.0 -2.0 5.0
3483
+ 2 2.0 3.0 -3.0 9.0
3484
+ 3 2.0 4.0 -4.0 16.0
3485
+ """
3486
+ return self.interpolate(
3487
+ method=method, limit=limit, limit_direction=limit_direction, limit_area=limit_area
3488
+ )
3489
+
3490
+ @property
3491
+ def at(self) -> AtIndexer:
3492
+ return AtIndexer(self)
3493
+
3494
+ at.__doc__ = AtIndexer.__doc__
3495
+
3496
+ @property
3497
+ def iat(self) -> iAtIndexer:
3498
+ return iAtIndexer(self)
3499
+
3500
+ iat.__doc__ = iAtIndexer.__doc__
3501
+
3502
+ @property
3503
+ def iloc(self) -> iLocIndexer:
3504
+ return iLocIndexer(self)
3505
+
3506
+ iloc.__doc__ = iLocIndexer.__doc__
3507
+
3508
+ @property
3509
+ def loc(self) -> LocIndexer:
3510
+ return LocIndexer(self)
3511
+
3512
+ loc.__doc__ = LocIndexer.__doc__
3513
+
3514
+ def __bool__(self) -> NoReturn:
3515
+ raise ValueError(
3516
+ "The truth value of a {0} is ambiguous. "
3517
+ "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(self.__class__.__name__)
3518
+ )
3519
+
3520
+ @staticmethod
3521
+ def _count_expr(psser: "Series") -> Column:
3522
+ return F.count(psser._dtype_op.nan_to_null(psser).spark.column)
3523
+
3524
+
3525
+ def _test() -> None:
3526
+ import os
3527
+ import doctest
3528
+ import shutil
3529
+ import sys
3530
+ import tempfile
3531
+ from pyspark.sql import SparkSession
3532
+ import pyspark.pandas.generic
3533
+
3534
+ os.chdir(os.environ["SPARK_HOME"])
3535
+
3536
+ globs = pyspark.pandas.generic.__dict__.copy()
3537
+ globs["ps"] = pyspark.pandas
3538
+ spark = (
3539
+ SparkSession.builder.master("local[4]")
3540
+ .appName("pyspark.pandas.generic tests")
3541
+ .getOrCreate()
3542
+ )
3543
+
3544
+ path = tempfile.mkdtemp()
3545
+ globs["path"] = path
3546
+
3547
+ (failure_count, test_count) = doctest.testmod(
3548
+ pyspark.pandas.generic,
3549
+ globs=globs,
3550
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
3551
+ )
3552
+
3553
+ shutil.rmtree(path, ignore_errors=True)
3554
+ spark.stop()
3555
+ if failure_count:
3556
+ sys.exit(-1)
3557
+
3558
+
3559
+ if __name__ == "__main__":
3560
+ _test()