snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1804 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ Base and utility classes for pandas-on-Spark objects.
20
+ """
21
+ import warnings
22
+ from abc import ABCMeta, abstractmethod
23
+ from functools import wraps, partial
24
+ from itertools import chain
25
+ from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast, TYPE_CHECKING
26
+
27
+ import numpy as np
28
+ import pandas as pd
29
+ from pandas.api.types import is_list_like, CategoricalDtype # type: ignore[attr-defined]
30
+ from pyspark.sql import functions as F, Column, Window
31
+ from pyspark.sql.types import LongType, BooleanType, NumericType
32
+
33
+ from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm.
34
+ from pyspark.pandas._typing import Axis, Dtype, IndexOpsLike, Label, SeriesOrIndex
35
+ from pyspark.pandas.config import get_option, option_context
36
+ from pyspark.pandas.internal import (
37
+ InternalField,
38
+ InternalFrame,
39
+ NATURAL_ORDER_COLUMN_NAME,
40
+ SPARK_DEFAULT_INDEX_NAME,
41
+ )
42
+ from pyspark.pandas.spark.accessors import SparkIndexOpsMethods
43
+ from pyspark.pandas.typedef import extension_dtypes
44
+ from pyspark.pandas.utils import (
45
+ combine_frames,
46
+ same_anchor,
47
+ scol_for,
48
+ validate_axis,
49
+ ERROR_MESSAGE_CANNOT_COMBINE,
50
+ )
51
+ from pyspark.pandas.frame import DataFrame
52
+
53
+ if TYPE_CHECKING:
54
+ from pyspark.sql._typing import ColumnOrName
55
+
56
+ from pyspark.pandas.data_type_ops.base import DataTypeOps
57
+ from pyspark.pandas.series import Series
58
+
59
+
60
+ def should_alignment_for_column_op(self: SeriesOrIndex, other: SeriesOrIndex) -> bool:
61
+ from pyspark.pandas.series import Series
62
+
63
+ if isinstance(self, Series) and isinstance(other, Series):
64
+ return not same_anchor(self, other)
65
+ else:
66
+ return self._internal.spark_frame is not other._internal.spark_frame
67
+
68
+
69
+ def align_diff_index_ops(
70
+ func: Callable[..., Column], this_index_ops: SeriesOrIndex, *args: Any
71
+ ) -> SeriesOrIndex:
72
+ """
73
+ Align the `IndexOpsMixin` objects and apply the function.
74
+
75
+ Parameters
76
+ ----------
77
+ func : The function to apply
78
+ this_index_ops : IndexOpsMixin
79
+ A base `IndexOpsMixin` object
80
+ args : list of other arguments including other `IndexOpsMixin` objects
81
+
82
+ Returns
83
+ -------
84
+ `Index` if all `this_index_ops` and arguments are `Index`; otherwise `Series`
85
+ """
86
+ from pyspark.pandas.indexes import Index
87
+ from pyspark.pandas.series import Series, first_series
88
+
89
+ cols = [arg for arg in args if isinstance(arg, IndexOpsMixin)]
90
+
91
+ if isinstance(this_index_ops, Series) and all(isinstance(col, Series) for col in cols):
92
+ combined = combine_frames(
93
+ this_index_ops.to_frame(),
94
+ *[cast(Series, col).rename(i) for i, col in enumerate(cols)],
95
+ how="full",
96
+ )
97
+
98
+ return column_op(func)(
99
+ combined["this"]._psser_for(combined["this"]._internal.column_labels[0]),
100
+ *[
101
+ combined["that"]._psser_for(label)
102
+ for label in combined["that"]._internal.column_labels
103
+ ],
104
+ ).rename(this_index_ops.name)
105
+ else:
106
+ # This could cause as many counts, reset_index calls, joins for combining
107
+ # as the number of `Index`s in `args`. So far it's fine since we can assume the ops
108
+ # only work between at most two `Index`s. We might need to fix it in the future.
109
+
110
+ self_len = len(this_index_ops)
111
+ if any(len(col) != self_len for col in args if isinstance(col, IndexOpsMixin)):
112
+ raise ValueError("operands could not be broadcast together with shapes")
113
+
114
+ with option_context("compute.default_index_type", "distributed-sequence"):
115
+ if isinstance(this_index_ops, Index) and all(isinstance(col, Index) for col in cols):
116
+ return Index(
117
+ column_op(func)(
118
+ this_index_ops.to_series().reset_index(drop=True),
119
+ *[
120
+ arg.to_series().reset_index(drop=True)
121
+ if isinstance(arg, Index)
122
+ else arg
123
+ for arg in args
124
+ ],
125
+ ).sort_index(),
126
+ name=this_index_ops.name,
127
+ )
128
+ elif isinstance(this_index_ops, Series):
129
+ this = cast(DataFrame, this_index_ops.reset_index())
130
+ that = [
131
+ cast(Series, col.to_series() if isinstance(col, Index) else col)
132
+ .rename(i)
133
+ .reset_index(drop=True)
134
+ for i, col in enumerate(cols)
135
+ ]
136
+
137
+ combined = combine_frames(this, *that, how="full").sort_index()
138
+ combined = combined.set_index(
139
+ combined._internal.column_labels[: this_index_ops._internal.index_level]
140
+ )
141
+ combined.index.names = this_index_ops._internal.index_names
142
+
143
+ return column_op(func)(
144
+ first_series(combined["this"]),
145
+ *[
146
+ combined["that"]._psser_for(label)
147
+ for label in combined["that"]._internal.column_labels
148
+ ],
149
+ ).rename(this_index_ops.name)
150
+ else:
151
+ this = this_index_ops.to_frame().reset_index(drop=True)
152
+
153
+ that_series = next(col for col in cols if isinstance(col, Series))
154
+ that_frame = that_series._psdf[
155
+ [
156
+ cast(Series, col.to_series() if isinstance(col, Index) else col).rename(i)
157
+ for i, col in enumerate(cols)
158
+ ]
159
+ ]
160
+
161
+ combined = combine_frames(this, that_frame.reset_index()).sort_index()
162
+
163
+ self_index = (
164
+ combined["this"].set_index(combined["this"]._internal.column_labels).index
165
+ )
166
+
167
+ other = combined["that"].set_index(
168
+ combined["that"]._internal.column_labels[: that_series._internal.index_level]
169
+ )
170
+ other.index.names = that_series._internal.index_names
171
+
172
+ return column_op(func)(
173
+ self_index,
174
+ *[
175
+ other._psser_for(label)
176
+ for label, col in zip(other._internal.column_labels, cols)
177
+ ],
178
+ ).rename(that_series.name)
179
+
180
+
181
+ def booleanize_null(scol: Column, f: Callable[..., Column]) -> Column:
182
+ """
183
+ Booleanize Null in Spark Column
184
+ """
185
+ comp_ops = [
186
+ getattr(Column, "__{}__".format(comp_op))
187
+ for comp_op in ["eq", "ne", "lt", "le", "ge", "gt"]
188
+ ]
189
+
190
+ if f in comp_ops:
191
+ # if `f` is "!=", fill null with True otherwise False
192
+ filler = f == Column.__ne__
193
+ scol = F.when(scol.isNull(), filler).otherwise(scol)
194
+
195
+ return scol
196
+
197
+
198
+ def column_op(f: Callable[..., Column]) -> Callable[..., SeriesOrIndex]:
199
+ """
200
+ A decorator that wraps APIs taking/returning Spark Column so that pandas-on-Spark Series can be
201
+ supported too. If this decorator is used for the `f` function that takes Spark Column and
202
+ returns Spark Column, decorated `f` takes pandas-on-Spark Series as well and returns
203
+ pandas-on-Spark Series.
204
+
205
+ :param f: a function that takes Spark Column and returns Spark Column.
206
+ :param self: pandas-on-Spark Series
207
+ :param args: arguments that the function `f` takes.
208
+ """
209
+
210
+ @wraps(f)
211
+ def wrapper(self: SeriesOrIndex, *args: Any) -> SeriesOrIndex:
212
+ from pyspark.pandas.indexes.base import Index
213
+ from pyspark.pandas.series import Series
214
+
215
+ # It is possible for the function `f` to take other arguments than Spark Column.
216
+ # To cover this case, explicitly check if the argument is pandas-on-Spark Series and
217
+ # extract Spark Column. For other arguments, they are used as are.
218
+ cols = [arg for arg in args if isinstance(arg, (Series, Index))]
219
+
220
+ if all(not should_alignment_for_column_op(self, col) for col in cols):
221
+ # Same DataFrame anchors
222
+ scol = f(
223
+ self.spark.column,
224
+ *[arg.spark.column if isinstance(arg, IndexOpsMixin) else arg for arg in args],
225
+ )
226
+
227
+ field = InternalField.from_struct_field(
228
+ self._internal.spark_frame.select(scol).schema[0],
229
+ use_extension_dtypes=any(
230
+ isinstance(col.dtype, extension_dtypes) for col in [self] + cols
231
+ ),
232
+ )
233
+
234
+ if not field.is_extension_dtype:
235
+ scol = booleanize_null(scol, f).alias(field.name)
236
+
237
+ if isinstance(self, Series) or not any(isinstance(col, Series) for col in cols):
238
+ index_ops = self._with_new_scol(scol, field=field)
239
+ else:
240
+ psser = next(col for col in cols if isinstance(col, Series))
241
+ index_ops = psser._with_new_scol(scol, field=field)
242
+ elif get_option("compute.ops_on_diff_frames"):
243
+ index_ops = align_diff_index_ops(f, self, *args)
244
+ else:
245
+ raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)
246
+
247
+ if not all(self.name == col.name for col in cols):
248
+ index_ops = index_ops.rename(None)
249
+
250
+ return index_ops
251
+
252
+ return wrapper
253
+
254
+
255
+ def numpy_column_op(f: Callable[..., Column]) -> Callable[..., SeriesOrIndex]:
256
+ @wraps(f)
257
+ def wrapper(self: SeriesOrIndex, *args: Any) -> SeriesOrIndex:
258
+ # PySpark does not support NumPy type out of the box. For now, we convert NumPy types
259
+ # into some primitive types understandable in PySpark.
260
+ new_args = []
261
+ for arg in args:
262
+ # TODO: This is a quick hack to support NumPy type. We should revisit this.
263
+ if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64):
264
+ new_args.append(float(arg / np.timedelta64(1, "s")))
265
+ else:
266
+ new_args.append(arg)
267
+ return column_op(f)(self, *new_args)
268
+
269
+ return wrapper
270
+
271
+
272
+ class IndexOpsMixin(object, metaclass=ABCMeta):
273
+ """common ops mixin to support a unified interface / docs for Series / Index
274
+
275
+ Assuming there are following attributes or properties and functions.
276
+ """
277
+
278
+ @property
279
+ @abstractmethod
280
+ def _internal(self) -> InternalFrame:
281
+ pass
282
+
283
+ @property
284
+ @abstractmethod
285
+ def _psdf(self) -> DataFrame:
286
+ pass
287
+
288
+ @abstractmethod
289
+ def _with_new_scol(
290
+ self: IndexOpsLike, scol: Column, *, field: Optional[InternalField] = None
291
+ ) -> IndexOpsLike:
292
+ pass
293
+
294
+ @property
295
+ @abstractmethod
296
+ def _column_label(self) -> Optional[Label]:
297
+ pass
298
+
299
+ @property
300
+ @abstractmethod
301
+ def spark(self: IndexOpsLike) -> SparkIndexOpsMethods[IndexOpsLike]:
302
+ pass
303
+
304
+ @property
305
+ def _dtype_op(self) -> "DataTypeOps":
306
+ from pyspark.pandas.data_type_ops.base import DataTypeOps
307
+
308
+ return DataTypeOps(self.dtype, self.spark.data_type)
309
+
310
+ @abstractmethod
311
+ def copy(self: IndexOpsLike) -> IndexOpsLike:
312
+ pass
313
+
314
+ # arithmetic operators
315
+ def __neg__(self: IndexOpsLike) -> IndexOpsLike:
316
+ return self._dtype_op.neg(self)
317
+
318
+ def __add__(self, other: Any) -> SeriesOrIndex:
319
+ return self._dtype_op.add(self, other)
320
+
321
+ def __sub__(self, other: Any) -> SeriesOrIndex:
322
+ return self._dtype_op.sub(self, other)
323
+
324
+ def __mul__(self, other: Any) -> SeriesOrIndex:
325
+ return self._dtype_op.mul(self, other)
326
+
327
+ def __truediv__(self, other: Any) -> SeriesOrIndex:
328
+ """
329
+ __truediv__ has different behaviour between pandas and PySpark for several cases.
330
+ 1. When dividing np.inf by zero, PySpark returns null whereas pandas returns np.inf
331
+ 2. When dividing a positive number by zero, PySpark returns null
332
+ whereas pandas returns np.inf
333
+ 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf
334
+ 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf
335
+
336
+ +-------------------------------------------+
337
+ | dividend (divisor: 0) | PySpark | pandas |
338
+ |-----------------------|---------|---------|
339
+ | np.inf | null | np.inf |
340
+ | -np.inf | null | -np.inf |
341
+ | 10 | null | np.inf |
342
+ | -10 | null | -np.inf |
343
+ +-----------------------|---------|---------+
344
+ """
345
+ return self._dtype_op.truediv(self, other)
346
+
347
+ def __mod__(self, other: Any) -> SeriesOrIndex:
348
+ return self._dtype_op.mod(self, other)
349
+
350
+ def __radd__(self, other: Any) -> SeriesOrIndex:
351
+ return self._dtype_op.radd(self, other)
352
+
353
+ def __rsub__(self, other: Any) -> SeriesOrIndex:
354
+ return self._dtype_op.rsub(self, other)
355
+
356
+ def __rmul__(self, other: Any) -> SeriesOrIndex:
357
+ return self._dtype_op.rmul(self, other)
358
+
359
+ def __rtruediv__(self, other: Any) -> SeriesOrIndex:
360
+ return self._dtype_op.rtruediv(self, other)
361
+
362
+ def __floordiv__(self, other: Any) -> SeriesOrIndex:
363
+ """
364
+ __floordiv__ has different behaviour between pandas and PySpark for several cases.
365
+ 1. When dividing np.inf by zero, PySpark returns null whereas pandas returns np.inf
366
+ 2. When dividing a positive number by zero, PySpark returns null
367
+ whereas pandas returns np.inf
368
+ 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf
369
+ 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf
370
+
371
+ +-------------------------------------------+
372
+ | dividend (divisor: 0) | PySpark | pandas |
373
+ |-----------------------|---------|---------|
374
+ | np.inf | null | np.inf |
375
+ | -np.inf | null | -np.inf |
376
+ | 10 | null | np.inf |
377
+ | -10 | null | -np.inf |
378
+ +-----------------------|---------|---------+
379
+ """
380
+ return self._dtype_op.floordiv(self, other)
381
+
382
+ def __rfloordiv__(self, other: Any) -> SeriesOrIndex:
383
+ return self._dtype_op.rfloordiv(self, other)
384
+
385
+ def __rmod__(self, other: Any) -> SeriesOrIndex:
386
+ return self._dtype_op.rmod(self, other)
387
+
388
+ def __pow__(self, other: Any) -> SeriesOrIndex:
389
+ return self._dtype_op.pow(self, other)
390
+
391
+ def __rpow__(self, other: Any) -> SeriesOrIndex:
392
+ return self._dtype_op.rpow(self, other)
393
+
394
+ def __abs__(self: IndexOpsLike) -> IndexOpsLike:
395
+ return self._dtype_op.abs(self)
396
+
397
+ # comparison operators
398
+ def __eq__(self, other: Any) -> SeriesOrIndex: # type: ignore[override]
399
+ # pandas always returns False for all items with dict and set.
400
+ if isinstance(other, (dict, set)):
401
+ return self != self
402
+ else:
403
+ return self._dtype_op.eq(self, other)
404
+
405
+ def __ne__(self, other: Any) -> SeriesOrIndex: # type: ignore[override]
406
+ return self._dtype_op.ne(self, other)
407
+
408
+ def __lt__(self, other: Any) -> SeriesOrIndex:
409
+ return self._dtype_op.lt(self, other)
410
+
411
+ def __le__(self, other: Any) -> SeriesOrIndex:
412
+ return self._dtype_op.le(self, other)
413
+
414
+ def __ge__(self, other: Any) -> SeriesOrIndex:
415
+ return self._dtype_op.ge(self, other)
416
+
417
+ def __gt__(self, other: Any) -> SeriesOrIndex:
418
+ return self._dtype_op.gt(self, other)
419
+
420
+ def __invert__(self: IndexOpsLike) -> IndexOpsLike:
421
+ return self._dtype_op.invert(self)
422
+
423
+ # `and`, `or`, `not` cannot be overloaded in Python,
424
+ # so use bitwise operators as boolean operators
425
+ def __and__(self, other: Any) -> SeriesOrIndex:
426
+ return self._dtype_op.__and__(self, other)
427
+
428
+ def __or__(self, other: Any) -> SeriesOrIndex:
429
+ return self._dtype_op.__or__(self, other)
430
+
431
+ def __rand__(self, other: Any) -> SeriesOrIndex:
432
+ return self._dtype_op.rand(self, other)
433
+
434
+ def __ror__(self, other: Any) -> SeriesOrIndex:
435
+ return self._dtype_op.ror(self, other)
436
+
437
+ def __xor__(self, other: Any) -> SeriesOrIndex:
438
+ return self._dtype_op.xor(self, other)
439
+
440
+ def __rxor__(self, other: Any) -> SeriesOrIndex:
441
+ return self._dtype_op.rxor(self, other)
442
+
443
+ def __len__(self) -> int:
444
+ return len(self._psdf)
445
+
446
+ # NDArray Compat
447
+ def __array_ufunc__(
448
+ self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
449
+ ) -> SeriesOrIndex:
450
+ from pyspark.pandas import numpy_compat
451
+
452
+ # Try dunder methods first.
453
+ result = numpy_compat.maybe_dispatch_ufunc_to_dunder_op(
454
+ self, ufunc, method, *inputs, **kwargs
455
+ )
456
+
457
+ # After that, we try with PySpark APIs.
458
+ if result is NotImplemented:
459
+ result = numpy_compat.maybe_dispatch_ufunc_to_spark_func(
460
+ self, ufunc, method, *inputs, **kwargs
461
+ )
462
+
463
+ if result is not NotImplemented:
464
+ return cast(SeriesOrIndex, result)
465
+ else:
466
+ # TODO: support more APIs?
467
+ raise NotImplementedError(
468
+ "pandas-on-Spark objects currently do not support %s." % ufunc
469
+ )
470
+
471
+ @property
472
+ def dtype(self) -> Dtype:
473
+ """Return the dtype object of the underlying data.
474
+
475
+ Examples
476
+ --------
477
+ >>> s = ps.Series([1, 2, 3])
478
+ >>> s.dtype
479
+ dtype('int64')
480
+
481
+ >>> s = ps.Series(list('abc'))
482
+ >>> s.dtype
483
+ dtype('O')
484
+
485
+ >>> s = ps.Series(pd.date_range('20130101', periods=3))
486
+ >>> s.dtype
487
+ dtype('<M8[ns]')
488
+
489
+ >>> s.rename("a").to_frame().set_index("a").index.dtype
490
+ dtype('<M8[ns]')
491
+ """
492
+ return self._internal.data_fields[0].dtype
493
+
494
+ @property
495
+ def empty(self) -> bool:
496
+ """
497
+ Returns true if the current object is empty. Otherwise, it returns false.
498
+
499
+ >>> ps.range(10).id.empty
500
+ False
501
+
502
+ >>> ps.range(0).id.empty
503
+ True
504
+
505
+ >>> ps.DataFrame({}, index=list('abc')).index.empty
506
+ False
507
+ """
508
+ return self._internal.resolved_copy.spark_frame.rdd.isEmpty()
509
+
510
+ @property
511
+ def hasnans(self) -> bool:
512
+ """
513
+ Return True if it has any missing values. Otherwise, it returns False.
514
+
515
+ >>> ps.DataFrame({}, index=list('abc')).index.hasnans
516
+ False
517
+
518
+ >>> ps.Series(['a', None]).hasnans
519
+ True
520
+
521
+ >>> ps.Series([1.0, 2.0, np.nan]).hasnans
522
+ True
523
+
524
+ >>> ps.Series([1, 2, 3]).hasnans
525
+ False
526
+
527
+ >>> (ps.Series([1.0, 2.0, np.nan]) + 1).hasnans
528
+ True
529
+
530
+ >>> ps.Series([1, 2, 3]).rename("a").to_frame().set_index("a").index.hasnans
531
+ False
532
+ """
533
+ return self.isnull().any()
534
+
535
+ @property
536
+ def is_monotonic(self) -> bool:
537
+ """
538
+ Return boolean if values in the object are monotonically increasing.
539
+
540
+ .. note:: the current implementation of is_monotonic requires to shuffle
541
+ and aggregate multiple times to check the order locally and globally,
542
+ which is potentially expensive. In case of multi-index, all data is
543
+ transferred to a single node which can easily cause out-of-memory errors.
544
+
545
+ .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`
546
+ for multi-index if you're using pandas-on-Spark < 1.7.0 with PySpark 3.1.1.
547
+
548
+ .. deprecated:: 3.4.0
549
+
550
+ Returns
551
+ -------
552
+ is_monotonic : bool
553
+
554
+ Examples
555
+ --------
556
+ >>> ser = ps.Series(['1/1/2018', '3/1/2018', '4/1/2018'])
557
+ >>> ser.is_monotonic
558
+ True
559
+
560
+ >>> df = ps.DataFrame({'dates': [None, '1/1/2018', '2/1/2018', '3/1/2018']})
561
+ >>> df.dates.is_monotonic
562
+ False
563
+
564
+ >>> df.index.is_monotonic
565
+ True
566
+
567
+ >>> ser = ps.Series([1])
568
+ >>> ser.is_monotonic
569
+ True
570
+
571
+ >>> ser = ps.Series([])
572
+ >>> ser.is_monotonic
573
+ True
574
+
575
+ >>> ser.rename("a").to_frame().set_index("a").index.is_monotonic
576
+ True
577
+
578
+ >>> ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])
579
+ >>> ser.is_monotonic
580
+ False
581
+
582
+ >>> ser.index.is_monotonic
583
+ True
584
+
585
+ Support for MultiIndex
586
+
587
+ >>> midx = ps.MultiIndex.from_tuples(
588
+ ... [('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd'), ('z', 'e')])
589
+ >>> midx # doctest: +SKIP
590
+ MultiIndex([('x', 'a'),
591
+ ('x', 'b'),
592
+ ('y', 'c'),
593
+ ('y', 'd'),
594
+ ('z', 'e')],
595
+ )
596
+ >>> midx.is_monotonic
597
+ True
598
+
599
+ >>> midx = ps.MultiIndex.from_tuples(
600
+ ... [('z', 'a'), ('z', 'b'), ('y', 'c'), ('y', 'd'), ('x', 'e')])
601
+ >>> midx # doctest: +SKIP
602
+ MultiIndex([('z', 'a'),
603
+ ('z', 'b'),
604
+ ('y', 'c'),
605
+ ('y', 'd'),
606
+ ('x', 'e')],
607
+ )
608
+ >>> midx.is_monotonic
609
+ False
610
+ """
611
+ warnings.warn(
612
+ "is_monotonic is deprecated and will be removed in a future version. "
613
+ "Use is_monotonic_increasing instead.",
614
+ FutureWarning,
615
+ )
616
+ return self._is_monotonic("increasing")
617
+
618
+ @property
619
+ def is_monotonic_increasing(self) -> bool:
620
+ """
621
+ Return boolean if values in the object are monotonically increasing.
622
+
623
+ .. note:: the current implementation of is_monotonic_increasing requires to shuffle
624
+ and aggregate multiple times to check the order locally and globally,
625
+ which is potentially expensive. In case of multi-index, all data is
626
+ transferred to a single node which can easily cause out-of-memory errors.
627
+
628
+ .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`
629
+ for multi-index if you're using pandas-on-Spark < 1.7.0 with PySpark 3.1.1.
630
+
631
+ Returns
632
+ -------
633
+ is_monotonic : bool
634
+
635
+ Examples
636
+ --------
637
+ >>> ser = ps.Series(['1/1/2018', '3/1/2018', '4/1/2018'])
638
+ >>> ser.is_monotonic_increasing
639
+ True
640
+
641
+ >>> df = ps.DataFrame({'dates': [None, '1/1/2018', '2/1/2018', '3/1/2018']})
642
+ >>> df.dates.is_monotonic_increasing
643
+ False
644
+
645
+ >>> df.index.is_monotonic_increasing
646
+ True
647
+
648
+ >>> ser = ps.Series([1])
649
+ >>> ser.is_monotonic_increasing
650
+ True
651
+
652
+ >>> ser = ps.Series([])
653
+ >>> ser.is_monotonic_increasing
654
+ True
655
+
656
+ >>> ser.rename("a").to_frame().set_index("a").index.is_monotonic_increasing
657
+ True
658
+
659
+ >>> ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])
660
+ >>> ser.is_monotonic_increasing
661
+ False
662
+
663
+ >>> ser.index.is_monotonic_increasing
664
+ True
665
+
666
+ Support for MultiIndex
667
+
668
+ >>> midx = ps.MultiIndex.from_tuples(
669
+ ... [('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd'), ('z', 'e')])
670
+ >>> midx # doctest: +SKIP
671
+ MultiIndex([('x', 'a'),
672
+ ('x', 'b'),
673
+ ('y', 'c'),
674
+ ('y', 'd'),
675
+ ('z', 'e')],
676
+ )
677
+ >>> midx.is_monotonic_increasing
678
+ True
679
+
680
+ >>> midx = ps.MultiIndex.from_tuples(
681
+ ... [('z', 'a'), ('z', 'b'), ('y', 'c'), ('y', 'd'), ('x', 'e')])
682
+ >>> midx # doctest: +SKIP
683
+ MultiIndex([('z', 'a'),
684
+ ('z', 'b'),
685
+ ('y', 'c'),
686
+ ('y', 'd'),
687
+ ('x', 'e')],
688
+ )
689
+ >>> midx.is_monotonic_increasing
690
+ False
691
+ """
692
+ return self._is_monotonic("increasing")
693
+
694
+ @property
695
+ def is_monotonic_decreasing(self) -> bool:
696
+ """
697
+ Return boolean if values in the object are monotonically decreasing.
698
+
699
+ .. note:: the current implementation of is_monotonic_decreasing requires to shuffle
700
+ and aggregate multiple times to check the order locally and globally,
701
+ which is potentially expensive. In case of multi-index, all data is transferred
702
+ to a single node which can easily cause out-of-memory errors.
703
+
704
+ .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`
705
+ for multi-index if you're using pandas-on-Spark < 1.7.0 with PySpark 3.1.1.
706
+
707
+ Returns
708
+ -------
709
+ is_monotonic : bool
710
+
711
+ Examples
712
+ --------
713
+ >>> ser = ps.Series(['4/1/2018', '3/1/2018', '1/1/2018'])
714
+ >>> ser.is_monotonic_decreasing
715
+ True
716
+
717
+ >>> df = ps.DataFrame({'dates': [None, '3/1/2018', '2/1/2018', '1/1/2018']})
718
+ >>> df.dates.is_monotonic_decreasing
719
+ False
720
+
721
+ >>> df.index.is_monotonic_decreasing
722
+ False
723
+
724
+ >>> ser = ps.Series([1])
725
+ >>> ser.is_monotonic_decreasing
726
+ True
727
+
728
+ >>> ser = ps.Series([])
729
+ >>> ser.is_monotonic_decreasing
730
+ True
731
+
732
+ >>> ser.rename("a").to_frame().set_index("a").index.is_monotonic_decreasing
733
+ True
734
+
735
+ >>> ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])
736
+ >>> ser.is_monotonic_decreasing
737
+ True
738
+
739
+ >>> ser.index.is_monotonic_decreasing
740
+ False
741
+
742
+ Support for MultiIndex
743
+
744
+ >>> midx = ps.MultiIndex.from_tuples(
745
+ ... [('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd'), ('z', 'e')])
746
+ >>> midx # doctest: +SKIP
747
+ MultiIndex([('x', 'a'),
748
+ ('x', 'b'),
749
+ ('y', 'c'),
750
+ ('y', 'd'),
751
+ ('z', 'e')],
752
+ )
753
+ >>> midx.is_monotonic_decreasing
754
+ False
755
+
756
+ >>> midx = ps.MultiIndex.from_tuples(
757
+ ... [('z', 'e'), ('z', 'd'), ('y', 'c'), ('y', 'b'), ('x', 'a')])
758
+ >>> midx # doctest: +SKIP
759
+ MultiIndex([('z', 'a'),
760
+ ('z', 'b'),
761
+ ('y', 'c'),
762
+ ('y', 'd'),
763
+ ('x', 'e')],
764
+ )
765
+ >>> midx.is_monotonic_decreasing
766
+ True
767
+ """
768
+ return self._is_monotonic("decreasing")
769
+
770
+ def _is_locally_monotonic_spark_column(self, order: str) -> Column:
771
+ window = (
772
+ Window.partitionBy(F.col("__partition_id"))
773
+ .orderBy(NATURAL_ORDER_COLUMN_NAME)
774
+ .rowsBetween(-1, -1)
775
+ )
776
+
777
+ if order == "increasing":
778
+ return (F.col("__origin") >= F.lag(F.col("__origin"), 1).over(window)) & F.col(
779
+ "__origin"
780
+ ).isNotNull()
781
+ else:
782
+ return (F.col("__origin") <= F.lag(F.col("__origin"), 1).over(window)) & F.col(
783
+ "__origin"
784
+ ).isNotNull()
785
+
786
+ def _is_monotonic(self, order: str) -> bool:
787
+ assert order in ("increasing", "decreasing")
788
+
789
+ sdf = self._internal.spark_frame
790
+
791
+ sdf = (
792
+ sdf.select(
793
+ F.spark_partition_id().alias(
794
+ "__partition_id"
795
+ ), # Make sure we use the same partition id in the whole job.
796
+ F.col(NATURAL_ORDER_COLUMN_NAME),
797
+ self.spark.column.alias("__origin"),
798
+ )
799
+ .select(
800
+ F.col("__partition_id"),
801
+ F.col("__origin"),
802
+ self._is_locally_monotonic_spark_column(order).alias(
803
+ "__comparison_within_partition"
804
+ ),
805
+ )
806
+ .groupby(F.col("__partition_id"))
807
+ .agg(
808
+ F.min(F.col("__origin")).alias("__partition_min"),
809
+ F.max(F.col("__origin")).alias("__partition_max"),
810
+ F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True))).alias(
811
+ "__comparison_within_partition"
812
+ ),
813
+ )
814
+ )
815
+
816
+ # Now we're windowing the aggregation results without partition specification.
817
+ # The number of rows here will be the same as partitions, which is expected
818
+ # to be small.
819
+ window = Window.orderBy(F.col("__partition_id")).rowsBetween(-1, -1)
820
+ if order == "increasing":
821
+ comparison_col = F.col("__partition_min") >= F.lag(F.col("__partition_max"), 1).over(
822
+ window
823
+ )
824
+ else:
825
+ comparison_col = F.col("__partition_min") <= F.lag(F.col("__partition_max"), 1).over(
826
+ window
827
+ )
828
+
829
+ sdf = sdf.select(
830
+ comparison_col.alias("__comparison_between_partitions"),
831
+ F.col("__comparison_within_partition"),
832
+ )
833
+
834
+ ret = sdf.select(
835
+ F.min(F.coalesce(F.col("__comparison_between_partitions"), F.lit(True)))
836
+ & F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True)))
837
+ ).collect()[0][0]
838
+ if ret is None:
839
+ return True
840
+ else:
841
+ return ret
842
+
843
+ @property
844
+ def ndim(self) -> int:
845
+ """
846
+ Return an int representing the number of array dimensions.
847
+
848
+ Return 1 for Series / Index / MultiIndex.
849
+
850
+ Examples
851
+ --------
852
+
853
+ For Series
854
+
855
+ >>> s = ps.Series([None, 1, 2, 3, 4], index=[4, 5, 2, 1, 8])
856
+ >>> s.ndim
857
+ 1
858
+
859
+ For Index
860
+
861
+ >>> s.index.ndim
862
+ 1
863
+
864
+ For MultiIndex
865
+
866
+ >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
867
+ ... ['speed', 'weight', 'length']],
868
+ ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
869
+ ... [1, 1, 1, 1, 1, 2, 1, 2, 2]])
870
+ >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
871
+ >>> s.index.ndim
872
+ 1
873
+ """
874
+ return 1
875
+
876
+ def astype(self: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
877
+ """
878
+ Cast a pandas-on-Spark object to a specified dtype ``dtype``.
879
+
880
+ Parameters
881
+ ----------
882
+ dtype : data type
883
+ Use a numpy.dtype or Python type to cast entire pandas object to
884
+ the same type.
885
+
886
+ Returns
887
+ -------
888
+ casted : same type as caller
889
+
890
+ See Also
891
+ --------
892
+ to_datetime : Convert argument to datetime.
893
+
894
+ Examples
895
+ --------
896
+ >>> ser = ps.Series([1, 2], dtype='int32')
897
+ >>> ser
898
+ 0 1
899
+ 1 2
900
+ dtype: int32
901
+
902
+ >>> ser.astype('int64')
903
+ 0 1
904
+ 1 2
905
+ dtype: int64
906
+
907
+ >>> ser.rename("a").to_frame().set_index("a").index.astype('int64') # doctest: +SKIP
908
+ Int64Index([1, 2], dtype='int64', name='a')
909
+ """
910
+ return self._dtype_op.astype(self, dtype)
911
+
912
+ def isin(self: IndexOpsLike, values: Sequence[Any]) -> IndexOpsLike:
913
+ """
914
+ Check whether `values` are contained in Series or Index.
915
+
916
+ Return a boolean Series or Index showing whether each element in the Series
917
+ matches an element in the passed sequence of `values` exactly.
918
+
919
+ Parameters
920
+ ----------
921
+ values : set or list-like
922
+ The sequence of values to test.
923
+
924
+ Returns
925
+ -------
926
+ isin : Series (bool dtype) or Index (bool dtype)
927
+
928
+ Examples
929
+ --------
930
+ >>> s = ps.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
931
+ ... 'hippo'], name='animal')
932
+ >>> s.isin(['cow', 'lama'])
933
+ 0 True
934
+ 1 True
935
+ 2 True
936
+ 3 False
937
+ 4 True
938
+ 5 False
939
+ Name: animal, dtype: bool
940
+
941
+ Passing a single string as ``s.isin('lama')`` will raise an error. Use
942
+ a list of one element instead:
943
+
944
+ >>> s.isin(['lama'])
945
+ 0 True
946
+ 1 False
947
+ 2 True
948
+ 3 False
949
+ 4 True
950
+ 5 False
951
+ Name: animal, dtype: bool
952
+
953
+ >>> s.rename("a").to_frame().set_index("a").index.isin(['lama']) # doctest: +SKIP
954
+ Index([True, False, True, False, True, False], dtype='bool', name='a')
955
+ """
956
+ if not is_list_like(values):
957
+ raise TypeError(
958
+ "only list-like objects are allowed to be passed"
959
+ " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__)
960
+ )
961
+
962
+ values = (
963
+ cast(np.ndarray, values).tolist() if isinstance(values, np.ndarray) else list(values)
964
+ )
965
+
966
+ other = [F.lit(v) for v in values]
967
+ scol = self.spark.column.isin(other)
968
+ field = self._internal.data_fields[0].copy(
969
+ dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False
970
+ )
971
+ return self._with_new_scol(scol=F.coalesce(scol, F.lit(False)), field=field)
972
+
973
+ def isnull(self: IndexOpsLike) -> IndexOpsLike:
974
+ """
975
+ Detect existing (non-missing) values.
976
+
977
+ Return a boolean same-sized object indicating if the values are NA.
978
+ NA values, such as None or numpy.NaN, get mapped to True values.
979
+ Everything else gets mapped to False values. Characters such as empty strings '' or
980
+ numpy.inf are not considered NA values
981
+ (unless you set pandas.options.mode.use_inf_as_na = True).
982
+
983
+ Returns
984
+ -------
985
+ Series or Index : Mask of bool values for each element in Series
986
+ that indicates whether an element is not an NA value.
987
+
988
+ Examples
989
+ --------
990
+ >>> ser = ps.Series([5, 6, np.NaN])
991
+ >>> ser.isna() # doctest: +NORMALIZE_WHITESPACE
992
+ 0 False
993
+ 1 False
994
+ 2 True
995
+ dtype: bool
996
+
997
+ >>> ser.rename("a").to_frame().set_index("a").index.isna() # doctest: +SKIP
998
+ Index([False, False, True], dtype='bool', name='a')
999
+ """
1000
+ from pyspark.pandas.indexes import MultiIndex
1001
+
1002
+ if isinstance(self, MultiIndex):
1003
+ raise NotImplementedError("isna is not defined for MultiIndex")
1004
+
1005
+ return self._dtype_op.isnull(self)
1006
+
1007
+ isna = isnull
1008
+
1009
+ def notnull(self: IndexOpsLike) -> IndexOpsLike:
1010
+ """
1011
+ Detect existing (non-missing) values.
1012
+ Return a boolean same-sized object indicating if the values are not NA.
1013
+ Non-missing values get mapped to True.
1014
+ Characters such as empty strings '' or numpy.inf are not considered NA values
1015
+ (unless you set pandas.options.mode.use_inf_as_na = True).
1016
+ NA values, such as None or numpy.NaN, get mapped to False values.
1017
+
1018
+ Returns
1019
+ -------
1020
+ Series or Index : Mask of bool values for each element in Series
1021
+ that indicates whether an element is not an NA value.
1022
+
1023
+ Examples
1024
+ --------
1025
+ Show which entries in a Series are not NA.
1026
+
1027
+ >>> ser = ps.Series([5, 6, np.NaN])
1028
+ >>> ser
1029
+ 0 5.0
1030
+ 1 6.0
1031
+ 2 NaN
1032
+ dtype: float64
1033
+
1034
+ >>> ser.notna()
1035
+ 0 True
1036
+ 1 True
1037
+ 2 False
1038
+ dtype: bool
1039
+
1040
+ >>> ser.rename("a").to_frame().set_index("a").index.notna() # doctest: +SKIP
1041
+ Index([True, True, False], dtype='bool', name='a')
1042
+ """
1043
+ from pyspark.pandas.indexes import MultiIndex
1044
+
1045
+ if isinstance(self, MultiIndex):
1046
+ raise NotImplementedError("notna is not defined for MultiIndex")
1047
+ return (~self.isnull()).rename(self.name) # type: ignore[attr-defined]
1048
+
1049
+ notna = notnull
1050
+
1051
+ # TODO: axis and many arguments should be implemented.
1052
+ def all(self, axis: Axis = 0, skipna: bool = True) -> bool:
1053
+ """
1054
+ Return whether all elements are True.
1055
+
1056
+ Returns True unless there at least one element within a series that is
1057
+ False or equivalent (e.g. zero or empty)
1058
+
1059
+ Parameters
1060
+ ----------
1061
+ axis : {0 or 'index'}, default 0
1062
+ Indicate which axis or axes should be reduced.
1063
+
1064
+ * 0 / 'index' : reduce the index, return a Series whose index is the
1065
+ original column labels.
1066
+
1067
+ skipna : boolean, default True
1068
+ Exclude NA values, such as None or numpy.NaN.
1069
+ If an entire row/column is NA values and `skipna` is True,
1070
+ then the result will be True, as for an empty row/column.
1071
+ If `skipna` is False, numpy.NaNs are treated as True because these are
1072
+ not equal to zero, Nones are treated as False.
1073
+
1074
+ Examples
1075
+ --------
1076
+ >>> ps.Series([True, True]).all()
1077
+ True
1078
+
1079
+ >>> ps.Series([True, False]).all()
1080
+ False
1081
+
1082
+ >>> ps.Series([0, 1]).all()
1083
+ False
1084
+
1085
+ >>> ps.Series([1, 2, 3]).all()
1086
+ True
1087
+
1088
+ >>> ps.Series([True, True, None]).all()
1089
+ True
1090
+
1091
+ >>> ps.Series([True, True, None]).all(skipna=False)
1092
+ False
1093
+
1094
+ >>> ps.Series([True, False, None]).all()
1095
+ False
1096
+
1097
+ >>> ps.Series([]).all()
1098
+ True
1099
+
1100
+ >>> ps.Series([np.nan]).all()
1101
+ True
1102
+
1103
+ >>> ps.Series([np.nan]).all(skipna=False)
1104
+ True
1105
+
1106
+ >>> ps.Series([None]).all()
1107
+ True
1108
+
1109
+ >>> ps.Series([None]).all(skipna=False)
1110
+ False
1111
+
1112
+ >>> df = ps.Series([True, False, None]).rename("a").to_frame()
1113
+ >>> df.set_index("a").index.all()
1114
+ False
1115
+ """
1116
+ axis = validate_axis(axis)
1117
+ if axis != 0:
1118
+ raise NotImplementedError('axis should be either 0 or "index" currently.')
1119
+
1120
+ sdf = self._internal.spark_frame.select(self.spark.column)
1121
+ col = scol_for(sdf, sdf.columns[0])
1122
+
1123
+ # `any` and `every` was added as of Spark 3.0.
1124
+ # ret = sdf.select(F.expr("every(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
1125
+ # We use min as its alternative as below.
1126
+ if isinstance(self.spark.data_type, NumericType) or skipna:
1127
+ # np.nan takes no effect to the result; None takes no effect if `skipna`
1128
+ ret = sdf.select(F.min(F.coalesce(col.cast("boolean"), F.lit(True)))).collect()[0][0]
1129
+ else:
1130
+ # Take None as False when not `skipna`
1131
+ ret = sdf.select(
1132
+ F.min(F.when(col.isNull(), F.lit(False)).otherwise(col.cast("boolean")))
1133
+ ).collect()[0][0]
1134
+
1135
+ if ret is None:
1136
+ return True
1137
+ else:
1138
+ return ret
1139
+
1140
+ # TODO: axis, skipna, and many arguments should be implemented.
1141
+ def any(self, axis: Axis = 0) -> bool:
1142
+ """
1143
+ Return whether any element is True.
1144
+
1145
+ Returns False unless there is at least one element within a series that is
1146
+ True or equivalent (e.g. non-zero or non-empty).
1147
+
1148
+ Parameters
1149
+ ----------
1150
+ axis : {0 or 'index'}, default 0
1151
+ Indicate which axis or axes should be reduced.
1152
+
1153
+ * 0 / 'index' : reduce the index, return a Series whose index is the
1154
+ original column labels.
1155
+
1156
+ Examples
1157
+ --------
1158
+ >>> ps.Series([False, False]).any()
1159
+ False
1160
+
1161
+ >>> ps.Series([True, False]).any()
1162
+ True
1163
+
1164
+ >>> ps.Series([0, 0]).any()
1165
+ False
1166
+
1167
+ >>> ps.Series([0, 1, 2]).any()
1168
+ True
1169
+
1170
+ >>> ps.Series([False, False, None]).any()
1171
+ False
1172
+
1173
+ >>> ps.Series([True, False, None]).any()
1174
+ True
1175
+
1176
+ >>> ps.Series([]).any()
1177
+ False
1178
+
1179
+ >>> ps.Series([np.nan]).any()
1180
+ False
1181
+
1182
+ >>> df = ps.Series([True, False, None]).rename("a").to_frame()
1183
+ >>> df.set_index("a").index.any()
1184
+ True
1185
+ """
1186
+ axis = validate_axis(axis)
1187
+ if axis != 0:
1188
+ raise NotImplementedError('axis should be either 0 or "index" currently.')
1189
+
1190
+ sdf = self._internal.spark_frame.select(self.spark.column)
1191
+ col = scol_for(sdf, sdf.columns[0])
1192
+
1193
+ # Note that we're ignoring `None`s here for now.
1194
+ # any and every was added as of Spark 3.0
1195
+ # ret = sdf.select(F.expr("any(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
1196
+ # Here we use max as its alternative:
1197
+ ret = sdf.select(F.max(F.coalesce(col.cast("boolean"), F.lit(False)))).collect()[0][0]
1198
+ if ret is None:
1199
+ return False
1200
+ else:
1201
+ return ret
1202
+
1203
+ # TODO: add frep and axis parameter
1204
+ def shift(
1205
+ self: IndexOpsLike, periods: int = 1, fill_value: Optional[Any] = None
1206
+ ) -> IndexOpsLike:
1207
+ """
1208
+ Shift Series/Index by desired number of periods.
1209
+
1210
+ .. note:: the current implementation of shift uses Spark's Window without
1211
+ specifying partition specification. This leads to moveing all data into
1212
+ a single partition in a single machine and could cause serious
1213
+ performance degradation. Avoid this method with very large datasets.
1214
+
1215
+ Parameters
1216
+ ----------
1217
+ periods : int
1218
+ Number of periods to shift. Can be positive or negative.
1219
+ fill_value : object, optional
1220
+ The scalar value to use for newly introduced missing values.
1221
+ The default depends on the dtype of self. For numeric data, np.nan is used.
1222
+
1223
+ Returns
1224
+ -------
1225
+ Copy of input Series/Index, shifted.
1226
+
1227
+ Examples
1228
+ --------
1229
+ >>> df = ps.DataFrame({'Col1': [10, 20, 15, 30, 45],
1230
+ ... 'Col2': [13, 23, 18, 33, 48],
1231
+ ... 'Col3': [17, 27, 22, 37, 52]},
1232
+ ... columns=['Col1', 'Col2', 'Col3'])
1233
+
1234
+ >>> df.Col1.shift(periods=3)
1235
+ 0 NaN
1236
+ 1 NaN
1237
+ 2 NaN
1238
+ 3 10.0
1239
+ 4 20.0
1240
+ Name: Col1, dtype: float64
1241
+
1242
+ >>> df.Col2.shift(periods=3, fill_value=0)
1243
+ 0 0
1244
+ 1 0
1245
+ 2 0
1246
+ 3 13
1247
+ 4 23
1248
+ Name: Col2, dtype: int64
1249
+
1250
+ >>> df.index.shift(periods=3, fill_value=0) # doctest: +SKIP
1251
+ Int64Index([0, 0, 0, 0, 1], dtype='int64')
1252
+ """
1253
+ return self._shift(periods, fill_value).spark.analyzed
1254
+
1255
+ def _shift(
1256
+ self: IndexOpsLike,
1257
+ periods: int,
1258
+ fill_value: Any,
1259
+ *,
1260
+ part_cols: Sequence["ColumnOrName"] = (),
1261
+ ) -> IndexOpsLike:
1262
+ if not isinstance(periods, int):
1263
+ raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__)
1264
+
1265
+ if periods == 0:
1266
+ return self.copy()
1267
+
1268
+ col = self.spark.column
1269
+ window = (
1270
+ Window.partitionBy(*part_cols)
1271
+ .orderBy(NATURAL_ORDER_COLUMN_NAME)
1272
+ .rowsBetween(-periods, -periods)
1273
+ )
1274
+ lag_col = F.lag(col, periods).over(window)
1275
+ col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col)
1276
+ return self._with_new_scol(col, field=self._internal.data_fields[0].copy(nullable=True))
1277
+
1278
+ # TODO: Update Documentation for Bins Parameter when its supported
1279
+ def value_counts(
1280
+ self,
1281
+ normalize: bool = False,
1282
+ sort: bool = True,
1283
+ ascending: bool = False,
1284
+ bins: None = None,
1285
+ dropna: bool = True,
1286
+ ) -> "Series":
1287
+ """
1288
+ Return a Series containing counts of unique values.
1289
+ The resulting object will be in descending order so that the
1290
+ first element is the most frequently-occurring element.
1291
+ Excludes NA values by default.
1292
+
1293
+ Parameters
1294
+ ----------
1295
+ normalize : boolean, default False
1296
+ If True then the object returned will contain the relative
1297
+ frequencies of the unique values.
1298
+ sort : boolean, default True
1299
+ Sort by values.
1300
+ ascending : boolean, default False
1301
+ Sort in ascending order.
1302
+ bins : Not Yet Supported
1303
+ dropna : boolean, default True
1304
+ Don't include counts of NaN.
1305
+
1306
+ Returns
1307
+ -------
1308
+ counts : Series
1309
+
1310
+ See Also
1311
+ --------
1312
+ Series.count: Number of non-NA elements in a Series.
1313
+
1314
+ Examples
1315
+ --------
1316
+ For Series
1317
+
1318
+ >>> df = ps.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
1319
+ >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE
1320
+ 1.0 3
1321
+ 0.0 2
1322
+ Name: x, dtype: int64
1323
+
1324
+ With `normalize` set to `True`, returns the relative frequency by
1325
+ dividing all values by the sum of values.
1326
+
1327
+ >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE
1328
+ 1.0 0.6
1329
+ 0.0 0.4
1330
+ Name: x, dtype: float64
1331
+
1332
+ **dropna**
1333
+ With `dropna` set to `False` we can also see NaN index values.
1334
+
1335
+ >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE
1336
+ 1.0 3
1337
+ 0.0 2
1338
+ NaN 1
1339
+ Name: x, dtype: int64
1340
+
1341
+ For Index
1342
+
1343
+ >>> idx = ps.Index([3, 1, 2, 3, 4, np.nan])
1344
+ >>> idx # doctest: +SKIP
1345
+ Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')
1346
+
1347
+ >>> idx.value_counts().sort_index()
1348
+ 1.0 1
1349
+ 2.0 1
1350
+ 3.0 2
1351
+ 4.0 1
1352
+ dtype: int64
1353
+
1354
+ **sort**
1355
+
1356
+ With `sort` set to `False`, the result wouldn't be sorted by number of count.
1357
+
1358
+ >>> idx.value_counts(sort=True).sort_index()
1359
+ 1.0 1
1360
+ 2.0 1
1361
+ 3.0 2
1362
+ 4.0 1
1363
+ dtype: int64
1364
+
1365
+ **normalize**
1366
+
1367
+ With `normalize` set to `True`, returns the relative frequency by
1368
+ dividing all values by the sum of values.
1369
+
1370
+ >>> idx.value_counts(normalize=True).sort_index()
1371
+ 1.0 0.2
1372
+ 2.0 0.2
1373
+ 3.0 0.4
1374
+ 4.0 0.2
1375
+ dtype: float64
1376
+
1377
+ **dropna**
1378
+
1379
+ With `dropna` set to `False` we can also see NaN index values.
1380
+
1381
+ >>> idx.value_counts(dropna=False).sort_index() # doctest: +SKIP
1382
+ 1.0 1
1383
+ 2.0 1
1384
+ 3.0 2
1385
+ 4.0 1
1386
+ NaN 1
1387
+ dtype: int64
1388
+
1389
+ For MultiIndex.
1390
+
1391
+ >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
1392
+ ... ['speed', 'weight', 'length']],
1393
+ ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
1394
+ ... [1, 1, 1, 1, 1, 2, 1, 2, 2]])
1395
+ >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
1396
+ >>> s.index # doctest: +SKIP
1397
+ MultiIndex([( 'lama', 'weight'),
1398
+ ( 'lama', 'weight'),
1399
+ ( 'lama', 'weight'),
1400
+ ( 'cow', 'weight'),
1401
+ ( 'cow', 'weight'),
1402
+ ( 'cow', 'length'),
1403
+ ('falcon', 'weight'),
1404
+ ('falcon', 'length'),
1405
+ ('falcon', 'length')],
1406
+ )
1407
+
1408
+ >>> s.index.value_counts().sort_index()
1409
+ (cow, length) 1
1410
+ (cow, weight) 2
1411
+ (falcon, length) 2
1412
+ (falcon, weight) 1
1413
+ (lama, weight) 3
1414
+ dtype: int64
1415
+
1416
+ >>> s.index.value_counts(normalize=True).sort_index()
1417
+ (cow, length) 0.111111
1418
+ (cow, weight) 0.222222
1419
+ (falcon, length) 0.222222
1420
+ (falcon, weight) 0.111111
1421
+ (lama, weight) 0.333333
1422
+ dtype: float64
1423
+
1424
+ If Index has name, keep the name up.
1425
+
1426
+ >>> idx = ps.Index([0, 0, 0, 1, 1, 2, 3], name='pandas-on-Spark')
1427
+ >>> idx.value_counts().sort_index()
1428
+ 0 3
1429
+ 1 2
1430
+ 2 1
1431
+ 3 1
1432
+ Name: pandas-on-Spark, dtype: int64
1433
+ """
1434
+ from pyspark.pandas.series import first_series, Series
1435
+
1436
+ if isinstance(self, Series):
1437
+ warnings.warn(
1438
+ "The resulting Series will have a fixed name of 'count' from 4.0.0.",
1439
+ FutureWarning,
1440
+ )
1441
+
1442
+ if bins is not None:
1443
+ raise NotImplementedError("value_counts currently does not support bins")
1444
+
1445
+ if dropna:
1446
+ sdf_dropna = self._internal.spark_frame.select(self.spark.column).dropna()
1447
+ else:
1448
+ sdf_dropna = self._internal.spark_frame.select(self.spark.column)
1449
+ index_name = SPARK_DEFAULT_INDEX_NAME
1450
+ column_name = self._internal.data_spark_column_names[0]
1451
+ sdf = sdf_dropna.groupby(scol_for(sdf_dropna, column_name).alias(index_name)).count()
1452
+ if sort:
1453
+ if ascending:
1454
+ sdf = sdf.orderBy(F.col("count"))
1455
+ else:
1456
+ sdf = sdf.orderBy(F.col("count").desc())
1457
+
1458
+ if normalize:
1459
+ drop_sum = sdf_dropna.count()
1460
+ sdf = sdf.withColumn("count", F.col("count") / F.lit(drop_sum))
1461
+
1462
+ internal = InternalFrame(
1463
+ spark_frame=sdf,
1464
+ index_spark_columns=[scol_for(sdf, index_name)],
1465
+ column_labels=self._internal.column_labels,
1466
+ data_spark_columns=[scol_for(sdf, "count")],
1467
+ column_label_names=self._internal.column_label_names,
1468
+ )
1469
+
1470
+ return first_series(DataFrame(internal))
1471
+
1472
+ def nunique(self, dropna: bool = True, approx: bool = False, rsd: float = 0.05) -> int:
1473
+ """
1474
+ Return number of unique elements in the object.
1475
+ Excludes NA values by default.
1476
+
1477
+ Parameters
1478
+ ----------
1479
+ dropna : bool, default True
1480
+ Don’t include NaN in the count.
1481
+ approx: bool, default False
1482
+ If False, will use the exact algorithm and return the exact number of unique.
1483
+ If True, it uses the HyperLogLog approximate algorithm, which is significantly faster
1484
+ for large amount of data.
1485
+ Note: This parameter is specific to pandas-on-Spark and is not found in pandas.
1486
+ rsd: float, default 0.05
1487
+ Maximum estimation error allowed in the HyperLogLog algorithm.
1488
+ Note: Just like ``approx`` this parameter is specific to pandas-on-Spark.
1489
+
1490
+ Returns
1491
+ -------
1492
+ int
1493
+
1494
+ See Also
1495
+ --------
1496
+ DataFrame.nunique: Method nunique for DataFrame.
1497
+ Series.count: Count non-NA/null observations in the Series.
1498
+
1499
+ Examples
1500
+ --------
1501
+ >>> ps.Series([1, 2, 3, np.nan]).nunique()
1502
+ 3
1503
+
1504
+ >>> ps.Series([1, 2, 3, np.nan]).nunique(dropna=False)
1505
+ 4
1506
+
1507
+ On big data, we recommend using the approximate algorithm to speed up this function.
1508
+ The result will be very close to the exact unique count.
1509
+
1510
+ >>> ps.Series([1, 2, 3, np.nan]).nunique(approx=True)
1511
+ 3
1512
+
1513
+ >>> idx = ps.Index([1, 1, 2, None])
1514
+ >>> idx # doctest: +SKIP
1515
+ Float64Index([1.0, 1.0, 2.0, nan], dtype='float64')
1516
+
1517
+ >>> idx.nunique()
1518
+ 2
1519
+
1520
+ >>> idx.nunique(dropna=False)
1521
+ 3
1522
+ """
1523
+ res = self._internal.spark_frame.select([self._nunique(dropna, approx, rsd)])
1524
+ return res.collect()[0][0]
1525
+
1526
+ def _nunique(self, dropna: bool = True, approx: bool = False, rsd: float = 0.05) -> Column:
1527
+ colname = self._internal.data_spark_column_names[0]
1528
+ count_fn = cast(
1529
+ Callable[[Column], Column],
1530
+ partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct,
1531
+ )
1532
+ if dropna:
1533
+ return count_fn(self.spark.column).alias(colname)
1534
+ else:
1535
+ return (
1536
+ count_fn(self.spark.column)
1537
+ + F.when(
1538
+ F.count(F.when(self.spark.column.isNull(), 1).otherwise(None)) >= 1, 1
1539
+ ).otherwise(0)
1540
+ ).alias(colname)
1541
+
1542
+ def take(self: IndexOpsLike, indices: Sequence[int]) -> IndexOpsLike:
1543
+ """
1544
+ Return the elements in the given *positional* indices along an axis.
1545
+
1546
+ This means that we are not indexing according to actual values in
1547
+ the index attribute of the object. We are indexing according to the
1548
+ actual position of the element in the object.
1549
+
1550
+ Parameters
1551
+ ----------
1552
+ indices : array-like
1553
+ An array of ints indicating which positions to take.
1554
+
1555
+ Returns
1556
+ -------
1557
+ taken : same type as caller
1558
+ An array-like containing the elements taken from the object.
1559
+
1560
+ See Also
1561
+ --------
1562
+ DataFrame.loc : Select a subset of a DataFrame by labels.
1563
+ DataFrame.iloc : Select a subset of a DataFrame by positions.
1564
+ numpy.take : Take elements from an array along an axis.
1565
+
1566
+ Examples
1567
+ --------
1568
+
1569
+ Series
1570
+
1571
+ >>> psser = ps.Series([100, 200, 300, 400, 500])
1572
+ >>> psser
1573
+ 0 100
1574
+ 1 200
1575
+ 2 300
1576
+ 3 400
1577
+ 4 500
1578
+ dtype: int64
1579
+
1580
+ >>> psser.take([0, 2, 4]).sort_index()
1581
+ 0 100
1582
+ 2 300
1583
+ 4 500
1584
+ dtype: int64
1585
+
1586
+ Index
1587
+
1588
+ >>> psidx = ps.Index([100, 200, 300, 400, 500])
1589
+ >>> psidx # doctest: +SKIP
1590
+ Int64Index([100, 200, 300, 400, 500], dtype='int64')
1591
+
1592
+ >>> psidx.take([0, 2, 4]).sort_values() # doctest: +SKIP
1593
+ Int64Index([100, 300, 500], dtype='int64')
1594
+
1595
+ MultiIndex
1596
+
1597
+ >>> psmidx = ps.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "c")])
1598
+ >>> psmidx # doctest: +SKIP
1599
+ MultiIndex([('x', 'a'),
1600
+ ('x', 'b'),
1601
+ ('x', 'c')],
1602
+ )
1603
+
1604
+ >>> psmidx.take([0, 2]) # doctest: +SKIP
1605
+ MultiIndex([('x', 'a'),
1606
+ ('x', 'c')],
1607
+ )
1608
+ """
1609
+ if not is_list_like(indices) or isinstance(indices, (dict, set)):
1610
+ raise TypeError("`indices` must be a list-like except dict or set")
1611
+ if isinstance(self, ps.Series):
1612
+ return cast(IndexOpsLike, self.iloc[indices])
1613
+ else:
1614
+ return cast(IndexOpsLike, self._psdf.iloc[indices].index)
1615
+
1616
+ def factorize(
1617
+ self: IndexOpsLike, sort: bool = True, na_sentinel: Optional[int] = -1
1618
+ ) -> Tuple[IndexOpsLike, pd.Index]:
1619
+ """
1620
+ Encode the object as an enumerated type or categorical variable.
1621
+
1622
+ This method is useful for obtaining a numeric representation of an
1623
+ array when all that matters is identifying distinct values.
1624
+
1625
+ Parameters
1626
+ ----------
1627
+ sort : bool, default True
1628
+ na_sentinel : int or None, default -1
1629
+ Value to mark "not found". If None, will not drop the NaN
1630
+ from the uniques of the values.
1631
+
1632
+ .. deprecated:: 3.4.0
1633
+
1634
+ Returns
1635
+ -------
1636
+ codes : Series or Index
1637
+ A Series or Index that's an indexer into `uniques`.
1638
+ ``uniques.take(codes)`` will have the same values as `values`.
1639
+ uniques : pd.Index
1640
+ The unique valid values.
1641
+
1642
+ .. note ::
1643
+
1644
+ Even if there's a missing value in `values`, `uniques` will
1645
+ *not* contain an entry for it.
1646
+
1647
+ Examples
1648
+ --------
1649
+ >>> psser = ps.Series(['b', None, 'a', 'c', 'b'])
1650
+ >>> codes, uniques = psser.factorize()
1651
+ >>> codes
1652
+ 0 1
1653
+ 1 -1
1654
+ 2 0
1655
+ 3 2
1656
+ 4 1
1657
+ dtype: int32
1658
+ >>> uniques
1659
+ Index(['a', 'b', 'c'], dtype='object')
1660
+
1661
+ >>> codes, uniques = psser.factorize(na_sentinel=None)
1662
+ >>> codes
1663
+ 0 1
1664
+ 1 3
1665
+ 2 0
1666
+ 3 2
1667
+ 4 1
1668
+ dtype: int32
1669
+ >>> uniques
1670
+ Index(['a', 'b', 'c', None], dtype='object')
1671
+
1672
+ >>> codes, uniques = psser.factorize(na_sentinel=-2)
1673
+ >>> codes
1674
+ 0 1
1675
+ 1 -2
1676
+ 2 0
1677
+ 3 2
1678
+ 4 1
1679
+ dtype: int32
1680
+ >>> uniques
1681
+ Index(['a', 'b', 'c'], dtype='object')
1682
+
1683
+ For Index:
1684
+
1685
+ >>> psidx = ps.Index(['b', None, 'a', 'c', 'b'])
1686
+ >>> codes, uniques = psidx.factorize()
1687
+ >>> codes # doctest: +SKIP
1688
+ Int64Index([1, -1, 0, 2, 1], dtype='int64')
1689
+ >>> uniques
1690
+ Index(['a', 'b', 'c'], dtype='object')
1691
+ """
1692
+ from pyspark.pandas.series import first_series
1693
+
1694
+ assert (na_sentinel is None) or isinstance(na_sentinel, int)
1695
+ assert sort is True
1696
+
1697
+ warnings.warn(
1698
+ "Argument `na_sentinel` will be removed in 4.0.0.",
1699
+ FutureWarning,
1700
+ )
1701
+
1702
+ if isinstance(self.dtype, CategoricalDtype):
1703
+ categories = self.dtype.categories
1704
+ if len(categories) == 0:
1705
+ scol = F.lit(None)
1706
+ else:
1707
+ kvs = list(
1708
+ chain(
1709
+ *[
1710
+ (F.lit(code), F.lit(category))
1711
+ for code, category in enumerate(categories)
1712
+ ]
1713
+ )
1714
+ )
1715
+ map_scol = F.create_map(*kvs)
1716
+ scol = map_scol[self.spark.column]
1717
+ codes, uniques = self._with_new_scol(
1718
+ scol.alias(self._internal.data_spark_column_names[0])
1719
+ ).factorize(na_sentinel=na_sentinel)
1720
+ return codes, uniques.astype(self.dtype)
1721
+
1722
+ uniq_sdf = self._internal.spark_frame.select(self.spark.column).distinct()
1723
+
1724
+ # Check number of uniques and constructs sorted `uniques_list`
1725
+ max_compute_count = get_option("compute.max_rows")
1726
+ if max_compute_count is not None:
1727
+ uniq_pdf = uniq_sdf.limit(max_compute_count + 1).toPandas()
1728
+ if len(uniq_pdf) > max_compute_count:
1729
+ raise ValueError(
1730
+ "Current Series has more then {0} unique values. "
1731
+ "Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' "
1732
+ "to more than {0} rows. Note that, before changing the "
1733
+ "'compute.max_rows', this operation is considerably expensive.".format(
1734
+ max_compute_count
1735
+ )
1736
+ )
1737
+ else:
1738
+ uniq_pdf = uniq_sdf.toPandas()
1739
+ # pandas takes both NaN and null in Spark to np.nan, so de-duplication is required
1740
+ uniq_series = first_series(uniq_pdf).drop_duplicates()
1741
+ uniques_list = uniq_series.tolist()
1742
+ uniques_list = sorted(uniques_list, key=lambda x: (pd.isna(x), x))
1743
+
1744
+ # Constructs `unique_to_code` mapping non-na unique to code
1745
+ unique_to_code = {}
1746
+ if na_sentinel is not None:
1747
+ na_sentinel_code = na_sentinel
1748
+ code = 0
1749
+ for unique in uniques_list:
1750
+ if pd.isna(unique):
1751
+ if na_sentinel is None:
1752
+ na_sentinel_code = code
1753
+ else:
1754
+ unique_to_code[unique] = code
1755
+ code += 1
1756
+
1757
+ kvs = list(
1758
+ chain(*([(F.lit(unique), F.lit(code)) for unique, code in unique_to_code.items()]))
1759
+ )
1760
+
1761
+ if len(kvs) == 0: # uniques are all missing values
1762
+ new_scol = F.lit(na_sentinel_code)
1763
+ else:
1764
+ map_scol = F.create_map(*kvs)
1765
+ null_scol = F.when(self.isnull().spark.column, F.lit(na_sentinel_code))
1766
+ new_scol = null_scol.otherwise(map_scol[self.spark.column])
1767
+
1768
+ codes = self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0]))
1769
+
1770
+ if na_sentinel is not None:
1771
+ # Drops the NaN from the uniques of the values
1772
+ uniques_list = [x for x in uniques_list if not pd.isna(x)]
1773
+
1774
+ uniques = pd.Index(uniques_list)
1775
+
1776
+ return codes, uniques
1777
+
1778
+
1779
+ def _test() -> None:
1780
+ import os
1781
+ import doctest
1782
+ import sys
1783
+ from pyspark.sql import SparkSession
1784
+ import pyspark.pandas.base
1785
+
1786
+ os.chdir(os.environ["SPARK_HOME"])
1787
+
1788
+ globs = pyspark.pandas.base.__dict__.copy()
1789
+ globs["ps"] = pyspark.pandas
1790
+ spark = (
1791
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.base tests").getOrCreate()
1792
+ )
1793
+ (failure_count, test_count) = doctest.testmod(
1794
+ pyspark.pandas.base,
1795
+ globs=globs,
1796
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
1797
+ )
1798
+ spark.stop()
1799
+ if failure_count:
1800
+ sys.exit(-1)
1801
+
1802
+
1803
+ if __name__ == "__main__":
1804
+ _test()