snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1680 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ An internal immutable DataFrame with some metadata to manage indexes.
20
+ """
21
+ import re
22
+ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, TYPE_CHECKING, cast
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+ from pandas.api.types import CategoricalDtype # noqa: F401
27
+ from pyspark._globals import _NoValue, _NoValueType
28
+ from pyspark.sql import (
29
+ functions as F,
30
+ Column as PySparkColumn,
31
+ DataFrame as PySparkDataFrame,
32
+ Window,
33
+ )
34
+ from pyspark.sql.types import ( # noqa: F401
35
+ BooleanType,
36
+ DataType,
37
+ LongType,
38
+ StructField,
39
+ StructType,
40
+ StringType,
41
+ )
42
+ from pyspark.sql.utils import is_timestamp_ntz_preferred
43
+
44
+ # For supporting Spark Connect
45
+ from pyspark.sql.utils import is_remote, get_column_class, get_dataframe_class
46
+
47
+ # For running doctests and reference resolution in PyCharm.
48
+ from pyspark import pandas as ps
49
+ from pyspark.pandas._typing import Label
50
+
51
+ if TYPE_CHECKING:
52
+ # This is required in old Python 3.5 to prevent circular reference.
53
+ from pyspark.pandas.series import Series
54
+ from pyspark.pandas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale
55
+ from pyspark.pandas.data_type_ops.base import DataTypeOps
56
+ from pyspark.pandas.typedef import (
57
+ Dtype,
58
+ as_spark_type,
59
+ extension_dtypes,
60
+ infer_pd_series_spark_type,
61
+ spark_type_to_pandas_dtype,
62
+ )
63
+ from pyspark.pandas.utils import (
64
+ column_labels_level,
65
+ default_session,
66
+ is_name_like_tuple,
67
+ is_testing,
68
+ lazy_property,
69
+ name_like_string,
70
+ scol_for,
71
+ spark_column_equals,
72
+ )
73
+
74
+
75
+ # A function to turn given numbers to Spark columns that represent pandas-on-Spark index.
76
+ SPARK_INDEX_NAME_FORMAT = "__index_level_{}__".format
77
+ SPARK_DEFAULT_INDEX_NAME = SPARK_INDEX_NAME_FORMAT(0)
78
+ # A pattern to check if the name of a Spark column is a pandas-on-Spark index name or not.
79
+ SPARK_INDEX_NAME_PATTERN = re.compile(r"__index_level_[0-9]+__")
80
+
81
+ NATURAL_ORDER_COLUMN_NAME = "__natural_order__"
82
+
83
+ HIDDEN_COLUMNS = {NATURAL_ORDER_COLUMN_NAME}
84
+
85
+ DEFAULT_SERIES_NAME = 0
86
+ SPARK_DEFAULT_SERIES_NAME = str(DEFAULT_SERIES_NAME)
87
+
88
+
89
+ class InternalField:
90
+ """
91
+ The internal field to store the dtype as well as the Spark's StructField optionally.
92
+
93
+ Parameters
94
+ ----------
95
+ dtype : numpy.dtype or pandas' ExtensionDtype
96
+ The dtype for the field
97
+ struct_field : StructField, optional
98
+ The `StructField` for the field. If None, InternalFrame will properly set.
99
+ """
100
+
101
+ def __init__(self, dtype: Dtype, struct_field: Optional[StructField] = None):
102
+ self._dtype = dtype
103
+ self._struct_field = struct_field
104
+
105
+ @staticmethod
106
+ def from_struct_field(
107
+ struct_field: StructField, *, use_extension_dtypes: bool = False
108
+ ) -> "InternalField":
109
+ """
110
+ Returns a new InternalField object created from the given StructField.
111
+
112
+ The dtype will be inferred from the data type of the given StructField.
113
+
114
+ Parameters
115
+ ----------
116
+ struct_field : StructField
117
+ The StructField used to create a new InternalField object.
118
+ use_extension_dtypes : bool
119
+ If True, try to use the extension dtypes.
120
+
121
+ Returns
122
+ -------
123
+ InternalField
124
+ """
125
+ return InternalField(
126
+ dtype=spark_type_to_pandas_dtype(
127
+ struct_field.dataType, use_extension_dtypes=use_extension_dtypes
128
+ ),
129
+ struct_field=struct_field,
130
+ )
131
+
132
+ @property
133
+ def dtype(self) -> Dtype:
134
+ """Return the dtype for the field."""
135
+ return self._dtype
136
+
137
+ @property
138
+ def struct_field(self) -> Optional[StructField]:
139
+ """Return the StructField for the field."""
140
+ return self._struct_field
141
+
142
+ @property
143
+ def name(self) -> str:
144
+ """Return the field name if the StructField exists."""
145
+ assert self.struct_field is not None
146
+ return self.struct_field.name
147
+
148
+ @property
149
+ def spark_type(self) -> DataType:
150
+ """Return the spark data type for the field if the StructField exists."""
151
+ assert self.struct_field is not None
152
+ return self.struct_field.dataType
153
+
154
+ @property
155
+ def nullable(self) -> bool:
156
+ """Return the nullability for the field if the StructField exists."""
157
+ assert self.struct_field is not None
158
+ return self.struct_field.nullable
159
+
160
+ @property
161
+ def metadata(self) -> Dict[str, Any]:
162
+ """Return the metadata for the field if the StructField exists."""
163
+ assert self.struct_field is not None
164
+ return self.struct_field.metadata
165
+
166
+ @property
167
+ def is_extension_dtype(self) -> bool:
168
+ """Return whether the dtype for the field is an extension type or not."""
169
+ return isinstance(self.dtype, extension_dtypes)
170
+
171
+ def normalize_spark_type(self) -> "InternalField":
172
+ """Return a new InternalField object with normalized Spark data type."""
173
+ assert self.struct_field is not None
174
+ return self.copy(
175
+ spark_type=force_decimal_precision_scale(as_nullable_spark_type(self.spark_type)),
176
+ nullable=True,
177
+ )
178
+
179
+ def copy(
180
+ self,
181
+ *,
182
+ name: Union[str, _NoValueType] = _NoValue,
183
+ dtype: Union[Dtype, _NoValueType] = _NoValue,
184
+ spark_type: Union[DataType, _NoValueType] = _NoValue,
185
+ nullable: Union[bool, _NoValueType] = _NoValue,
186
+ metadata: Union[Optional[Dict[str, Any]], _NoValueType] = _NoValue,
187
+ ) -> "InternalField":
188
+ """Copy the InternalField object."""
189
+ if name is _NoValue:
190
+ name = self.name
191
+ if dtype is _NoValue:
192
+ dtype = self.dtype
193
+ if spark_type is _NoValue:
194
+ spark_type = self.spark_type
195
+ if nullable is _NoValue:
196
+ nullable = self.nullable
197
+ if metadata is _NoValue:
198
+ metadata = self.metadata
199
+ return InternalField(
200
+ dtype=cast(Dtype, dtype),
201
+ struct_field=StructField(
202
+ name=cast(str, name),
203
+ dataType=cast(DataType, spark_type),
204
+ nullable=cast(bool, nullable),
205
+ metadata=cast(Optional[Dict[str, Any]], metadata),
206
+ ),
207
+ )
208
+
209
+ def __eq__(self, other: Any) -> bool:
210
+ return (
211
+ isinstance(other, InternalField)
212
+ and self.dtype == other.dtype
213
+ and self.struct_field == other.struct_field
214
+ )
215
+
216
+ def __repr__(self) -> str:
217
+ return "InternalField(dtype={dtype}, struct_field={struct_field})".format(
218
+ dtype=self.dtype, struct_field=self.struct_field
219
+ )
220
+
221
+
222
+ class InternalFrame:
223
+ """
224
+ The internal immutable DataFrame which manages Spark DataFrame and column names and index
225
+ information.
226
+
227
+ .. note:: this is an internal class. It is not supposed to be exposed to users and users
228
+ should not directly access to it.
229
+
230
+ The internal immutable DataFrame represents the index information for a DataFrame it belongs to.
231
+ For instance, if we have a pandas-on-Spark DataFrame as below, pandas DataFrame does not
232
+ store the index as columns.
233
+
234
+ >>> psdf = ps.DataFrame({
235
+ ... 'A': [1, 2, 3, 4],
236
+ ... 'B': [5, 6, 7, 8],
237
+ ... 'C': [9, 10, 11, 12],
238
+ ... 'D': [13, 14, 15, 16],
239
+ ... 'E': [17, 18, 19, 20]}, columns = ['A', 'B', 'C', 'D', 'E'])
240
+ >>> psdf # doctest: +NORMALIZE_WHITESPACE
241
+ A B C D E
242
+ 0 1 5 9 13 17
243
+ 1 2 6 10 14 18
244
+ 2 3 7 11 15 19
245
+ 3 4 8 12 16 20
246
+
247
+ However, all columns including index column are also stored in Spark DataFrame internally
248
+ as below.
249
+
250
+ >>> psdf._internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
251
+ +-----------------+---+---+---+---+---+
252
+ |__index_level_0__| A| B| C| D| E|
253
+ +-----------------+---+---+---+---+---+
254
+ | 0| 1| 5| 9| 13| 17|
255
+ | 1| 2| 6| 10| 14| 18|
256
+ | 2| 3| 7| 11| 15| 19|
257
+ | 3| 4| 8| 12| 16| 20|
258
+ +-----------------+---+---+---+---+---+
259
+
260
+ To fill this gap, the current metadata is used by mapping Spark's internal column
261
+ to pandas-on-Spark's index. See the method below:
262
+
263
+ * `spark_frame` represents the internal Spark DataFrame
264
+
265
+ * `data_spark_column_names` represents non-indexing Spark column names
266
+
267
+ * `data_spark_columns` represents non-indexing Spark columns
268
+
269
+ * `data_fields` represents non-indexing InternalFields
270
+
271
+ * `index_spark_column_names` represents internal index Spark column names
272
+
273
+ * `index_spark_columns` represents internal index Spark columns
274
+
275
+ * `index_fields` represents index InternalFields
276
+
277
+ * `spark_column_names` represents all columns
278
+
279
+ * `index_names` represents the external index name as a label
280
+
281
+ * `to_internal_spark_frame` represents Spark DataFrame derived by the metadata. Includes index.
282
+
283
+ * `to_pandas_frame` represents pandas DataFrame derived by the metadata
284
+
285
+ >>> internal = psdf._internal
286
+ >>> internal.spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
287
+ +-----------------+---+---+---+---+---+-----------------+
288
+ |__index_level_0__| A| B| C| D| E|__natural_order__|
289
+ +-----------------+---+---+---+---+---+-----------------+
290
+ | 0| 1| 5| 9| 13| 17| ...|
291
+ | 1| 2| 6| 10| 14| 18| ...|
292
+ | 2| 3| 7| 11| 15| 19| ...|
293
+ | 3| 4| 8| 12| 16| 20| ...|
294
+ +-----------------+---+---+---+---+---+-----------------+
295
+ >>> internal.data_spark_column_names
296
+ ['A', 'B', 'C', 'D', 'E']
297
+ >>> internal.index_spark_column_names
298
+ ['__index_level_0__']
299
+ >>> internal.spark_column_names
300
+ ['__index_level_0__', 'A', 'B', 'C', 'D', 'E']
301
+ >>> internal.index_names
302
+ [None]
303
+ >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE
304
+ [InternalField(dtype=int64, struct_field=StructField('A', LongType(), False)),
305
+ InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)),
306
+ InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)),
307
+ InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)),
308
+ InternalField(dtype=int64, struct_field=StructField('E', LongType(), False))]
309
+ >>> internal.index_fields
310
+ [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), False))]
311
+ >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
312
+ +-----------------+---+---+---+---+---+
313
+ |__index_level_0__| A| B| C| D| E|
314
+ +-----------------+---+---+---+---+---+
315
+ | 0| 1| 5| 9| 13| 17|
316
+ | 1| 2| 6| 10| 14| 18|
317
+ | 2| 3| 7| 11| 15| 19|
318
+ | 3| 4| 8| 12| 16| 20|
319
+ +-----------------+---+---+---+---+---+
320
+ >>> internal.to_pandas_frame
321
+ A B C D E
322
+ 0 1 5 9 13 17
323
+ 1 2 6 10 14 18
324
+ 2 3 7 11 15 19
325
+ 3 4 8 12 16 20
326
+
327
+ In case that index is set to one of the existing columns as below:
328
+
329
+ >>> psdf1 = psdf.set_index("A")
330
+ >>> psdf1 # doctest: +NORMALIZE_WHITESPACE
331
+ B C D E
332
+ A
333
+ 1 5 9 13 17
334
+ 2 6 10 14 18
335
+ 3 7 11 15 19
336
+ 4 8 12 16 20
337
+
338
+ >>> psdf1._internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
339
+ +---+---+---+---+---+
340
+ | A| B| C| D| E|
341
+ +---+---+---+---+---+
342
+ | 1| 5| 9| 13| 17|
343
+ | 2| 6| 10| 14| 18|
344
+ | 3| 7| 11| 15| 19|
345
+ | 4| 8| 12| 16| 20|
346
+ +---+---+---+---+---+
347
+
348
+ >>> internal = psdf1._internal
349
+ >>> internal.spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
350
+ +-----------------+---+---+---+---+---+-----------------+
351
+ |__index_level_0__| A| B| C| D| E|__natural_order__|
352
+ +-----------------+---+---+---+---+---+-----------------+
353
+ | 0| 1| 5| 9| 13| 17| ...|
354
+ | 1| 2| 6| 10| 14| 18| ...|
355
+ | 2| 3| 7| 11| 15| 19| ...|
356
+ | 3| 4| 8| 12| 16| 20| ...|
357
+ +-----------------+---+---+---+---+---+-----------------+
358
+ >>> internal.data_spark_column_names
359
+ ['B', 'C', 'D', 'E']
360
+ >>> internal.index_spark_column_names
361
+ ['A']
362
+ >>> internal.spark_column_names
363
+ ['A', 'B', 'C', 'D', 'E']
364
+ >>> internal.index_names
365
+ [('A',)]
366
+ >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE
367
+ [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)),
368
+ InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)),
369
+ InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)),
370
+ InternalField(dtype=int64, struct_field=StructField('E', LongType(), False))]
371
+ >>> internal.index_fields
372
+ [InternalField(dtype=int64, struct_field=StructField('A', LongType(), False))]
373
+ >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
374
+ +---+---+---+---+---+
375
+ | A| B| C| D| E|
376
+ +---+---+---+---+---+
377
+ | 1| 5| 9| 13| 17|
378
+ | 2| 6| 10| 14| 18|
379
+ | 3| 7| 11| 15| 19|
380
+ | 4| 8| 12| 16| 20|
381
+ +---+---+---+---+---+
382
+ >>> internal.to_pandas_frame # doctest: +NORMALIZE_WHITESPACE
383
+ B C D E
384
+ A
385
+ 1 5 9 13 17
386
+ 2 6 10 14 18
387
+ 3 7 11 15 19
388
+ 4 8 12 16 20
389
+
390
+ In case that index becomes a multi index as below:
391
+
392
+ >>> psdf2 = psdf.set_index("A", append=True)
393
+ >>> psdf2 # doctest: +NORMALIZE_WHITESPACE
394
+ B C D E
395
+ A
396
+ 0 1 5 9 13 17
397
+ 1 2 6 10 14 18
398
+ 2 3 7 11 15 19
399
+ 3 4 8 12 16 20
400
+
401
+ >>> psdf2._internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
402
+ +-----------------+---+---+---+---+---+
403
+ |__index_level_0__| A| B| C| D| E|
404
+ +-----------------+---+---+---+---+---+
405
+ | 0| 1| 5| 9| 13| 17|
406
+ | 1| 2| 6| 10| 14| 18|
407
+ | 2| 3| 7| 11| 15| 19|
408
+ | 3| 4| 8| 12| 16| 20|
409
+ +-----------------+---+---+---+---+---+
410
+
411
+ >>> internal = psdf2._internal
412
+ >>> internal.spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
413
+ +-----------------+---+---+---+---+---+-----------------+
414
+ |__index_level_0__| A| B| C| D| E|__natural_order__|
415
+ +-----------------+---+---+---+---+---+-----------------+
416
+ | 0| 1| 5| 9| 13| 17| ...|
417
+ | 1| 2| 6| 10| 14| 18| ...|
418
+ | 2| 3| 7| 11| 15| 19| ...|
419
+ | 3| 4| 8| 12| 16| 20| ...|
420
+ +-----------------+---+---+---+---+---+-----------------+
421
+ >>> internal.data_spark_column_names
422
+ ['B', 'C', 'D', 'E']
423
+ >>> internal.index_spark_column_names
424
+ ['__index_level_0__', 'A']
425
+ >>> internal.spark_column_names
426
+ ['__index_level_0__', 'A', 'B', 'C', 'D', 'E']
427
+ >>> internal.index_names
428
+ [None, ('A',)]
429
+ >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE
430
+ [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)),
431
+ InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)),
432
+ InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)),
433
+ InternalField(dtype=int64, struct_field=StructField('E', LongType(), False))]
434
+ >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE
435
+ [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), False)),
436
+ InternalField(dtype=int64, struct_field=StructField('A', LongType(), False))]
437
+ >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
438
+ +-----------------+---+---+---+---+---+
439
+ |__index_level_0__| A| B| C| D| E|
440
+ +-----------------+---+---+---+---+---+
441
+ | 0| 1| 5| 9| 13| 17|
442
+ | 1| 2| 6| 10| 14| 18|
443
+ | 2| 3| 7| 11| 15| 19|
444
+ | 3| 4| 8| 12| 16| 20|
445
+ +-----------------+---+---+---+---+---+
446
+ >>> internal.to_pandas_frame # doctest: +NORMALIZE_WHITESPACE
447
+ B C D E
448
+ A
449
+ 0 1 5 9 13 17
450
+ 1 2 6 10 14 18
451
+ 2 3 7 11 15 19
452
+ 3 4 8 12 16 20
453
+
454
+ For multi-level columns, it also holds column_labels
455
+
456
+ >>> columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'),
457
+ ... ('Y', 'C'), ('Y', 'D')])
458
+ >>> psdf3 = ps.DataFrame([
459
+ ... [1, 2, 3, 4],
460
+ ... [5, 6, 7, 8],
461
+ ... [9, 10, 11, 12],
462
+ ... [13, 14, 15, 16],
463
+ ... [17, 18, 19, 20]], columns = columns)
464
+ >>> psdf3 # doctest: +NORMALIZE_WHITESPACE
465
+ X Y
466
+ A B C D
467
+ 0 1 2 3 4
468
+ 1 5 6 7 8
469
+ 2 9 10 11 12
470
+ 3 13 14 15 16
471
+ 4 17 18 19 20
472
+
473
+ >>> internal = psdf3._internal
474
+ >>> internal.spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
475
+ +-----------------+------+------+------+------+-----------------+
476
+ |__index_level_0__|(X, A)|(X, B)|(Y, C)|(Y, D)|__natural_order__|
477
+ +-----------------+------+------+------+------+-----------------+
478
+ | 0| 1| 2| 3| 4| ...|
479
+ | 1| 5| 6| 7| 8| ...|
480
+ | 2| 9| 10| 11| 12| ...|
481
+ | 3| 13| 14| 15| 16| ...|
482
+ | 4| 17| 18| 19| 20| ...|
483
+ +-----------------+------+------+------+------+-----------------+
484
+ >>> internal.data_spark_column_names
485
+ ['(X, A)', '(X, B)', '(Y, C)', '(Y, D)']
486
+ >>> internal.column_labels
487
+ [('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Y', 'D')]
488
+
489
+ For Series, it also holds scol to represent the column.
490
+
491
+ >>> psseries = psdf1.B
492
+ >>> psseries
493
+ A
494
+ 1 5
495
+ 2 6
496
+ 3 7
497
+ 4 8
498
+ Name: B, dtype: int64
499
+
500
+ >>> internal = psseries._internal
501
+ >>> internal.spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
502
+ +-----------------+---+---+---+---+---+-----------------+
503
+ |__index_level_0__| A| B| C| D| E|__natural_order__|
504
+ +-----------------+---+---+---+---+---+-----------------+
505
+ | 0| 1| 5| 9| 13| 17| ...|
506
+ | 1| 2| 6| 10| 14| 18| ...|
507
+ | 2| 3| 7| 11| 15| 19| ...|
508
+ | 3| 4| 8| 12| 16| 20| ...|
509
+ +-----------------+---+---+---+---+---+-----------------+
510
+ >>> internal.data_spark_column_names
511
+ ['B']
512
+ >>> internal.index_spark_column_names
513
+ ['A']
514
+ >>> internal.spark_column_names
515
+ ['A', 'B']
516
+ >>> internal.index_names
517
+ [('A',)]
518
+ >>> internal.data_fields
519
+ [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False))]
520
+ >>> internal.index_fields
521
+ [InternalField(dtype=int64, struct_field=StructField('A', LongType(), False))]
522
+ >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE
523
+ +---+---+
524
+ | A| B|
525
+ +---+---+
526
+ | 1| 5|
527
+ | 2| 6|
528
+ | 3| 7|
529
+ | 4| 8|
530
+ +---+---+
531
+ >>> internal.to_pandas_frame # doctest: +NORMALIZE_WHITESPACE
532
+ B
533
+ A
534
+ 1 5
535
+ 2 6
536
+ 3 7
537
+ 4 8
538
+ """
539
+
540
+ def __init__(
541
+ self,
542
+ spark_frame: PySparkDataFrame,
543
+ index_spark_columns: Optional[List[PySparkColumn]],
544
+ index_names: Optional[List[Optional[Label]]] = None,
545
+ index_fields: Optional[List[InternalField]] = None,
546
+ column_labels: Optional[List[Label]] = None,
547
+ data_spark_columns: Optional[List[PySparkColumn]] = None,
548
+ data_fields: Optional[List[InternalField]] = None,
549
+ column_label_names: Optional[List[Optional[Label]]] = None,
550
+ ):
551
+ """
552
+ Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and
553
+ index fields and names.
554
+
555
+ :param spark_frame: Spark DataFrame to be managed.
556
+ :param index_spark_columns: list of Spark Column
557
+ Spark Columns for the index.
558
+ :param index_names: list of tuples
559
+ the index names.
560
+ :param index_fields: list of InternalField
561
+ the InternalFields for the index columns
562
+ :param column_labels: list of tuples with the same length
563
+ The multi-level values in the tuples.
564
+ :param data_spark_columns: list of Spark Column
565
+ Spark Columns to appear as columns. If this is None, calculated
566
+ from spark_frame.
567
+ :param data_fields: list of InternalField
568
+ the InternalFields for the data columns
569
+ :param column_label_names: Names for each of the column index levels.
570
+
571
+ See the examples below to refer what each parameter means.
572
+
573
+ >>> column_labels = pd.MultiIndex.from_tuples(
574
+ ... [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"])
575
+ >>> row_index = pd.MultiIndex.from_tuples(
576
+ ... [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')],
577
+ ... names=["row_index_a", "row_index_b"])
578
+ >>> psdf = ps.DataFrame(
579
+ ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels)
580
+ >>> psdf.set_index(('a', 'x'), append=True, inplace=True)
581
+ >>> psdf # doctest: +NORMALIZE_WHITESPACE
582
+ column_labels_a a b
583
+ column_labels_b y z
584
+ row_index_a row_index_b (a, x)
585
+ foo bar 1 2 3
586
+ 4 5 6
587
+ zoo bar 7 8 9
588
+
589
+ >>> internal = psdf._internal
590
+
591
+ >>> internal.spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
592
+ +-----------------+-----------------+------+------+------+...
593
+ |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|...
594
+ +-----------------+-----------------+------+------+------+...
595
+ | foo| bar| 1| 2| 3|...
596
+ | foo| bar| 4| 5| 6|...
597
+ | zoo| bar| 7| 8| 9|...
598
+ +-----------------+-----------------+------+------+------+...
599
+
600
+ >>> internal.index_spark_columns # doctest: +SKIP
601
+ [Column<'__index_level_0__'>, Column<'__index_level_1__'>, Column<'(a, x)'>]
602
+
603
+ >>> internal.index_names
604
+ [('row_index_a',), ('row_index_b',), ('a', 'x')]
605
+
606
+ >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE
607
+ [InternalField(dtype=object,
608
+ struct_field=StructField('__index_level_0__', StringType(), False)),
609
+ InternalField(dtype=object,
610
+ struct_field=StructField('__index_level_1__', StringType(), False)),
611
+ InternalField(dtype=int64,
612
+ struct_field=StructField('(a, x)', LongType(), False))]
613
+
614
+ >>> internal.column_labels
615
+ [('a', 'y'), ('b', 'z')]
616
+
617
+ >>> internal.data_spark_columns # doctest: +SKIP
618
+ [Column<'(a, y)'>, Column<'(b, z)'>]
619
+
620
+ >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE
621
+ [InternalField(dtype=int64, struct_field=StructField('(a, y)', LongType(), False)),
622
+ InternalField(dtype=int64, struct_field=StructField('(b, z)', LongType(), False))]
623
+
624
+ >>> internal.column_label_names
625
+ [('column_labels_a',), ('column_labels_b',)]
626
+ """
627
+ SparkDataFrame = get_dataframe_class()
628
+ assert isinstance(spark_frame, SparkDataFrame)
629
+ assert not spark_frame.isStreaming, "pandas-on-Spark does not support Structured Streaming."
630
+
631
+ if not index_spark_columns:
632
+ if data_spark_columns is not None:
633
+ if column_labels is not None:
634
+ data_spark_columns = [
635
+ scol.alias(name_like_string(label))
636
+ for scol, label in zip(data_spark_columns, column_labels)
637
+ ]
638
+ spark_frame = spark_frame.select(data_spark_columns)
639
+
640
+ assert not any(SPARK_INDEX_NAME_PATTERN.match(name) for name in spark_frame.columns), (
641
+ "Index columns should not appear in columns of the Spark DataFrame. Avoid "
642
+ "index column names [%s]." % SPARK_INDEX_NAME_PATTERN
643
+ )
644
+
645
+ # Create default index.
646
+ spark_frame = InternalFrame.attach_default_index(spark_frame)
647
+ index_spark_columns = [scol_for(spark_frame, SPARK_DEFAULT_INDEX_NAME)]
648
+
649
+ index_fields = [
650
+ InternalField.from_struct_field(
651
+ StructField(SPARK_DEFAULT_INDEX_NAME, LongType(), nullable=False)
652
+ )
653
+ ]
654
+
655
+ if data_spark_columns is not None:
656
+ data_struct_fields = [
657
+ field
658
+ for field in spark_frame.schema.fields
659
+ if field.name != SPARK_DEFAULT_INDEX_NAME
660
+ ]
661
+ data_spark_columns = [
662
+ scol_for(spark_frame, field.name) for field in data_struct_fields
663
+ ]
664
+ if data_fields is not None:
665
+ data_fields = [
666
+ field.copy(
667
+ name=name_like_string(struct_field.name),
668
+ )
669
+ for field, struct_field in zip(data_fields, data_struct_fields)
670
+ ]
671
+
672
+ if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns:
673
+ spark_frame = spark_frame.withColumn(
674
+ NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()
675
+ )
676
+
677
+ self._sdf = spark_frame
678
+
679
+ # index_spark_columns
680
+ Column = get_column_class()
681
+ assert all(
682
+ isinstance(index_scol, Column) for index_scol in index_spark_columns
683
+ ), index_spark_columns
684
+
685
+ self._index_spark_columns: List[Column] = index_spark_columns # type: ignore[valid-type]
686
+
687
+ # data_spark_columns
688
+ if data_spark_columns is None:
689
+ data_spark_columns = [
690
+ scol_for(spark_frame, col)
691
+ for col in spark_frame.columns
692
+ if all(
693
+ not spark_column_equals(scol_for(spark_frame, col), index_scol)
694
+ for index_scol in index_spark_columns
695
+ )
696
+ and col not in HIDDEN_COLUMNS
697
+ ]
698
+ else:
699
+ assert all(isinstance(scol, Column) for scol in data_spark_columns)
700
+
701
+ self._data_spark_columns: List[Column] = data_spark_columns # type: ignore[valid-type]
702
+
703
+ # fields
704
+ if index_fields is None:
705
+ index_fields = [None] * len(index_spark_columns)
706
+ if data_fields is None:
707
+ data_fields = [None] * len(data_spark_columns)
708
+
709
+ assert len(index_spark_columns) == len(index_fields), (
710
+ len(index_spark_columns),
711
+ len(index_fields),
712
+ )
713
+ assert len(data_spark_columns) == len(data_fields), (
714
+ len(data_spark_columns),
715
+ len(data_fields),
716
+ )
717
+
718
+ if any(field is None or field.struct_field is None for field in index_fields) and any(
719
+ field is None or field.struct_field is None for field in data_fields
720
+ ):
721
+ schema = spark_frame.select(index_spark_columns + data_spark_columns).schema
722
+ fields = [
723
+ InternalField.from_struct_field(struct_field)
724
+ if field is None
725
+ else InternalField(field.dtype, struct_field)
726
+ if field.struct_field is None
727
+ else field
728
+ for field, struct_field in zip(index_fields + data_fields, schema.fields)
729
+ ]
730
+ index_fields = fields[: len(index_spark_columns)]
731
+ data_fields = fields[len(index_spark_columns) :]
732
+ elif any(field is None or field.struct_field is None for field in index_fields):
733
+ schema = spark_frame.select(index_spark_columns).schema
734
+ index_fields = [
735
+ InternalField.from_struct_field(struct_field)
736
+ if field is None
737
+ else InternalField(field.dtype, struct_field)
738
+ if field.struct_field is None
739
+ else field
740
+ for field, struct_field in zip(index_fields, schema.fields)
741
+ ]
742
+ elif any(field is None or field.struct_field is None for field in data_fields):
743
+ schema = spark_frame.select(data_spark_columns).schema
744
+ data_fields = [
745
+ InternalField.from_struct_field(struct_field)
746
+ if field is None
747
+ else InternalField(field.dtype, struct_field)
748
+ if field.struct_field is None
749
+ else field
750
+ for field, struct_field in zip(data_fields, schema.fields)
751
+ ]
752
+
753
+ assert all(
754
+ isinstance(ops.dtype, Dtype.__args__) # type: ignore[attr-defined]
755
+ and (
756
+ ops.dtype == np.dtype("object")
757
+ or as_spark_type(ops.dtype, raise_error=False) is not None
758
+ )
759
+ for ops in index_fields
760
+ ), index_fields
761
+
762
+ if is_testing():
763
+ struct_fields = spark_frame.select(index_spark_columns).schema.fields
764
+ if is_remote():
765
+ # TODO(SPARK-42965): For some reason, the metadata of StructField is different
766
+ # in a few tests when using Spark Connect. However, the function works properly.
767
+ # Therefore, we temporarily perform Spark Connect tests by excluding metadata
768
+ # until the issue is resolved.
769
+ def remove_metadata(struct_field: StructField) -> StructField:
770
+ new_struct_field = StructField(
771
+ struct_field.name, struct_field.dataType, struct_field.nullable
772
+ )
773
+ return new_struct_field
774
+
775
+ assert all(
776
+ remove_metadata(index_field.struct_field) == remove_metadata(struct_field)
777
+ for index_field, struct_field in zip(index_fields, struct_fields)
778
+ ), (index_fields, struct_fields)
779
+ else:
780
+ assert all(
781
+ index_field.struct_field == struct_field
782
+ for index_field, struct_field in zip(index_fields, struct_fields)
783
+ ), (index_fields, struct_fields)
784
+
785
+ self._index_fields: List[InternalField] = index_fields
786
+
787
+ assert all(
788
+ isinstance(ops.dtype, Dtype.__args__) # type: ignore[attr-defined]
789
+ and (
790
+ ops.dtype == np.dtype("object")
791
+ or as_spark_type(ops.dtype, raise_error=False) is not None
792
+ )
793
+ for ops in data_fields
794
+ ), data_fields
795
+
796
+ if is_testing():
797
+ struct_fields = spark_frame.select(data_spark_columns).schema.fields
798
+ if is_remote():
799
+ # TODO(SPARK-42965): For some reason, the metadata of StructField is different
800
+ # in a few tests when using Spark Connect. However, the function works properly.
801
+ # Therefore, we temporarily perform Spark Connect tests by excluding metadata
802
+ # until the issue is resolved.
803
+ def remove_metadata(struct_field: StructField) -> StructField:
804
+ new_struct_field = StructField(
805
+ struct_field.name, struct_field.dataType, struct_field.nullable
806
+ )
807
+ return new_struct_field
808
+
809
+ assert all(
810
+ remove_metadata(data_field.struct_field) == remove_metadata(struct_field)
811
+ for data_field, struct_field in zip(data_fields, struct_fields)
812
+ ), (data_fields, struct_fields)
813
+ else:
814
+ assert all(
815
+ data_field.struct_field == struct_field
816
+ for data_field, struct_field in zip(data_fields, struct_fields)
817
+ ), (data_fields, struct_fields)
818
+
819
+ self._data_fields: List[InternalField] = data_fields
820
+
821
+ # index_names
822
+ if not index_names:
823
+ index_names = [None] * len(index_spark_columns)
824
+
825
+ assert len(index_spark_columns) == len(index_names), (
826
+ len(index_spark_columns),
827
+ len(index_names),
828
+ )
829
+ assert all(
830
+ is_name_like_tuple(index_name, check_type=True) for index_name in index_names
831
+ ), index_names
832
+
833
+ self._index_names: List[Optional[Label]] = index_names
834
+
835
+ # column_labels
836
+ if column_labels is None:
837
+ column_labels = [(col,) for col in spark_frame.select(self._data_spark_columns).columns]
838
+ else:
839
+ assert len(column_labels) == len(self._data_spark_columns), (
840
+ len(column_labels),
841
+ len(self._data_spark_columns),
842
+ )
843
+ if len(column_labels) == 1:
844
+ column_label = column_labels[0]
845
+ assert is_name_like_tuple(column_label, check_type=True), column_label
846
+ else:
847
+ assert all(
848
+ is_name_like_tuple(column_label, check_type=True)
849
+ for column_label in column_labels
850
+ ), column_labels
851
+ assert len(set(len(label) for label in column_labels)) <= 1, column_labels
852
+
853
+ self._column_labels: List[Label] = column_labels
854
+
855
+ # column_label_names
856
+ if column_label_names is None:
857
+ column_label_names = [None] * column_labels_level(self._column_labels)
858
+ else:
859
+ if len(self._column_labels) > 0:
860
+ assert len(column_label_names) == column_labels_level(self._column_labels), (
861
+ len(column_label_names),
862
+ column_labels_level(self._column_labels),
863
+ )
864
+ else:
865
+ assert len(column_label_names) > 0, len(column_label_names)
866
+ assert all(
867
+ is_name_like_tuple(column_label_name, check_type=True)
868
+ for column_label_name in column_label_names
869
+ ), column_label_names
870
+
871
+ self._column_label_names: List[Optional[Label]] = column_label_names
872
+
873
+ @staticmethod
874
+ def attach_default_index(
875
+ sdf: PySparkDataFrame, default_index_type: Optional[str] = None
876
+ ) -> PySparkDataFrame:
877
+ """
878
+ This method attaches a default index to Spark DataFrame. Spark does not have the index
879
+ notion so corresponding column should be generated.
880
+ There are several types of default index can be configured by `compute.default_index_type`.
881
+
882
+ >>> spark_frame = ps.range(10).to_spark()
883
+ >>> spark_frame
884
+ DataFrame[id: bigint]
885
+
886
+ It adds the default index column '__index_level_0__'.
887
+
888
+ >>> spark_frame = InternalFrame.attach_default_index(spark_frame)
889
+ >>> spark_frame
890
+ DataFrame[__index_level_0__: bigint, id: bigint]
891
+
892
+ It throws an exception if the given column name already exists.
893
+
894
+ >>> InternalFrame.attach_default_index(spark_frame)
895
+ ... # doctest: +ELLIPSIS
896
+ Traceback (most recent call last):
897
+ ...
898
+ AssertionError: '__index_level_0__' already exists...
899
+ """
900
+ index_column = SPARK_DEFAULT_INDEX_NAME
901
+ assert (
902
+ index_column not in sdf.columns
903
+ ), "'%s' already exists in the Spark column names '%s'" % (index_column, sdf.columns)
904
+
905
+ if default_index_type is None:
906
+ default_index_type = ps.get_option("compute.default_index_type")
907
+
908
+ if default_index_type == "sequence":
909
+ return InternalFrame.attach_sequence_column(sdf, column_name=index_column)
910
+ elif default_index_type == "distributed-sequence":
911
+ return InternalFrame.attach_distributed_sequence_column(sdf, column_name=index_column)
912
+ elif default_index_type == "distributed":
913
+ return InternalFrame.attach_distributed_column(sdf, column_name=index_column)
914
+ else:
915
+ raise ValueError(
916
+ "'compute.default_index_type' should be one of 'sequence',"
917
+ " 'distributed-sequence' and 'distributed'"
918
+ )
919
+
920
+ @staticmethod
921
+ def attach_sequence_column(sdf: PySparkDataFrame, column_name: str) -> PySparkDataFrame:
922
+ scols = [scol_for(sdf, column) for column in sdf.columns]
923
+ sequential_index = (
924
+ F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1
925
+ )
926
+ return sdf.select(sequential_index.alias(column_name), *scols)
927
+
928
+ @staticmethod
929
+ def attach_distributed_column(sdf: PySparkDataFrame, column_name: str) -> PySparkDataFrame:
930
+ scols = [scol_for(sdf, column) for column in sdf.columns]
931
+ # Does not add an alias to avoid having some changes in protobuf definition for now.
932
+ # The alias is more for query strings in DataFrame.explain, and they are cosmetic changes.
933
+ if is_remote():
934
+ return sdf.select(F.monotonically_increasing_id().alias(column_name), *scols)
935
+ jvm = sdf.sparkSession._jvm
936
+ tag = jvm.org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FUNC_ALIAS()
937
+ jexpr = F.monotonically_increasing_id()._jc.expr()
938
+ jexpr.setTagValue(tag, "distributed_index")
939
+ return sdf.select(PySparkColumn(jvm.Column(jexpr)).alias(column_name), *scols)
940
+
941
+ @staticmethod
942
+ def attach_distributed_sequence_column(
943
+ sdf: PySparkDataFrame, column_name: str
944
+ ) -> PySparkDataFrame:
945
+ """
946
+ This method attaches a Spark column that has a sequence in a distributed manner.
947
+ This is equivalent to the column assigned when default index type 'distributed-sequence'.
948
+
949
+ >>> sdf = ps.DataFrame(['a', 'b', 'c']).to_spark()
950
+ >>> sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence")
951
+ >>> sdf.show() # doctest: +NORMALIZE_WHITESPACE
952
+ +--------+---+
953
+ |sequence| 0|
954
+ +--------+---+
955
+ | 0| a|
956
+ | 1| b|
957
+ | 2| c|
958
+ +--------+---+
959
+ """
960
+ if len(sdf.columns) > 0:
961
+ if is_remote():
962
+ from pyspark.sql.connect.column import Column as ConnectColumn
963
+ from pyspark.sql.connect.expressions import DistributedSequenceID
964
+
965
+ return sdf.select(
966
+ ConnectColumn(DistributedSequenceID()).alias(column_name),
967
+ "*", # type: ignore[call-overload]
968
+ )
969
+ else:
970
+ return PySparkDataFrame(
971
+ sdf._jdf.toDF().withSequenceColumn(column_name),
972
+ sdf.sparkSession,
973
+ )
974
+ else:
975
+ cnt = sdf.count()
976
+ if cnt > 0:
977
+ return default_session().range(cnt).toDF(column_name)
978
+ else:
979
+ return default_session().createDataFrame(
980
+ [], schema=StructType().add(column_name, data_type=LongType(), nullable=False)
981
+ )
982
+
983
+ def spark_column_for(self, label: Label) -> PySparkColumn:
984
+ """Return Spark Column for the given column label."""
985
+ column_labels_to_scol = dict(zip(self.column_labels, self.data_spark_columns))
986
+ if label in column_labels_to_scol:
987
+ return column_labels_to_scol[label]
988
+ else:
989
+ raise KeyError(name_like_string(label))
990
+
991
+ def spark_column_name_for(self, label_or_scol: Union[Label, PySparkColumn]) -> str:
992
+ """Return the actual Spark column name for the given column label."""
993
+ Column = get_column_class()
994
+ if isinstance(label_or_scol, Column):
995
+ return self.spark_frame.select(label_or_scol).columns[0]
996
+ else:
997
+ return self.field_for(label_or_scol).name # type: ignore[arg-type]
998
+
999
+ def spark_type_for(self, label_or_scol: Union[Label, PySparkColumn]) -> DataType:
1000
+ """Return DataType for the given column label."""
1001
+ Column = get_column_class()
1002
+ if isinstance(label_or_scol, Column):
1003
+ return self.spark_frame.select(label_or_scol).schema[0].dataType
1004
+ else:
1005
+ return self.field_for(label_or_scol).spark_type # type: ignore[arg-type]
1006
+
1007
+ def spark_column_nullable_for(self, label_or_scol: Union[Label, PySparkColumn]) -> bool:
1008
+ """Return nullability for the given column label."""
1009
+ Column = get_column_class()
1010
+ if isinstance(label_or_scol, Column):
1011
+ return self.spark_frame.select(label_or_scol).schema[0].nullable
1012
+ else:
1013
+ return self.field_for(label_or_scol).nullable # type: ignore[arg-type]
1014
+
1015
+ def field_for(self, label: Label) -> InternalField:
1016
+ """Return InternalField for the given column label."""
1017
+ column_labels_to_fields = dict(zip(self.column_labels, self.data_fields))
1018
+ if label in column_labels_to_fields:
1019
+ return column_labels_to_fields[label]
1020
+ else:
1021
+ raise KeyError(name_like_string(label))
1022
+
1023
+ @property
1024
+ def spark_frame(self) -> PySparkDataFrame:
1025
+ """Return the managed Spark DataFrame."""
1026
+ return self._sdf
1027
+
1028
+ @lazy_property
1029
+ def data_spark_column_names(self) -> List[str]:
1030
+ """Return the managed column field names."""
1031
+ return [field.name for field in self.data_fields]
1032
+
1033
+ @property
1034
+ def data_spark_columns(self) -> List[PySparkColumn]:
1035
+ """Return Spark Columns for the managed data columns."""
1036
+ return self._data_spark_columns
1037
+
1038
+ @property
1039
+ def index_spark_column_names(self) -> List[str]:
1040
+ """Return the managed index field names."""
1041
+ return [field.name for field in self.index_fields]
1042
+
1043
+ @property
1044
+ def index_spark_columns(self) -> List[PySparkColumn]:
1045
+ """Return Spark Columns for the managed index columns."""
1046
+ return self._index_spark_columns
1047
+
1048
+ @lazy_property
1049
+ def spark_column_names(self) -> List[str]:
1050
+ """Return all the field names including index field names."""
1051
+ return self.spark_frame.select(self.spark_columns).columns
1052
+
1053
+ @lazy_property
1054
+ def spark_columns(self) -> List[PySparkColumn]:
1055
+ """Return Spark Columns for the managed columns including index columns."""
1056
+ index_spark_columns = self.index_spark_columns
1057
+ return index_spark_columns + [
1058
+ spark_column
1059
+ for spark_column in self.data_spark_columns
1060
+ if all(not spark_column_equals(spark_column, scol) for scol in index_spark_columns)
1061
+ ]
1062
+
1063
+ @property
1064
+ def index_names(self) -> List[Optional[Label]]:
1065
+ """Return the managed index names."""
1066
+ return self._index_names
1067
+
1068
+ @lazy_property
1069
+ def index_level(self) -> int:
1070
+ """Return the level of the index."""
1071
+ return len(self._index_names)
1072
+
1073
+ @property
1074
+ def column_labels(self) -> List[Label]:
1075
+ """Return the managed column index."""
1076
+ return self._column_labels
1077
+
1078
+ @lazy_property
1079
+ def column_labels_level(self) -> int:
1080
+ """Return the level of the column index."""
1081
+ return len(self._column_label_names)
1082
+
1083
+ @property
1084
+ def column_label_names(self) -> List[Optional[Label]]:
1085
+ """Return names of the index levels."""
1086
+ return self._column_label_names
1087
+
1088
+ @property
1089
+ def index_fields(self) -> List[InternalField]:
1090
+ """Return InternalFields for the managed index columns."""
1091
+ return self._index_fields
1092
+
1093
+ @property
1094
+ def data_fields(self) -> List[InternalField]:
1095
+ """Return InternalFields for the managed columns."""
1096
+ return self._data_fields
1097
+
1098
+ @lazy_property
1099
+ def to_internal_spark_frame(self) -> PySparkDataFrame:
1100
+ """
1101
+ Return as Spark DataFrame. This contains index columns as well
1102
+ and should be only used for internal purposes.
1103
+ """
1104
+ index_spark_columns = self.index_spark_columns
1105
+ data_columns = []
1106
+ for spark_column in self.data_spark_columns:
1107
+ if all(not spark_column_equals(spark_column, scol) for scol in index_spark_columns):
1108
+ data_columns.append(spark_column)
1109
+ return self.spark_frame.select(index_spark_columns + data_columns)
1110
+
1111
+ @lazy_property
1112
+ def to_pandas_frame(self) -> pd.DataFrame:
1113
+ """Return as pandas DataFrame."""
1114
+ sdf = self.to_internal_spark_frame
1115
+ pdf = sdf.toPandas()
1116
+ if len(pdf) == 0 and len(sdf.schema) > 0:
1117
+ pdf = pdf.astype(
1118
+ {field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema}
1119
+ )
1120
+
1121
+ return InternalFrame.restore_index(pdf, **self.arguments_for_restore_index)
1122
+
1123
+ @lazy_property
1124
+ def arguments_for_restore_index(self) -> Dict:
1125
+ """Create arguments for `restore_index`."""
1126
+ column_names = []
1127
+ fields = self.index_fields.copy()
1128
+
1129
+ for spark_column, column_name, field in zip(
1130
+ self.data_spark_columns, self.data_spark_column_names, self.data_fields
1131
+ ):
1132
+ for index_spark_column_name, index_spark_column in zip(
1133
+ self.index_spark_column_names, self.index_spark_columns
1134
+ ):
1135
+ if spark_column_equals(spark_column, index_spark_column):
1136
+ column_names.append(index_spark_column_name)
1137
+ break
1138
+ else:
1139
+ column_names.append(column_name)
1140
+ fields.append(field)
1141
+
1142
+ return dict(
1143
+ index_columns=self.index_spark_column_names,
1144
+ index_names=self.index_names,
1145
+ data_columns=column_names,
1146
+ column_labels=self.column_labels,
1147
+ column_label_names=self.column_label_names,
1148
+ fields=fields,
1149
+ )
1150
+
1151
+ @staticmethod
1152
+ def restore_index(
1153
+ pdf: pd.DataFrame,
1154
+ *,
1155
+ index_columns: List[str],
1156
+ index_names: List[Label],
1157
+ data_columns: List[str],
1158
+ column_labels: List[Label],
1159
+ column_label_names: List[Label],
1160
+ fields: List[InternalField] = None,
1161
+ ) -> pd.DataFrame:
1162
+ """
1163
+ Restore pandas DataFrame indices using the metadata.
1164
+
1165
+ :param pdf: the pandas DataFrame to be processed.
1166
+ :param index_columns: the original column names for index columns.
1167
+ :param index_names: the index names after restored.
1168
+ :param data_columns: the original column names for data columns.
1169
+ :param column_labels: the column labels after restored.
1170
+ :param column_label_names: the column label names after restored.
1171
+ :param fields: the fields after restored.
1172
+ :return: the restored pandas DataFrame
1173
+
1174
+ >>> from numpy import dtype
1175
+ >>> pdf = pd.DataFrame({"index": [10, 20, 30], "a": ['a', 'b', 'c'], "b": [0, 2, 1]})
1176
+ >>> InternalFrame.restore_index(
1177
+ ... pdf,
1178
+ ... index_columns=["index"],
1179
+ ... index_names=[("idx",)],
1180
+ ... data_columns=["a", "b", "index"],
1181
+ ... column_labels=[("x",), ("y",), ("z",)],
1182
+ ... column_label_names=[("lv1",)],
1183
+ ... fields=[
1184
+ ... InternalField(
1185
+ ... dtype=dtype('int64'),
1186
+ ... struct_field=StructField(name='index', dataType=LongType(), nullable=False),
1187
+ ... ),
1188
+ ... InternalField(
1189
+ ... dtype=dtype('object'),
1190
+ ... struct_field=StructField(name='a', dataType=StringType(), nullable=False),
1191
+ ... ),
1192
+ ... InternalField(
1193
+ ... dtype=CategoricalDtype(categories=["i", "j", "k"]),
1194
+ ... struct_field=StructField(name='b', dataType=LongType(), nullable=False),
1195
+ ... ),
1196
+ ... ],
1197
+ ... ) # doctest: +NORMALIZE_WHITESPACE
1198
+ lv1 x y z
1199
+ idx
1200
+ 10 a i 10
1201
+ 20 b k 20
1202
+ 30 c j 30
1203
+ """
1204
+ for col, field in zip(pdf.columns, fields):
1205
+ pdf[col] = DataTypeOps(field.dtype, field.spark_type).restore(pdf[col])
1206
+
1207
+ append = False
1208
+ for index_field in index_columns:
1209
+ drop = index_field not in data_columns
1210
+ pdf = pdf.set_index(index_field, drop=drop, append=append)
1211
+ append = True
1212
+ pdf = pdf[data_columns]
1213
+
1214
+ pdf.index.names = [
1215
+ name if name is None or len(name) > 1 else name[0] for name in index_names
1216
+ ]
1217
+
1218
+ names = [name if name is None or len(name) > 1 else name[0] for name in column_label_names]
1219
+ if len(column_label_names) > 1:
1220
+ pdf.columns = pd.MultiIndex.from_tuples(column_labels, names=names)
1221
+ else:
1222
+ pdf.columns = pd.Index(
1223
+ [None if label is None else label[0] for label in column_labels],
1224
+ name=names[0],
1225
+ )
1226
+
1227
+ return pdf
1228
+
1229
+ @lazy_property
1230
+ def resolved_copy(self) -> "InternalFrame":
1231
+ """Copy the immutable InternalFrame with the updates resolved."""
1232
+ sdf = self.spark_frame.select(self.spark_columns + list(HIDDEN_COLUMNS))
1233
+ return self.copy(
1234
+ spark_frame=sdf,
1235
+ index_spark_columns=[scol_for(sdf, col) for col in self.index_spark_column_names],
1236
+ data_spark_columns=[scol_for(sdf, col) for col in self.data_spark_column_names],
1237
+ )
1238
+
1239
+ def with_new_sdf(
1240
+ self,
1241
+ spark_frame: PySparkDataFrame,
1242
+ *,
1243
+ index_fields: Optional[List[InternalField]] = None,
1244
+ data_columns: Optional[List[str]] = None,
1245
+ data_fields: Optional[List[InternalField]] = None,
1246
+ ) -> "InternalFrame":
1247
+ """Copy the immutable InternalFrame with the updates by the specified Spark DataFrame.
1248
+
1249
+ :param spark_frame: the new Spark DataFrame
1250
+ :param index_fields: the new InternalFields for the index columns.
1251
+ If None, the original dtyeps are used.
1252
+ :param data_columns: the new column names. If None, the original one is used.
1253
+ :param data_fields: the new InternalFields for the data columns.
1254
+ If None, the original dtyeps are used.
1255
+ :return: the copied InternalFrame.
1256
+ """
1257
+ if index_fields is None:
1258
+ index_fields = self.index_fields
1259
+ else:
1260
+ assert len(index_fields) == len(self.index_fields), (
1261
+ len(index_fields),
1262
+ len(self.index_fields),
1263
+ )
1264
+
1265
+ if data_columns is None:
1266
+ data_columns = self.data_spark_column_names
1267
+ else:
1268
+ assert len(data_columns) == len(self.column_labels), (
1269
+ len(data_columns),
1270
+ len(self.column_labels),
1271
+ )
1272
+
1273
+ if data_fields is None:
1274
+ data_fields = self.data_fields
1275
+ else:
1276
+ assert len(data_fields) == len(self.column_labels), (
1277
+ len(data_fields),
1278
+ len(self.column_labels),
1279
+ )
1280
+
1281
+ sdf = spark_frame.drop(NATURAL_ORDER_COLUMN_NAME)
1282
+ return self.copy(
1283
+ spark_frame=sdf,
1284
+ index_spark_columns=[scol_for(sdf, col) for col in self.index_spark_column_names],
1285
+ index_fields=index_fields,
1286
+ data_spark_columns=[scol_for(sdf, col) for col in data_columns],
1287
+ data_fields=data_fields,
1288
+ )
1289
+
1290
+ def with_new_columns(
1291
+ self,
1292
+ scols_or_pssers: Sequence[Union[PySparkColumn, "Series"]],
1293
+ *,
1294
+ column_labels: Optional[List[Label]] = None,
1295
+ data_fields: Optional[List[InternalField]] = None,
1296
+ column_label_names: Union[Optional[List[Optional[Label]]], _NoValueType] = _NoValue,
1297
+ keep_order: bool = True,
1298
+ ) -> "InternalFrame":
1299
+ """
1300
+ Copy the immutable InternalFrame with the updates by the specified Spark Columns or Series.
1301
+
1302
+ :param scols_or_pssers: the new Spark Columns or Series.
1303
+ :param column_labels: the new column index.
1304
+ If None, the column_labels of the corresponding `scols_or_pssers` is used if it is
1305
+ Series; otherwise the original one is used.
1306
+ :param data_fields: the new InternalFields for the data columns.
1307
+ If None, the dtypes of the corresponding `scols_or_pssers` is used if it is Series;
1308
+ otherwise the dtypes will be inferred from the corresponding `scols_or_pssers`.
1309
+ :param column_label_names: the new names of the column index levels.
1310
+ :return: the copied InternalFrame.
1311
+ """
1312
+ from pyspark.pandas.series import Series
1313
+
1314
+ if column_labels is None:
1315
+ if all(isinstance(scol_or_psser, Series) for scol_or_psser in scols_or_pssers):
1316
+ column_labels = [cast(Series, psser)._column_label for psser in scols_or_pssers]
1317
+ else:
1318
+ assert len(scols_or_pssers) == len(self.column_labels), (
1319
+ len(scols_or_pssers),
1320
+ len(self.column_labels),
1321
+ )
1322
+ column_labels = []
1323
+ for scol_or_psser, label in zip(scols_or_pssers, self.column_labels):
1324
+ if isinstance(scol_or_psser, Series):
1325
+ column_labels.append(scol_or_psser._column_label)
1326
+ else:
1327
+ column_labels.append(label)
1328
+ else:
1329
+ assert len(scols_or_pssers) == len(column_labels), (
1330
+ len(scols_or_pssers),
1331
+ len(column_labels),
1332
+ )
1333
+
1334
+ data_spark_columns = []
1335
+ for scol_or_psser in scols_or_pssers:
1336
+ if isinstance(scol_or_psser, Series):
1337
+ scol = scol_or_psser.spark.column
1338
+ else:
1339
+ scol = scol_or_psser
1340
+ data_spark_columns.append(scol)
1341
+
1342
+ if data_fields is None:
1343
+ data_fields = []
1344
+ for scol_or_psser in scols_or_pssers:
1345
+ if isinstance(scol_or_psser, Series):
1346
+ data_fields.append(scol_or_psser._internal.data_fields[0])
1347
+ else:
1348
+ data_fields.append(None)
1349
+ else:
1350
+ assert len(scols_or_pssers) == len(data_fields), (
1351
+ len(scols_or_pssers),
1352
+ len(data_fields),
1353
+ )
1354
+
1355
+ sdf = self.spark_frame
1356
+ if not keep_order:
1357
+ sdf = self.spark_frame.select(self.index_spark_columns + data_spark_columns)
1358
+ index_spark_columns = [scol_for(sdf, col) for col in self.index_spark_column_names]
1359
+ data_spark_columns = [
1360
+ scol_for(sdf, col) for col in self.spark_frame.select(data_spark_columns).columns
1361
+ ]
1362
+ else:
1363
+ index_spark_columns = self.index_spark_columns
1364
+
1365
+ if column_label_names is _NoValue:
1366
+ column_label_names = self._column_label_names
1367
+
1368
+ return self.copy(
1369
+ spark_frame=sdf,
1370
+ index_spark_columns=index_spark_columns,
1371
+ column_labels=column_labels,
1372
+ data_spark_columns=data_spark_columns,
1373
+ data_fields=data_fields,
1374
+ column_label_names=column_label_names,
1375
+ )
1376
+
1377
+ def with_filter(self, pred: Union[PySparkColumn, "Series"]) -> "InternalFrame":
1378
+ """
1379
+ Copy the immutable InternalFrame with the updates by the predicate.
1380
+
1381
+ :param pred: the predicate to filter.
1382
+ :return: the copied InternalFrame.
1383
+ """
1384
+ from pyspark.pandas.series import Series
1385
+
1386
+ if isinstance(pred, Series):
1387
+ assert isinstance(pred.spark.data_type, BooleanType), pred.spark.data_type
1388
+ condition = pred.spark.column
1389
+ else:
1390
+ condition = pred
1391
+ spark_type = self.spark_frame.select(condition).schema[0].dataType
1392
+ assert isinstance(spark_type, BooleanType), spark_type
1393
+
1394
+ return self.with_new_sdf(self.spark_frame.filter(condition).select(self.spark_columns))
1395
+
1396
+ def with_new_spark_column(
1397
+ self,
1398
+ column_label: Label,
1399
+ scol: PySparkColumn,
1400
+ *,
1401
+ field: Optional[InternalField] = None,
1402
+ keep_order: bool = True,
1403
+ ) -> "InternalFrame":
1404
+ """
1405
+ Copy the immutable InternalFrame with the updates by the specified Spark Column.
1406
+
1407
+ :param column_label: the column label to be updated.
1408
+ :param scol: the new Spark Column
1409
+ :param field: the new InternalField for the data column.
1410
+ If not specified, the InternalField will be inferred from the spark Column.
1411
+ :return: the copied InternalFrame.
1412
+ """
1413
+ assert column_label in self.column_labels, column_label
1414
+
1415
+ idx = self.column_labels.index(column_label)
1416
+ data_spark_columns = self.data_spark_columns.copy()
1417
+ data_spark_columns[idx] = scol
1418
+ data_fields = self.data_fields.copy()
1419
+ data_fields[idx] = field
1420
+ return self.with_new_columns(
1421
+ data_spark_columns, data_fields=data_fields, keep_order=keep_order
1422
+ )
1423
+
1424
+ def select_column(self, column_label: Label) -> "InternalFrame":
1425
+ """
1426
+ Copy the immutable InternalFrame with the specified column.
1427
+
1428
+ :param column_label: the column label to use.
1429
+ :return: the copied InternalFrame.
1430
+ """
1431
+ assert column_label in self.column_labels, column_label
1432
+
1433
+ return self.copy(
1434
+ column_labels=[column_label],
1435
+ data_spark_columns=[self.spark_column_for(column_label)],
1436
+ data_fields=[self.field_for(column_label)],
1437
+ column_label_names=None,
1438
+ )
1439
+
1440
+ def copy(
1441
+ self,
1442
+ *,
1443
+ spark_frame: Union[PySparkDataFrame, _NoValueType] = _NoValue,
1444
+ index_spark_columns: Union[List[PySparkColumn], _NoValueType] = _NoValue,
1445
+ index_names: Union[Optional[List[Optional[Label]]], _NoValueType] = _NoValue,
1446
+ index_fields: Union[Optional[List[InternalField]], _NoValueType] = _NoValue,
1447
+ column_labels: Union[Optional[List[Label]], _NoValueType] = _NoValue,
1448
+ data_spark_columns: Union[Optional[List[PySparkColumn]], _NoValueType] = _NoValue,
1449
+ data_fields: Union[Optional[List[InternalField]], _NoValueType] = _NoValue,
1450
+ column_label_names: Union[Optional[List[Optional[Label]]], _NoValueType] = _NoValue,
1451
+ ) -> "InternalFrame":
1452
+ """
1453
+ Copy the immutable InternalFrame.
1454
+
1455
+ :param spark_frame: the new Spark DataFrame. If not specified, the original one is used.
1456
+ :param index_spark_columns: the list of Spark Column.
1457
+ If not specified, the original ones are used.
1458
+ :param index_names: the index names. If not specified, the original ones are used.
1459
+ :param index_fields: the new InternalFields for the index columns.
1460
+ If not specified, the original metadata are used.
1461
+ :param column_labels: the new column labels. If not specified, the original ones are used.
1462
+ :param data_spark_columns: the new Spark Columns.
1463
+ If not specified, the original ones are used.
1464
+ :param data_fields: the new InternalFields for the data columns.
1465
+ If not specified, the original metadata are used.
1466
+ :param column_label_names: the new names of the column index levels.
1467
+ If not specified, the original ones are used.
1468
+ :return: the copied immutable InternalFrame.
1469
+ """
1470
+ if spark_frame is _NoValue:
1471
+ spark_frame = self.spark_frame
1472
+ if index_spark_columns is _NoValue:
1473
+ index_spark_columns = self.index_spark_columns
1474
+ if index_names is _NoValue:
1475
+ index_names = self.index_names
1476
+ if index_fields is _NoValue:
1477
+ index_fields = self.index_fields
1478
+ if column_labels is _NoValue:
1479
+ column_labels = self.column_labels
1480
+ if data_spark_columns is _NoValue:
1481
+ data_spark_columns = self.data_spark_columns
1482
+ if data_fields is _NoValue:
1483
+ data_fields = self.data_fields
1484
+ if column_label_names is _NoValue:
1485
+ column_label_names = self.column_label_names
1486
+ return InternalFrame(
1487
+ spark_frame=cast(PySparkDataFrame, spark_frame),
1488
+ index_spark_columns=cast(List[PySparkColumn], index_spark_columns),
1489
+ index_names=cast(Optional[List[Optional[Label]]], index_names),
1490
+ index_fields=cast(Optional[List[InternalField]], index_fields),
1491
+ column_labels=cast(Optional[List[Label]], column_labels),
1492
+ data_spark_columns=cast(Optional[List[PySparkColumn]], data_spark_columns),
1493
+ data_fields=cast(Optional[List[InternalField]], data_fields),
1494
+ column_label_names=cast(Optional[List[Optional[Label]]], column_label_names),
1495
+ )
1496
+
1497
+ @staticmethod
1498
+ def from_pandas(pdf: pd.DataFrame) -> "InternalFrame":
1499
+ """Create an immutable DataFrame from pandas DataFrame.
1500
+
1501
+ :param pdf: :class:`pd.DataFrame`
1502
+ :return: the created immutable DataFrame
1503
+ """
1504
+
1505
+ index_names: List[Optional[Label]] = [
1506
+ name if name is None or isinstance(name, tuple) else (name,) for name in pdf.index.names
1507
+ ]
1508
+
1509
+ columns = pdf.columns
1510
+ column_labels: List[Label]
1511
+ if isinstance(columns, pd.MultiIndex):
1512
+ column_labels = columns.tolist()
1513
+ else:
1514
+ column_labels = [(col,) for col in columns]
1515
+
1516
+ column_label_names: List[Optional[Label]] = [
1517
+ name if name is None or isinstance(name, tuple) else (name,) for name in columns.names
1518
+ ]
1519
+
1520
+ prefer_timestamp_ntz = is_timestamp_ntz_preferred()
1521
+
1522
+ (
1523
+ pdf,
1524
+ index_columns,
1525
+ index_fields,
1526
+ data_columns,
1527
+ data_fields,
1528
+ ) = InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=prefer_timestamp_ntz)
1529
+
1530
+ schema = StructType([field.struct_field for field in index_fields + data_fields])
1531
+
1532
+ sdf = default_session().createDataFrame(pdf, schema=schema)
1533
+ return InternalFrame(
1534
+ spark_frame=sdf,
1535
+ index_spark_columns=[scol_for(sdf, col) for col in index_columns],
1536
+ index_names=index_names,
1537
+ index_fields=index_fields,
1538
+ column_labels=column_labels,
1539
+ data_spark_columns=[scol_for(sdf, col) for col in data_columns],
1540
+ data_fields=data_fields,
1541
+ column_label_names=column_label_names,
1542
+ )
1543
+
1544
+ @staticmethod
1545
+ def prepare_pandas_frame(
1546
+ pdf: pd.DataFrame, *, retain_index: bool = True, prefer_timestamp_ntz: bool = False
1547
+ ) -> Tuple[pd.DataFrame, List[str], List[InternalField], List[str], List[InternalField]]:
1548
+ """
1549
+ Prepare pandas DataFrame for creating Spark DataFrame.
1550
+
1551
+ :param pdf: the pandas DataFrame to be prepared.
1552
+ :param retain_index: whether the indices should be retained.
1553
+ :return: the tuple of
1554
+ - the prepared pandas dataFrame
1555
+ - index column names for Spark DataFrame
1556
+ - the InternalFields for the index columns of the given pandas DataFrame
1557
+ - data column names for Spark DataFrame
1558
+ - the InternalFields for the data columns of the given pandas DataFrame
1559
+
1560
+ >>> pdf = pd.DataFrame(
1561
+ ... {("x", "a"): ['a', 'b', 'c'],
1562
+ ... ("y", "b"): pd.Categorical(["i", "k", "j"], categories=["i", "j", "k"])},
1563
+ ... index=[10, 20, 30])
1564
+ >>> prepared, index_columns, index_fields, data_columns, data_fields = (
1565
+ ... InternalFrame.prepare_pandas_frame(pdf)
1566
+ ... )
1567
+ >>> prepared
1568
+ __index_level_0__ (x, a) (y, b)
1569
+ 0 10 a 0
1570
+ 1 20 b 2
1571
+ 2 30 c 1
1572
+ >>> index_columns
1573
+ ['__index_level_0__']
1574
+ >>> index_fields # doctest: +NORMALIZE_WHITESPACE
1575
+ [InternalField(dtype=int64, struct_field=StructField('__index_level_0__',
1576
+ LongType(), False))]
1577
+ >>> data_columns
1578
+ ['(x, a)', '(y, b)']
1579
+ >>> data_fields # doctest: +NORMALIZE_WHITESPACE
1580
+ [InternalField(dtype=object, struct_field=StructField('(x, a)', StringType(), False)),
1581
+ InternalField(dtype=category, struct_field=StructField('(y, b)', ByteType(), False))]
1582
+
1583
+ >>> import datetime
1584
+ >>> pdf = pd.DataFrame({
1585
+ ... "dt": [datetime.datetime(1970, 1, 1)], "dt_obj": [datetime.datetime(1970, 1, 1)]
1586
+ ... })
1587
+ >>> pdf.dt_obj = pdf.dt_obj.astype("object")
1588
+ >>> _, _, _, _, data_fields = (
1589
+ ... InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True)
1590
+ ... )
1591
+ >>> data_fields # doctest: +NORMALIZE_WHITESPACE
1592
+ [InternalField(dtype=datetime64[ns],
1593
+ struct_field=StructField('dt', TimestampNTZType(), False)),
1594
+ InternalField(dtype=object,
1595
+ struct_field=StructField('dt_obj', TimestampNTZType(), False))]
1596
+
1597
+ >>> pdf = pd.DataFrame({
1598
+ ... "td": [datetime.timedelta(0)], "td_obj": [datetime.timedelta(0)]
1599
+ ... })
1600
+ >>> pdf.td_obj = pdf.td_obj.astype("object")
1601
+ >>> _, _, _, _, data_fields = (
1602
+ ... InternalFrame.prepare_pandas_frame(pdf)
1603
+ ... )
1604
+ >>> data_fields # doctest: +NORMALIZE_WHITESPACE
1605
+ [InternalField(dtype=timedelta64[ns],
1606
+ struct_field=StructField('td', DayTimeIntervalType(0, 3), False)),
1607
+ InternalField(dtype=object,
1608
+ struct_field=StructField('td_obj', DayTimeIntervalType(0, 3), False))]
1609
+ """
1610
+ pdf = pdf.copy()
1611
+
1612
+ data_columns = [name_like_string(col) for col in pdf.columns]
1613
+ pdf.columns = data_columns
1614
+
1615
+ if retain_index:
1616
+ index_nlevels = pdf.index.nlevels
1617
+ index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in range(index_nlevels)]
1618
+ pdf.index.names = index_columns
1619
+ reset_index = pdf.reset_index()
1620
+ else:
1621
+ index_nlevels = 0
1622
+ index_columns = []
1623
+ reset_index = pdf
1624
+
1625
+ index_dtypes = list(reset_index.dtypes)[:index_nlevels]
1626
+ data_dtypes = list(reset_index.dtypes)[index_nlevels:]
1627
+
1628
+ for col, dtype in zip(reset_index.columns, reset_index.dtypes):
1629
+ spark_type = infer_pd_series_spark_type(reset_index[col], dtype, prefer_timestamp_ntz)
1630
+ reset_index[col] = DataTypeOps(dtype, spark_type).prepare(reset_index[col])
1631
+
1632
+ fields = [
1633
+ InternalField(
1634
+ dtype=dtype,
1635
+ struct_field=StructField(
1636
+ name=str(name),
1637
+ dataType=infer_pd_series_spark_type(col, dtype, prefer_timestamp_ntz),
1638
+ nullable=bool(col.isnull().any()),
1639
+ ),
1640
+ )
1641
+ for (name, col), dtype in zip(reset_index.items(), index_dtypes + data_dtypes)
1642
+ ]
1643
+
1644
+ return (
1645
+ reset_index,
1646
+ index_columns,
1647
+ fields[:index_nlevels],
1648
+ data_columns,
1649
+ fields[index_nlevels:],
1650
+ )
1651
+
1652
+
1653
+ def _test() -> None:
1654
+ import os
1655
+ import doctest
1656
+ import sys
1657
+ from pyspark.sql import SparkSession
1658
+ import pyspark.pandas.internal
1659
+
1660
+ os.chdir(os.environ["SPARK_HOME"])
1661
+
1662
+ globs = pyspark.pandas.internal.__dict__.copy()
1663
+ globs["ps"] = pyspark.pandas
1664
+ spark = (
1665
+ SparkSession.builder.master("local[4]")
1666
+ .appName("pyspark.pandas.internal tests")
1667
+ .getOrCreate()
1668
+ )
1669
+ (failure_count, test_count) = doctest.testmod(
1670
+ pyspark.pandas.internal,
1671
+ globs=globs,
1672
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
1673
+ )
1674
+ spark.stop()
1675
+ if failure_count:
1676
+ sys.exit(-1)
1677
+
1678
+
1679
+ if __name__ == "__main__":
1680
+ _test()