snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2172 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+ from pyspark.errors.exceptions.base import SessionNotSameException
18
+ from pyspark.sql.connect.utils import check_dependencies
19
+
20
+ check_dependencies(__name__)
21
+
22
+ from typing import (
23
+ Any,
24
+ Dict,
25
+ Iterator,
26
+ List,
27
+ Optional,
28
+ Tuple,
29
+ Union,
30
+ Sequence,
31
+ TYPE_CHECKING,
32
+ overload,
33
+ Callable,
34
+ cast,
35
+ Type,
36
+ )
37
+
38
+ import sys
39
+ import random
40
+ import pandas
41
+ import pyarrow as pa
42
+ import json
43
+ import warnings
44
+ from collections.abc import Iterable
45
+
46
+ from pyspark import _NoValue
47
+ from pyspark._globals import _NoValueType
48
+ from pyspark.sql.observation import Observation
49
+ from pyspark.sql.types import Row, StructType
50
+ from pyspark.sql.dataframe import (
51
+ DataFrame as PySparkDataFrame,
52
+ DataFrameNaFunctions as PySparkDataFrameNaFunctions,
53
+ DataFrameStatFunctions as PySparkDataFrameStatFunctions,
54
+ )
55
+
56
+ from pyspark.errors import (
57
+ PySparkTypeError,
58
+ PySparkAttributeError,
59
+ PySparkValueError,
60
+ PySparkNotImplementedError,
61
+ )
62
+ from pyspark.errors.exceptions.connect import SparkConnectException
63
+ from pyspark.rdd import PythonEvalType
64
+ from pyspark.storagelevel import StorageLevel
65
+ import pyspark.sql.connect.plan as plan
66
+ from pyspark.sql.connect.group import GroupedData
67
+ from pyspark.sql.connect.readwriter import DataFrameWriter, DataFrameWriterV2
68
+ from pyspark.sql.connect.streaming.readwriter import DataStreamWriter
69
+ from pyspark.sql.connect.column import Column
70
+ from pyspark.sql.connect.expressions import UnresolvedRegex
71
+ from pyspark.sql.connect.functions import (
72
+ _to_col_with_plan_id,
73
+ _to_col,
74
+ _invoke_function,
75
+ col,
76
+ lit,
77
+ expr as sql_expression,
78
+ )
79
+ from pyspark.sql.pandas.types import from_arrow_schema
80
+
81
+
82
+ if TYPE_CHECKING:
83
+ from pyspark.sql.connect._typing import (
84
+ ColumnOrName,
85
+ LiteralType,
86
+ PrimitiveType,
87
+ OptionalPrimitiveType,
88
+ PandasMapIterFunction,
89
+ ArrowMapIterFunction,
90
+ )
91
+ from pyspark.sql.connect.session import SparkSession
92
+ from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
93
+
94
+
95
+ class DataFrame:
96
+ def __init__(
97
+ self,
98
+ session: "SparkSession",
99
+ schema: Optional[StructType] = None,
100
+ ):
101
+ """Creates a new data frame"""
102
+ self._schema = schema
103
+ self._plan: Optional[plan.LogicalPlan] = None
104
+ self._session: "SparkSession" = session
105
+ # Check whether _repr_html is supported or not, we use it to avoid calling RPC twice
106
+ # by __repr__ and _repr_html_ while eager evaluation opens.
107
+ self._support_repr_html = False
108
+
109
+ def __repr__(self) -> str:
110
+ if not self._support_repr_html:
111
+ (
112
+ repl_eager_eval_enabled,
113
+ repl_eager_eval_max_num_rows,
114
+ repl_eager_eval_truncate,
115
+ ) = self._session._client.get_configs(
116
+ "spark.sql.repl.eagerEval.enabled",
117
+ "spark.sql.repl.eagerEval.maxNumRows",
118
+ "spark.sql.repl.eagerEval.truncate",
119
+ )
120
+ if repl_eager_eval_enabled == "true":
121
+ return self._show_string(
122
+ n=int(cast(str, repl_eager_eval_max_num_rows)),
123
+ truncate=int(cast(str, repl_eager_eval_truncate)),
124
+ vertical=False,
125
+ )
126
+ return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
127
+
128
+ def _repr_html_(self) -> Optional[str]:
129
+ if not self._support_repr_html:
130
+ self._support_repr_html = True
131
+ (
132
+ repl_eager_eval_enabled,
133
+ repl_eager_eval_max_num_rows,
134
+ repl_eager_eval_truncate,
135
+ ) = self._session._client.get_configs(
136
+ "spark.sql.repl.eagerEval.enabled",
137
+ "spark.sql.repl.eagerEval.maxNumRows",
138
+ "spark.sql.repl.eagerEval.truncate",
139
+ )
140
+ if repl_eager_eval_enabled == "true":
141
+ pdf = DataFrame.withPlan(
142
+ plan.HtmlString(
143
+ child=self._plan,
144
+ num_rows=int(cast(str, repl_eager_eval_max_num_rows)),
145
+ truncate=int(cast(str, repl_eager_eval_truncate)),
146
+ ),
147
+ session=self._session,
148
+ ).toPandas()
149
+ assert pdf is not None
150
+ return pdf["html_string"][0]
151
+ else:
152
+ return None
153
+
154
+ _repr_html_.__doc__ = PySparkDataFrame._repr_html_.__doc__
155
+
156
+ @property
157
+ def write(self) -> "DataFrameWriter":
158
+ assert self._plan is not None
159
+ return DataFrameWriter(self._plan, self._session)
160
+
161
+ write.__doc__ = PySparkDataFrame.write.__doc__
162
+
163
+ def isEmpty(self) -> bool:
164
+ return len(self.take(1)) == 0
165
+
166
+ isEmpty.__doc__ = PySparkDataFrame.isEmpty.__doc__
167
+
168
+ def select(self, *cols: "ColumnOrName") -> "DataFrame":
169
+ if len(cols) == 1 and isinstance(cols[0], list):
170
+ cols = cols[0]
171
+
172
+ return DataFrame.withPlan(plan.Project(self._plan, *cols), session=self._session)
173
+
174
+ select.__doc__ = PySparkDataFrame.select.__doc__
175
+
176
+ def selectExpr(self, *expr: Union[str, List[str]]) -> "DataFrame":
177
+ sql_expr = []
178
+ if len(expr) == 1 and isinstance(expr[0], list):
179
+ expr = expr[0] # type: ignore[assignment]
180
+ for element in expr:
181
+ if isinstance(element, str):
182
+ sql_expr.append(sql_expression(element))
183
+ else:
184
+ sql_expr.extend([sql_expression(e) for e in element])
185
+
186
+ return DataFrame.withPlan(plan.Project(self._plan, *sql_expr), session=self._session)
187
+
188
+ selectExpr.__doc__ = PySparkDataFrame.selectExpr.__doc__
189
+
190
+ def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame":
191
+ if not exprs:
192
+ raise PySparkValueError(
193
+ error_class="CANNOT_BE_EMPTY",
194
+ message_parameters={"item": "exprs"},
195
+ )
196
+
197
+ if len(exprs) == 1 and isinstance(exprs[0], dict):
198
+ measures = [_invoke_function(f, col(e)) for e, f in exprs[0].items()]
199
+ return self.groupBy().agg(*measures)
200
+ else:
201
+ # other expressions
202
+ assert all(isinstance(c, Column) for c in exprs), "all exprs should be Expression"
203
+ exprs = cast(Tuple[Column, ...], exprs)
204
+ return self.groupBy().agg(*exprs)
205
+
206
+ agg.__doc__ = PySparkDataFrame.agg.__doc__
207
+
208
+ def alias(self, alias: str) -> "DataFrame":
209
+ return DataFrame.withPlan(plan.SubqueryAlias(self._plan, alias), session=self._session)
210
+
211
+ alias.__doc__ = PySparkDataFrame.alias.__doc__
212
+
213
+ def colRegex(self, colName: str) -> Column:
214
+ if not isinstance(colName, str):
215
+ raise PySparkTypeError(
216
+ error_class="NOT_STR",
217
+ message_parameters={"arg_name": "colName", "arg_type": type(colName).__name__},
218
+ )
219
+ if self._plan is not None:
220
+ return Column(UnresolvedRegex(colName, self._plan._plan_id))
221
+ else:
222
+ return Column(UnresolvedRegex(colName))
223
+
224
+ colRegex.__doc__ = PySparkDataFrame.colRegex.__doc__
225
+
226
+ @property
227
+ def dtypes(self) -> List[Tuple[str, str]]:
228
+ return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]
229
+
230
+ dtypes.__doc__ = PySparkDataFrame.dtypes.__doc__
231
+
232
+ @property
233
+ def columns(self) -> List[str]:
234
+ if self._plan is None:
235
+ return []
236
+
237
+ return self.schema.names
238
+
239
+ columns.__doc__ = PySparkDataFrame.columns.__doc__
240
+
241
+ @property
242
+ def sparkSession(self) -> "SparkSession":
243
+ return self._session
244
+
245
+ sparkSession.__doc__ = PySparkDataFrame.sparkSession.__doc__
246
+
247
+ def count(self) -> int:
248
+ pdd = self.agg(_invoke_function("count", lit(1))).toPandas()
249
+ return pdd.iloc[0, 0]
250
+
251
+ count.__doc__ = PySparkDataFrame.count.__doc__
252
+
253
+ def crossJoin(self, other: "DataFrame") -> "DataFrame":
254
+ if self._plan is None:
255
+ raise Exception("Cannot cartesian join when self._plan is empty.")
256
+ if other._plan is None:
257
+ raise Exception("Cannot cartesian join when other._plan is empty.")
258
+ self.checkSameSparkSession(other)
259
+ return DataFrame.withPlan(
260
+ plan.Join(left=self._plan, right=other._plan, on=None, how="cross"),
261
+ session=self._session,
262
+ )
263
+
264
+ crossJoin.__doc__ = PySparkDataFrame.crossJoin.__doc__
265
+
266
+ def checkSameSparkSession(self, other: "DataFrame") -> None:
267
+ if self._session.session_id != other._session.session_id:
268
+ raise SessionNotSameException(
269
+ error_class="SESSION_NOT_SAME",
270
+ message_parameters={},
271
+ )
272
+
273
+ def coalesce(self, numPartitions: int) -> "DataFrame":
274
+ if not numPartitions > 0:
275
+ raise PySparkValueError(
276
+ error_class="VALUE_NOT_POSITIVE",
277
+ message_parameters={"arg_name": "numPartitions", "arg_value": str(numPartitions)},
278
+ )
279
+ return DataFrame.withPlan(
280
+ plan.Repartition(self._plan, num_partitions=numPartitions, shuffle=False),
281
+ self._session,
282
+ )
283
+
284
+ coalesce.__doc__ = PySparkDataFrame.coalesce.__doc__
285
+
286
+ @overload
287
+ def repartition(self, numPartitions: int, *cols: "ColumnOrName") -> "DataFrame":
288
+ ...
289
+
290
+ @overload
291
+ def repartition(self, *cols: "ColumnOrName") -> "DataFrame":
292
+ ...
293
+
294
+ def repartition( # type: ignore[misc]
295
+ self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName"
296
+ ) -> "DataFrame":
297
+ if isinstance(numPartitions, int):
298
+ if not numPartitions > 0:
299
+ raise PySparkValueError(
300
+ error_class="VALUE_NOT_POSITIVE",
301
+ message_parameters={
302
+ "arg_name": "numPartitions",
303
+ "arg_value": str(numPartitions),
304
+ },
305
+ )
306
+ if len(cols) == 0:
307
+ return DataFrame.withPlan(
308
+ plan.Repartition(self._plan, num_partitions=numPartitions, shuffle=True),
309
+ self._session,
310
+ )
311
+ else:
312
+ return DataFrame.withPlan(
313
+ plan.RepartitionByExpression(self._plan, numPartitions, list(cols)),
314
+ self.sparkSession,
315
+ )
316
+ elif isinstance(numPartitions, (str, Column)):
317
+ cols = (numPartitions,) + cols
318
+ return DataFrame.withPlan(
319
+ plan.RepartitionByExpression(self._plan, None, list(cols)),
320
+ self.sparkSession,
321
+ )
322
+ else:
323
+ raise PySparkTypeError(
324
+ error_class="NOT_COLUMN_OR_STR",
325
+ message_parameters={
326
+ "arg_name": "numPartitions",
327
+ "arg_type": type(numPartitions).__name__,
328
+ },
329
+ )
330
+
331
+ repartition.__doc__ = PySparkDataFrame.repartition.__doc__
332
+
333
+ @overload
334
+ def repartitionByRange(self, numPartitions: int, *cols: "ColumnOrName") -> "DataFrame":
335
+ ...
336
+
337
+ @overload
338
+ def repartitionByRange(self, *cols: "ColumnOrName") -> "DataFrame":
339
+ ...
340
+
341
+ def repartitionByRange( # type: ignore[misc]
342
+ self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName"
343
+ ) -> "DataFrame":
344
+ def _convert_col(col: "ColumnOrName") -> "ColumnOrName":
345
+ from pyspark.sql.connect.expressions import SortOrder, ColumnReference
346
+
347
+ if isinstance(col, Column):
348
+ if isinstance(col._expr, SortOrder):
349
+ return col
350
+ else:
351
+ return Column(SortOrder(col._expr))
352
+ else:
353
+ return Column(SortOrder(ColumnReference(col)))
354
+
355
+ if isinstance(numPartitions, int):
356
+ if not numPartitions > 0:
357
+ raise PySparkValueError(
358
+ error_class="VALUE_NOT_POSITIVE",
359
+ message_parameters={
360
+ "arg_name": "numPartitions",
361
+ "arg_value": str(numPartitions),
362
+ },
363
+ )
364
+ if len(cols) == 0:
365
+ raise PySparkValueError(
366
+ error_class="CANNOT_BE_EMPTY",
367
+ message_parameters={"item": "cols"},
368
+ )
369
+ else:
370
+ sort = []
371
+ sort.extend([_convert_col(c) for c in cols])
372
+ return DataFrame.withPlan(
373
+ plan.RepartitionByExpression(self._plan, numPartitions, sort),
374
+ self.sparkSession,
375
+ )
376
+ elif isinstance(numPartitions, (str, Column)):
377
+ cols = (numPartitions,) + cols
378
+ sort = []
379
+ sort.extend([_convert_col(c) for c in cols])
380
+ return DataFrame.withPlan(
381
+ plan.RepartitionByExpression(self._plan, None, sort),
382
+ self.sparkSession,
383
+ )
384
+ else:
385
+ raise PySparkTypeError(
386
+ error_class="NOT_COLUMN_OR_INT_OR_STR",
387
+ message_parameters={
388
+ "arg_name": "numPartitions",
389
+ "arg_type": type(numPartitions).__name__,
390
+ },
391
+ )
392
+
393
+ repartitionByRange.__doc__ = PySparkDataFrame.repartitionByRange.__doc__
394
+
395
+ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
396
+ if subset is not None and not isinstance(subset, (list, tuple)):
397
+ raise PySparkTypeError(
398
+ error_class="NOT_LIST_OR_TUPLE",
399
+ message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__},
400
+ )
401
+
402
+ if subset is None:
403
+ return DataFrame.withPlan(
404
+ plan.Deduplicate(child=self._plan, all_columns_as_keys=True), session=self._session
405
+ )
406
+ else:
407
+ return DataFrame.withPlan(
408
+ plan.Deduplicate(child=self._plan, column_names=subset), session=self._session
409
+ )
410
+
411
+ dropDuplicates.__doc__ = PySparkDataFrame.dropDuplicates.__doc__
412
+
413
+ drop_duplicates = dropDuplicates
414
+
415
+ def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> "DataFrame":
416
+ if subset is not None and not isinstance(subset, (list, tuple)):
417
+ raise PySparkTypeError(
418
+ error_class="NOT_LIST_OR_TUPLE",
419
+ message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__},
420
+ )
421
+
422
+ if subset is None:
423
+ return DataFrame.withPlan(
424
+ plan.Deduplicate(child=self._plan, all_columns_as_keys=True, within_watermark=True),
425
+ session=self._session,
426
+ )
427
+ else:
428
+ return DataFrame.withPlan(
429
+ plan.Deduplicate(child=self._plan, column_names=subset, within_watermark=True),
430
+ session=self._session,
431
+ )
432
+
433
+ dropDuplicatesWithinWatermark.__doc__ = PySparkDataFrame.dropDuplicatesWithinWatermark.__doc__
434
+
435
+ drop_duplicates_within_watermark = dropDuplicatesWithinWatermark
436
+
437
+ def distinct(self) -> "DataFrame":
438
+ return DataFrame.withPlan(
439
+ plan.Deduplicate(child=self._plan, all_columns_as_keys=True), session=self._session
440
+ )
441
+
442
+ distinct.__doc__ = PySparkDataFrame.distinct.__doc__
443
+
444
+ def drop(self, *cols: "ColumnOrName") -> "DataFrame":
445
+ _cols = list(cols)
446
+ if any(not isinstance(c, (str, Column)) for c in _cols):
447
+ raise PySparkTypeError(
448
+ error_class="NOT_COLUMN_OR_STR",
449
+ message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__},
450
+ )
451
+
452
+ return DataFrame.withPlan(
453
+ plan.Drop(
454
+ child=self._plan,
455
+ columns=_cols,
456
+ ),
457
+ session=self._session,
458
+ )
459
+
460
+ drop.__doc__ = PySparkDataFrame.drop.__doc__
461
+
462
+ def filter(self, condition: Union[Column, str]) -> "DataFrame":
463
+ if isinstance(condition, str):
464
+ expr = sql_expression(condition)
465
+ else:
466
+ expr = condition
467
+ return DataFrame.withPlan(plan.Filter(child=self._plan, filter=expr), session=self._session)
468
+
469
+ filter.__doc__ = PySparkDataFrame.filter.__doc__
470
+
471
+ def first(self) -> Optional[Row]:
472
+ return self.head()
473
+
474
+ first.__doc__ = PySparkDataFrame.first.__doc__
475
+
476
+ def groupBy(self, *cols: "ColumnOrName") -> GroupedData:
477
+ if len(cols) == 1 and isinstance(cols[0], list):
478
+ cols = cols[0]
479
+
480
+ _cols: List[Column] = []
481
+ for c in cols:
482
+ if isinstance(c, Column):
483
+ _cols.append(c)
484
+ elif isinstance(c, str):
485
+ _cols.append(self[c])
486
+ else:
487
+ raise PySparkTypeError(
488
+ error_class="NOT_COLUMN_OR_STR",
489
+ message_parameters={"arg_name": "groupBy", "arg_type": type(c).__name__},
490
+ )
491
+
492
+ return GroupedData(df=self, group_type="groupby", grouping_cols=_cols)
493
+
494
+ groupBy.__doc__ = PySparkDataFrame.groupBy.__doc__
495
+
496
+ groupby = groupBy
497
+
498
+ def rollup(self, *cols: "ColumnOrName") -> "GroupedData":
499
+ _cols: List[Column] = []
500
+ for c in cols:
501
+ if isinstance(c, Column):
502
+ _cols.append(c)
503
+ elif isinstance(c, str):
504
+ _cols.append(self[c])
505
+ else:
506
+ raise PySparkTypeError(
507
+ error_class="NOT_COLUMN_OR_STR",
508
+ message_parameters={"arg_name": "rollup", "arg_type": type(c).__name__},
509
+ )
510
+
511
+ return GroupedData(df=self, group_type="rollup", grouping_cols=_cols)
512
+
513
+ rollup.__doc__ = PySparkDataFrame.rollup.__doc__
514
+
515
+ def cube(self, *cols: "ColumnOrName") -> "GroupedData":
516
+ _cols: List[Column] = []
517
+ for c in cols:
518
+ if isinstance(c, Column):
519
+ _cols.append(c)
520
+ elif isinstance(c, str):
521
+ _cols.append(self[c])
522
+ else:
523
+ raise PySparkTypeError(
524
+ error_class="NOT_COLUMN_OR_STR",
525
+ message_parameters={"arg_name": "cube", "arg_type": type(c).__name__},
526
+ )
527
+
528
+ return GroupedData(df=self, group_type="cube", grouping_cols=_cols)
529
+
530
+ cube.__doc__ = PySparkDataFrame.cube.__doc__
531
+
532
+ @overload
533
+ def head(self) -> Optional[Row]:
534
+ ...
535
+
536
+ @overload
537
+ def head(self, n: int) -> List[Row]:
538
+ ...
539
+
540
+ def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]:
541
+ if n is None:
542
+ rs = self.head(1)
543
+ return rs[0] if rs else None
544
+ return self.take(n)
545
+
546
+ head.__doc__ = PySparkDataFrame.head.__doc__
547
+
548
+ def take(self, num: int) -> List[Row]:
549
+ return self.limit(num).collect()
550
+
551
+ take.__doc__ = PySparkDataFrame.take.__doc__
552
+
553
+ # TODO: extend `on` to also be type List[Column].
554
+ def join(
555
+ self,
556
+ other: "DataFrame",
557
+ on: Optional[Union[str, List[str], Column, List[Column]]] = None,
558
+ how: Optional[str] = None,
559
+ ) -> "DataFrame":
560
+ if self._plan is None:
561
+ raise Exception("Cannot join when self._plan is empty.")
562
+ if other._plan is None:
563
+ raise Exception("Cannot join when other._plan is empty.")
564
+ if how is not None and isinstance(how, str):
565
+ how = how.lower().replace("_", "")
566
+ self.checkSameSparkSession(other)
567
+ return DataFrame.withPlan(
568
+ plan.Join(left=self._plan, right=other._plan, on=on, how=how),
569
+ session=self._session,
570
+ )
571
+
572
+ join.__doc__ = PySparkDataFrame.join.__doc__
573
+
574
+ def limit(self, n: int) -> "DataFrame":
575
+ return DataFrame.withPlan(plan.Limit(child=self._plan, limit=n), session=self._session)
576
+
577
+ limit.__doc__ = PySparkDataFrame.limit.__doc__
578
+
579
+ def tail(self, num: int) -> List[Row]:
580
+ return DataFrame.withPlan(
581
+ plan.Tail(child=self._plan, limit=num), session=self._session
582
+ ).collect()
583
+
584
+ tail.__doc__ = PySparkDataFrame.tail.__doc__
585
+
586
+ def _sort_cols(
587
+ self, cols: Sequence[Union[str, Column, List[Union[str, Column]]]], kwargs: Dict[str, Any]
588
+ ) -> List[Column]:
589
+ """Return a JVM Seq of Columns that describes the sort order"""
590
+ if cols is None:
591
+ raise PySparkValueError(
592
+ error_class="CANNOT_BE_EMPTY",
593
+ message_parameters={"item": "cols"},
594
+ )
595
+
596
+ _cols: List[Column] = []
597
+ if len(cols) == 1 and isinstance(cols[0], list):
598
+ _cols = [_to_col(c) for c in cols[0]]
599
+ else:
600
+ _cols = [_to_col(cast("ColumnOrName", c)) for c in cols]
601
+
602
+ ascending = kwargs.get("ascending", True)
603
+ if isinstance(ascending, (bool, int)):
604
+ if not ascending:
605
+ _cols = [c.desc() for c in _cols]
606
+ elif isinstance(ascending, list):
607
+ _cols = [c if asc else c.desc() for asc, c in zip(ascending, _cols)]
608
+ else:
609
+ raise PySparkTypeError(
610
+ error_class="NOT_BOOL_OR_LIST",
611
+ message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
612
+ )
613
+
614
+ return _cols
615
+
616
+ def sort(
617
+ self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
618
+ ) -> "DataFrame":
619
+ return DataFrame.withPlan(
620
+ plan.Sort(
621
+ self._plan,
622
+ columns=self._sort_cols(cols, kwargs),
623
+ is_global=True,
624
+ ),
625
+ session=self._session,
626
+ )
627
+
628
+ sort.__doc__ = PySparkDataFrame.sort.__doc__
629
+
630
+ orderBy = sort
631
+
632
+ def sortWithinPartitions(
633
+ self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
634
+ ) -> "DataFrame":
635
+ return DataFrame.withPlan(
636
+ plan.Sort(
637
+ self._plan,
638
+ columns=self._sort_cols(cols, kwargs),
639
+ is_global=False,
640
+ ),
641
+ session=self._session,
642
+ )
643
+
644
+ sortWithinPartitions.__doc__ = PySparkDataFrame.sortWithinPartitions.__doc__
645
+
646
+ def sample(
647
+ self,
648
+ withReplacement: Optional[Union[float, bool]] = None,
649
+ fraction: Optional[Union[int, float]] = None,
650
+ seed: Optional[int] = None,
651
+ ) -> "DataFrame":
652
+ # For the cases below:
653
+ # sample(True, 0.5 [, seed])
654
+ # sample(True, fraction=0.5 [, seed])
655
+ # sample(withReplacement=False, fraction=0.5 [, seed])
656
+ is_withReplacement_set = type(withReplacement) == bool and isinstance(fraction, float)
657
+
658
+ # For the case below:
659
+ # sample(faction=0.5 [, seed])
660
+ is_withReplacement_omitted_kwargs = withReplacement is None and isinstance(fraction, float)
661
+
662
+ # For the case below:
663
+ # sample(0.5 [, seed])
664
+ is_withReplacement_omitted_args = isinstance(withReplacement, float)
665
+
666
+ if not (
667
+ is_withReplacement_set
668
+ or is_withReplacement_omitted_kwargs
669
+ or is_withReplacement_omitted_args
670
+ ):
671
+ argtypes = [type(arg).__name__ for arg in [withReplacement, fraction, seed]]
672
+ raise PySparkTypeError(
673
+ error_class="NOT_BOOL_OR_FLOAT_OR_INT",
674
+ message_parameters={
675
+ "arg_name": "withReplacement (optional), "
676
+ + "fraction (required) and seed (optional)",
677
+ "arg_type": ", ".join(argtypes),
678
+ },
679
+ )
680
+
681
+ if is_withReplacement_omitted_args:
682
+ if fraction is not None:
683
+ seed = cast(int, fraction)
684
+ fraction = withReplacement
685
+ withReplacement = None
686
+
687
+ if withReplacement is None:
688
+ withReplacement = False
689
+
690
+ seed = int(seed) if seed is not None else random.randint(0, sys.maxsize)
691
+
692
+ return DataFrame.withPlan(
693
+ plan.Sample(
694
+ child=self._plan,
695
+ lower_bound=0.0,
696
+ upper_bound=fraction, # type: ignore[arg-type]
697
+ with_replacement=withReplacement, # type: ignore[arg-type]
698
+ seed=seed,
699
+ ),
700
+ session=self._session,
701
+ )
702
+
703
+ sample.__doc__ = PySparkDataFrame.sample.__doc__
704
+
705
+ def withColumnRenamed(self, existing: str, new: str) -> "DataFrame":
706
+ return self.withColumnsRenamed({existing: new})
707
+
708
+ withColumnRenamed.__doc__ = PySparkDataFrame.withColumnRenamed.__doc__
709
+
710
+ def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
711
+ if not isinstance(colsMap, dict):
712
+ raise PySparkTypeError(
713
+ error_class="NOT_DICT",
714
+ message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__},
715
+ )
716
+
717
+ return DataFrame.withPlan(plan.WithColumnsRenamed(self._plan, colsMap), self._session)
718
+
719
+ withColumnsRenamed.__doc__ = PySparkDataFrame.withColumnsRenamed.__doc__
720
+
721
+ def _show_string(
722
+ self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False
723
+ ) -> str:
724
+ if not isinstance(n, int) or isinstance(n, bool):
725
+ raise PySparkTypeError(
726
+ error_class="NOT_INT",
727
+ message_parameters={"arg_name": "n", "arg_type": type(n).__name__},
728
+ )
729
+ if not isinstance(vertical, bool):
730
+ raise PySparkTypeError(
731
+ error_class="NOT_BOOL",
732
+ message_parameters={"arg_name": "vertical", "arg_type": type(vertical).__name__},
733
+ )
734
+
735
+ _truncate: int = -1
736
+ if isinstance(truncate, bool) and truncate:
737
+ _truncate = 20
738
+ else:
739
+ try:
740
+ _truncate = int(truncate)
741
+ except ValueError:
742
+ raise PySparkTypeError(
743
+ error_class="NOT_BOOL",
744
+ message_parameters={
745
+ "arg_name": "truncate",
746
+ "arg_type": type(truncate).__name__,
747
+ },
748
+ )
749
+
750
+ pdf = DataFrame.withPlan(
751
+ plan.ShowString(child=self._plan, num_rows=n, truncate=_truncate, vertical=vertical),
752
+ session=self._session,
753
+ ).toPandas()
754
+ assert pdf is not None
755
+ return pdf["show_string"][0]
756
+
757
+ def withColumns(self, colsMap: Dict[str, Column]) -> "DataFrame":
758
+ if not isinstance(colsMap, dict):
759
+ raise PySparkTypeError(
760
+ error_class="NOT_DICT",
761
+ message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__},
762
+ )
763
+
764
+ names: List[str] = []
765
+ columns: List[Column] = []
766
+ for columnName, column in colsMap.items():
767
+ names.append(columnName)
768
+ columns.append(column)
769
+
770
+ return DataFrame.withPlan(
771
+ plan.WithColumns(
772
+ self._plan,
773
+ columnNames=names,
774
+ columns=columns,
775
+ ),
776
+ session=self._session,
777
+ )
778
+
779
+ withColumns.__doc__ = PySparkDataFrame.withColumns.__doc__
780
+
781
+ def withColumn(self, colName: str, col: Column) -> "DataFrame":
782
+ if not isinstance(col, Column):
783
+ raise PySparkTypeError(
784
+ error_class="NOT_COLUMN",
785
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
786
+ )
787
+ return DataFrame.withPlan(
788
+ plan.WithColumns(
789
+ self._plan,
790
+ columnNames=[colName],
791
+ columns=[col],
792
+ ),
793
+ session=self._session,
794
+ )
795
+
796
+ withColumn.__doc__ = PySparkDataFrame.withColumn.__doc__
797
+
798
+ def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> "DataFrame":
799
+ if not isinstance(metadata, dict):
800
+ raise PySparkTypeError(
801
+ error_class="NOT_DICT",
802
+ message_parameters={"arg_name": "metadata", "arg_type": type(metadata).__name__},
803
+ )
804
+
805
+ return DataFrame.withPlan(
806
+ plan.WithColumns(
807
+ self._plan,
808
+ columnNames=[columnName],
809
+ columns=[self[columnName]],
810
+ metadata=[json.dumps(metadata)],
811
+ ),
812
+ session=self._session,
813
+ )
814
+
815
+ withMetadata.__doc__ = PySparkDataFrame.withMetadata.__doc__
816
+
817
+ def unpivot(
818
+ self,
819
+ ids: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]],
820
+ values: Optional[Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]]],
821
+ variableColumnName: str,
822
+ valueColumnName: str,
823
+ ) -> "DataFrame":
824
+ assert ids is not None, "ids must not be None"
825
+
826
+ def to_jcols(
827
+ cols: Optional[Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]]]
828
+ ) -> List["ColumnOrName"]:
829
+ if cols is None:
830
+ lst = []
831
+ elif isinstance(cols, tuple):
832
+ lst = list(cols)
833
+ elif isinstance(cols, list):
834
+ lst = cols
835
+ else:
836
+ lst = [cols]
837
+ return lst
838
+
839
+ return DataFrame.withPlan(
840
+ plan.Unpivot(
841
+ self._plan,
842
+ to_jcols(ids),
843
+ to_jcols(values) if values is not None else None,
844
+ variableColumnName,
845
+ valueColumnName,
846
+ ),
847
+ self._session,
848
+ )
849
+
850
+ unpivot.__doc__ = PySparkDataFrame.unpivot.__doc__
851
+
852
+ melt = unpivot
853
+
854
+ def withWatermark(self, eventTime: str, delayThreshold: str) -> "DataFrame":
855
+ # TODO: reuse error handling code in sql.DataFrame.withWatermark()
856
+ if not eventTime or type(eventTime) is not str:
857
+ raise PySparkTypeError(
858
+ error_class="NOT_STR",
859
+ message_parameters={"arg_name": "eventTime", "arg_type": type(eventTime).__name__},
860
+ )
861
+ if not delayThreshold or type(delayThreshold) is not str:
862
+ raise PySparkTypeError(
863
+ error_class="NOT_STR",
864
+ message_parameters={
865
+ "arg_name": "delayThreshold",
866
+ "arg_type": type(delayThreshold).__name__,
867
+ },
868
+ )
869
+
870
+ return DataFrame.withPlan(
871
+ plan.WithWatermark(
872
+ self._plan,
873
+ event_time=eventTime,
874
+ delay_threshold=delayThreshold,
875
+ ),
876
+ session=self._session,
877
+ )
878
+
879
+ withWatermark.__doc__ = PySparkDataFrame.withWatermark.__doc__
880
+
881
+ def hint(
882
+ self, name: str, *parameters: Union["PrimitiveType", List["PrimitiveType"]]
883
+ ) -> "DataFrame":
884
+ if len(parameters) == 1 and isinstance(parameters[0], list):
885
+ parameters = parameters[0] # type: ignore[assignment]
886
+
887
+ if not isinstance(name, str):
888
+ raise PySparkTypeError(
889
+ error_class="NOT_STR",
890
+ message_parameters={"arg_name": "name", "arg_type": type(name).__name__},
891
+ )
892
+
893
+ allowed_types = (str, list, float, int)
894
+ for p in parameters:
895
+ if not isinstance(p, allowed_types):
896
+ raise PySparkTypeError(
897
+ error_class="INVALID_ITEM_FOR_CONTAINER",
898
+ message_parameters={
899
+ "arg_name": "parameters",
900
+ "allowed_types": ", ".join([t.__name__ for t in allowed_types]),
901
+ "item_type": type(p).__name__,
902
+ },
903
+ )
904
+
905
+ return DataFrame.withPlan(
906
+ plan.Hint(self._plan, name, list(parameters)),
907
+ session=self._session,
908
+ )
909
+
910
+ hint.__doc__ = PySparkDataFrame.hint.__doc__
911
+
912
+ def randomSplit(
913
+ self,
914
+ weights: List[float],
915
+ seed: Optional[int] = None,
916
+ ) -> List["DataFrame"]:
917
+ for w in weights:
918
+ if w < 0.0:
919
+ raise PySparkValueError(
920
+ error_class="VALUE_NOT_POSITIVE",
921
+ message_parameters={"arg_name": "weights", "arg_value": str(w)},
922
+ )
923
+ seed = seed if seed is not None else random.randint(0, sys.maxsize)
924
+ total = sum(weights)
925
+ if total <= 0:
926
+ raise PySparkValueError(
927
+ error_class="VALUE_NOT_POSITIVE",
928
+ message_parameters={"arg_name": "sum(weights)", "arg_value": str(total)},
929
+ )
930
+ proportions = list(map(lambda x: x / total, weights))
931
+ normalizedCumWeights = [0.0]
932
+ for v in proportions:
933
+ normalizedCumWeights.append(normalizedCumWeights[-1] + v)
934
+ j = 1
935
+ length = len(normalizedCumWeights)
936
+ splits = []
937
+ while j < length:
938
+ lowerBound = normalizedCumWeights[j - 1]
939
+ upperBound = normalizedCumWeights[j]
940
+ samplePlan = DataFrame.withPlan(
941
+ plan.Sample(
942
+ child=self._plan,
943
+ lower_bound=lowerBound,
944
+ upper_bound=upperBound,
945
+ with_replacement=False,
946
+ seed=int(seed),
947
+ deterministic_order=True,
948
+ ),
949
+ session=self._session,
950
+ )
951
+ splits.append(samplePlan)
952
+ j += 1
953
+
954
+ return splits
955
+
956
+ randomSplit.__doc__ = PySparkDataFrame.randomSplit.__doc__
957
+
958
+ def observe(
959
+ self,
960
+ observation: Union["Observation", str],
961
+ *exprs: Column,
962
+ ) -> "DataFrame":
963
+ if len(exprs) == 0:
964
+ raise PySparkValueError(
965
+ error_class="CANNOT_BE_EMPTY",
966
+ message_parameters={"item": "exprs"},
967
+ )
968
+ if not all(isinstance(c, Column) for c in exprs):
969
+ raise PySparkTypeError(
970
+ error_class="NOT_LIST_OF_COLUMN",
971
+ message_parameters={"arg_name": "exprs"},
972
+ )
973
+
974
+ if isinstance(observation, Observation):
975
+ return DataFrame.withPlan(
976
+ plan.CollectMetrics(self._plan, str(observation._name), list(exprs)),
977
+ self._session,
978
+ )
979
+ elif isinstance(observation, str):
980
+ return DataFrame.withPlan(
981
+ plan.CollectMetrics(self._plan, observation, list(exprs)),
982
+ self._session,
983
+ )
984
+ else:
985
+ raise PySparkTypeError(
986
+ error_class="NOT_OBSERVATION_OR_STR",
987
+ message_parameters={
988
+ "arg_name": "observation",
989
+ "arg_type": type(observation).__name__,
990
+ },
991
+ )
992
+
993
+ observe.__doc__ = PySparkDataFrame.observe.__doc__
994
+
995
+ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False) -> None:
996
+ print(self._show_string(n, truncate, vertical))
997
+
998
+ show.__doc__ = PySparkDataFrame.show.__doc__
999
+
1000
+ def union(self, other: "DataFrame") -> "DataFrame":
1001
+ return self.unionAll(other)
1002
+
1003
+ union.__doc__ = PySparkDataFrame.union.__doc__
1004
+
1005
+ def unionAll(self, other: "DataFrame") -> "DataFrame":
1006
+ if other._plan is None:
1007
+ raise PySparkValueError(
1008
+ error_class="MISSING_VALID_PLAN",
1009
+ message_parameters={"operator": "Union"},
1010
+ )
1011
+ self.checkSameSparkSession(other)
1012
+ return DataFrame.withPlan(
1013
+ plan.SetOperation(self._plan, other._plan, "union", is_all=True), session=self._session
1014
+ )
1015
+
1016
+ unionAll.__doc__ = PySparkDataFrame.unionAll.__doc__
1017
+
1018
+ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
1019
+ if other._plan is None:
1020
+ raise PySparkValueError(
1021
+ error_class="MISSING_VALID_PLAN",
1022
+ message_parameters={"operator": "UnionByName"},
1023
+ )
1024
+ self.checkSameSparkSession(other)
1025
+ return DataFrame.withPlan(
1026
+ plan.SetOperation(
1027
+ self._plan,
1028
+ other._plan,
1029
+ "union",
1030
+ by_name=True,
1031
+ allow_missing_columns=allowMissingColumns,
1032
+ ),
1033
+ session=self._session,
1034
+ )
1035
+
1036
+ unionByName.__doc__ = PySparkDataFrame.unionByName.__doc__
1037
+
1038
+ def subtract(self, other: "DataFrame") -> "DataFrame":
1039
+ return DataFrame.withPlan(
1040
+ plan.SetOperation(self._plan, other._plan, "except", is_all=False),
1041
+ session=self._session,
1042
+ )
1043
+
1044
+ subtract.__doc__ = PySparkDataFrame.subtract.__doc__
1045
+
1046
+ def exceptAll(self, other: "DataFrame") -> "DataFrame":
1047
+ return DataFrame.withPlan(
1048
+ plan.SetOperation(self._plan, other._plan, "except", is_all=True), session=self._session
1049
+ )
1050
+
1051
+ exceptAll.__doc__ = PySparkDataFrame.exceptAll.__doc__
1052
+
1053
+ def intersect(self, other: "DataFrame") -> "DataFrame":
1054
+ return DataFrame.withPlan(
1055
+ plan.SetOperation(self._plan, other._plan, "intersect", is_all=False),
1056
+ session=self._session,
1057
+ )
1058
+
1059
+ intersect.__doc__ = PySparkDataFrame.intersect.__doc__
1060
+
1061
+ def intersectAll(self, other: "DataFrame") -> "DataFrame":
1062
+ return DataFrame.withPlan(
1063
+ plan.SetOperation(self._plan, other._plan, "intersect", is_all=True),
1064
+ session=self._session,
1065
+ )
1066
+
1067
+ intersectAll.__doc__ = PySparkDataFrame.intersectAll.__doc__
1068
+
1069
+ def where(self, condition: Union[Column, str]) -> "DataFrame":
1070
+ if not isinstance(condition, (str, Column)):
1071
+ raise PySparkTypeError(
1072
+ error_class="NOT_COLUMN_OR_STR",
1073
+ message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__},
1074
+ )
1075
+ return self.filter(condition)
1076
+
1077
+ where.__doc__ = PySparkDataFrame.where.__doc__
1078
+
1079
+ @property
1080
+ def na(self) -> "DataFrameNaFunctions":
1081
+ return DataFrameNaFunctions(self)
1082
+
1083
+ na.__doc__ = PySparkDataFrame.na.__doc__
1084
+
1085
+ def fillna(
1086
+ self,
1087
+ value: Union["LiteralType", Dict[str, "LiteralType"]],
1088
+ subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None,
1089
+ ) -> "DataFrame":
1090
+ if not isinstance(value, (float, int, str, bool, dict)):
1091
+ raise PySparkTypeError(
1092
+ error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR",
1093
+ message_parameters={"arg_name": "value", "arg_type": type(value).__name__},
1094
+ )
1095
+ if isinstance(value, dict):
1096
+ if len(value) == 0:
1097
+ raise PySparkValueError(
1098
+ error_class="CANNOT_BE_EMPTY",
1099
+ message_parameters={"item": "value"},
1100
+ )
1101
+ for c, v in value.items():
1102
+ if not isinstance(c, str):
1103
+ raise PySparkTypeError(
1104
+ error_class="NOT_STR",
1105
+ message_parameters={
1106
+ "arg_name": "key type of dict",
1107
+ "arg_type": type(c).__name__,
1108
+ },
1109
+ )
1110
+ if not isinstance(v, (bool, int, float, str)):
1111
+ raise PySparkTypeError(
1112
+ error_class="NOT_BOOL_OR_FLOAT_OR_INT_OR_STR",
1113
+ message_parameters={
1114
+ "arg_name": "value type of dict",
1115
+ "arg_type": type(v).__name__,
1116
+ },
1117
+ )
1118
+
1119
+ _cols: List[str] = []
1120
+ if subset is not None:
1121
+ if isinstance(subset, str):
1122
+ _cols = [subset]
1123
+ elif isinstance(subset, (tuple, list)):
1124
+ for c in subset:
1125
+ if not isinstance(c, str):
1126
+ raise PySparkTypeError(
1127
+ error_class="NOT_LIST_OR_STR_OR_TUPLE",
1128
+ message_parameters={"arg_name": "cols", "arg_type": type(c).__name__},
1129
+ )
1130
+ _cols = list(subset)
1131
+ else:
1132
+ raise PySparkTypeError(
1133
+ error_class="NOT_LIST_OR_TUPLE",
1134
+ message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__},
1135
+ )
1136
+
1137
+ if isinstance(value, dict):
1138
+ _cols = list(value.keys())
1139
+ _values = [value[c] for c in _cols]
1140
+ else:
1141
+ _values = [value]
1142
+
1143
+ return DataFrame.withPlan(
1144
+ plan.NAFill(child=self._plan, cols=_cols, values=_values),
1145
+ session=self._session,
1146
+ )
1147
+
1148
+ fillna.__doc__ = PySparkDataFrame.fillna.__doc__
1149
+
1150
+ def dropna(
1151
+ self,
1152
+ how: str = "any",
1153
+ thresh: Optional[int] = None,
1154
+ subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None,
1155
+ ) -> "DataFrame":
1156
+ min_non_nulls: Optional[int] = None
1157
+
1158
+ if how is not None:
1159
+ if not isinstance(how, str):
1160
+ raise PySparkTypeError(
1161
+ error_class="NOT_STR",
1162
+ message_parameters={"arg_name": "how", "arg_type": type(how).__name__},
1163
+ )
1164
+ if how == "all":
1165
+ min_non_nulls = 1
1166
+ elif how == "any":
1167
+ min_non_nulls = None
1168
+ else:
1169
+ raise PySparkValueError(
1170
+ error_class="CANNOT_BE_EMPTY",
1171
+ message_parameters={"arg_name": "how", "arg_value": str(how)},
1172
+ )
1173
+
1174
+ if thresh is not None:
1175
+ if not isinstance(thresh, int):
1176
+ raise PySparkTypeError(
1177
+ error_class="NOT_INT",
1178
+ message_parameters={"arg_name": "thresh", "arg_type": type(thresh).__name__},
1179
+ )
1180
+
1181
+ # 'thresh' overwrites 'how'
1182
+ min_non_nulls = thresh
1183
+
1184
+ _cols: List[str] = []
1185
+ if subset is not None:
1186
+ if isinstance(subset, str):
1187
+ _cols = [subset]
1188
+ elif isinstance(subset, (tuple, list)):
1189
+ for c in subset:
1190
+ if not isinstance(c, str):
1191
+ raise PySparkTypeError(
1192
+ error_class="NOT_LIST_OR_STR_OR_TUPLE",
1193
+ message_parameters={"arg_name": "cols", "arg_type": type(c).__name__},
1194
+ )
1195
+ _cols = list(subset)
1196
+ else:
1197
+ raise PySparkTypeError(
1198
+ error_class="NOT_LIST_OR_STR_OR_TUPLE",
1199
+ message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__},
1200
+ )
1201
+
1202
+ return DataFrame.withPlan(
1203
+ plan.NADrop(child=self._plan, cols=_cols, min_non_nulls=min_non_nulls),
1204
+ session=self._session,
1205
+ )
1206
+
1207
+ dropna.__doc__ = PySparkDataFrame.dropna.__doc__
1208
+
1209
+ def replace(
1210
+ self,
1211
+ to_replace: Union[
1212
+ "LiteralType", List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"]
1213
+ ],
1214
+ value: Optional[
1215
+ Union["OptionalPrimitiveType", List["OptionalPrimitiveType"], _NoValueType]
1216
+ ] = _NoValue,
1217
+ subset: Optional[List[str]] = None,
1218
+ ) -> "DataFrame":
1219
+ if value is _NoValue:
1220
+ if isinstance(to_replace, dict):
1221
+ value = None
1222
+ else:
1223
+ raise PySparkTypeError(
1224
+ error_class="ARGUMENT_REQUIRED",
1225
+ message_parameters={"arg_name": "value", "condition": "`to_replace` is dict"},
1226
+ )
1227
+
1228
+ # Helper functions
1229
+ def all_of(types: Union[Type, Tuple[Type, ...]]) -> Callable[[Iterable], bool]:
1230
+ """Given a type or tuple of types and a sequence of xs
1231
+ check if each x is instance of type(s)
1232
+
1233
+ >>> all_of(bool)([True, False])
1234
+ True
1235
+ >>> all_of(str)(["a", 1])
1236
+ False
1237
+ """
1238
+
1239
+ def all_of_(xs: Iterable) -> bool:
1240
+ return all(isinstance(x, types) for x in xs)
1241
+
1242
+ return all_of_
1243
+
1244
+ all_of_bool = all_of(bool)
1245
+ all_of_str = all_of(str)
1246
+ all_of_numeric = all_of((float, int))
1247
+
1248
+ # Validate input types
1249
+ valid_types = (bool, float, int, str, list, tuple)
1250
+ if not isinstance(to_replace, valid_types + (dict,)):
1251
+ raise PySparkTypeError(
1252
+ error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_LIST_OR_STR_OR_TUPLE",
1253
+ message_parameters={
1254
+ "arg_name": "to_replace",
1255
+ "arg_type": type(to_replace).__name__,
1256
+ },
1257
+ )
1258
+
1259
+ if (
1260
+ not isinstance(value, valid_types)
1261
+ and value is not None
1262
+ and not isinstance(to_replace, dict)
1263
+ ):
1264
+ raise PySparkTypeError(
1265
+ error_class="NOT_BOOL_OR_FLOAT_OR_INT_OR_LIST_OR_NONE_OR_STR_OR_TUPLE",
1266
+ message_parameters={"arg_name": "value", "arg_type": type(value).__name__},
1267
+ )
1268
+
1269
+ if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
1270
+ if len(to_replace) != len(value):
1271
+ raise PySparkValueError(
1272
+ error_class="LENGTH_SHOULD_BE_THE_SAME",
1273
+ message_parameters={
1274
+ "arg1": "to_replace",
1275
+ "arg2": "value",
1276
+ "arg1_length": str(len(to_replace)),
1277
+ "arg2_length": str(len(value)),
1278
+ },
1279
+ )
1280
+
1281
+ if not (subset is None or isinstance(subset, (list, tuple, str))):
1282
+ raise PySparkTypeError(
1283
+ error_class="NOT_LIST_OR_STR_OR_TUPLE",
1284
+ message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__},
1285
+ )
1286
+
1287
+ # Reshape input arguments if necessary
1288
+ if isinstance(to_replace, (float, int, str)):
1289
+ to_replace = [to_replace]
1290
+
1291
+ if isinstance(to_replace, dict):
1292
+ rep_dict = to_replace
1293
+ if value is not None:
1294
+ warnings.warn("to_replace is a dict and value is not None. value will be ignored.")
1295
+ else:
1296
+ if isinstance(value, (float, int, str)) or value is None:
1297
+ value = [value for _ in range(len(to_replace))]
1298
+ rep_dict = dict(zip(to_replace, cast("Iterable[Optional[Union[float, str]]]", value)))
1299
+
1300
+ if isinstance(subset, str):
1301
+ subset = [subset]
1302
+
1303
+ # Verify we were not passed in mixed type generics.
1304
+ if not any(
1305
+ all_of_type(rep_dict.keys())
1306
+ and all_of_type(x for x in rep_dict.values() if x is not None)
1307
+ for all_of_type in [all_of_bool, all_of_str, all_of_numeric]
1308
+ ):
1309
+ raise PySparkValueError(
1310
+ error_class="MIXED_TYPE_REPLACEMENT",
1311
+ message_parameters={},
1312
+ )
1313
+
1314
+ return DataFrame.withPlan(
1315
+ plan.NAReplace(child=self._plan, cols=subset, replacements=rep_dict),
1316
+ session=self._session,
1317
+ )
1318
+
1319
+ replace.__doc__ = PySparkDataFrame.replace.__doc__
1320
+
1321
+ @property
1322
+ def stat(self) -> "DataFrameStatFunctions":
1323
+ return DataFrameStatFunctions(self)
1324
+
1325
+ stat.__doc__ = PySparkDataFrame.stat.__doc__
1326
+
1327
+ def summary(self, *statistics: str) -> "DataFrame":
1328
+ _statistics: List[str] = list(statistics)
1329
+ for s in _statistics:
1330
+ if not isinstance(s, str):
1331
+ raise PySparkTypeError(
1332
+ error_class="NOT_LIST_OF_STR",
1333
+ message_parameters={"arg_name": "statistics", "arg_type": type(s).__name__},
1334
+ )
1335
+ return DataFrame.withPlan(
1336
+ plan.StatSummary(child=self._plan, statistics=_statistics),
1337
+ session=self._session,
1338
+ )
1339
+
1340
+ summary.__doc__ = PySparkDataFrame.summary.__doc__
1341
+
1342
+ def describe(self, *cols: Union[str, List[str]]) -> "DataFrame":
1343
+ if len(cols) == 1 and isinstance(cols[0], list):
1344
+ cols = cols[0] # type: ignore[assignment]
1345
+
1346
+ _cols = []
1347
+ for column in cols:
1348
+ if isinstance(column, str):
1349
+ _cols.append(column)
1350
+ else:
1351
+ _cols.extend([s for s in column])
1352
+ return DataFrame.withPlan(
1353
+ plan.StatDescribe(child=self._plan, cols=_cols),
1354
+ session=self._session,
1355
+ )
1356
+
1357
+ describe.__doc__ = PySparkDataFrame.describe.__doc__
1358
+
1359
+ def cov(self, col1: str, col2: str) -> float:
1360
+ if not isinstance(col1, str):
1361
+ raise PySparkTypeError(
1362
+ error_class="NOT_STR",
1363
+ message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__},
1364
+ )
1365
+ if not isinstance(col2, str):
1366
+ raise PySparkTypeError(
1367
+ error_class="NOT_STR",
1368
+ message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__},
1369
+ )
1370
+ pdf = DataFrame.withPlan(
1371
+ plan.StatCov(child=self._plan, col1=col1, col2=col2),
1372
+ session=self._session,
1373
+ ).toPandas()
1374
+
1375
+ assert pdf is not None
1376
+ return pdf["cov"][0]
1377
+
1378
+ cov.__doc__ = PySparkDataFrame.cov.__doc__
1379
+
1380
+ def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float:
1381
+ if not isinstance(col1, str):
1382
+ raise PySparkTypeError(
1383
+ error_class="NOT_STR",
1384
+ message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__},
1385
+ )
1386
+ if not isinstance(col2, str):
1387
+ raise PySparkTypeError(
1388
+ error_class="NOT_STR",
1389
+ message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__},
1390
+ )
1391
+ if not method:
1392
+ method = "pearson"
1393
+ if not method == "pearson":
1394
+ raise PySparkValueError(
1395
+ error_class="VALUE_NOT_PEARSON",
1396
+ message_parameters={"arg_name": "method", "arg_value": method},
1397
+ )
1398
+ pdf = DataFrame.withPlan(
1399
+ plan.StatCorr(child=self._plan, col1=col1, col2=col2, method=method),
1400
+ session=self._session,
1401
+ ).toPandas()
1402
+
1403
+ assert pdf is not None
1404
+ return pdf["corr"][0]
1405
+
1406
+ corr.__doc__ = PySparkDataFrame.corr.__doc__
1407
+
1408
+ def approxQuantile(
1409
+ self,
1410
+ col: Union[str, List[str], Tuple[str]],
1411
+ probabilities: Union[List[float], Tuple[float]],
1412
+ relativeError: float,
1413
+ ) -> Union[List[float], List[List[float]]]:
1414
+ if not isinstance(col, (str, list, tuple)):
1415
+ raise PySparkTypeError(
1416
+ error_class="NOT_LIST_OR_STR_OR_TUPLE",
1417
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
1418
+ )
1419
+
1420
+ isStr = isinstance(col, str)
1421
+
1422
+ if isinstance(col, tuple):
1423
+ col = list(col)
1424
+ elif isStr:
1425
+ col = [cast(str, col)]
1426
+
1427
+ for c in col:
1428
+ if not isinstance(c, str):
1429
+ raise PySparkTypeError(
1430
+ error_class="NOT_LIST_OF_STR",
1431
+ message_parameters={"arg_name": "columns", "arg_type": type(c).__name__},
1432
+ )
1433
+
1434
+ if not isinstance(probabilities, (list, tuple)):
1435
+ raise PySparkTypeError(
1436
+ error_class="NOT_LIST_OR_TUPLE",
1437
+ message_parameters={
1438
+ "arg_name": "probabilities",
1439
+ "arg_type": type(probabilities).__name__,
1440
+ },
1441
+ )
1442
+ if isinstance(probabilities, tuple):
1443
+ probabilities = list(probabilities)
1444
+ for p in probabilities:
1445
+ if not isinstance(p, (float, int)) or p < 0 or p > 1:
1446
+ raise PySparkTypeError(
1447
+ error_class="NOT_LIST_OF_FLOAT_OR_INT",
1448
+ message_parameters={
1449
+ "arg_name": "probabilities",
1450
+ "arg_type": type(p).__name__,
1451
+ },
1452
+ )
1453
+
1454
+ if not isinstance(relativeError, (float, int)):
1455
+ raise PySparkTypeError(
1456
+ error_class="NOT_FLOAT_OR_INT",
1457
+ message_parameters={
1458
+ "arg_name": "relativeError",
1459
+ "arg_type": type(relativeError).__name__,
1460
+ },
1461
+ )
1462
+ if relativeError < 0:
1463
+ raise PySparkValueError(
1464
+ error_class="NEGATIVE_VALUE",
1465
+ message_parameters={
1466
+ "arg_name": "relativeError",
1467
+ "arg_value": str(relativeError),
1468
+ },
1469
+ )
1470
+ relativeError = float(relativeError)
1471
+ pdf = DataFrame.withPlan(
1472
+ plan.StatApproxQuantile(
1473
+ child=self._plan,
1474
+ cols=list(col),
1475
+ probabilities=probabilities,
1476
+ relativeError=relativeError,
1477
+ ),
1478
+ session=self._session,
1479
+ ).toPandas()
1480
+
1481
+ assert pdf is not None
1482
+ jaq = pdf["approx_quantile"][0]
1483
+ jaq_list = [list(j) for j in jaq]
1484
+ return jaq_list[0] if isStr else jaq_list
1485
+
1486
+ approxQuantile.__doc__ = PySparkDataFrame.approxQuantile.__doc__
1487
+
1488
+ def crosstab(self, col1: str, col2: str) -> "DataFrame":
1489
+ if not isinstance(col1, str):
1490
+ raise PySparkTypeError(
1491
+ error_class="NOT_STR",
1492
+ message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__},
1493
+ )
1494
+ if not isinstance(col2, str):
1495
+ raise PySparkTypeError(
1496
+ error_class="NOT_STR",
1497
+ message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__},
1498
+ )
1499
+ return DataFrame.withPlan(
1500
+ plan.StatCrosstab(child=self._plan, col1=col1, col2=col2),
1501
+ session=self._session,
1502
+ )
1503
+
1504
+ crosstab.__doc__ = PySparkDataFrame.crosstab.__doc__
1505
+
1506
+ def freqItems(
1507
+ self, cols: Union[List[str], Tuple[str]], support: Optional[float] = None
1508
+ ) -> "DataFrame":
1509
+ if isinstance(cols, tuple):
1510
+ cols = list(cols)
1511
+ if not isinstance(cols, list):
1512
+ raise PySparkTypeError(
1513
+ error_class="NOT_LIST_OR_TUPLE",
1514
+ message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__},
1515
+ )
1516
+ if not support:
1517
+ support = 0.01
1518
+ return DataFrame.withPlan(
1519
+ plan.StatFreqItems(child=self._plan, cols=cols, support=support),
1520
+ session=self._session,
1521
+ )
1522
+
1523
+ freqItems.__doc__ = PySparkDataFrame.freqItems.__doc__
1524
+
1525
+ def sampleBy(
1526
+ self, col: "ColumnOrName", fractions: Dict[Any, float], seed: Optional[int] = None
1527
+ ) -> "DataFrame":
1528
+ from pyspark.sql.connect.expressions import ColumnReference
1529
+
1530
+ if isinstance(col, str):
1531
+ col = Column(ColumnReference(col))
1532
+ elif not isinstance(col, Column):
1533
+ raise PySparkTypeError(
1534
+ error_class="NOT_COLUMN_OR_STR",
1535
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
1536
+ )
1537
+ if not isinstance(fractions, dict):
1538
+ raise PySparkTypeError(
1539
+ error_class="NOT_DICT",
1540
+ message_parameters={"arg_name": "fractions", "arg_type": type(fractions).__name__},
1541
+ )
1542
+ for k, v in fractions.items():
1543
+ if not isinstance(k, (float, int, str)):
1544
+ raise PySparkTypeError(
1545
+ error_class="DISALLOWED_TYPE_FOR_CONTAINER",
1546
+ message_parameters={
1547
+ "arg_name": "fractions",
1548
+ "arg_type": type(fractions).__name__,
1549
+ "allowed_types": "float, int, str",
1550
+ "return_type": type(k).__name__,
1551
+ },
1552
+ )
1553
+ fractions[k] = float(v)
1554
+ seed = seed if seed is not None else random.randint(0, sys.maxsize)
1555
+ return DataFrame.withPlan(
1556
+ plan.StatSampleBy(child=self._plan, col=col, fractions=fractions, seed=seed),
1557
+ session=self._session,
1558
+ )
1559
+
1560
+ sampleBy.__doc__ = PySparkDataFrame.sampleBy.__doc__
1561
+
1562
+ def _get_alias(self) -> Optional[str]:
1563
+ p = self._plan
1564
+ while p is not None:
1565
+ if isinstance(p, plan.Project) and p.alias:
1566
+ return p.alias
1567
+ p = p._child
1568
+ return None
1569
+
1570
+ def __getattr__(self, name: str) -> "Column":
1571
+ if name in ["_jseq", "_jdf", "_jmap", "_jcols"]:
1572
+ raise PySparkAttributeError(
1573
+ error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name}
1574
+ )
1575
+ elif name in [
1576
+ "rdd",
1577
+ "toJSON",
1578
+ "foreach",
1579
+ "foreachPartition",
1580
+ "checkpoint",
1581
+ "localCheckpoint",
1582
+ ]:
1583
+ raise PySparkNotImplementedError(
1584
+ error_class="NOT_IMPLEMENTED",
1585
+ message_parameters={"feature": f"{name}()"},
1586
+ )
1587
+
1588
+ if name not in self.columns:
1589
+ raise AttributeError(
1590
+ "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)
1591
+ )
1592
+
1593
+ return self[name]
1594
+
1595
+ __getattr__.__doc__ = PySparkDataFrame.__getattr__.__doc__
1596
+
1597
+ @overload
1598
+ def __getitem__(self, item: Union[int, str]) -> Column:
1599
+ ...
1600
+
1601
+ @overload
1602
+ def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame":
1603
+ ...
1604
+
1605
+ def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Column, "DataFrame"]:
1606
+ if isinstance(item, str):
1607
+ # Check for alias
1608
+ alias = self._get_alias()
1609
+ if self._plan is None:
1610
+ raise SparkConnectException("Cannot analyze on empty plan.")
1611
+ return _to_col_with_plan_id(
1612
+ col=alias if alias is not None else item,
1613
+ plan_id=self._plan._plan_id,
1614
+ )
1615
+ elif isinstance(item, Column):
1616
+ return self.filter(item)
1617
+ elif isinstance(item, (list, tuple)):
1618
+ return self.select(*item)
1619
+ elif isinstance(item, int):
1620
+ return col(self.columns[item])
1621
+ else:
1622
+ raise PySparkTypeError(
1623
+ error_class="NOT_COLUMN_OR_INT_OR_LIST_OR_STR_OR_TUPLE",
1624
+ message_parameters={"arg_name": "item", "arg_type": type(item).__name__},
1625
+ )
1626
+
1627
+ def __dir__(self) -> List[str]:
1628
+ attrs = set(super().__dir__())
1629
+ attrs.update(self.columns)
1630
+ return sorted(attrs)
1631
+
1632
+ __dir__.__doc__ = PySparkDataFrame.__dir__.__doc__
1633
+
1634
+ def _print_plan(self) -> str:
1635
+ if self._plan:
1636
+ return self._plan.print()
1637
+ return ""
1638
+
1639
+ def collect(self) -> List[Row]:
1640
+ if self._plan is None:
1641
+ raise Exception("Cannot collect on empty plan.")
1642
+ if self._session is None:
1643
+ raise Exception("Cannot collect on empty session.")
1644
+ query = self._plan.to_proto(self._session.client)
1645
+ table, schema = self._session.client.to_table(query)
1646
+
1647
+ schema = schema or from_arrow_schema(table.schema, prefer_timestamp_ntz=True)
1648
+
1649
+ assert schema is not None and isinstance(schema, StructType)
1650
+
1651
+ from pyspark.sql.connect.conversion import ArrowTableToRowsConversion
1652
+
1653
+ return ArrowTableToRowsConversion.convert(table, schema)
1654
+
1655
+ collect.__doc__ = PySparkDataFrame.collect.__doc__
1656
+
1657
+ def toPandas(self) -> "pandas.DataFrame":
1658
+ if self._plan is None:
1659
+ raise Exception("Cannot collect on empty plan.")
1660
+ if self._session is None:
1661
+ raise Exception("Cannot collect on empty session.")
1662
+ query = self._plan.to_proto(self._session.client)
1663
+ return self._session.client.to_pandas(query)
1664
+
1665
+ toPandas.__doc__ = PySparkDataFrame.toPandas.__doc__
1666
+
1667
+ @property
1668
+ def schema(self) -> StructType:
1669
+ if self._plan is not None:
1670
+ query = self._plan.to_proto(self._session.client)
1671
+ if self._session is None:
1672
+ raise Exception("Cannot analyze without SparkSession.")
1673
+ return self._session.client.schema(query)
1674
+ else:
1675
+ raise Exception("Empty plan.")
1676
+
1677
+ schema.__doc__ = PySparkDataFrame.schema.__doc__
1678
+
1679
+ def isLocal(self) -> bool:
1680
+ if self._plan is None:
1681
+ raise Exception("Cannot analyze on empty plan.")
1682
+ query = self._plan.to_proto(self._session.client)
1683
+ result = self._session.client._analyze(method="is_local", plan=query).is_local
1684
+ assert result is not None
1685
+ return result
1686
+
1687
+ isLocal.__doc__ = PySparkDataFrame.isLocal.__doc__
1688
+
1689
+ @property
1690
+ def isStreaming(self) -> bool:
1691
+ if self._plan is None:
1692
+ raise Exception("Cannot analyze on empty plan.")
1693
+ query = self._plan.to_proto(self._session.client)
1694
+ result = self._session.client._analyze(method="is_streaming", plan=query).is_streaming
1695
+ assert result is not None
1696
+ return result
1697
+
1698
+ isStreaming.__doc__ = PySparkDataFrame.isStreaming.__doc__
1699
+
1700
+ def _tree_string(self, level: Optional[int] = None) -> str:
1701
+ if self._plan is None:
1702
+ raise Exception("Cannot analyze on empty plan.")
1703
+ query = self._plan.to_proto(self._session.client)
1704
+ result = self._session.client._analyze(
1705
+ method="tree_string", plan=query, level=level
1706
+ ).tree_string
1707
+ assert result is not None
1708
+ return result
1709
+
1710
+ def printSchema(self, level: Optional[int] = None) -> None:
1711
+ print(self._tree_string(level))
1712
+
1713
+ printSchema.__doc__ = PySparkDataFrame.printSchema.__doc__
1714
+
1715
+ def inputFiles(self) -> List[str]:
1716
+ if self._plan is None:
1717
+ raise Exception("Cannot analyze on empty plan.")
1718
+ query = self._plan.to_proto(self._session.client)
1719
+ result = self._session.client._analyze(method="input_files", plan=query).input_files
1720
+ assert result is not None
1721
+ return result
1722
+
1723
+ inputFiles.__doc__ = PySparkDataFrame.inputFiles.__doc__
1724
+
1725
+ def to(self, schema: StructType) -> "DataFrame":
1726
+ assert schema is not None
1727
+ return DataFrame.withPlan(
1728
+ plan.ToSchema(child=self._plan, schema=schema),
1729
+ session=self._session,
1730
+ )
1731
+
1732
+ to.__doc__ = PySparkDataFrame.to.__doc__
1733
+
1734
+ def toDF(self, *cols: str) -> "DataFrame":
1735
+ for col_ in cols:
1736
+ if not isinstance(col_, str):
1737
+ raise PySparkTypeError(
1738
+ error_class="NOT_LIST_OF_STR",
1739
+ message_parameters={"arg_name": "cols", "arg_type": type(col_).__name__},
1740
+ )
1741
+ return DataFrame.withPlan(plan.ToDF(self._plan, list(cols)), self._session)
1742
+
1743
+ toDF.__doc__ = PySparkDataFrame.toDF.__doc__
1744
+
1745
+ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame":
1746
+ result = func(self, *args, **kwargs)
1747
+ assert isinstance(
1748
+ result, DataFrame
1749
+ ), "Func returned an instance of type [%s], " "should have been DataFrame." % type(result)
1750
+ return result
1751
+
1752
+ transform.__doc__ = PySparkDataFrame.transform.__doc__
1753
+
1754
+ def _explain_string(
1755
+ self, extended: Optional[Union[bool, str]] = None, mode: Optional[str] = None
1756
+ ) -> str:
1757
+ if extended is not None and mode is not None:
1758
+ raise PySparkValueError(
1759
+ error_class="CANNOT_SET_TOGETHER",
1760
+ message_parameters={"arg_list": "extended and mode"},
1761
+ )
1762
+
1763
+ # For the no argument case: df.explain()
1764
+ is_no_argument = extended is None and mode is None
1765
+
1766
+ # For the cases below:
1767
+ # explain(True)
1768
+ # explain(extended=False)
1769
+ is_extended_case = isinstance(extended, bool) and mode is None
1770
+
1771
+ # For the case when extended is mode:
1772
+ # df.explain("formatted")
1773
+ is_extended_as_mode = isinstance(extended, str) and mode is None
1774
+
1775
+ # For the mode specified:
1776
+ # df.explain(mode="formatted")
1777
+ is_mode_case = extended is None and isinstance(mode, str)
1778
+
1779
+ if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case):
1780
+ argtypes = [str(type(arg)) for arg in [extended, mode] if arg is not None]
1781
+ raise PySparkTypeError(
1782
+ error_class="NOT_BOOL_OR_STR",
1783
+ message_parameters={
1784
+ "arg_name": "extended (optional) and mode (optional)",
1785
+ "arg_type": ", ".join(argtypes),
1786
+ },
1787
+ )
1788
+
1789
+ # Sets an explain mode depending on a given argument
1790
+ if is_no_argument:
1791
+ explain_mode = "simple"
1792
+ elif is_extended_case:
1793
+ explain_mode = "extended" if extended else "simple"
1794
+ elif is_mode_case:
1795
+ explain_mode = cast(str, mode)
1796
+ elif is_extended_as_mode:
1797
+ explain_mode = cast(str, extended)
1798
+
1799
+ if self._plan is not None:
1800
+ query = self._plan.to_proto(self._session.client)
1801
+ if self._session is None:
1802
+ raise Exception("Cannot analyze without SparkSession.")
1803
+ return self._session.client.explain_string(query, explain_mode)
1804
+ else:
1805
+ return ""
1806
+
1807
+ def explain(
1808
+ self, extended: Optional[Union[bool, str]] = None, mode: Optional[str] = None
1809
+ ) -> None:
1810
+ print(self._explain_string(extended=extended, mode=mode))
1811
+
1812
+ explain.__doc__ = PySparkDataFrame.explain.__doc__
1813
+
1814
+ def createTempView(self, name: str) -> None:
1815
+ command = plan.CreateView(
1816
+ child=self._plan, name=name, is_global=False, replace=False
1817
+ ).command(session=self._session.client)
1818
+ self._session.client.execute_command(command)
1819
+
1820
+ createTempView.__doc__ = PySparkDataFrame.createTempView.__doc__
1821
+
1822
+ def createOrReplaceTempView(self, name: str) -> None:
1823
+ command = plan.CreateView(
1824
+ child=self._plan, name=name, is_global=False, replace=True
1825
+ ).command(session=self._session.client)
1826
+ self._session.client.execute_command(command)
1827
+
1828
+ createOrReplaceTempView.__doc__ = PySparkDataFrame.createOrReplaceTempView.__doc__
1829
+
1830
+ def createGlobalTempView(self, name: str) -> None:
1831
+ command = plan.CreateView(
1832
+ child=self._plan, name=name, is_global=True, replace=False
1833
+ ).command(session=self._session.client)
1834
+ self._session.client.execute_command(command)
1835
+
1836
+ createGlobalTempView.__doc__ = PySparkDataFrame.createGlobalTempView.__doc__
1837
+
1838
+ def createOrReplaceGlobalTempView(self, name: str) -> None:
1839
+ command = plan.CreateView(
1840
+ child=self._plan, name=name, is_global=True, replace=True
1841
+ ).command(session=self._session.client)
1842
+ self._session.client.execute_command(command)
1843
+
1844
+ createOrReplaceGlobalTempView.__doc__ = PySparkDataFrame.createOrReplaceGlobalTempView.__doc__
1845
+
1846
+ def cache(self) -> "DataFrame":
1847
+ if self._plan is None:
1848
+ raise Exception("Cannot cache on empty plan.")
1849
+ return self.persist()
1850
+
1851
+ cache.__doc__ = PySparkDataFrame.cache.__doc__
1852
+
1853
+ def persist(
1854
+ self,
1855
+ storageLevel: StorageLevel = (StorageLevel.MEMORY_AND_DISK_DESER),
1856
+ ) -> "DataFrame":
1857
+ if self._plan is None:
1858
+ raise Exception("Cannot persist on empty plan.")
1859
+ relation = self._plan.plan(self._session.client)
1860
+ self._session.client._analyze(
1861
+ method="persist", relation=relation, storage_level=storageLevel
1862
+ )
1863
+ return self
1864
+
1865
+ persist.__doc__ = PySparkDataFrame.persist.__doc__
1866
+
1867
+ @property
1868
+ def storageLevel(self) -> StorageLevel:
1869
+ if self._plan is None:
1870
+ raise Exception("Cannot persist on empty plan.")
1871
+ relation = self._plan.plan(self._session.client)
1872
+ storage_level = self._session.client._analyze(
1873
+ method="get_storage_level", relation=relation
1874
+ ).storage_level
1875
+ assert storage_level is not None
1876
+ return storage_level
1877
+
1878
+ storageLevel.__doc__ = PySparkDataFrame.storageLevel.__doc__
1879
+
1880
+ def unpersist(self, blocking: bool = False) -> "DataFrame":
1881
+ if self._plan is None:
1882
+ raise Exception("Cannot unpersist on empty plan.")
1883
+ relation = self._plan.plan(self._session.client)
1884
+ self._session.client._analyze(method="unpersist", relation=relation, blocking=blocking)
1885
+ return self
1886
+
1887
+ unpersist.__doc__ = PySparkDataFrame.unpivot.__doc__
1888
+
1889
+ @property
1890
+ def is_cached(self) -> bool:
1891
+ return self.storageLevel != StorageLevel.NONE
1892
+
1893
+ def toLocalIterator(self, prefetchPartitions: bool = False) -> Iterator[Row]:
1894
+ from pyspark.sql.connect.conversion import ArrowTableToRowsConversion
1895
+
1896
+ if self._plan is None:
1897
+ raise Exception("Cannot collect on empty plan.")
1898
+ if self._session is None:
1899
+ raise Exception("Cannot collect on empty session.")
1900
+ query = self._plan.to_proto(self._session.client)
1901
+
1902
+ schema: Optional[StructType] = None
1903
+ for schema_or_table in self._session.client.to_table_as_iterator(query):
1904
+ if isinstance(schema_or_table, StructType):
1905
+ assert schema is None
1906
+ schema = schema_or_table
1907
+ else:
1908
+ assert isinstance(schema_or_table, pa.Table)
1909
+ table = schema_or_table
1910
+ if schema is None:
1911
+ schema = from_arrow_schema(table.schema, prefer_timestamp_ntz=True)
1912
+ yield from ArrowTableToRowsConversion.convert(table, schema)
1913
+
1914
+ toLocalIterator.__doc__ = PySparkDataFrame.toLocalIterator.__doc__
1915
+
1916
+ def to_pandas_on_spark(
1917
+ self, index_col: Optional[Union[str, List[str]]] = None
1918
+ ) -> "PandasOnSparkDataFrame":
1919
+ warnings.warn(
1920
+ "DataFrame.to_pandas_on_spark is deprecated. Use DataFrame.pandas_api instead.",
1921
+ FutureWarning,
1922
+ )
1923
+ return self.pandas_api(index_col)
1924
+
1925
+ def pandas_api(
1926
+ self, index_col: Optional[Union[str, List[str]]] = None
1927
+ ) -> "PandasOnSparkDataFrame":
1928
+ from pyspark.pandas.namespace import _get_index_map
1929
+ from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
1930
+ from pyspark.pandas.internal import InternalFrame
1931
+
1932
+ index_spark_columns, index_names = _get_index_map(self, index_col) # type: ignore[arg-type]
1933
+ internal = InternalFrame(
1934
+ spark_frame=self, # type: ignore[arg-type]
1935
+ index_spark_columns=index_spark_columns,
1936
+ index_names=index_names, # type: ignore[arg-type]
1937
+ )
1938
+ return PandasOnSparkDataFrame(internal)
1939
+
1940
+ pandas_api.__doc__ = PySparkDataFrame.pandas_api.__doc__
1941
+
1942
+ def registerTempTable(self, name: str) -> None:
1943
+ warnings.warn("Deprecated in 2.0, use createOrReplaceTempView instead.", FutureWarning)
1944
+ self.createOrReplaceTempView(name)
1945
+
1946
+ registerTempTable.__doc__ = PySparkDataFrame.registerTempTable.__doc__
1947
+
1948
+ def _map_partitions(
1949
+ self,
1950
+ func: "PandasMapIterFunction",
1951
+ schema: Union[StructType, str],
1952
+ evalType: int,
1953
+ barrier: bool,
1954
+ ) -> "DataFrame":
1955
+ from pyspark.sql.connect.udf import UserDefinedFunction
1956
+
1957
+ if self._plan is None:
1958
+ raise Exception("Cannot mapInPandas when self._plan is empty.")
1959
+
1960
+ udf_obj = UserDefinedFunction(
1961
+ func,
1962
+ returnType=schema,
1963
+ evalType=evalType,
1964
+ )
1965
+
1966
+ return DataFrame.withPlan(
1967
+ plan.MapPartitions(
1968
+ child=self._plan, function=udf_obj, cols=self.columns, is_barrier=barrier
1969
+ ),
1970
+ session=self._session,
1971
+ )
1972
+
1973
+ def mapInPandas(
1974
+ self,
1975
+ func: "PandasMapIterFunction",
1976
+ schema: Union[StructType, str],
1977
+ barrier: bool = False,
1978
+ ) -> "DataFrame":
1979
+ return self._map_partitions(func, schema, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, barrier)
1980
+
1981
+ mapInPandas.__doc__ = PySparkDataFrame.mapInPandas.__doc__
1982
+
1983
+ def mapInArrow(
1984
+ self,
1985
+ func: "ArrowMapIterFunction",
1986
+ schema: Union[StructType, str],
1987
+ barrier: bool = False,
1988
+ ) -> "DataFrame":
1989
+ return self._map_partitions(func, schema, PythonEvalType.SQL_MAP_ARROW_ITER_UDF, barrier)
1990
+
1991
+ mapInArrow.__doc__ = PySparkDataFrame.mapInArrow.__doc__
1992
+
1993
+ @property
1994
+ def writeStream(self) -> DataStreamWriter:
1995
+ assert self._plan is not None
1996
+ return DataStreamWriter(plan=self._plan, session=self._session)
1997
+
1998
+ writeStream.__doc__ = PySparkDataFrame.writeStream.__doc__
1999
+
2000
+ def sameSemantics(self, other: "DataFrame") -> bool:
2001
+ assert self._plan is not None
2002
+ assert other._plan is not None
2003
+ return self._session.client.same_semantics(
2004
+ plan=self._plan.to_proto(self._session.client),
2005
+ other=other._plan.to_proto(other._session.client),
2006
+ )
2007
+
2008
+ sameSemantics.__doc__ = PySparkDataFrame.sameSemantics.__doc__
2009
+
2010
+ def semanticHash(self) -> int:
2011
+ assert self._plan is not None
2012
+ return self._session.client.semantic_hash(
2013
+ plan=self._plan.to_proto(self._session.client),
2014
+ )
2015
+
2016
+ semanticHash.__doc__ = PySparkDataFrame.semanticHash.__doc__
2017
+
2018
+ def writeTo(self, table: str) -> "DataFrameWriterV2":
2019
+ assert self._plan is not None
2020
+ return DataFrameWriterV2(self._plan, self._session, table)
2021
+
2022
+ writeTo.__doc__ = PySparkDataFrame.writeTo.__doc__
2023
+
2024
+ # SparkConnect specific API
2025
+ def offset(self, n: int) -> "DataFrame":
2026
+ return DataFrame.withPlan(plan.Offset(child=self._plan, offset=n), session=self._session)
2027
+
2028
+ offset.__doc__ = PySparkDataFrame.offset.__doc__
2029
+
2030
+ @classmethod
2031
+ def withPlan(cls, plan: plan.LogicalPlan, session: "SparkSession") -> "DataFrame":
2032
+ """
2033
+ Main initialization method used to construct a new data frame with a child plan.
2034
+ This is for internal purpose.
2035
+ """
2036
+ new_frame = DataFrame(session=session)
2037
+ new_frame._plan = plan
2038
+ return new_frame
2039
+
2040
+
2041
+ class DataFrameNaFunctions:
2042
+ def __init__(self, df: DataFrame):
2043
+ self.df = df
2044
+
2045
+ def fill(
2046
+ self,
2047
+ value: Union["LiteralType", Dict[str, "LiteralType"]],
2048
+ subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None,
2049
+ ) -> DataFrame:
2050
+ return self.df.fillna(value=value, subset=subset)
2051
+
2052
+ fill.__doc__ = DataFrame.fillna.__doc__
2053
+
2054
+ def drop(
2055
+ self,
2056
+ how: str = "any",
2057
+ thresh: Optional[int] = None,
2058
+ subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None,
2059
+ ) -> DataFrame:
2060
+ return self.df.dropna(how=how, thresh=thresh, subset=subset)
2061
+
2062
+ drop.__doc__ = DataFrame.dropna.__doc__
2063
+
2064
+ def replace(
2065
+ self,
2066
+ to_replace: Union[List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"]],
2067
+ value: Optional[
2068
+ Union["OptionalPrimitiveType", List["OptionalPrimitiveType"], _NoValueType]
2069
+ ] = _NoValue,
2070
+ subset: Optional[List[str]] = None,
2071
+ ) -> DataFrame:
2072
+ return self.df.replace(to_replace, value, subset)
2073
+
2074
+ replace.__doc__ = DataFrame.replace.__doc__
2075
+
2076
+
2077
+ DataFrameNaFunctions.__doc__ = PySparkDataFrameNaFunctions.__doc__
2078
+
2079
+
2080
+ class DataFrameStatFunctions:
2081
+ def __init__(self, df: DataFrame):
2082
+ self.df = df
2083
+
2084
+ def cov(self, col1: str, col2: str) -> float:
2085
+ return self.df.cov(col1, col2)
2086
+
2087
+ cov.__doc__ = DataFrame.cov.__doc__
2088
+
2089
+ def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float:
2090
+ return self.df.corr(col1, col2, method)
2091
+
2092
+ corr.__doc__ = DataFrame.corr.__doc__
2093
+
2094
+ def approxQuantile(
2095
+ self,
2096
+ col: Union[str, List[str], Tuple[str]],
2097
+ probabilities: Union[List[float], Tuple[float]],
2098
+ relativeError: float,
2099
+ ) -> Union[List[float], List[List[float]]]:
2100
+ return self.df.approxQuantile(col, probabilities, relativeError)
2101
+
2102
+ approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__
2103
+
2104
+ def crosstab(self, col1: str, col2: str) -> DataFrame:
2105
+ return self.df.crosstab(col1, col2)
2106
+
2107
+ crosstab.__doc__ = DataFrame.crosstab.__doc__
2108
+
2109
+ def freqItems(
2110
+ self, cols: Union[List[str], Tuple[str]], support: Optional[float] = None
2111
+ ) -> DataFrame:
2112
+ return self.df.freqItems(cols, support)
2113
+
2114
+ freqItems.__doc__ = DataFrame.freqItems.__doc__
2115
+
2116
+ def sampleBy(
2117
+ self, col: str, fractions: Dict[Any, float], seed: Optional[int] = None
2118
+ ) -> DataFrame:
2119
+ return self.df.sampleBy(col, fractions, seed)
2120
+
2121
+ sampleBy.__doc__ = DataFrame.sampleBy.__doc__
2122
+
2123
+
2124
+ DataFrameStatFunctions.__doc__ = PySparkDataFrameStatFunctions.__doc__
2125
+
2126
+
2127
+ def _test() -> None:
2128
+ import os
2129
+ import sys
2130
+ import doctest
2131
+ from pyspark.sql import SparkSession as PySparkSession
2132
+ import pyspark.sql.connect.dataframe
2133
+
2134
+ os.chdir(os.environ["SPARK_HOME"])
2135
+
2136
+ globs = pyspark.sql.connect.dataframe.__dict__.copy()
2137
+ # Spark Connect does not support RDD but the tests depend on them.
2138
+ del pyspark.sql.connect.dataframe.DataFrame.coalesce.__doc__
2139
+ del pyspark.sql.connect.dataframe.DataFrame.repartition.__doc__
2140
+ del pyspark.sql.connect.dataframe.DataFrame.repartitionByRange.__doc__
2141
+
2142
+ # TODO(SPARK-41625): Support Structured Streaming
2143
+ del pyspark.sql.connect.dataframe.DataFrame.isStreaming.__doc__
2144
+
2145
+ # TODO(SPARK-41888): Support StreamingQueryListener for DataFrame.observe
2146
+ del pyspark.sql.connect.dataframe.DataFrame.observe.__doc__
2147
+
2148
+ # TODO(SPARK-43435): should reenable this test
2149
+ del pyspark.sql.connect.dataframe.DataFrame.writeStream.__doc__
2150
+
2151
+ globs["spark"] = (
2152
+ PySparkSession.builder.appName("sql.connect.dataframe tests")
2153
+ .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]"))
2154
+ .getOrCreate()
2155
+ )
2156
+
2157
+ (failure_count, test_count) = doctest.testmod(
2158
+ pyspark.sql.connect.dataframe,
2159
+ globs=globs,
2160
+ optionflags=doctest.ELLIPSIS
2161
+ | doctest.NORMALIZE_WHITESPACE
2162
+ | doctest.IGNORE_EXCEPTION_DETAIL,
2163
+ )
2164
+
2165
+ globs["spark"].stop()
2166
+
2167
+ if failure_count:
2168
+ sys.exit(-1)
2169
+
2170
+
2171
+ if __name__ == "__main__":
2172
+ _test()