snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2289 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+ from pyspark.sql.connect.utils import check_dependencies
18
+
19
+ check_dependencies(__name__)
20
+
21
+ from typing import Any, List, Optional, Type, Sequence, Union, cast, TYPE_CHECKING, Mapping, Dict
22
+ import functools
23
+ import json
24
+ import pickle
25
+ from threading import Lock
26
+ from inspect import signature, isclass
27
+
28
+ import pyarrow as pa
29
+
30
+ from pyspark.serializers import CloudPickleSerializer
31
+ from pyspark.storagelevel import StorageLevel
32
+ from pyspark.sql.types import DataType
33
+
34
+ import pyspark.sql.connect.proto as proto
35
+ from pyspark.sql.connect.conversion import storage_level_to_proto
36
+ from pyspark.sql.connect.column import Column
37
+ from pyspark.sql.connect.expressions import (
38
+ Expression,
39
+ SortOrder,
40
+ ColumnReference,
41
+ LiteralExpression,
42
+ )
43
+ from pyspark.sql.connect.types import pyspark_types_to_proto_types, UnparsedDataType
44
+ from pyspark.errors import (
45
+ PySparkTypeError,
46
+ PySparkNotImplementedError,
47
+ PySparkRuntimeError,
48
+ IllegalArgumentException,
49
+ )
50
+
51
+ if TYPE_CHECKING:
52
+ from pyspark.sql.connect._typing import ColumnOrName
53
+ from pyspark.sql.connect.client import SparkConnectClient
54
+ from pyspark.sql.connect.udf import UserDefinedFunction
55
+
56
+
57
+ class LogicalPlan:
58
+
59
+ _lock: Lock = Lock()
60
+ _nextPlanId: int = 0
61
+
62
+ INDENT = 2
63
+
64
+ def __init__(self, child: Optional["LogicalPlan"]) -> None:
65
+ self._child = child
66
+
67
+ plan_id: Optional[int] = None
68
+ with LogicalPlan._lock:
69
+ plan_id = LogicalPlan._nextPlanId
70
+ LogicalPlan._nextPlanId += 1
71
+
72
+ assert plan_id is not None
73
+ self._plan_id = plan_id
74
+
75
+ def _create_proto_relation(self) -> proto.Relation:
76
+ plan = proto.Relation()
77
+ plan.common.plan_id = self._plan_id
78
+ return plan
79
+
80
+ def unresolved_attr(self, colName: str) -> proto.Expression:
81
+ """Creates an unresolved attribute from a column name."""
82
+ exp = proto.Expression()
83
+ exp.unresolved_attribute.unparsed_identifier = colName
84
+ return exp
85
+
86
+ def to_attr_or_expression(
87
+ self, col: "ColumnOrName", session: "SparkConnectClient"
88
+ ) -> proto.Expression:
89
+ """Returns either an instance of an unresolved attribute or the serialized
90
+ expression value of the column."""
91
+ if type(col) is str:
92
+ return self.unresolved_attr(col)
93
+ else:
94
+ return cast(Column, col).to_plan(session)
95
+
96
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
97
+ ...
98
+
99
+ def command(self, session: "SparkConnectClient") -> proto.Command:
100
+ ...
101
+
102
+ def _verify(self, session: "SparkConnectClient") -> bool:
103
+ """This method is used to verify that the current logical plan
104
+ can be serialized to Proto and back and afterwards is identical."""
105
+ plan = proto.Plan()
106
+ plan.root.CopyFrom(self.plan(session))
107
+
108
+ serialized_plan = plan.SerializeToString()
109
+ test_plan = proto.Plan()
110
+ test_plan.ParseFromString(serialized_plan)
111
+
112
+ return test_plan == plan
113
+
114
+ def to_proto(self, session: "SparkConnectClient", debug: bool = False) -> proto.Plan:
115
+ """
116
+ Generates connect proto plan based on this LogicalPlan.
117
+
118
+ Parameters
119
+ ----------
120
+ session : :class:`SparkConnectClient`, optional.
121
+ a session that connects remote spark cluster.
122
+ debug: bool
123
+ if enabled, the proto plan will be printed.
124
+ """
125
+ plan = proto.Plan()
126
+ plan.root.CopyFrom(self.plan(session))
127
+
128
+ if debug:
129
+ print(plan)
130
+
131
+ return plan
132
+
133
+ def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, Any]:
134
+ """
135
+ Extracts the parameters that are able to be printed. It looks up the signature
136
+ in the constructor of this :class:`LogicalPlan`, and retrieves the variables
137
+ from this instance by the same name (or the name with prefix `_`) defined
138
+ in the constructor.
139
+
140
+ Parameters
141
+ ----------
142
+ parameters : map
143
+ Parameter mapping from ``inspect.signature(...).parameters``
144
+
145
+ Returns
146
+ -------
147
+ dict
148
+ A dictionary consisting of a string name and variable found in this
149
+ :class:`LogicalPlan`.
150
+
151
+ Notes
152
+ -----
153
+ :class:`LogicalPlan` itself is filtered out and considered as a non-printable
154
+ parameter.
155
+
156
+ Examples
157
+ --------
158
+ The example below returns a dictionary from `self._start`, `self._end`,
159
+ `self._num_partitions`.
160
+
161
+ >>> rg = Range(0, 10, 1)
162
+ >>> rg._parameters_to_print(signature(rg.__class__.__init__).parameters)
163
+ {'start': 0, 'end': 10, 'step': 1, 'num_partitions': None}
164
+
165
+ If the child is defined, it is not considered as a printable instance
166
+
167
+ >>> project = Project(rg, "value")
168
+ >>> project._parameters_to_print(signature(project.__class__.__init__).parameters)
169
+ {'columns': ['value']}
170
+ """
171
+ params = {}
172
+ for name, tpe in parameters.items():
173
+ # LogicalPlan is not to print, e.g., LogicalPlan
174
+ is_logical_plan = isclass(tpe.annotation) and isinstance(tpe.annotation, LogicalPlan)
175
+ # Look up the string argument defined as a forward reference e.g., "LogicalPlan"
176
+ is_forwardref_logical_plan = getattr(tpe.annotation, "__forward_arg__", "").endswith(
177
+ "LogicalPlan"
178
+ )
179
+ # Wrapped LogicalPlan, e.g., Optional[LogicalPlan]
180
+ is_nested_logical_plan = any(
181
+ isclass(a) and issubclass(a, LogicalPlan)
182
+ for a in getattr(tpe.annotation, "__args__", ())
183
+ )
184
+ # Wrapped forward reference of LogicalPlan, e.g., Optional["LogicalPlan"].
185
+ is_nested_forwardref_logical_plan = any(
186
+ getattr(a, "__forward_arg__", "").endswith("LogicalPlan")
187
+ for a in getattr(tpe.annotation, "__args__", ())
188
+ )
189
+ if (
190
+ not is_logical_plan
191
+ and not is_forwardref_logical_plan
192
+ and not is_nested_logical_plan
193
+ and not is_nested_forwardref_logical_plan
194
+ ):
195
+ # Searches self.name or self._name
196
+ try:
197
+ params[name] = getattr(self, name)
198
+ except AttributeError:
199
+ try:
200
+ params[name] = getattr(self, "_" + name)
201
+ except AttributeError:
202
+ pass # Simpy ignore
203
+ return params
204
+
205
+ def print(self, indent: int = 0) -> str:
206
+ """
207
+ Print the simple string representation of the current :class:`LogicalPlan`.
208
+
209
+ Parameters
210
+ ----------
211
+ indent : int
212
+ The number of leading spaces for the output string.
213
+
214
+ Returns
215
+ -------
216
+ str
217
+ Simple string representation of this :class:`LogicalPlan`.
218
+ """
219
+ params = self._parameters_to_print(signature(self.__class__.__init__).parameters)
220
+ pretty_params = [f"{name}='{param}'" for name, param in params.items()]
221
+ if len(pretty_params) == 0:
222
+ pretty_str = ""
223
+ else:
224
+ pretty_str = " " + ", ".join(pretty_params)
225
+ return f"{' ' * indent}<{self.__class__.__name__}{pretty_str}>\n{self._child_print(indent)}"
226
+
227
+ def _repr_html_(self) -> str:
228
+ """Returns a :class:`LogicalPlan` with HTML code. This is generally called in third-party
229
+ systems such as Jupyter.
230
+
231
+ Returns
232
+ -------
233
+ str
234
+ HTML representation of this :class:`LogicalPlan`.
235
+ """
236
+ params = self._parameters_to_print(signature(self.__class__.__init__).parameters)
237
+ pretty_params = [
238
+ f"\n {name}: " f"{param} <br/>" for name, param in params.items()
239
+ ]
240
+ if len(pretty_params) == 0:
241
+ pretty_str = ""
242
+ else:
243
+ pretty_str = "".join(pretty_params)
244
+ return f"""
245
+ <ul>
246
+ <li>
247
+ <b>{self.__class__.__name__}</b><br/>{pretty_str}
248
+ {self._child_repr()}
249
+ </li>
250
+ </ul>
251
+ """
252
+
253
+ def _child_print(self, indent: int) -> str:
254
+ return self._child.print(indent + LogicalPlan.INDENT) if self._child else ""
255
+
256
+ def _child_repr(self) -> str:
257
+ return self._child._repr_html_() if self._child is not None else ""
258
+
259
+
260
+ class DataSource(LogicalPlan):
261
+ """A datasource with a format and optional a schema from which Spark reads data"""
262
+
263
+ def __init__(
264
+ self,
265
+ format: Optional[str] = None,
266
+ schema: Optional[str] = None,
267
+ options: Optional[Mapping[str, str]] = None,
268
+ paths: Optional[List[str]] = None,
269
+ predicates: Optional[List[str]] = None,
270
+ is_streaming: Optional[bool] = None,
271
+ ) -> None:
272
+ super().__init__(None)
273
+
274
+ assert format is None or isinstance(format, str)
275
+ assert schema is None or isinstance(schema, str)
276
+
277
+ if options is not None:
278
+ for k, v in options.items():
279
+ assert isinstance(k, str)
280
+ assert isinstance(v, str)
281
+
282
+ if paths is not None:
283
+ assert isinstance(paths, list)
284
+ assert all(isinstance(path, str) for path in paths)
285
+
286
+ if predicates is not None:
287
+ assert isinstance(predicates, list)
288
+ assert all(isinstance(predicate, str) for predicate in predicates)
289
+
290
+ self._format = format
291
+ self._schema = schema
292
+ self._options = options
293
+ self._paths = paths
294
+ self._predicates = predicates
295
+ self._is_streaming = is_streaming
296
+
297
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
298
+ plan = self._create_proto_relation()
299
+ if self._format is not None:
300
+ plan.read.data_source.format = self._format
301
+ if self._schema is not None:
302
+ plan.read.data_source.schema = self._schema
303
+ if self._options is not None and len(self._options) > 0:
304
+ for k, v in self._options.items():
305
+ plan.read.data_source.options[k] = v
306
+ if self._paths is not None and len(self._paths) > 0:
307
+ plan.read.data_source.paths.extend(self._paths)
308
+ if self._predicates is not None and len(self._predicates) > 0:
309
+ plan.read.data_source.predicates.extend(self._predicates)
310
+ if self._is_streaming is not None:
311
+ plan.read.is_streaming = self._is_streaming
312
+ return plan
313
+
314
+
315
+ class Read(LogicalPlan):
316
+ def __init__(
317
+ self,
318
+ table_name: str,
319
+ options: Optional[Dict[str, str]] = None,
320
+ is_streaming: Optional[bool] = None,
321
+ ) -> None:
322
+ super().__init__(None)
323
+ self.table_name = table_name
324
+ self.options = options or {}
325
+ self._is_streaming = is_streaming
326
+
327
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
328
+ plan = self._create_proto_relation()
329
+ plan.read.named_table.unparsed_identifier = self.table_name
330
+ if self._is_streaming is not None:
331
+ plan.read.is_streaming = self._is_streaming
332
+ for k, v in self.options.items():
333
+ plan.read.named_table.options[k] = v
334
+ return plan
335
+
336
+ def print(self, indent: int = 0) -> str:
337
+ return f"{' ' * indent}<Read table_name={self.table_name}>\n"
338
+
339
+
340
+ class LocalRelation(LogicalPlan):
341
+ """Creates a LocalRelation plan object based on a PyArrow Table."""
342
+
343
+ def __init__(
344
+ self,
345
+ table: Optional["pa.Table"],
346
+ schema: Optional[str] = None,
347
+ ) -> None:
348
+ super().__init__(None)
349
+
350
+ if table is None:
351
+ assert schema is not None
352
+ else:
353
+ assert isinstance(table, pa.Table)
354
+
355
+ assert schema is None or isinstance(schema, str)
356
+
357
+ self._table = table
358
+
359
+ self._schema = schema
360
+
361
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
362
+ plan = self._create_proto_relation()
363
+ if self._table is not None:
364
+ sink = pa.BufferOutputStream()
365
+ with pa.ipc.new_stream(sink, self._table.schema) as writer:
366
+ for b in self._table.to_batches():
367
+ writer.write_batch(b)
368
+ plan.local_relation.data = sink.getvalue().to_pybytes()
369
+
370
+ if self._schema is not None:
371
+ plan.local_relation.schema = self._schema
372
+ return plan
373
+
374
+ def serialize(self, session: "SparkConnectClient") -> bytes:
375
+ p = self.plan(session)
376
+ return bytes(p.local_relation.SerializeToString())
377
+
378
+ def print(self, indent: int = 0) -> str:
379
+ return f"{' ' * indent}<LocalRelation>\n"
380
+
381
+ def _repr_html_(self) -> str:
382
+ return """
383
+ <ul>
384
+ <li><b>LocalRelation</b></li>
385
+ </ul>
386
+ """
387
+
388
+
389
+ class CachedLocalRelation(LogicalPlan):
390
+ """Creates a CachedLocalRelation plan object based on a hash of a LocalRelation."""
391
+
392
+ def __init__(self, hash: str) -> None:
393
+ super().__init__(None)
394
+
395
+ self._hash = hash
396
+
397
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
398
+ plan = self._create_proto_relation()
399
+ clr = plan.cached_local_relation
400
+
401
+ clr.hash = self._hash
402
+
403
+ return plan
404
+
405
+ def print(self, indent: int = 0) -> str:
406
+ return f"{' ' * indent}<CachedLocalRelation>\n"
407
+
408
+ def _repr_html_(self) -> str:
409
+ return """
410
+ <ul>
411
+ <li><b>CachedLocalRelation</b></li>
412
+ </ul>
413
+ """
414
+
415
+
416
+ class ShowString(LogicalPlan):
417
+ def __init__(
418
+ self, child: Optional["LogicalPlan"], num_rows: int, truncate: int, vertical: bool
419
+ ) -> None:
420
+ super().__init__(child)
421
+ self.num_rows = num_rows
422
+ self.truncate = truncate
423
+ self.vertical = vertical
424
+
425
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
426
+ assert self._child is not None
427
+ plan = self._create_proto_relation()
428
+ plan.show_string.input.CopyFrom(self._child.plan(session))
429
+ plan.show_string.num_rows = self.num_rows
430
+ plan.show_string.truncate = self.truncate
431
+ plan.show_string.vertical = self.vertical
432
+ return plan
433
+
434
+
435
+ class HtmlString(LogicalPlan):
436
+ def __init__(self, child: Optional["LogicalPlan"], num_rows: int, truncate: int) -> None:
437
+ super().__init__(child)
438
+ self.num_rows = num_rows
439
+ self.truncate = truncate
440
+
441
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
442
+ assert self._child is not None
443
+ plan = self._create_proto_relation()
444
+ plan.html_string.input.CopyFrom(self._child.plan(session))
445
+ plan.html_string.num_rows = self.num_rows
446
+ plan.html_string.truncate = self.truncate
447
+ return plan
448
+
449
+
450
+ class Project(LogicalPlan):
451
+ """Logical plan object for a projection.
452
+
453
+ All input arguments are directly serialized into the corresponding protocol buffer
454
+ objects. This class only provides very limited error handling and input validation.
455
+
456
+ To be compatible with PySpark, we validate that the input arguments are all
457
+ expressions to be able to serialize them to the server.
458
+
459
+ """
460
+
461
+ def __init__(self, child: Optional["LogicalPlan"], *columns: "ColumnOrName") -> None:
462
+ super().__init__(child)
463
+ self._columns = list(columns)
464
+ self.alias: Optional[str] = None
465
+ self._verify_expressions()
466
+
467
+ def _verify_expressions(self) -> None:
468
+ """Ensures that all input arguments are instances of Expression or String."""
469
+ for c in self._columns:
470
+ if not isinstance(c, (Column, str)):
471
+ raise PySparkTypeError(
472
+ error_class="NOT_LIST_OF_COLUMN_OR_STR",
473
+ message_parameters={"arg_name": "columns"},
474
+ )
475
+
476
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
477
+ from pyspark.sql.connect.functions import col
478
+
479
+ assert self._child is not None
480
+ plan = self._create_proto_relation()
481
+ plan.project.input.CopyFrom(self._child.plan(session))
482
+
483
+ proj_exprs = []
484
+ for c in self._columns:
485
+ if isinstance(c, Column):
486
+ proj_exprs.append(c.to_plan(session))
487
+ else:
488
+ proj_exprs.append(col(c).to_plan(session))
489
+
490
+ plan.project.expressions.extend(proj_exprs)
491
+ return plan
492
+
493
+
494
+ class WithColumns(LogicalPlan):
495
+ """Logical plan object for a withColumns operation."""
496
+
497
+ def __init__(
498
+ self,
499
+ child: Optional["LogicalPlan"],
500
+ columnNames: Sequence[str],
501
+ columns: Sequence[Column],
502
+ metadata: Optional[Sequence[str]] = None,
503
+ ) -> None:
504
+ super().__init__(child)
505
+
506
+ assert isinstance(columnNames, list)
507
+ assert len(columnNames) > 0
508
+ assert all(isinstance(c, str) for c in columnNames)
509
+
510
+ assert isinstance(columns, list)
511
+ assert len(columns) == len(columnNames)
512
+ assert all(isinstance(c, Column) for c in columns)
513
+
514
+ if metadata is not None:
515
+ assert isinstance(metadata, list)
516
+ assert len(metadata) == len(columnNames)
517
+ for m in metadata:
518
+ assert isinstance(m, str)
519
+ # validate json string
520
+ assert m == "" or json.loads(m) is not None
521
+
522
+ self._columnNames = columnNames
523
+ self._columns = columns
524
+ self._metadata = metadata
525
+
526
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
527
+ assert self._child is not None
528
+ plan = self._create_proto_relation()
529
+ plan.with_columns.input.CopyFrom(self._child.plan(session))
530
+
531
+ for i in range(0, len(self._columnNames)):
532
+ alias = proto.Expression.Alias()
533
+ alias.expr.CopyFrom(self._columns[i].to_plan(session))
534
+ alias.name.append(self._columnNames[i])
535
+ if self._metadata is not None:
536
+ alias.metadata = self._metadata[i]
537
+ plan.with_columns.aliases.append(alias)
538
+
539
+ return plan
540
+
541
+
542
+ class WithWatermark(LogicalPlan):
543
+ """Logical plan object for a WithWatermark operation."""
544
+
545
+ def __init__(self, child: Optional["LogicalPlan"], event_time: str, delay_threshold: str):
546
+ super().__init__(child)
547
+ self._event_time = event_time
548
+ self._delay_threshold = delay_threshold
549
+
550
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
551
+ assert self._child is not None
552
+ plan = self._create_proto_relation()
553
+ plan.with_watermark.input.CopyFrom(self._child.plan(session))
554
+ plan.with_watermark.event_time = self._event_time
555
+ plan.with_watermark.delay_threshold = self._delay_threshold
556
+ return plan
557
+
558
+
559
+ class CachedRemoteRelation(LogicalPlan):
560
+ """Logical plan object for a DataFrame reference which represents a DataFrame that's been
561
+ cached on the server with a given id."""
562
+
563
+ def __init__(self, relationId: str):
564
+ super().__init__(None)
565
+ self._relationId = relationId
566
+
567
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
568
+ plan = self._create_proto_relation()
569
+ plan.cached_remote_relation.relation_id = self._relationId
570
+ return plan
571
+
572
+
573
+ class Hint(LogicalPlan):
574
+ """Logical plan object for a Hint operation."""
575
+
576
+ def __init__(self, child: Optional["LogicalPlan"], name: str, parameters: List[Any]) -> None:
577
+ super().__init__(child)
578
+
579
+ assert isinstance(name, str)
580
+
581
+ self._name = name
582
+
583
+ for param in parameters:
584
+ assert isinstance(param, (list, str, float, int))
585
+ if isinstance(param, list):
586
+ assert all(isinstance(p, (str, float, int)) for p in param)
587
+
588
+ self._parameters = parameters
589
+
590
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
591
+ from pyspark.sql.connect.functions import array, lit
592
+
593
+ assert self._child is not None
594
+ plan = self._create_proto_relation()
595
+ plan.hint.input.CopyFrom(self._child.plan(session))
596
+ plan.hint.name = self._name
597
+ for param in self._parameters:
598
+ if isinstance(param, list):
599
+ plan.hint.parameters.append(array(*[lit(p) for p in param]).to_plan(session))
600
+ else:
601
+ plan.hint.parameters.append(lit(param).to_plan(session))
602
+ return plan
603
+
604
+
605
+ class Filter(LogicalPlan):
606
+ def __init__(self, child: Optional["LogicalPlan"], filter: Column) -> None:
607
+ super().__init__(child)
608
+ self.filter = filter
609
+
610
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
611
+ assert self._child is not None
612
+ plan = self._create_proto_relation()
613
+ plan.filter.input.CopyFrom(self._child.plan(session))
614
+ plan.filter.condition.CopyFrom(self.filter.to_plan(session))
615
+ return plan
616
+
617
+
618
+ class Limit(LogicalPlan):
619
+ def __init__(self, child: Optional["LogicalPlan"], limit: int) -> None:
620
+ super().__init__(child)
621
+ self.limit = limit
622
+
623
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
624
+ assert self._child is not None
625
+ plan = self._create_proto_relation()
626
+ plan.limit.input.CopyFrom(self._child.plan(session))
627
+ plan.limit.limit = self.limit
628
+ return plan
629
+
630
+
631
+ class Tail(LogicalPlan):
632
+ def __init__(self, child: Optional["LogicalPlan"], limit: int) -> None:
633
+ super().__init__(child)
634
+ self.limit = limit
635
+
636
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
637
+ assert self._child is not None
638
+ plan = self._create_proto_relation()
639
+ plan.tail.input.CopyFrom(self._child.plan(session))
640
+ plan.tail.limit = self.limit
641
+ return plan
642
+
643
+
644
+ class Offset(LogicalPlan):
645
+ def __init__(self, child: Optional["LogicalPlan"], offset: int = 0) -> None:
646
+ super().__init__(child)
647
+ self.offset = offset
648
+
649
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
650
+ assert self._child is not None
651
+ plan = self._create_proto_relation()
652
+ plan.offset.input.CopyFrom(self._child.plan(session))
653
+ plan.offset.offset = self.offset
654
+ return plan
655
+
656
+
657
+ class Deduplicate(LogicalPlan):
658
+ def __init__(
659
+ self,
660
+ child: Optional["LogicalPlan"],
661
+ all_columns_as_keys: bool = False,
662
+ column_names: Optional[List[str]] = None,
663
+ within_watermark: bool = False,
664
+ ) -> None:
665
+ super().__init__(child)
666
+ self.all_columns_as_keys = all_columns_as_keys
667
+ self.column_names = column_names
668
+ self.within_watermark = within_watermark
669
+
670
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
671
+ assert self._child is not None
672
+ plan = self._create_proto_relation()
673
+ plan.deduplicate.input.CopyFrom(self._child.plan(session))
674
+ plan.deduplicate.all_columns_as_keys = self.all_columns_as_keys
675
+ plan.deduplicate.within_watermark = self.within_watermark
676
+ if self.column_names is not None:
677
+ plan.deduplicate.column_names.extend(self.column_names)
678
+ return plan
679
+
680
+
681
+ class Sort(LogicalPlan):
682
+ def __init__(
683
+ self,
684
+ child: Optional["LogicalPlan"],
685
+ columns: List[Column],
686
+ is_global: bool,
687
+ ) -> None:
688
+ super().__init__(child)
689
+
690
+ assert all(isinstance(c, Column) for c in columns)
691
+ assert isinstance(is_global, bool)
692
+
693
+ self.columns = columns
694
+ self.is_global = is_global
695
+
696
+ def _convert_col(
697
+ self, col: Column, session: "SparkConnectClient"
698
+ ) -> proto.Expression.SortOrder:
699
+ if isinstance(col._expr, SortOrder):
700
+ return col._expr.to_plan(session).sort_order
701
+ else:
702
+ return SortOrder(col._expr).to_plan(session).sort_order
703
+
704
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
705
+ assert self._child is not None
706
+ plan = self._create_proto_relation()
707
+ plan.sort.input.CopyFrom(self._child.plan(session))
708
+ plan.sort.order.extend([self._convert_col(c, session) for c in self.columns])
709
+ plan.sort.is_global = self.is_global
710
+ return plan
711
+
712
+
713
+ class Drop(LogicalPlan):
714
+ def __init__(
715
+ self,
716
+ child: Optional["LogicalPlan"],
717
+ columns: List[Union[Column, str]],
718
+ ) -> None:
719
+ super().__init__(child)
720
+ if len(columns) > 0:
721
+ assert all(isinstance(c, (Column, str)) for c in columns)
722
+ self._columns = columns
723
+
724
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
725
+ assert self._child is not None
726
+ plan = self._create_proto_relation()
727
+ plan.drop.input.CopyFrom(self._child.plan(session))
728
+ for c in self._columns:
729
+ if isinstance(c, Column):
730
+ plan.drop.columns.append(c.to_plan(session))
731
+ else:
732
+ plan.drop.column_names.append(c)
733
+ return plan
734
+
735
+
736
+ class Sample(LogicalPlan):
737
+ def __init__(
738
+ self,
739
+ child: Optional["LogicalPlan"],
740
+ lower_bound: float,
741
+ upper_bound: float,
742
+ with_replacement: bool,
743
+ seed: Optional[int],
744
+ deterministic_order: bool = False,
745
+ ) -> None:
746
+ super().__init__(child)
747
+ self.lower_bound = lower_bound
748
+ self.upper_bound = upper_bound
749
+ self.with_replacement = with_replacement
750
+ self.seed = seed
751
+ self.deterministic_order = deterministic_order
752
+
753
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
754
+ assert self._child is not None
755
+ plan = self._create_proto_relation()
756
+ plan.sample.input.CopyFrom(self._child.plan(session))
757
+ plan.sample.lower_bound = self.lower_bound
758
+ plan.sample.upper_bound = self.upper_bound
759
+ plan.sample.with_replacement = self.with_replacement
760
+ if self.seed is not None:
761
+ plan.sample.seed = self.seed
762
+ plan.sample.deterministic_order = self.deterministic_order
763
+ return plan
764
+
765
+
766
+ class Aggregate(LogicalPlan):
767
+ def __init__(
768
+ self,
769
+ child: Optional["LogicalPlan"],
770
+ group_type: str,
771
+ grouping_cols: Sequence[Column],
772
+ aggregate_cols: Sequence[Column],
773
+ pivot_col: Optional[Column],
774
+ pivot_values: Optional[Sequence[Any]],
775
+ ) -> None:
776
+ super().__init__(child)
777
+
778
+ assert isinstance(group_type, str) and group_type in ["groupby", "rollup", "cube", "pivot"]
779
+ self._group_type = group_type
780
+
781
+ assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols)
782
+ self._grouping_cols = grouping_cols
783
+
784
+ assert isinstance(aggregate_cols, list) and all(
785
+ isinstance(c, Column) for c in aggregate_cols
786
+ )
787
+ self._aggregate_cols = aggregate_cols
788
+
789
+ if group_type == "pivot":
790
+ assert pivot_col is not None and isinstance(pivot_col, Column)
791
+ assert pivot_values is None or isinstance(pivot_values, list)
792
+ else:
793
+ assert pivot_col is None
794
+ assert pivot_values is None
795
+
796
+ self._pivot_col = pivot_col
797
+ self._pivot_values = pivot_values
798
+
799
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
800
+ from pyspark.sql.connect.functions import lit
801
+
802
+ assert self._child is not None
803
+ plan = self._create_proto_relation()
804
+ plan.aggregate.input.CopyFrom(self._child.plan(session))
805
+ plan.aggregate.grouping_expressions.extend(
806
+ [c.to_plan(session) for c in self._grouping_cols]
807
+ )
808
+ plan.aggregate.aggregate_expressions.extend(
809
+ [c.to_plan(session) for c in self._aggregate_cols]
810
+ )
811
+
812
+ if self._group_type == "groupby":
813
+ plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY
814
+ elif self._group_type == "rollup":
815
+ plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP
816
+ elif self._group_type == "cube":
817
+ plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_CUBE
818
+ elif self._group_type == "pivot":
819
+ plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_PIVOT
820
+ assert self._pivot_col is not None
821
+ plan.aggregate.pivot.col.CopyFrom(self._pivot_col.to_plan(session))
822
+ if self._pivot_values is not None and len(self._pivot_values) > 0:
823
+ plan.aggregate.pivot.values.extend(
824
+ [lit(v).to_plan(session).literal for v in self._pivot_values]
825
+ )
826
+
827
+ return plan
828
+
829
+
830
+ class Join(LogicalPlan):
831
+ def __init__(
832
+ self,
833
+ left: Optional["LogicalPlan"],
834
+ right: "LogicalPlan",
835
+ on: Optional[Union[str, List[str], Column, List[Column]]],
836
+ how: Optional[str],
837
+ ) -> None:
838
+ super().__init__(left)
839
+ self.left = cast(LogicalPlan, left)
840
+ self.right = right
841
+ self.on = on
842
+ if how is None:
843
+ join_type = proto.Join.JoinType.JOIN_TYPE_INNER
844
+ elif how == "inner":
845
+ join_type = proto.Join.JoinType.JOIN_TYPE_INNER
846
+ elif how in ["outer", "full", "fullouter"]:
847
+ join_type = proto.Join.JoinType.JOIN_TYPE_FULL_OUTER
848
+ elif how in ["leftouter", "left"]:
849
+ join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER
850
+ elif how in ["rightouter", "right"]:
851
+ join_type = proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER
852
+ elif how in ["leftsemi", "semi"]:
853
+ join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI
854
+ elif how in ["leftanti", "anti"]:
855
+ join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI
856
+ elif how == "cross":
857
+ join_type = proto.Join.JoinType.JOIN_TYPE_CROSS
858
+ else:
859
+ raise IllegalArgumentException(
860
+ error_class="UNSUPPORTED_JOIN_TYPE",
861
+ message_parameters={"join_type": how},
862
+ )
863
+ self.how = join_type
864
+
865
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
866
+ plan = self._create_proto_relation()
867
+ plan.join.left.CopyFrom(self.left.plan(session))
868
+ plan.join.right.CopyFrom(self.right.plan(session))
869
+ if self.on is not None:
870
+ if not isinstance(self.on, list):
871
+ if isinstance(self.on, str):
872
+ plan.join.using_columns.append(self.on)
873
+ else:
874
+ plan.join.join_condition.CopyFrom(self.to_attr_or_expression(self.on, session))
875
+ elif len(self.on) > 0:
876
+ if isinstance(self.on[0], str):
877
+ plan.join.using_columns.extend(cast(str, self.on))
878
+ else:
879
+ merge_column = functools.reduce(lambda c1, c2: c1 & c2, self.on)
880
+ plan.join.join_condition.CopyFrom(cast(Column, merge_column).to_plan(session))
881
+ plan.join.join_type = self.how
882
+ return plan
883
+
884
+ def print(self, indent: int = 0) -> str:
885
+ i = " " * indent
886
+ o = " " * (indent + LogicalPlan.INDENT)
887
+ n = indent + LogicalPlan.INDENT * 2
888
+ return (
889
+ f"{i}<Join on={self.on} how={self.how}>\n{o}"
890
+ f"left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}"
891
+ )
892
+
893
+ def _repr_html_(self) -> str:
894
+ return f"""
895
+ <ul>
896
+ <li>
897
+ <b>Join</b><br />
898
+ Left: {self.left._repr_html_()}
899
+ Right: {self.right._repr_html_()}
900
+ </li>
901
+ </uL>
902
+ """
903
+
904
+
905
+ class SetOperation(LogicalPlan):
906
+ def __init__(
907
+ self,
908
+ child: Optional["LogicalPlan"],
909
+ other: Optional["LogicalPlan"],
910
+ set_op: str,
911
+ is_all: bool = True,
912
+ by_name: bool = False,
913
+ allow_missing_columns: bool = False,
914
+ ) -> None:
915
+ super().__init__(child)
916
+ self.other = other
917
+ self.by_name = by_name
918
+ self.is_all = is_all
919
+ self.set_op = set_op
920
+ self.allow_missing_columns = allow_missing_columns
921
+
922
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
923
+ assert self._child is not None
924
+ plan = self._create_proto_relation()
925
+ if self._child is not None:
926
+ plan.set_op.left_input.CopyFrom(self._child.plan(session))
927
+ if self.other is not None:
928
+ plan.set_op.right_input.CopyFrom(self.other.plan(session))
929
+ if self.set_op == "union":
930
+ plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_UNION
931
+ elif self.set_op == "intersect":
932
+ plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_INTERSECT
933
+ elif self.set_op == "except":
934
+ plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_EXCEPT
935
+ else:
936
+ raise PySparkNotImplementedError(
937
+ error_class="UNSUPPORTED_OPERATION",
938
+ message_parameters={"feature": self.set_op},
939
+ )
940
+
941
+ plan.set_op.is_all = self.is_all
942
+ plan.set_op.by_name = self.by_name
943
+ plan.set_op.allow_missing_columns = self.allow_missing_columns
944
+ return plan
945
+
946
+ def print(self, indent: int = 0) -> str:
947
+ assert self._child is not None
948
+ assert self.other is not None
949
+
950
+ i = " " * indent
951
+ o = " " * (indent + LogicalPlan.INDENT)
952
+ n = indent + LogicalPlan.INDENT * 2
953
+ return (
954
+ f"{i}SetOperation\n{o}child1=\n{self._child.print(n)}"
955
+ f"\n{o}child2=\n{self.other.print(n)}"
956
+ )
957
+
958
+ def _repr_html_(self) -> str:
959
+ assert self._child is not None
960
+ assert self.other is not None
961
+
962
+ return f"""
963
+ <ul>
964
+ <li>
965
+ <b>SetOperation</b><br />
966
+ Left: {self._child._repr_html_()}
967
+ Right: {self.other._repr_html_()}
968
+ </li>
969
+ </uL>
970
+ """
971
+
972
+
973
+ class Repartition(LogicalPlan):
974
+ """Repartition Relation into a different number of partitions."""
975
+
976
+ def __init__(self, child: Optional["LogicalPlan"], num_partitions: int, shuffle: bool) -> None:
977
+ super().__init__(child)
978
+ self._num_partitions = num_partitions
979
+ self._shuffle = shuffle
980
+
981
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
982
+ plan = self._create_proto_relation()
983
+ if self._child is not None:
984
+ plan.repartition.input.CopyFrom(self._child.plan(session))
985
+ plan.repartition.shuffle = self._shuffle
986
+ plan.repartition.num_partitions = self._num_partitions
987
+ return plan
988
+
989
+
990
+ class RepartitionByExpression(LogicalPlan):
991
+ """Repartition Relation into a different number of partitions using Expression"""
992
+
993
+ def __init__(
994
+ self,
995
+ child: Optional["LogicalPlan"],
996
+ num_partitions: Optional[int],
997
+ columns: List["ColumnOrName"],
998
+ ) -> None:
999
+ super().__init__(child)
1000
+ self.num_partitions = num_partitions
1001
+ self.columns = columns
1002
+
1003
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1004
+ plan = self._create_proto_relation()
1005
+
1006
+ part_exprs = []
1007
+ for c in self.columns:
1008
+ if isinstance(c, Column):
1009
+ part_exprs.append(c.to_plan(session))
1010
+ elif c == "*":
1011
+ exp = proto.Expression()
1012
+ exp.unresolved_star.SetInParent()
1013
+ part_exprs.append(exp)
1014
+ else:
1015
+ part_exprs.append(self.unresolved_attr(c))
1016
+ plan.repartition_by_expression.partition_exprs.extend(part_exprs)
1017
+
1018
+ if self._child is not None:
1019
+ plan.repartition_by_expression.input.CopyFrom(self._child.plan(session))
1020
+ if self.num_partitions is not None:
1021
+ plan.repartition_by_expression.num_partitions = self.num_partitions
1022
+ return plan
1023
+
1024
+
1025
+ class SubqueryAlias(LogicalPlan):
1026
+ """Alias for a relation."""
1027
+
1028
+ def __init__(self, child: Optional["LogicalPlan"], alias: str) -> None:
1029
+ super().__init__(child)
1030
+ self._alias = alias
1031
+
1032
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1033
+ plan = self._create_proto_relation()
1034
+ if self._child is not None:
1035
+ plan.subquery_alias.input.CopyFrom(self._child.plan(session))
1036
+ plan.subquery_alias.alias = self._alias
1037
+ return plan
1038
+
1039
+
1040
+ class SQL(LogicalPlan):
1041
+ def __init__(self, query: str, args: Optional[Union[Dict[str, Any], List]] = None) -> None:
1042
+ super().__init__(None)
1043
+
1044
+ if args is not None:
1045
+ if isinstance(args, Dict):
1046
+ for k, v in args.items():
1047
+ assert isinstance(k, str)
1048
+ else:
1049
+ assert isinstance(args, List)
1050
+
1051
+ self._query = query
1052
+ self._args = args
1053
+
1054
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1055
+ plan = self._create_proto_relation()
1056
+ plan.sql.query = self._query
1057
+
1058
+ if self._args is not None and len(self._args) > 0:
1059
+ if isinstance(self._args, Dict):
1060
+ for k, v in self._args.items():
1061
+ plan.sql.args[k].CopyFrom(
1062
+ LiteralExpression._from_value(v).to_plan(session).literal
1063
+ )
1064
+ else:
1065
+ for v in self._args:
1066
+ plan.sql.pos_args.append(
1067
+ LiteralExpression._from_value(v).to_plan(session).literal
1068
+ )
1069
+
1070
+ return plan
1071
+
1072
+ def command(self, session: "SparkConnectClient") -> proto.Command:
1073
+ cmd = proto.Command()
1074
+ cmd.sql_command.sql = self._query
1075
+ if self._args is not None and len(self._args) > 0:
1076
+ if isinstance(self._args, Dict):
1077
+ for k, v in self._args.items():
1078
+ cmd.sql_command.args[k].CopyFrom(
1079
+ LiteralExpression._from_value(v).to_plan(session).literal
1080
+ )
1081
+ else:
1082
+ for v in self._args:
1083
+ cmd.sql_command.pos_args.append(
1084
+ LiteralExpression._from_value(v).to_plan(session).literal
1085
+ )
1086
+
1087
+ return cmd
1088
+
1089
+
1090
+ class Range(LogicalPlan):
1091
+ def __init__(
1092
+ self,
1093
+ start: int,
1094
+ end: int,
1095
+ step: int,
1096
+ num_partitions: Optional[int] = None,
1097
+ ) -> None:
1098
+ super().__init__(None)
1099
+ self._start = start
1100
+ self._end = end
1101
+ self._step = step
1102
+ self._num_partitions = num_partitions
1103
+
1104
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1105
+ plan = self._create_proto_relation()
1106
+ plan.range.start = self._start
1107
+ plan.range.end = self._end
1108
+ plan.range.step = self._step
1109
+ if self._num_partitions is not None:
1110
+ plan.range.num_partitions = self._num_partitions
1111
+ return plan
1112
+
1113
+
1114
+ class ToSchema(LogicalPlan):
1115
+ def __init__(self, child: Optional["LogicalPlan"], schema: DataType) -> None:
1116
+ super().__init__(child)
1117
+ self._schema = schema
1118
+
1119
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1120
+ assert self._child is not None
1121
+ plan = self._create_proto_relation()
1122
+ plan.to_schema.input.CopyFrom(self._child.plan(session))
1123
+ plan.to_schema.schema.CopyFrom(pyspark_types_to_proto_types(self._schema))
1124
+ return plan
1125
+
1126
+
1127
+ class WithColumnsRenamed(LogicalPlan):
1128
+ def __init__(self, child: Optional["LogicalPlan"], colsMap: Mapping[str, str]) -> None:
1129
+ super().__init__(child)
1130
+ self._colsMap = colsMap
1131
+
1132
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1133
+ assert self._child is not None
1134
+ plan = self._create_proto_relation()
1135
+ plan.with_columns_renamed.input.CopyFrom(self._child.plan(session))
1136
+ for k, v in self._colsMap.items():
1137
+ plan.with_columns_renamed.rename_columns_map[k] = v
1138
+ return plan
1139
+
1140
+
1141
+ class Unpivot(LogicalPlan):
1142
+ """Logical plan object for a unpivot operation."""
1143
+
1144
+ def __init__(
1145
+ self,
1146
+ child: Optional["LogicalPlan"],
1147
+ ids: List["ColumnOrName"],
1148
+ values: Optional[List["ColumnOrName"]],
1149
+ variable_column_name: str,
1150
+ value_column_name: str,
1151
+ ) -> None:
1152
+ super().__init__(child)
1153
+ self.ids = ids
1154
+ self.values = values
1155
+ self.variable_column_name = variable_column_name
1156
+ self.value_column_name = value_column_name
1157
+
1158
+ def col_to_expr(self, col: "ColumnOrName", session: "SparkConnectClient") -> proto.Expression:
1159
+ if isinstance(col, Column):
1160
+ return col.to_plan(session)
1161
+ else:
1162
+ return self.unresolved_attr(col)
1163
+
1164
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1165
+ assert self._child is not None
1166
+ plan = self._create_proto_relation()
1167
+ plan.unpivot.input.CopyFrom(self._child.plan(session))
1168
+ plan.unpivot.ids.extend([self.col_to_expr(x, session) for x in self.ids])
1169
+ if self.values is not None:
1170
+ plan.unpivot.values.values.extend([self.col_to_expr(x, session) for x in self.values])
1171
+ plan.unpivot.variable_column_name = self.variable_column_name
1172
+ plan.unpivot.value_column_name = self.value_column_name
1173
+ return plan
1174
+
1175
+
1176
+ class CollectMetrics(LogicalPlan):
1177
+ """Logical plan object for a CollectMetrics operation."""
1178
+
1179
+ def __init__(
1180
+ self,
1181
+ child: Optional["LogicalPlan"],
1182
+ name: str,
1183
+ exprs: List["ColumnOrName"],
1184
+ ) -> None:
1185
+ super().__init__(child)
1186
+ self._name = name
1187
+ self._exprs = exprs
1188
+
1189
+ def col_to_expr(self, col: "ColumnOrName", session: "SparkConnectClient") -> proto.Expression:
1190
+ if isinstance(col, Column):
1191
+ return col.to_plan(session)
1192
+ else:
1193
+ return self.unresolved_attr(col)
1194
+
1195
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1196
+ assert self._child is not None
1197
+
1198
+ plan = proto.Relation()
1199
+ plan.common.plan_id = self._child._plan_id
1200
+ plan.collect_metrics.input.CopyFrom(self._child.plan(session))
1201
+ plan.collect_metrics.name = self._name
1202
+ plan.collect_metrics.metrics.extend([self.col_to_expr(x, session) for x in self._exprs])
1203
+ return plan
1204
+
1205
+
1206
+ class NAFill(LogicalPlan):
1207
+ def __init__(
1208
+ self, child: Optional["LogicalPlan"], cols: Optional[List[str]], values: List[Any]
1209
+ ) -> None:
1210
+ super().__init__(child)
1211
+
1212
+ assert (
1213
+ isinstance(values, list)
1214
+ and len(values) > 0
1215
+ and all(isinstance(v, (bool, int, float, str)) for v in values)
1216
+ )
1217
+
1218
+ if cols is not None and len(cols) > 0:
1219
+ assert isinstance(cols, list) and all(isinstance(c, str) for c in cols)
1220
+ if len(values) > 1:
1221
+ assert len(cols) == len(values)
1222
+
1223
+ self.cols = cols
1224
+ self.values = values
1225
+
1226
+ def _convert_value(self, v: Any) -> proto.Expression.Literal:
1227
+ value = proto.Expression.Literal()
1228
+ if isinstance(v, bool):
1229
+ value.boolean = v
1230
+ elif isinstance(v, int):
1231
+ value.long = v
1232
+ elif isinstance(v, float):
1233
+ value.double = v
1234
+ else:
1235
+ value.string = v
1236
+ return value
1237
+
1238
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1239
+ assert self._child is not None
1240
+ plan = self._create_proto_relation()
1241
+ plan.fill_na.input.CopyFrom(self._child.plan(session))
1242
+ if self.cols is not None and len(self.cols) > 0:
1243
+ plan.fill_na.cols.extend(self.cols)
1244
+ plan.fill_na.values.extend([self._convert_value(v) for v in self.values])
1245
+ return plan
1246
+
1247
+
1248
+ class NADrop(LogicalPlan):
1249
+ def __init__(
1250
+ self,
1251
+ child: Optional["LogicalPlan"],
1252
+ cols: Optional[List[str]],
1253
+ min_non_nulls: Optional[int],
1254
+ ) -> None:
1255
+ super().__init__(child)
1256
+
1257
+ self.cols = cols
1258
+ self.min_non_nulls = min_non_nulls
1259
+
1260
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1261
+ assert self._child is not None
1262
+ plan = self._create_proto_relation()
1263
+ plan.drop_na.input.CopyFrom(self._child.plan(session))
1264
+ if self.cols is not None and len(self.cols) > 0:
1265
+ plan.drop_na.cols.extend(self.cols)
1266
+ if self.min_non_nulls is not None:
1267
+ plan.drop_na.min_non_nulls = self.min_non_nulls
1268
+ return plan
1269
+
1270
+
1271
+ class NAReplace(LogicalPlan):
1272
+ def __init__(
1273
+ self,
1274
+ child: Optional["LogicalPlan"],
1275
+ cols: Optional[List[str]],
1276
+ replacements: Dict[Any, Any],
1277
+ ) -> None:
1278
+ super().__init__(child)
1279
+
1280
+ for old_value, new_value in replacements.items():
1281
+ if old_value is not None:
1282
+ assert isinstance(old_value, (bool, int, float, str))
1283
+ if new_value is not None:
1284
+ assert isinstance(new_value, (bool, int, float, str))
1285
+
1286
+ self.cols = cols
1287
+ self.replacements = replacements
1288
+
1289
+ def _convert_int_to_float(self, v: Any) -> Any:
1290
+ # a bool is also an int
1291
+ if v is not None and not isinstance(v, bool) and isinstance(v, int):
1292
+ return float(v)
1293
+ else:
1294
+ return v
1295
+
1296
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1297
+ assert self._child is not None
1298
+ plan = self._create_proto_relation()
1299
+ plan.replace.input.CopyFrom(self._child.plan(session))
1300
+ if self.cols is not None and len(self.cols) > 0:
1301
+ plan.replace.cols.extend(self.cols)
1302
+ if len(self.replacements) > 0:
1303
+ for old_value, new_value in self.replacements.items():
1304
+ replacement = proto.NAReplace.Replacement()
1305
+ replacement.old_value.CopyFrom(
1306
+ LiteralExpression._from_value(self._convert_int_to_float(old_value))
1307
+ .to_plan(session)
1308
+ .literal
1309
+ )
1310
+ replacement.new_value.CopyFrom(
1311
+ LiteralExpression._from_value(self._convert_int_to_float(new_value))
1312
+ .to_plan(session)
1313
+ .literal
1314
+ )
1315
+ plan.replace.replacements.append(replacement)
1316
+ return plan
1317
+
1318
+
1319
+ class StatSummary(LogicalPlan):
1320
+ def __init__(self, child: Optional["LogicalPlan"], statistics: List[str]) -> None:
1321
+ super().__init__(child)
1322
+ self.statistics = statistics
1323
+
1324
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1325
+ assert self._child is not None
1326
+ plan = self._create_proto_relation()
1327
+ plan.summary.input.CopyFrom(self._child.plan(session))
1328
+ plan.summary.statistics.extend(self.statistics)
1329
+ return plan
1330
+
1331
+
1332
+ class StatDescribe(LogicalPlan):
1333
+ def __init__(self, child: Optional["LogicalPlan"], cols: List[str]) -> None:
1334
+ super().__init__(child)
1335
+ self.cols = cols
1336
+
1337
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1338
+ assert self._child is not None
1339
+ plan = self._create_proto_relation()
1340
+ plan.describe.input.CopyFrom(self._child.plan(session))
1341
+ plan.describe.cols.extend(self.cols)
1342
+ return plan
1343
+
1344
+
1345
+ class StatCov(LogicalPlan):
1346
+ def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str) -> None:
1347
+ super().__init__(child)
1348
+ self._col1 = col1
1349
+ self._col2 = col2
1350
+
1351
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1352
+ assert self._child is not None
1353
+ plan = self._create_proto_relation()
1354
+ plan.cov.input.CopyFrom(self._child.plan(session))
1355
+ plan.cov.col1 = self._col1
1356
+ plan.cov.col2 = self._col2
1357
+ return plan
1358
+
1359
+
1360
+ class StatApproxQuantile(LogicalPlan):
1361
+ def __init__(
1362
+ self,
1363
+ child: Optional["LogicalPlan"],
1364
+ cols: List[str],
1365
+ probabilities: List[float],
1366
+ relativeError: float,
1367
+ ) -> None:
1368
+ super().__init__(child)
1369
+ self._cols = cols
1370
+ self._probabilities = probabilities
1371
+ self._relativeError = relativeError
1372
+
1373
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1374
+ assert self._child is not None
1375
+ plan = self._create_proto_relation()
1376
+ plan.approx_quantile.input.CopyFrom(self._child.plan(session))
1377
+ plan.approx_quantile.cols.extend(self._cols)
1378
+ plan.approx_quantile.probabilities.extend(self._probabilities)
1379
+ plan.approx_quantile.relative_error = self._relativeError
1380
+ return plan
1381
+
1382
+
1383
+ class StatCrosstab(LogicalPlan):
1384
+ def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str) -> None:
1385
+ super().__init__(child)
1386
+ self.col1 = col1
1387
+ self.col2 = col2
1388
+
1389
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1390
+ assert self._child is not None
1391
+ plan = self._create_proto_relation()
1392
+ plan.crosstab.input.CopyFrom(self._child.plan(session))
1393
+ plan.crosstab.col1 = self.col1
1394
+ plan.crosstab.col2 = self.col2
1395
+ return plan
1396
+
1397
+
1398
+ class StatFreqItems(LogicalPlan):
1399
+ def __init__(
1400
+ self,
1401
+ child: Optional["LogicalPlan"],
1402
+ cols: List[str],
1403
+ support: float,
1404
+ ) -> None:
1405
+ super().__init__(child)
1406
+ self._cols = cols
1407
+ self._support = support
1408
+
1409
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1410
+ assert self._child is not None
1411
+ plan = self._create_proto_relation()
1412
+ plan.freq_items.input.CopyFrom(self._child.plan(session))
1413
+ plan.freq_items.cols.extend(self._cols)
1414
+ plan.freq_items.support = self._support
1415
+ return plan
1416
+
1417
+
1418
+ class StatSampleBy(LogicalPlan):
1419
+ def __init__(
1420
+ self,
1421
+ child: Optional["LogicalPlan"],
1422
+ col: "ColumnOrName",
1423
+ fractions: Dict[Any, float],
1424
+ seed: Optional[int],
1425
+ ) -> None:
1426
+ super().__init__(child)
1427
+
1428
+ assert col is not None and isinstance(col, (Column, str))
1429
+
1430
+ assert fractions is not None and isinstance(fractions, dict)
1431
+ for k, v in fractions.items():
1432
+ assert v is not None and isinstance(v, float)
1433
+
1434
+ assert seed is None or isinstance(seed, int)
1435
+
1436
+ if isinstance(col, Column):
1437
+ self._col = col
1438
+ else:
1439
+ self._col = Column(ColumnReference(col))
1440
+
1441
+ self._fractions = fractions
1442
+
1443
+ self._seed = seed
1444
+
1445
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1446
+ assert self._child is not None
1447
+ plan = self._create_proto_relation()
1448
+ plan.sample_by.input.CopyFrom(self._child.plan(session))
1449
+ plan.sample_by.col.CopyFrom(self._col._expr.to_plan(session))
1450
+ if len(self._fractions) > 0:
1451
+ for k, v in self._fractions.items():
1452
+ fraction = proto.StatSampleBy.Fraction()
1453
+ fraction.stratum.CopyFrom(LiteralExpression._from_value(k).to_plan(session).literal)
1454
+ fraction.fraction = float(v)
1455
+ plan.sample_by.fractions.append(fraction)
1456
+ if self._seed is not None:
1457
+ plan.sample_by.seed = self._seed
1458
+ return plan
1459
+
1460
+
1461
+ class StatCorr(LogicalPlan):
1462
+ def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str, method: str) -> None:
1463
+ super().__init__(child)
1464
+ self._col1 = col1
1465
+ self._col2 = col2
1466
+ self._method = method
1467
+
1468
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1469
+ assert self._child is not None
1470
+ plan = self._create_proto_relation()
1471
+ plan.corr.input.CopyFrom(self._child.plan(session))
1472
+ plan.corr.col1 = self._col1
1473
+ plan.corr.col2 = self._col2
1474
+ plan.corr.method = self._method
1475
+ return plan
1476
+
1477
+
1478
+ class ToDF(LogicalPlan):
1479
+ def __init__(self, child: Optional["LogicalPlan"], cols: Sequence[str]) -> None:
1480
+ super().__init__(child)
1481
+ self._cols = cols
1482
+
1483
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1484
+ assert self._child is not None
1485
+ plan = self._create_proto_relation()
1486
+ plan.to_df.input.CopyFrom(self._child.plan(session))
1487
+ plan.to_df.column_names.extend(self._cols)
1488
+ return plan
1489
+
1490
+
1491
+ class CreateView(LogicalPlan):
1492
+ def __init__(
1493
+ self, child: Optional["LogicalPlan"], name: str, is_global: bool, replace: bool
1494
+ ) -> None:
1495
+ super().__init__(child)
1496
+ self._name = name
1497
+ self._is_global = is_global
1498
+ self._replace = replace
1499
+
1500
+ def command(self, session: "SparkConnectClient") -> proto.Command:
1501
+ assert self._child is not None
1502
+ plan = proto.Command()
1503
+
1504
+ plan.create_dataframe_view.replace = self._replace
1505
+ plan.create_dataframe_view.is_global = self._is_global
1506
+ plan.create_dataframe_view.name = self._name
1507
+ plan.create_dataframe_view.input.CopyFrom(self._child.plan(session))
1508
+ return plan
1509
+
1510
+
1511
+ class WriteOperation(LogicalPlan):
1512
+ def __init__(self, child: "LogicalPlan") -> None:
1513
+ super(WriteOperation, self).__init__(child)
1514
+ self.source: Optional[str] = None
1515
+ self.path: Optional[str] = None
1516
+ self.table_name: Optional[str] = None
1517
+ self.table_save_method: Optional[str] = None
1518
+ self.mode: Optional[str] = None
1519
+ self.sort_cols: List[str] = []
1520
+ self.partitioning_cols: List[str] = []
1521
+ self.options: Dict[str, Optional[str]] = {}
1522
+ self.num_buckets: int = -1
1523
+ self.bucket_cols: List[str] = []
1524
+
1525
+ def command(self, session: "SparkConnectClient") -> proto.Command:
1526
+ assert self._child is not None
1527
+ plan = proto.Command()
1528
+
1529
+ plan.write_operation.input.CopyFrom(self._child.plan(session))
1530
+ if self.source is not None:
1531
+ plan.write_operation.source = self.source
1532
+ plan.write_operation.sort_column_names.extend(self.sort_cols)
1533
+ plan.write_operation.partitioning_columns.extend(self.partitioning_cols)
1534
+
1535
+ if self.num_buckets > 0:
1536
+ plan.write_operation.bucket_by.bucket_column_names.extend(self.bucket_cols)
1537
+ plan.write_operation.bucket_by.num_buckets = self.num_buckets
1538
+
1539
+ for k in self.options:
1540
+ if self.options[k] is None:
1541
+ plan.write_operation.options.pop(k, None)
1542
+ else:
1543
+ plan.write_operation.options[k] = cast(str, self.options[k])
1544
+
1545
+ if self.table_name is not None:
1546
+ plan.write_operation.table.table_name = self.table_name
1547
+ if self.table_save_method is not None:
1548
+ tsm = self.table_save_method.lower()
1549
+ if tsm == "save_as_table":
1550
+ plan.write_operation.table.save_method = (
1551
+ proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE # noqa: E501
1552
+ )
1553
+ elif tsm == "insert_into":
1554
+ plan.write_operation.table.save_method = (
1555
+ proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
1556
+ )
1557
+ else:
1558
+ raise ValueError(
1559
+ f"Unknown TestSaveMethod value for DataFrame: {self.table_save_method}"
1560
+ )
1561
+ elif self.path is not None:
1562
+ plan.write_operation.path = self.path
1563
+
1564
+ if self.mode is not None:
1565
+ wm = self.mode.lower()
1566
+ if wm == "append":
1567
+ plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_APPEND
1568
+ elif wm == "overwrite":
1569
+ plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE
1570
+ elif wm == "error":
1571
+ plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS
1572
+ elif wm == "ignore":
1573
+ plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE
1574
+ else:
1575
+ raise ValueError(f"Unknown SaveMode value for DataFrame: {self.mode}")
1576
+ return plan
1577
+
1578
+ def print(self, indent: int = 0) -> str:
1579
+ i = " " * indent
1580
+ return (
1581
+ f"{i}"
1582
+ f"<WriteOperation source='{self.source}' "
1583
+ f"path='{self.path} "
1584
+ f"table_name='{self.table_name}' "
1585
+ f"table_save_method='{self.table_save_method}' "
1586
+ f"mode='{self.mode}' "
1587
+ f"sort_cols='{self.sort_cols}' "
1588
+ f"partitioning_cols='{self.partitioning_cols}' "
1589
+ f"num_buckets='{self.num_buckets}' "
1590
+ f"bucket_cols='{self.bucket_cols}' "
1591
+ f"options='{self.options}'>"
1592
+ )
1593
+
1594
+ def _repr_html_(self) -> str:
1595
+ return (
1596
+ f"<uL><li>WriteOperation <br />source='{self.source}'<br />"
1597
+ f"path: '{self.path}<br />"
1598
+ f"table_name: '{self.table_name}' <br />"
1599
+ f"table_save_method: '{self.table_save_method}' <br />"
1600
+ f"mode: '{self.mode}' <br />"
1601
+ f"sort_cols: '{self.sort_cols}' <br />"
1602
+ f"partitioning_cols: '{self.partitioning_cols}' <br />"
1603
+ f"num_buckets: '{self.num_buckets}' <br />"
1604
+ f"bucket_cols: '{self.bucket_cols}' <br />"
1605
+ f"options: '{self.options}'<br />"
1606
+ f"</li></ul>"
1607
+ )
1608
+
1609
+
1610
+ class WriteOperationV2(LogicalPlan):
1611
+ def __init__(self, child: "LogicalPlan", table_name: str) -> None:
1612
+ super(WriteOperationV2, self).__init__(child)
1613
+ self.table_name: Optional[str] = table_name
1614
+ self.provider: Optional[str] = None
1615
+ self.partitioning_columns: List["ColumnOrName"] = []
1616
+ self.options: dict[str, Optional[str]] = {}
1617
+ self.table_properties: dict[str, Optional[str]] = {}
1618
+ self.mode: Optional[str] = None
1619
+ self.overwrite_condition: Optional["ColumnOrName"] = None
1620
+
1621
+ def col_to_expr(self, col: "ColumnOrName", session: "SparkConnectClient") -> proto.Expression:
1622
+ if isinstance(col, Column):
1623
+ return col.to_plan(session)
1624
+ else:
1625
+ return self.unresolved_attr(col)
1626
+
1627
+ def command(self, session: "SparkConnectClient") -> proto.Command:
1628
+ assert self._child is not None
1629
+ plan = proto.Command()
1630
+ plan.write_operation_v2.input.CopyFrom(self._child.plan(session))
1631
+ if self.table_name is not None:
1632
+ plan.write_operation_v2.table_name = self.table_name
1633
+ if self.provider is not None:
1634
+ plan.write_operation_v2.provider = self.provider
1635
+
1636
+ plan.write_operation_v2.partitioning_columns.extend(
1637
+ [self.col_to_expr(x, session) for x in self.partitioning_columns]
1638
+ )
1639
+
1640
+ for k in self.options:
1641
+ if self.options[k] is None:
1642
+ plan.write_operation_v2.options.pop(k, None)
1643
+ else:
1644
+ plan.write_operation_v2.options[k] = cast(str, self.options[k])
1645
+
1646
+ for k in self.table_properties:
1647
+ if self.table_properties[k] is None:
1648
+ plan.write_operation_v2.table_properties.pop(k, None)
1649
+ else:
1650
+ plan.write_operation_v2.table_properties[k] = cast(str, self.table_properties[k])
1651
+
1652
+ if self.mode is not None:
1653
+ wm = self.mode.lower()
1654
+ if wm == "create":
1655
+ plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_CREATE
1656
+ elif wm == "overwrite":
1657
+ plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_OVERWRITE
1658
+ if self.overwrite_condition is not None:
1659
+ plan.write_operation_v2.overwrite_condition.CopyFrom(
1660
+ self.col_to_expr(self.overwrite_condition, session)
1661
+ )
1662
+ elif wm == "overwrite_partitions":
1663
+ plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_OVERWRITE_PARTITIONS
1664
+ elif wm == "append":
1665
+ plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_APPEND
1666
+ elif wm == "replace":
1667
+ plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_REPLACE
1668
+ elif wm == "create_or_replace":
1669
+ plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE
1670
+ else:
1671
+ raise ValueError(f"Unknown Mode value for DataFrame: {self.mode}")
1672
+ return plan
1673
+
1674
+
1675
+ class WriteStreamOperation(LogicalPlan):
1676
+ def __init__(self, child: "LogicalPlan") -> None:
1677
+ super(WriteStreamOperation, self).__init__(child)
1678
+ self.write_op = proto.WriteStreamOperationStart()
1679
+
1680
+ def command(self, session: "SparkConnectClient") -> proto.Command:
1681
+ assert self._child is not None
1682
+ self.write_op.input.CopyFrom(self._child.plan(session))
1683
+ cmd = proto.Command()
1684
+ cmd.write_stream_operation_start.CopyFrom(self.write_op)
1685
+ return cmd
1686
+
1687
+
1688
+ # Catalog API (internal-only)
1689
+
1690
+
1691
+ class CurrentDatabase(LogicalPlan):
1692
+ def __init__(self) -> None:
1693
+ super().__init__(None)
1694
+
1695
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1696
+ return proto.Relation(catalog=proto.Catalog(current_database=proto.CurrentDatabase()))
1697
+
1698
+
1699
+ class SetCurrentDatabase(LogicalPlan):
1700
+ def __init__(self, db_name: str) -> None:
1701
+ super().__init__(None)
1702
+ self._db_name = db_name
1703
+
1704
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1705
+ plan = proto.Relation()
1706
+ plan.catalog.set_current_database.db_name = self._db_name
1707
+ return plan
1708
+
1709
+
1710
+ class ListDatabases(LogicalPlan):
1711
+ def __init__(self, pattern: Optional[str] = None) -> None:
1712
+ super().__init__(None)
1713
+ self._pattern = pattern
1714
+
1715
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1716
+ plan = proto.Relation(catalog=proto.Catalog(list_databases=proto.ListDatabases()))
1717
+ if self._pattern is not None:
1718
+ plan.catalog.list_databases.pattern = self._pattern
1719
+ return plan
1720
+
1721
+
1722
+ class ListTables(LogicalPlan):
1723
+ def __init__(self, db_name: Optional[str] = None, pattern: Optional[str] = None) -> None:
1724
+ super().__init__(None)
1725
+ self._db_name = db_name
1726
+ self._pattern = pattern
1727
+
1728
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1729
+ plan = proto.Relation(catalog=proto.Catalog(list_tables=proto.ListTables()))
1730
+ if self._db_name is not None:
1731
+ plan.catalog.list_tables.db_name = self._db_name
1732
+ if self._pattern is not None:
1733
+ plan.catalog.list_tables.pattern = self._pattern
1734
+ return plan
1735
+
1736
+
1737
+ class ListFunctions(LogicalPlan):
1738
+ def __init__(self, db_name: Optional[str] = None, pattern: Optional[str] = None) -> None:
1739
+ super().__init__(None)
1740
+ self._db_name = db_name
1741
+ self._pattern = pattern
1742
+
1743
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1744
+ plan = proto.Relation(catalog=proto.Catalog(list_functions=proto.ListFunctions()))
1745
+ if self._db_name is not None:
1746
+ plan.catalog.list_functions.db_name = self._db_name
1747
+ if self._pattern is not None:
1748
+ plan.catalog.list_functions.pattern = self._pattern
1749
+ return plan
1750
+
1751
+
1752
+ class ListColumns(LogicalPlan):
1753
+ def __init__(self, table_name: str, db_name: Optional[str] = None) -> None:
1754
+ super().__init__(None)
1755
+ self._table_name = table_name
1756
+ self._db_name = db_name
1757
+
1758
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1759
+ plan = proto.Relation(catalog=proto.Catalog(list_columns=proto.ListColumns()))
1760
+ plan.catalog.list_columns.table_name = self._table_name
1761
+ if self._db_name is not None:
1762
+ plan.catalog.list_columns.db_name = self._db_name
1763
+ return plan
1764
+
1765
+
1766
+ class GetDatabase(LogicalPlan):
1767
+ def __init__(self, db_name: str) -> None:
1768
+ super().__init__(None)
1769
+ self._db_name = db_name
1770
+
1771
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1772
+ plan = proto.Relation(catalog=proto.Catalog(get_database=proto.GetDatabase()))
1773
+ plan.catalog.get_database.db_name = self._db_name
1774
+ return plan
1775
+
1776
+
1777
+ class GetTable(LogicalPlan):
1778
+ def __init__(self, table_name: str, db_name: Optional[str] = None) -> None:
1779
+ super().__init__(None)
1780
+ self._table_name = table_name
1781
+ self._db_name = db_name
1782
+
1783
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1784
+ plan = proto.Relation(catalog=proto.Catalog(get_table=proto.GetTable()))
1785
+ plan.catalog.get_table.table_name = self._table_name
1786
+ if self._db_name is not None:
1787
+ plan.catalog.get_table.db_name = self._db_name
1788
+ return plan
1789
+
1790
+
1791
+ class GetFunction(LogicalPlan):
1792
+ def __init__(self, function_name: str, db_name: Optional[str] = None) -> None:
1793
+ super().__init__(None)
1794
+ self._function_name = function_name
1795
+ self._db_name = db_name
1796
+
1797
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1798
+ plan = proto.Relation(catalog=proto.Catalog(get_function=proto.GetFunction()))
1799
+ plan.catalog.get_function.function_name = self._function_name
1800
+ if self._db_name is not None:
1801
+ plan.catalog.get_function.db_name = self._db_name
1802
+ return plan
1803
+
1804
+
1805
+ class DatabaseExists(LogicalPlan):
1806
+ def __init__(self, db_name: str) -> None:
1807
+ super().__init__(None)
1808
+ self._db_name = db_name
1809
+
1810
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1811
+ plan = proto.Relation(catalog=proto.Catalog(database_exists=proto.DatabaseExists()))
1812
+ plan.catalog.database_exists.db_name = self._db_name
1813
+ return plan
1814
+
1815
+
1816
+ class TableExists(LogicalPlan):
1817
+ def __init__(self, table_name: str, db_name: Optional[str] = None) -> None:
1818
+ super().__init__(None)
1819
+ self._table_name = table_name
1820
+ self._db_name = db_name
1821
+
1822
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1823
+ plan = proto.Relation(catalog=proto.Catalog(table_exists=proto.TableExists()))
1824
+ plan.catalog.table_exists.table_name = self._table_name
1825
+ if self._db_name is not None:
1826
+ plan.catalog.table_exists.db_name = self._db_name
1827
+ return plan
1828
+
1829
+
1830
+ class FunctionExists(LogicalPlan):
1831
+ def __init__(self, function_name: str, db_name: Optional[str] = None) -> None:
1832
+ super().__init__(None)
1833
+ self._function_name = function_name
1834
+ self._db_name = db_name
1835
+
1836
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1837
+ plan = proto.Relation(catalog=proto.Catalog(function_exists=proto.FunctionExists()))
1838
+ plan.catalog.function_exists.function_name = self._function_name
1839
+ if self._db_name is not None:
1840
+ plan.catalog.function_exists.db_name = self._db_name
1841
+ return plan
1842
+
1843
+
1844
+ class CreateExternalTable(LogicalPlan):
1845
+ def __init__(
1846
+ self,
1847
+ table_name: str,
1848
+ path: str,
1849
+ source: Optional[str] = None,
1850
+ schema: Optional[DataType] = None,
1851
+ options: Mapping[str, str] = {},
1852
+ ) -> None:
1853
+ super().__init__(None)
1854
+ self._table_name = table_name
1855
+ self._path = path
1856
+ self._source = source
1857
+ self._schema = schema
1858
+ self._options = options
1859
+
1860
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1861
+ plan = proto.Relation(
1862
+ catalog=proto.Catalog(create_external_table=proto.CreateExternalTable())
1863
+ )
1864
+ plan.catalog.create_external_table.table_name = self._table_name
1865
+ if self._path is not None:
1866
+ plan.catalog.create_external_table.path = self._path
1867
+ if self._source is not None:
1868
+ plan.catalog.create_external_table.source = self._source
1869
+ if self._schema is not None:
1870
+ plan.catalog.create_external_table.schema.CopyFrom(
1871
+ pyspark_types_to_proto_types(self._schema)
1872
+ )
1873
+ for k in self._options.keys():
1874
+ v = self._options.get(k)
1875
+ if v is not None:
1876
+ plan.catalog.create_external_table.options[k] = v
1877
+ return plan
1878
+
1879
+
1880
+ class CreateTable(LogicalPlan):
1881
+ def __init__(
1882
+ self,
1883
+ table_name: str,
1884
+ path: str,
1885
+ source: Optional[str] = None,
1886
+ description: Optional[str] = None,
1887
+ schema: Optional[DataType] = None,
1888
+ options: Mapping[str, str] = {},
1889
+ ) -> None:
1890
+ super().__init__(None)
1891
+ self._table_name = table_name
1892
+ self._path = path
1893
+ self._source = source
1894
+ self._description = description
1895
+ self._schema = schema
1896
+ self._options = options
1897
+
1898
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1899
+ plan = proto.Relation(catalog=proto.Catalog(create_table=proto.CreateTable()))
1900
+ plan.catalog.create_table.table_name = self._table_name
1901
+ if self._path is not None:
1902
+ plan.catalog.create_table.path = self._path
1903
+ if self._source is not None:
1904
+ plan.catalog.create_table.source = self._source
1905
+ if self._description is not None:
1906
+ plan.catalog.create_table.description = self._description
1907
+ if self._schema is not None:
1908
+ plan.catalog.create_table.schema.CopyFrom(pyspark_types_to_proto_types(self._schema))
1909
+ for k in self._options.keys():
1910
+ v = self._options.get(k)
1911
+ if v is not None:
1912
+ plan.catalog.create_table.options[k] = v
1913
+ return plan
1914
+
1915
+
1916
+ class DropTempView(LogicalPlan):
1917
+ def __init__(self, view_name: str) -> None:
1918
+ super().__init__(None)
1919
+ self._view_name = view_name
1920
+
1921
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1922
+ plan = proto.Relation(catalog=proto.Catalog(drop_temp_view=proto.DropTempView()))
1923
+ plan.catalog.drop_temp_view.view_name = self._view_name
1924
+ return plan
1925
+
1926
+
1927
+ class DropGlobalTempView(LogicalPlan):
1928
+ def __init__(self, view_name: str) -> None:
1929
+ super().__init__(None)
1930
+ self._view_name = view_name
1931
+
1932
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1933
+ plan = proto.Relation(
1934
+ catalog=proto.Catalog(drop_global_temp_view=proto.DropGlobalTempView())
1935
+ )
1936
+ plan.catalog.drop_global_temp_view.view_name = self._view_name
1937
+ return plan
1938
+
1939
+
1940
+ class RecoverPartitions(LogicalPlan):
1941
+ def __init__(self, table_name: str) -> None:
1942
+ super().__init__(None)
1943
+ self._table_name = table_name
1944
+
1945
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1946
+ plan = proto.Relation(
1947
+ catalog=proto.Catalog(
1948
+ recover_partitions=proto.RecoverPartitions(table_name=self._table_name)
1949
+ )
1950
+ )
1951
+ return plan
1952
+
1953
+
1954
+ class IsCached(LogicalPlan):
1955
+ def __init__(self, table_name: str) -> None:
1956
+ super().__init__(None)
1957
+ self._table_name = table_name
1958
+
1959
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1960
+ plan = proto.Relation(
1961
+ catalog=proto.Catalog(is_cached=proto.IsCached(table_name=self._table_name))
1962
+ )
1963
+ return plan
1964
+
1965
+
1966
+ class CacheTable(LogicalPlan):
1967
+ def __init__(self, table_name: str, storage_level: Optional[StorageLevel] = None) -> None:
1968
+ super().__init__(None)
1969
+ self._table_name = table_name
1970
+ self._storage_level = storage_level
1971
+
1972
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1973
+ _cache_table = proto.CacheTable(table_name=self._table_name)
1974
+ if self._storage_level:
1975
+ _cache_table.storage_level.CopyFrom(storage_level_to_proto(self._storage_level))
1976
+ plan = proto.Relation(catalog=proto.Catalog(cache_table=_cache_table))
1977
+ return plan
1978
+
1979
+
1980
+ class UncacheTable(LogicalPlan):
1981
+ def __init__(self, table_name: str) -> None:
1982
+ super().__init__(None)
1983
+ self._table_name = table_name
1984
+
1985
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1986
+ plan = proto.Relation(catalog=proto.Catalog(uncache_table=proto.UncacheTable()))
1987
+ plan.catalog.uncache_table.table_name = self._table_name
1988
+ return plan
1989
+
1990
+
1991
+ class ClearCache(LogicalPlan):
1992
+ def __init__(self) -> None:
1993
+ super().__init__(None)
1994
+
1995
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
1996
+ return proto.Relation(catalog=proto.Catalog(clear_cache=proto.ClearCache()))
1997
+
1998
+
1999
+ class RefreshTable(LogicalPlan):
2000
+ def __init__(self, table_name: str) -> None:
2001
+ super().__init__(None)
2002
+ self._table_name = table_name
2003
+
2004
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2005
+ plan = proto.Relation(catalog=proto.Catalog(refresh_table=proto.RefreshTable()))
2006
+ plan.catalog.refresh_table.table_name = self._table_name
2007
+ return plan
2008
+
2009
+
2010
+ class RefreshByPath(LogicalPlan):
2011
+ def __init__(self, path: str) -> None:
2012
+ super().__init__(None)
2013
+ self._path = path
2014
+
2015
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2016
+ plan = proto.Relation(catalog=proto.Catalog(refresh_by_path=proto.RefreshByPath()))
2017
+ plan.catalog.refresh_by_path.path = self._path
2018
+ return plan
2019
+
2020
+
2021
+ class CurrentCatalog(LogicalPlan):
2022
+ def __init__(self) -> None:
2023
+ super().__init__(None)
2024
+
2025
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2026
+ return proto.Relation(catalog=proto.Catalog(current_catalog=proto.CurrentCatalog()))
2027
+
2028
+
2029
+ class SetCurrentCatalog(LogicalPlan):
2030
+ def __init__(self, catalog_name: str) -> None:
2031
+ super().__init__(None)
2032
+ self._catalog_name = catalog_name
2033
+
2034
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2035
+ plan = proto.Relation(catalog=proto.Catalog(set_current_catalog=proto.SetCurrentCatalog()))
2036
+ plan.catalog.set_current_catalog.catalog_name = self._catalog_name
2037
+ return plan
2038
+
2039
+
2040
+ class ListCatalogs(LogicalPlan):
2041
+ def __init__(self, pattern: Optional[str] = None) -> None:
2042
+ super().__init__(None)
2043
+ self._pattern = pattern
2044
+
2045
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2046
+ plan = proto.Relation(catalog=proto.Catalog(list_catalogs=proto.ListCatalogs()))
2047
+ if self._pattern is not None:
2048
+ plan.catalog.list_catalogs.pattern = self._pattern
2049
+ return plan
2050
+
2051
+
2052
+ class MapPartitions(LogicalPlan):
2053
+ """Logical plan object for a mapPartitions-equivalent API: mapInPandas, mapInArrow."""
2054
+
2055
+ def __init__(
2056
+ self,
2057
+ child: Optional["LogicalPlan"],
2058
+ function: "UserDefinedFunction",
2059
+ cols: List[str],
2060
+ is_barrier: bool,
2061
+ ) -> None:
2062
+ super().__init__(child)
2063
+
2064
+ self._func = function._build_common_inline_user_defined_function(*cols)
2065
+ self._is_barrier = is_barrier
2066
+
2067
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2068
+ assert self._child is not None
2069
+ plan = self._create_proto_relation()
2070
+ plan.map_partitions.input.CopyFrom(self._child.plan(session))
2071
+ plan.map_partitions.func.CopyFrom(self._func.to_plan_udf(session))
2072
+ plan.map_partitions.is_barrier = self._is_barrier
2073
+ return plan
2074
+
2075
+
2076
+ class GroupMap(LogicalPlan):
2077
+ """Logical plan object for a Group Map API: apply, applyInPandas."""
2078
+
2079
+ def __init__(
2080
+ self,
2081
+ child: Optional["LogicalPlan"],
2082
+ grouping_cols: Sequence[Column],
2083
+ function: "UserDefinedFunction",
2084
+ cols: List[str],
2085
+ ):
2086
+ assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols)
2087
+
2088
+ super().__init__(child)
2089
+ self._grouping_cols = grouping_cols
2090
+ self._func = function._build_common_inline_user_defined_function(*cols)
2091
+
2092
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2093
+ assert self._child is not None
2094
+ plan = self._create_proto_relation()
2095
+ plan.group_map.input.CopyFrom(self._child.plan(session))
2096
+ plan.group_map.grouping_expressions.extend(
2097
+ [c.to_plan(session) for c in self._grouping_cols]
2098
+ )
2099
+ plan.group_map.func.CopyFrom(self._func.to_plan_udf(session))
2100
+ return plan
2101
+
2102
+
2103
+ class CoGroupMap(LogicalPlan):
2104
+ """Logical plan object for a CoGroup Map API: applyInPandas."""
2105
+
2106
+ def __init__(
2107
+ self,
2108
+ input: Optional["LogicalPlan"],
2109
+ input_grouping_cols: Sequence[Column],
2110
+ other: Optional["LogicalPlan"],
2111
+ other_grouping_cols: Sequence[Column],
2112
+ function: "UserDefinedFunction",
2113
+ cols: List[Column],
2114
+ ):
2115
+ assert isinstance(input_grouping_cols, list) and all(
2116
+ isinstance(c, Column) for c in input_grouping_cols
2117
+ )
2118
+ assert isinstance(other_grouping_cols, list) and all(
2119
+ isinstance(c, Column) for c in other_grouping_cols
2120
+ )
2121
+
2122
+ super().__init__(input)
2123
+ self._input_grouping_cols = input_grouping_cols
2124
+ self._other_grouping_cols = other_grouping_cols
2125
+ self._other = cast(LogicalPlan, other)
2126
+ # The function takes entire DataFrame as inputs, no need to do
2127
+ # column binding (no input columns).
2128
+ self._func = function._build_common_inline_user_defined_function()
2129
+
2130
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2131
+ assert self._child is not None
2132
+ plan = self._create_proto_relation()
2133
+ plan.co_group_map.input.CopyFrom(self._child.plan(session))
2134
+ plan.co_group_map.input_grouping_expressions.extend(
2135
+ [c.to_plan(session) for c in self._input_grouping_cols]
2136
+ )
2137
+ plan.co_group_map.other.CopyFrom(self._other.plan(session))
2138
+ plan.co_group_map.other_grouping_expressions.extend(
2139
+ [c.to_plan(session) for c in self._other_grouping_cols]
2140
+ )
2141
+ plan.co_group_map.func.CopyFrom(self._func.to_plan_udf(session))
2142
+ return plan
2143
+
2144
+
2145
+ class ApplyInPandasWithState(LogicalPlan):
2146
+ """Logical plan object for a applyInPandasWithState."""
2147
+
2148
+ def __init__(
2149
+ self,
2150
+ child: Optional["LogicalPlan"],
2151
+ grouping_cols: Sequence[Column],
2152
+ function: "UserDefinedFunction",
2153
+ output_schema: str,
2154
+ state_schema: str,
2155
+ output_mode: str,
2156
+ timeout_conf: str,
2157
+ cols: List[str],
2158
+ ):
2159
+ assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols)
2160
+
2161
+ super().__init__(child)
2162
+ self._grouping_cols = grouping_cols
2163
+ self._func = function._build_common_inline_user_defined_function(*cols)
2164
+ self._output_schema = output_schema
2165
+ self._state_schema = state_schema
2166
+ self._output_mode = output_mode
2167
+ self._timeout_conf = timeout_conf
2168
+
2169
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2170
+ assert self._child is not None
2171
+ plan = self._create_proto_relation()
2172
+ plan.apply_in_pandas_with_state.input.CopyFrom(self._child.plan(session))
2173
+ plan.apply_in_pandas_with_state.grouping_expressions.extend(
2174
+ [c.to_plan(session) for c in self._grouping_cols]
2175
+ )
2176
+ plan.apply_in_pandas_with_state.func.CopyFrom(self._func.to_plan_udf(session))
2177
+ plan.apply_in_pandas_with_state.output_schema = self._output_schema
2178
+ plan.apply_in_pandas_with_state.state_schema = self._state_schema
2179
+ plan.apply_in_pandas_with_state.output_mode = self._output_mode
2180
+ plan.apply_in_pandas_with_state.timeout_conf = self._timeout_conf
2181
+ return plan
2182
+
2183
+
2184
+ class PythonUDTF:
2185
+ """Represents a Python user-defined table function."""
2186
+
2187
+ def __init__(
2188
+ self,
2189
+ func: Type,
2190
+ return_type: Union[DataType, str],
2191
+ eval_type: int,
2192
+ python_ver: str,
2193
+ ) -> None:
2194
+ self._func = func
2195
+ self._name = func.__name__
2196
+ self._return_type: DataType = (
2197
+ UnparsedDataType(return_type) if isinstance(return_type, str) else return_type
2198
+ )
2199
+ self._eval_type = eval_type
2200
+ self._python_ver = python_ver
2201
+
2202
+ def to_plan(self, session: "SparkConnectClient") -> proto.PythonUDTF:
2203
+ udtf = proto.PythonUDTF()
2204
+ # Currently the return type cannot be None.
2205
+ # TODO(SPARK-44380): support `analyze` in Python UDTFs
2206
+ assert self._return_type is not None
2207
+ udtf.return_type.CopyFrom(pyspark_types_to_proto_types(self._return_type))
2208
+ udtf.eval_type = self._eval_type
2209
+ try:
2210
+ udtf.command = CloudPickleSerializer().dumps(self._func)
2211
+ except pickle.PicklingError:
2212
+ raise PySparkRuntimeError(
2213
+ error_class="UDTF_SERIALIZATION_ERROR",
2214
+ message_parameters={
2215
+ "name": self._name,
2216
+ "message": "Please check the stack trace and "
2217
+ "make sure the function is serializable.",
2218
+ },
2219
+ )
2220
+ udtf.python_ver = self._python_ver
2221
+ return udtf
2222
+
2223
+ def __repr__(self) -> str:
2224
+ return (
2225
+ f"PythonUDTF({self._name}, {self._return_type}, "
2226
+ f"{self._eval_type}, {self._python_ver})"
2227
+ )
2228
+
2229
+
2230
+ class CommonInlineUserDefinedTableFunction(LogicalPlan):
2231
+ """
2232
+ Logical plan object for a user-defined table function with
2233
+ an inlined defined function body.
2234
+ """
2235
+
2236
+ def __init__(
2237
+ self,
2238
+ function_name: str,
2239
+ function: PythonUDTF,
2240
+ deterministic: bool,
2241
+ arguments: Sequence[Expression],
2242
+ ) -> None:
2243
+ super().__init__(None)
2244
+ self._function_name = function_name
2245
+ self._deterministic = deterministic
2246
+ self._arguments = arguments
2247
+ self._function = function
2248
+
2249
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2250
+ plan = self._create_proto_relation()
2251
+ plan.common_inline_user_defined_table_function.function_name = self._function_name
2252
+ plan.common_inline_user_defined_table_function.deterministic = self._deterministic
2253
+ if len(self._arguments) > 0:
2254
+ plan.common_inline_user_defined_table_function.arguments.extend(
2255
+ [arg.to_plan(session) for arg in self._arguments]
2256
+ )
2257
+ plan.common_inline_user_defined_table_function.python_udtf.CopyFrom(
2258
+ self._function.to_plan(session)
2259
+ )
2260
+ return plan
2261
+
2262
+ def udtf_plan(
2263
+ self, session: "SparkConnectClient"
2264
+ ) -> "proto.CommonInlineUserDefinedTableFunction":
2265
+ """
2266
+ Compared to `plan`, it returns a `proto.CommonInlineUserDefinedTableFunction`
2267
+ instead of a `proto.Relation`.
2268
+ """
2269
+ plan = proto.CommonInlineUserDefinedTableFunction()
2270
+ plan.function_name = self._function_name
2271
+ plan.deterministic = self._deterministic
2272
+ if len(self._arguments) > 0:
2273
+ plan.arguments.extend([arg.to_plan(session) for arg in self._arguments])
2274
+ plan.python_udtf.CopyFrom(cast(proto.PythonUDF, self._function.to_plan(session)))
2275
+ return plan
2276
+
2277
+ def __repr__(self) -> str:
2278
+ return f"{self._function_name}({', '.join([str(arg) for arg in self._arguments])})"
2279
+
2280
+
2281
+ class CachedRelation(LogicalPlan):
2282
+ def __init__(self, plan: proto.Relation) -> None:
2283
+ super(CachedRelation, self).__init__(None)
2284
+ self._plan = plan
2285
+ # Update the plan ID based on the incremented counter.
2286
+ self._plan.common.plan_id = self._plan_id
2287
+
2288
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
2289
+ return self._plan