snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1686 @@
1
+ # -*- encoding: utf-8 -*-
2
+ #
3
+ # Licensed to the Apache Software Foundation (ASF) under one or more
4
+ # contributor license agreements. See the NOTICE file distributed with
5
+ # this work for additional information regarding copyright ownership.
6
+ # The ASF licenses this file to You under the Apache License, Version 2.0
7
+ # (the "License"); you may not use this file except in compliance with
8
+ # the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+ import unittest
19
+ import difflib
20
+ from itertools import zip_longest
21
+
22
+ from pyspark.sql.functions import sha2, to_timestamp
23
+ from pyspark.errors import (
24
+ AnalysisException,
25
+ ParseException,
26
+ PySparkAssertionError,
27
+ IllegalArgumentException,
28
+ SparkUpgradeException,
29
+ )
30
+ from pyspark.testing.utils import assertDataFrameEqual, assertSchemaEqual, _context_diff, have_numpy
31
+ from pyspark.testing.sqlutils import ReusedSQLTestCase
32
+ from pyspark.sql import Row
33
+ import pyspark.sql.functions as F
34
+ from pyspark.sql.functions import to_date, unix_timestamp, from_unixtime
35
+ from pyspark.sql.types import (
36
+ StringType,
37
+ ArrayType,
38
+ LongType,
39
+ StructType,
40
+ MapType,
41
+ FloatType,
42
+ DoubleType,
43
+ StructField,
44
+ IntegerType,
45
+ BooleanType,
46
+ )
47
+ from pyspark.testing.sqlutils import have_pandas
48
+
49
+
50
+ class UtilsTestsMixin:
51
+ def test_assert_equal_inttype(self):
52
+ df1 = self.spark.createDataFrame(
53
+ data=[
54
+ ("1", 1000),
55
+ ("2", 3000),
56
+ ],
57
+ schema=["id", "amount"],
58
+ )
59
+ df2 = self.spark.createDataFrame(
60
+ data=[
61
+ ("1", 1000),
62
+ ("2", 3000),
63
+ ],
64
+ schema=["id", "amount"],
65
+ )
66
+
67
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
68
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
69
+
70
+ def test_assert_equal_arraytype(self):
71
+ df1 = self.spark.createDataFrame(
72
+ data=[
73
+ ("john", ["Python", "Java"]),
74
+ ("jane", ["Scala", "SQL", "Java"]),
75
+ ],
76
+ schema=StructType(
77
+ [
78
+ StructField("name", StringType(), True),
79
+ StructField("languages", ArrayType(StringType()), True),
80
+ ]
81
+ ),
82
+ )
83
+ df2 = self.spark.createDataFrame(
84
+ data=[
85
+ ("john", ["Python", "Java"]),
86
+ ("jane", ["Scala", "SQL", "Java"]),
87
+ ],
88
+ schema=StructType(
89
+ [
90
+ StructField("name", StringType(), True),
91
+ StructField("languages", ArrayType(StringType()), True),
92
+ ]
93
+ ),
94
+ )
95
+
96
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
97
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
98
+
99
+ def test_assert_approx_equal_arraytype_float(self):
100
+ df1 = self.spark.createDataFrame(
101
+ data=[
102
+ ("student1", [97.01, 89.23]),
103
+ ("student2", [91.86, 84.34]),
104
+ ],
105
+ schema=StructType(
106
+ [
107
+ StructField("student", StringType(), True),
108
+ StructField("grades", ArrayType(FloatType()), True),
109
+ ]
110
+ ),
111
+ )
112
+ df2 = self.spark.createDataFrame(
113
+ data=[
114
+ ("student1", [97.01, 89.23]),
115
+ ("student2", [91.86, 84.339999]),
116
+ ],
117
+ schema=StructType(
118
+ [
119
+ StructField("student", StringType(), True),
120
+ StructField("grades", ArrayType(FloatType()), True),
121
+ ]
122
+ ),
123
+ )
124
+
125
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
126
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
127
+
128
+ def test_assert_approx_equal_arraytype_float_default_rtol_fail(self):
129
+ # fails with default rtol, 1e-5
130
+ df1 = self.spark.createDataFrame(
131
+ data=[
132
+ ("student1", [97.01, 89.23]),
133
+ ("student2", [91.86, 84.34]),
134
+ ],
135
+ schema=StructType(
136
+ [
137
+ StructField("student", StringType(), True),
138
+ StructField("grades", ArrayType(FloatType()), True),
139
+ ]
140
+ ),
141
+ )
142
+ df2 = self.spark.createDataFrame(
143
+ data=[
144
+ ("student1", [97.01, 89.23]),
145
+ ("student2", [91.86, 84.341]),
146
+ ],
147
+ schema=StructType(
148
+ [
149
+ StructField("student", StringType(), True),
150
+ StructField("grades", ArrayType(FloatType()), True),
151
+ ]
152
+ ),
153
+ )
154
+
155
+ rows_str1 = ""
156
+ rows_str2 = ""
157
+
158
+ # count different rows
159
+ for r1, r2 in list(zip_longest(df1.collect(), df2.collect())):
160
+ rows_str1 += str(r1) + "\n"
161
+ rows_str2 += str(r2) + "\n"
162
+
163
+ generated_diff = _context_diff(
164
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=2
165
+ )
166
+
167
+ error_msg = "Results do not match: "
168
+ percent_diff = (1 / 2) * 100
169
+ error_msg += "( %.5f %% )" % percent_diff
170
+ error_msg += "\n" + "\n".join(generated_diff)
171
+
172
+ with self.assertRaises(PySparkAssertionError) as pe:
173
+ assertDataFrameEqual(df1, df2)
174
+
175
+ self.check_error(
176
+ exception=pe.exception,
177
+ error_class="DIFFERENT_ROWS",
178
+ message_parameters={"error_msg": error_msg},
179
+ )
180
+
181
+ with self.assertRaises(PySparkAssertionError) as pe:
182
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
183
+
184
+ self.check_error(
185
+ exception=pe.exception,
186
+ error_class="DIFFERENT_ROWS",
187
+ message_parameters={"error_msg": error_msg},
188
+ )
189
+
190
+ def test_assert_approx_equal_arraytype_float_custom_rtol_pass(self):
191
+ # passes with custom rtol, 1e-2
192
+ df1 = self.spark.createDataFrame(
193
+ data=[
194
+ ("student1", [97.01, 89.23]),
195
+ ("student2", [91.86, 84.34]),
196
+ ],
197
+ schema=StructType(
198
+ [
199
+ StructField("student", StringType(), True),
200
+ StructField("grades", ArrayType(FloatType()), True),
201
+ ]
202
+ ),
203
+ )
204
+ df2 = self.spark.createDataFrame(
205
+ data=[
206
+ ("student1", [97.01, 89.23]),
207
+ ("student2", [91.86, 84.341]),
208
+ ],
209
+ schema=StructType(
210
+ [
211
+ StructField("student", StringType(), True),
212
+ StructField("grades", ArrayType(FloatType()), True),
213
+ ]
214
+ ),
215
+ )
216
+
217
+ assertDataFrameEqual(df1, df2, rtol=1e-2)
218
+
219
+ def test_assert_approx_equal_doubletype_custom_rtol_pass(self):
220
+ # passes with custom rtol, 1e-2
221
+ df1 = self.spark.createDataFrame(
222
+ data=[
223
+ ("student1", 97.01),
224
+ ("student2", 84.34),
225
+ ],
226
+ schema=StructType(
227
+ [
228
+ StructField("student", StringType(), True),
229
+ StructField("grade", DoubleType(), True),
230
+ ]
231
+ ),
232
+ )
233
+ df2 = self.spark.createDataFrame(
234
+ data=[
235
+ ("student1", 97.01),
236
+ ("student2", 84.341),
237
+ ],
238
+ schema=StructType(
239
+ [
240
+ StructField("student", StringType(), True),
241
+ StructField("grade", DoubleType(), True),
242
+ ]
243
+ ),
244
+ )
245
+
246
+ assertDataFrameEqual(df1, df2, rtol=1e-2)
247
+
248
+ def test_assert_approx_equal_decimaltype_custom_rtol_pass(self):
249
+ # passes with custom rtol, 1e-2
250
+ df1 = self.spark.createDataFrame(
251
+ data=[
252
+ ("student1", 83.14),
253
+ ("student2", 97.12),
254
+ ],
255
+ schema=StructType(
256
+ [
257
+ StructField("student", StringType(), True),
258
+ StructField("grade", DoubleType(), True),
259
+ ]
260
+ ),
261
+ )
262
+ df2 = self.spark.createDataFrame(
263
+ data=[
264
+ ("student1", 83.14),
265
+ ("student2", 97.111),
266
+ ],
267
+ schema=StructType(
268
+ [
269
+ StructField("student", StringType(), True),
270
+ StructField("grade", DoubleType(), True),
271
+ ]
272
+ ),
273
+ )
274
+
275
+ # cast to DecimalType
276
+ df1 = df1.withColumn("col_1", F.col("grade").cast("decimal(4,3)"))
277
+ df2 = df2.withColumn("col_1", F.col("grade").cast("decimal(4,3)"))
278
+
279
+ assertDataFrameEqual(df1, df2, rtol=1e-1)
280
+
281
+ def test_assert_notequal_arraytype(self):
282
+ df1 = self.spark.createDataFrame(
283
+ data=[
284
+ ("Amy", ["C++", "Rust"]),
285
+ ("John", ["Python", "Java"]),
286
+ ("Jane", ["Scala", "SQL", "Java"]),
287
+ ],
288
+ schema=StructType(
289
+ [
290
+ StructField("name", StringType(), True),
291
+ StructField("languages", ArrayType(StringType()), True),
292
+ ]
293
+ ),
294
+ )
295
+ df2 = self.spark.createDataFrame(
296
+ data=[
297
+ ("Amy", ["C++", "Rust"]),
298
+ ("John", ["Python", "Java"]),
299
+ ("Jane", ["Scala", "Java"]),
300
+ ],
301
+ schema=StructType(
302
+ [
303
+ StructField("name", StringType(), True),
304
+ StructField("languages", ArrayType(StringType()), True),
305
+ ]
306
+ ),
307
+ )
308
+
309
+ rows_str1 = ""
310
+ rows_str2 = ""
311
+
312
+ sorted_list1 = sorted(df1.collect(), key=lambda x: str(x))
313
+ sorted_list2 = sorted(df2.collect(), key=lambda x: str(x))
314
+
315
+ # count different rows
316
+ for r1, r2 in list(zip_longest(sorted_list1, sorted_list2)):
317
+ rows_str1 += str(r1) + "\n"
318
+ rows_str2 += str(r2) + "\n"
319
+
320
+ generated_diff = _context_diff(
321
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=3
322
+ )
323
+
324
+ error_msg = "Results do not match: "
325
+ percent_diff = (1 / 3) * 100
326
+ error_msg += "( %.5f %% )" % percent_diff
327
+ error_msg += "\n" + "\n".join(generated_diff)
328
+
329
+ with self.assertRaises(PySparkAssertionError) as pe:
330
+ assertDataFrameEqual(df1, df2)
331
+
332
+ self.check_error(
333
+ exception=pe.exception,
334
+ error_class="DIFFERENT_ROWS",
335
+ message_parameters={"error_msg": error_msg},
336
+ )
337
+
338
+ rows_str1 = ""
339
+ rows_str2 = ""
340
+
341
+ # count different rows
342
+ for r1, r2 in list(zip_longest(df1.collect(), df2.collect())):
343
+ rows_str1 += str(r1) + "\n"
344
+ rows_str2 += str(r2) + "\n"
345
+
346
+ generated_diff = _context_diff(
347
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=3
348
+ )
349
+
350
+ error_msg = "Results do not match: "
351
+ percent_diff = (1 / 3) * 100
352
+ error_msg += "( %.5f %% )" % percent_diff
353
+ error_msg += "\n" + "\n".join(generated_diff)
354
+
355
+ with self.assertRaises(PySparkAssertionError) as pe:
356
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
357
+
358
+ self.check_error(
359
+ exception=pe.exception,
360
+ error_class="DIFFERENT_ROWS",
361
+ message_parameters={"error_msg": error_msg},
362
+ )
363
+
364
+ def test_assert_equal_maptype(self):
365
+ df1 = self.spark.createDataFrame(
366
+ data=[
367
+ ("student1", {"id": 222342203655477580}),
368
+ ("student2", {"id": 422322203155477692}),
369
+ ],
370
+ schema=StructType(
371
+ [
372
+ StructField("student", StringType(), True),
373
+ StructField("properties", MapType(StringType(), LongType()), True),
374
+ ]
375
+ ),
376
+ )
377
+ df2 = self.spark.createDataFrame(
378
+ data=[
379
+ ("student1", {"id": 222342203655477580}),
380
+ ("student2", {"id": 422322203155477692}),
381
+ ],
382
+ schema=StructType(
383
+ [
384
+ StructField("student", StringType(), True),
385
+ StructField("properties", MapType(StringType(), LongType()), True),
386
+ ]
387
+ ),
388
+ )
389
+
390
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
391
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
392
+
393
+ def test_assert_approx_equal_maptype_double(self):
394
+ df1 = self.spark.createDataFrame(
395
+ data=[
396
+ ("student1", {"math": 76.23, "english": 92.64}),
397
+ ("student2", {"math": 87.89, "english": 84.48}),
398
+ ],
399
+ schema=StructType(
400
+ [
401
+ StructField("student", StringType(), True),
402
+ StructField("grades", MapType(StringType(), DoubleType()), True),
403
+ ]
404
+ ),
405
+ )
406
+ df2 = self.spark.createDataFrame(
407
+ data=[
408
+ ("student1", {"math": 76.23, "english": 92.63999999}),
409
+ ("student2", {"math": 87.89, "english": 84.48}),
410
+ ],
411
+ schema=StructType(
412
+ [
413
+ StructField("student", StringType(), True),
414
+ StructField("grades", MapType(StringType(), DoubleType()), True),
415
+ ]
416
+ ),
417
+ )
418
+
419
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
420
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
421
+
422
+ def test_assert_approx_equal_nested_struct_double(self):
423
+ df1 = self.spark.createDataFrame(
424
+ data=[
425
+ ("jane", (64.57, 76.63, 97.81)),
426
+ ("john", (93.92, 91.57, 84.36)),
427
+ ],
428
+ schema=StructType(
429
+ [
430
+ StructField("name", StringType(), True),
431
+ StructField(
432
+ "grades",
433
+ StructType(
434
+ [
435
+ StructField("math", DoubleType(), True),
436
+ StructField("english", DoubleType(), True),
437
+ StructField("biology", DoubleType(), True),
438
+ ]
439
+ ),
440
+ ),
441
+ ]
442
+ ),
443
+ )
444
+
445
+ df2 = self.spark.createDataFrame(
446
+ data=[
447
+ ("jane", (64.57, 76.63, 97.81000001)),
448
+ ("john", (93.92, 91.57, 84.36)),
449
+ ],
450
+ schema=StructType(
451
+ [
452
+ StructField("name", StringType(), True),
453
+ StructField(
454
+ "grades",
455
+ StructType(
456
+ [
457
+ StructField("math", DoubleType(), True),
458
+ StructField("english", DoubleType(), True),
459
+ StructField("biology", DoubleType(), True),
460
+ ]
461
+ ),
462
+ ),
463
+ ]
464
+ ),
465
+ )
466
+
467
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
468
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
469
+
470
+ def test_assert_equal_nested_struct_str(self):
471
+ df1 = self.spark.createDataFrame(
472
+ data=[
473
+ (1, ("jane", "anne", "doe")),
474
+ (2, ("john", "bob", "smith")),
475
+ ],
476
+ schema=StructType(
477
+ [
478
+ StructField("id", IntegerType(), True),
479
+ StructField(
480
+ "name",
481
+ StructType(
482
+ [
483
+ StructField("first", StringType(), True),
484
+ StructField("middle", StringType(), True),
485
+ StructField("last", StringType(), True),
486
+ ]
487
+ ),
488
+ ),
489
+ ]
490
+ ),
491
+ )
492
+
493
+ df2 = self.spark.createDataFrame(
494
+ data=[
495
+ (1, ("jane", "anne", "doe")),
496
+ (2, ("john", "bob", "smith")),
497
+ ],
498
+ schema=StructType(
499
+ [
500
+ StructField("id", IntegerType(), True),
501
+ StructField(
502
+ "name",
503
+ StructType(
504
+ [
505
+ StructField("first", StringType(), True),
506
+ StructField("middle", StringType(), True),
507
+ StructField("last", StringType(), True),
508
+ ]
509
+ ),
510
+ ),
511
+ ]
512
+ ),
513
+ )
514
+
515
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
516
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
517
+
518
+ def test_assert_equal_nested_struct_str_duplicate(self):
519
+ df1 = self.spark.createDataFrame(
520
+ data=[
521
+ (1, ("jane doe", "jane doe")),
522
+ (2, ("john smith", "john smith")),
523
+ ],
524
+ schema=StructType(
525
+ [
526
+ StructField("id", IntegerType(), True),
527
+ StructField(
528
+ "full name",
529
+ StructType(
530
+ [
531
+ StructField("name", StringType(), True),
532
+ StructField("name", StringType(), True),
533
+ ]
534
+ ),
535
+ ),
536
+ ]
537
+ ),
538
+ )
539
+
540
+ df2 = self.spark.createDataFrame(
541
+ data=[
542
+ (1, ("jane doe", "jane doe")),
543
+ (2, ("john smith", "john smith")),
544
+ ],
545
+ schema=StructType(
546
+ [
547
+ StructField("id", IntegerType(), True),
548
+ StructField(
549
+ "full name",
550
+ StructType(
551
+ [
552
+ StructField("name", StringType(), True),
553
+ StructField("name", StringType(), True),
554
+ ]
555
+ ),
556
+ ),
557
+ ]
558
+ ),
559
+ )
560
+
561
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
562
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
563
+
564
+ def test_assert_equal_duplicate_col(self):
565
+ df1 = self.spark.createDataFrame(
566
+ data=[
567
+ (1, "Python", 1, 1),
568
+ (2, "Scala", 2, 2),
569
+ ],
570
+ schema=["number", "language", "number", "number"],
571
+ )
572
+ df2 = self.spark.createDataFrame(
573
+ data=[
574
+ (1, "Python", 1, 1),
575
+ (2, "Scala", 2, 2),
576
+ ],
577
+ schema=["number", "language", "number", "number"],
578
+ )
579
+
580
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
581
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
582
+
583
+ def test_assert_equal_timestamp(self):
584
+ df1 = self.spark.createDataFrame(
585
+ data=[("1", "2023-01-01 12:01:01.000")], schema=["id", "timestamp"]
586
+ )
587
+
588
+ df2 = self.spark.createDataFrame(
589
+ data=[("1", "2023-01-01 12:01:01.000")], schema=["id", "timestamp"]
590
+ )
591
+
592
+ df1 = df1.withColumn("timestamp", to_timestamp("timestamp"))
593
+ df2 = df2.withColumn("timestamp", to_timestamp("timestamp"))
594
+
595
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
596
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
597
+
598
+ def test_assert_equal_nullrow(self):
599
+ df1 = self.spark.createDataFrame(
600
+ data=[
601
+ ("1", 1000),
602
+ (None, None),
603
+ ],
604
+ schema=["id", "amount"],
605
+ )
606
+ df2 = self.spark.createDataFrame(
607
+ data=[
608
+ ("1", 1000),
609
+ (None, None),
610
+ ],
611
+ schema=["id", "amount"],
612
+ )
613
+
614
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
615
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
616
+
617
+ def test_assert_notequal_nullval(self):
618
+ df1 = self.spark.createDataFrame(
619
+ data=[
620
+ ("1", 1000),
621
+ ("2", 2000),
622
+ ],
623
+ schema=["id", "amount"],
624
+ )
625
+ df2 = self.spark.createDataFrame(
626
+ data=[
627
+ ("1", 1000),
628
+ ("2", None),
629
+ ],
630
+ schema=["id", "amount"],
631
+ )
632
+
633
+ rows_str1 = ""
634
+ rows_str2 = ""
635
+
636
+ # count different rows
637
+ for r1, r2 in list(zip_longest(df1.collect(), df2.collect())):
638
+ rows_str1 += str(r1) + "\n"
639
+ rows_str2 += str(r2) + "\n"
640
+
641
+ generated_diff = _context_diff(
642
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=2
643
+ )
644
+
645
+ error_msg = "Results do not match: "
646
+ percent_diff = (1 / 2) * 100
647
+ error_msg += "( %.5f %% )" % percent_diff
648
+ error_msg += "\n" + "\n".join(generated_diff)
649
+
650
+ with self.assertRaises(PySparkAssertionError) as pe:
651
+ assertDataFrameEqual(df1, df2)
652
+
653
+ self.check_error(
654
+ exception=pe.exception,
655
+ error_class="DIFFERENT_ROWS",
656
+ message_parameters={"error_msg": error_msg},
657
+ )
658
+
659
+ with self.assertRaises(PySparkAssertionError) as pe:
660
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
661
+
662
+ self.check_error(
663
+ exception=pe.exception,
664
+ error_class="DIFFERENT_ROWS",
665
+ message_parameters={"error_msg": error_msg},
666
+ )
667
+
668
+ def test_assert_equal_nulldf(self):
669
+ df1 = None
670
+ df2 = None
671
+
672
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
673
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
674
+
675
+ def test_assert_unequal_null_actual(self):
676
+ df1 = None
677
+ df2 = self.spark.createDataFrame(
678
+ data=[
679
+ ("1", 1000),
680
+ ("2", 3000),
681
+ ],
682
+ schema=["id", "amount"],
683
+ )
684
+
685
+ with self.assertRaises(PySparkAssertionError) as pe:
686
+ assertDataFrameEqual(df1, df2)
687
+
688
+ self.check_error(
689
+ exception=pe.exception,
690
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
691
+ message_parameters={
692
+ "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
693
+ "arg_name": "actual",
694
+ "actual_type": None,
695
+ },
696
+ )
697
+
698
+ with self.assertRaises(PySparkAssertionError) as pe:
699
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
700
+
701
+ self.check_error(
702
+ exception=pe.exception,
703
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
704
+ message_parameters={
705
+ "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
706
+ "arg_name": "actual",
707
+ "actual_type": None,
708
+ },
709
+ )
710
+
711
+ def test_assert_unequal_null_expected(self):
712
+ df1 = self.spark.createDataFrame(
713
+ data=[
714
+ ("1", 1000),
715
+ ("2", 3000),
716
+ ],
717
+ schema=["id", "amount"],
718
+ )
719
+ df2 = None
720
+
721
+ with self.assertRaises(PySparkAssertionError) as pe:
722
+ assertDataFrameEqual(df1, df2)
723
+
724
+ self.check_error(
725
+ exception=pe.exception,
726
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
727
+ message_parameters={
728
+ "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
729
+ "arg_name": "expected",
730
+ "actual_type": None,
731
+ },
732
+ )
733
+
734
+ with self.assertRaises(PySparkAssertionError) as pe:
735
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
736
+
737
+ self.check_error(
738
+ exception=pe.exception,
739
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
740
+ message_parameters={
741
+ "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
742
+ "arg_name": "expected",
743
+ "actual_type": None,
744
+ },
745
+ )
746
+
747
+ @unittest.skipIf(not have_pandas or not have_numpy, "no pandas or numpy dependency")
748
+ def test_assert_equal_exact_pandas_df(self):
749
+ import pandas as pd
750
+ import numpy as np
751
+
752
+ df1 = pd.DataFrame(
753
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)]), columns=["a", "b", "c"]
754
+ )
755
+ df2 = pd.DataFrame(
756
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)]), columns=["a", "b", "c"]
757
+ )
758
+
759
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
760
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
761
+
762
+ @unittest.skipIf(not have_pandas or not have_numpy, "no pandas or numpy dependency")
763
+ def test_assert_approx_equal_pandas_df(self):
764
+ import pandas as pd
765
+ import numpy as np
766
+
767
+ # test that asserts close enough equality for pandas df
768
+ df1 = pd.DataFrame(
769
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 59)]), columns=["a", "b", "c"]
770
+ )
771
+ df2 = pd.DataFrame(
772
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 59.0001)]), columns=["a", "b", "c"]
773
+ )
774
+
775
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
776
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
777
+
778
+ @unittest.skipIf(not have_pandas or not have_numpy, "no pandas or numpy dependency")
779
+ def test_assert_approx_equal_fail_exact_pandas_df(self):
780
+ import pandas as pd
781
+ import numpy as np
782
+
783
+ # test that asserts close enough equality for pandas df
784
+ df1 = pd.DataFrame(
785
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 59)]), columns=["a", "b", "c"]
786
+ )
787
+ df2 = pd.DataFrame(
788
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 59.0001)]), columns=["a", "b", "c"]
789
+ )
790
+
791
+ with self.assertRaises(PySparkAssertionError) as pe:
792
+ assertDataFrameEqual(df1, df2, checkRowOrder=False, rtol=0, atol=0)
793
+
794
+ self.check_error(
795
+ exception=pe.exception,
796
+ error_class="DIFFERENT_PANDAS_DATAFRAME",
797
+ message_parameters={
798
+ "left": df1.to_string(),
799
+ "left_dtype": str(df1.dtypes),
800
+ "right": df2.to_string(),
801
+ "right_dtype": str(df2.dtypes),
802
+ },
803
+ )
804
+
805
+ with self.assertRaises(PySparkAssertionError) as pe:
806
+ assertDataFrameEqual(df1, df2, checkRowOrder=True, rtol=0, atol=0)
807
+
808
+ self.check_error(
809
+ exception=pe.exception,
810
+ error_class="DIFFERENT_PANDAS_DATAFRAME",
811
+ message_parameters={
812
+ "left": df1.to_string(),
813
+ "left_dtype": str(df1.dtypes),
814
+ "right": df2.to_string(),
815
+ "right_dtype": str(df2.dtypes),
816
+ },
817
+ )
818
+
819
+ @unittest.skipIf(not have_pandas or not have_numpy, "no pandas or numpy dependency")
820
+ def test_assert_unequal_pandas_df(self):
821
+ import pandas as pd
822
+ import numpy as np
823
+
824
+ df1 = pd.DataFrame(
825
+ data=np.array([(1, 2, 3), (4, 5, 6), (6, 5, 4)]), columns=["a", "b", "c"]
826
+ )
827
+ df2 = pd.DataFrame(
828
+ data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)]), columns=["a", "b", "c"]
829
+ )
830
+
831
+ with self.assertRaises(PySparkAssertionError) as pe:
832
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
833
+
834
+ self.check_error(
835
+ exception=pe.exception,
836
+ error_class="DIFFERENT_PANDAS_DATAFRAME",
837
+ message_parameters={
838
+ "left": df1.to_string(),
839
+ "left_dtype": str(df1.dtypes),
840
+ "right": df2.to_string(),
841
+ "right_dtype": str(df2.dtypes),
842
+ },
843
+ )
844
+
845
+ with self.assertRaises(PySparkAssertionError) as pe:
846
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
847
+
848
+ self.check_error(
849
+ exception=pe.exception,
850
+ error_class="DIFFERENT_PANDAS_DATAFRAME",
851
+ message_parameters={
852
+ "left": df1.to_string(),
853
+ "left_dtype": str(df1.dtypes),
854
+ "right": df2.to_string(),
855
+ "right_dtype": str(df2.dtypes),
856
+ },
857
+ )
858
+
859
+ @unittest.skipIf(not have_pandas or not have_numpy, "no pandas or numpy dependency")
860
+ def test_assert_type_error_pandas_df(self):
861
+ import pyspark.pandas as ps
862
+ import pandas as pd
863
+ import numpy as np
864
+
865
+ df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
866
+ df2 = pd.DataFrame(
867
+ data=np.array([(1, 2, 3), (4, 5, 6), (6, 5, 4)]), columns=["a", "b", "c"]
868
+ )
869
+
870
+ with self.assertRaises(PySparkAssertionError) as pe:
871
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
872
+
873
+ self.check_error(
874
+ exception=pe.exception,
875
+ error_class="DIFFERENT_PANDAS_DATAFRAME",
876
+ message_parameters={
877
+ "left": df1.to_string(),
878
+ "left_dtype": str(df1.dtypes),
879
+ "right": df2.to_string(),
880
+ "right_dtype": str(df2.dtypes),
881
+ },
882
+ )
883
+
884
+ with self.assertRaises(PySparkAssertionError) as pe:
885
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
886
+
887
+ self.check_error(
888
+ exception=pe.exception,
889
+ error_class="DIFFERENT_PANDAS_DATAFRAME",
890
+ message_parameters={
891
+ "left": df1.to_string(),
892
+ "left_dtype": str(df1.dtypes),
893
+ "right": df2.to_string(),
894
+ "right_dtype": str(df2.dtypes),
895
+ },
896
+ )
897
+
898
+ @unittest.skipIf(not have_pandas, "no pandas dependency")
899
+ def test_assert_equal_exact_pandas_on_spark_df(self):
900
+ import pyspark.pandas as ps
901
+
902
+ df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
903
+ df2 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
904
+
905
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
906
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
907
+
908
+ @unittest.skipIf(not have_pandas, "no pandas dependency")
909
+ def test_assert_equal_exact_pandas_on_spark_df(self):
910
+ import pyspark.pandas as ps
911
+
912
+ df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
913
+ df2 = ps.DataFrame(data=[30, 20, 10], columns=["Numbers"])
914
+
915
+ assertDataFrameEqual(df1, df2)
916
+
917
+ @unittest.skipIf(not have_pandas, "no pandas dependency")
918
+ def test_assert_equal_approx_pandas_on_spark_df(self):
919
+ import pyspark.pandas as ps
920
+
921
+ df1 = ps.DataFrame(data=[10.0001, 20.32, 30.1], columns=["Numbers"])
922
+ df2 = ps.DataFrame(data=[10.0, 20.32, 30.1], columns=["Numbers"])
923
+
924
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
925
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
926
+
927
+ @unittest.skipIf(not have_pandas, "no pandas dependency")
928
+ def test_assert_error_pandas_pyspark_df(self):
929
+ import pyspark.pandas as ps
930
+ import pandas as pd
931
+
932
+ df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
933
+ df2 = self.spark.createDataFrame([(10,), (11,), (13,)], ["Numbers"])
934
+
935
+ with self.assertRaises(PySparkAssertionError) as pe:
936
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
937
+
938
+ self.check_error(
939
+ exception=pe.exception,
940
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
941
+ message_parameters={
942
+ "expected_type": f"{ps.DataFrame.__name__}, "
943
+ f"{pd.DataFrame.__name__}, "
944
+ f"{ps.Series.__name__}, "
945
+ f"{pd.Series.__name__}, "
946
+ f"{ps.Index.__name__}"
947
+ f"{pd.Index.__name__}, ",
948
+ "arg_name": "expected",
949
+ "actual_type": type(df2),
950
+ },
951
+ )
952
+
953
+ with self.assertRaises(PySparkAssertionError) as pe:
954
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
955
+
956
+ self.check_error(
957
+ exception=pe.exception,
958
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
959
+ message_parameters={
960
+ "expected_type": f"{ps.DataFrame.__name__}, "
961
+ f"{pd.DataFrame.__name__}, "
962
+ f"{ps.Series.__name__}, "
963
+ f"{pd.Series.__name__}, "
964
+ f"{ps.Index.__name__}"
965
+ f"{pd.Index.__name__}, ",
966
+ "arg_name": "expected",
967
+ "actual_type": type(df2),
968
+ },
969
+ )
970
+
971
+ def test_assert_error_non_pyspark_df(self):
972
+ dict1 = {"a": 1, "b": 2}
973
+ dict2 = {"a": 1, "b": 2}
974
+
975
+ with self.assertRaises(PySparkAssertionError) as pe:
976
+ assertDataFrameEqual(dict1, dict2)
977
+
978
+ self.check_error(
979
+ exception=pe.exception,
980
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
981
+ message_parameters={
982
+ "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
983
+ "arg_name": "actual",
984
+ "actual_type": type(dict1),
985
+ },
986
+ )
987
+
988
+ with self.assertRaises(PySparkAssertionError) as pe:
989
+ assertDataFrameEqual(dict1, dict2, checkRowOrder=True)
990
+
991
+ self.check_error(
992
+ exception=pe.exception,
993
+ error_class="INVALID_TYPE_DF_EQUALITY_ARG",
994
+ message_parameters={
995
+ "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
996
+ "arg_name": "actual",
997
+ "actual_type": type(dict1),
998
+ },
999
+ )
1000
+
1001
+ def test_row_order_ignored(self):
1002
+ # test that row order is ignored (not checked) by default
1003
+ df1 = self.spark.createDataFrame(
1004
+ data=[
1005
+ ("2", 3000.00),
1006
+ ("1", 1000.00),
1007
+ ],
1008
+ schema=["id", "amount"],
1009
+ )
1010
+ df2 = self.spark.createDataFrame(
1011
+ data=[
1012
+ ("1", 1000.00),
1013
+ ("2", 3000.00),
1014
+ ],
1015
+ schema=["id", "amount"],
1016
+ )
1017
+
1018
+ assertDataFrameEqual(df1, df2)
1019
+
1020
+ def test_check_row_order_error(self):
1021
+ # test checkRowOrder=True
1022
+ df1 = self.spark.createDataFrame(
1023
+ data=[
1024
+ ("2", 3000.00),
1025
+ ("1", 1000.00),
1026
+ ],
1027
+ schema=["id", "amount"],
1028
+ )
1029
+ df2 = self.spark.createDataFrame(
1030
+ data=[
1031
+ ("1", 1000.00),
1032
+ ("2", 3000.00),
1033
+ ],
1034
+ schema=["id", "amount"],
1035
+ )
1036
+
1037
+ rows_str1 = ""
1038
+ rows_str2 = ""
1039
+
1040
+ # count different rows
1041
+ for r1, r2 in list(zip_longest(df1.collect(), df2.collect())):
1042
+ rows_str1 += str(r1) + "\n"
1043
+ rows_str2 += str(r2) + "\n"
1044
+
1045
+ generated_diff = _context_diff(
1046
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=2
1047
+ )
1048
+
1049
+ error_msg = "Results do not match: "
1050
+ percent_diff = (2 / 2) * 100
1051
+ error_msg += "( %.5f %% )" % percent_diff
1052
+ error_msg += "\n" + "\n".join(generated_diff)
1053
+
1054
+ with self.assertRaises(PySparkAssertionError) as pe:
1055
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1056
+
1057
+ self.check_error(
1058
+ exception=pe.exception,
1059
+ error_class="DIFFERENT_ROWS",
1060
+ message_parameters={"error_msg": error_msg},
1061
+ )
1062
+
1063
+ def test_remove_non_word_characters_long(self):
1064
+ def remove_non_word_characters(col):
1065
+ return F.regexp_replace(col, "[^\\w\\s]+", "")
1066
+
1067
+ source_data = [("jo&&se",), ("**li**",), ("#::luisa",), (None,)]
1068
+ source_df = self.spark.createDataFrame(source_data, ["name"])
1069
+
1070
+ actual_df = source_df.withColumn("clean_name", remove_non_word_characters(F.col("name")))
1071
+
1072
+ expected_data = [("jo&&se", "jose"), ("**li**", "li"), ("#::luisa", "luisa"), (None, None)]
1073
+ expected_df = self.spark.createDataFrame(expected_data, ["name", "clean_name"])
1074
+
1075
+ assertDataFrameEqual(actual_df, expected_df)
1076
+
1077
+ def test_assert_pyspark_approx_equal(self):
1078
+ df1 = self.spark.createDataFrame(
1079
+ data=[
1080
+ ("1", 1000.00),
1081
+ ("2", 3000.00),
1082
+ ],
1083
+ schema=["id", "amount"],
1084
+ )
1085
+ df2 = self.spark.createDataFrame(
1086
+ data=[
1087
+ ("1", 1000.0000001),
1088
+ ("2", 3000.00),
1089
+ ],
1090
+ schema=["id", "amount"],
1091
+ )
1092
+
1093
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1094
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1095
+
1096
+ def test_assert_pyspark_approx_equal_custom_rtol(self):
1097
+ df1 = self.spark.createDataFrame(
1098
+ data=[
1099
+ ("1", 1000.00),
1100
+ ("2", 3000.00),
1101
+ ],
1102
+ schema=["id", "amount"],
1103
+ )
1104
+ df2 = self.spark.createDataFrame(
1105
+ data=[
1106
+ ("1", 1000.01),
1107
+ ("2", 3000.00),
1108
+ ],
1109
+ schema=["id", "amount"],
1110
+ )
1111
+
1112
+ assertDataFrameEqual(df1, df2, rtol=1e-2)
1113
+
1114
+ def test_assert_pyspark_df_not_equal(self):
1115
+ df1 = self.spark.createDataFrame(
1116
+ data=[
1117
+ ("1", 1000.00),
1118
+ ("2", 3000.00),
1119
+ ("3", 2000.00),
1120
+ ],
1121
+ schema=["id", "amount"],
1122
+ )
1123
+ df2 = self.spark.createDataFrame(
1124
+ data=[
1125
+ ("1", 1001.00),
1126
+ ("2", 3000.00),
1127
+ ("3", 2003.00),
1128
+ ],
1129
+ schema=["id", "amount"],
1130
+ )
1131
+
1132
+ rows_str1 = ""
1133
+ rows_str2 = ""
1134
+
1135
+ # count different rows
1136
+ for r1, r2 in list(zip_longest(df1.collect(), df2.collect())):
1137
+ rows_str1 += str(r1) + "\n"
1138
+ rows_str2 += str(r2) + "\n"
1139
+
1140
+ generated_diff = _context_diff(
1141
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=3
1142
+ )
1143
+
1144
+ error_msg = "Results do not match: "
1145
+ percent_diff = (2 / 3) * 100
1146
+ error_msg += "( %.5f %% )" % percent_diff
1147
+ error_msg += "\n" + "\n".join(generated_diff)
1148
+
1149
+ with self.assertRaises(PySparkAssertionError) as pe:
1150
+ assertDataFrameEqual(df1, df2)
1151
+
1152
+ self.check_error(
1153
+ exception=pe.exception,
1154
+ error_class="DIFFERENT_ROWS",
1155
+ message_parameters={"error_msg": error_msg},
1156
+ )
1157
+
1158
+ with self.assertRaises(PySparkAssertionError) as pe:
1159
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1160
+
1161
+ self.check_error(
1162
+ exception=pe.exception,
1163
+ error_class="DIFFERENT_ROWS",
1164
+ message_parameters={"error_msg": error_msg},
1165
+ )
1166
+
1167
+ def test_assert_notequal_schema(self):
1168
+ df1 = self.spark.createDataFrame(
1169
+ data=[
1170
+ (1, 1000),
1171
+ (2, 3000),
1172
+ ],
1173
+ schema=["id", "number"],
1174
+ )
1175
+ df2 = self.spark.createDataFrame(
1176
+ data=[
1177
+ ("1", 1000),
1178
+ ("2", 5000),
1179
+ ],
1180
+ schema=["id", "amount"],
1181
+ )
1182
+
1183
+ generated_diff = difflib.ndiff(str(df1.schema).splitlines(), str(df2.schema).splitlines())
1184
+
1185
+ expected_error_msg = "\n".join(generated_diff)
1186
+
1187
+ with self.assertRaises(PySparkAssertionError) as pe:
1188
+ assertDataFrameEqual(df1, df2)
1189
+
1190
+ self.check_error(
1191
+ exception=pe.exception,
1192
+ error_class="DIFFERENT_SCHEMA",
1193
+ message_parameters={"error_msg": expected_error_msg},
1194
+ )
1195
+
1196
+ def test_diff_schema_lens(self):
1197
+ df1 = self.spark.createDataFrame(
1198
+ data=[
1199
+ (1, 3000),
1200
+ (2, 1000),
1201
+ ],
1202
+ schema=["id", "amount"],
1203
+ )
1204
+
1205
+ df2 = self.spark.createDataFrame(
1206
+ data=[
1207
+ (1, 3000, "a"),
1208
+ (2, 1000, "b"),
1209
+ ],
1210
+ schema=["id", "amount", "letter"],
1211
+ )
1212
+
1213
+ generated_diff = difflib.ndiff(str(df1.schema).splitlines(), str(df2.schema).splitlines())
1214
+
1215
+ expected_error_msg = "\n".join(generated_diff)
1216
+
1217
+ with self.assertRaises(PySparkAssertionError) as pe:
1218
+ assertDataFrameEqual(df1, df2)
1219
+
1220
+ self.check_error(
1221
+ exception=pe.exception,
1222
+ error_class="DIFFERENT_SCHEMA",
1223
+ message_parameters={"error_msg": expected_error_msg},
1224
+ )
1225
+
1226
+ def test_schema_ignore_nullable(self):
1227
+ s1 = StructType(
1228
+ [StructField("id", IntegerType(), True), StructField("name", StringType(), True)]
1229
+ )
1230
+
1231
+ df1 = self.spark.createDataFrame([(1, "jane"), (2, "john")], s1)
1232
+
1233
+ s2 = StructType(
1234
+ [StructField("id", IntegerType(), True), StructField("name", StringType(), False)]
1235
+ )
1236
+
1237
+ df2 = self.spark.createDataFrame([(1, "jane"), (2, "john")], s2)
1238
+
1239
+ assertDataFrameEqual(df1, df2)
1240
+
1241
+ def test_schema_ignore_nullable_array_equal(self):
1242
+ s1 = StructType([StructField("names", ArrayType(DoubleType(), True), True)])
1243
+ s2 = StructType([StructField("names", ArrayType(DoubleType(), False), False)])
1244
+
1245
+ assertSchemaEqual(s1, s2)
1246
+
1247
+ def test_schema_ignore_nullable_struct_equal(self):
1248
+ s1 = StructType(
1249
+ [StructField("names", StructType([StructField("age", IntegerType(), True)]), True)]
1250
+ )
1251
+ s2 = StructType(
1252
+ [StructField("names", StructType([StructField("age", IntegerType(), False)]), False)]
1253
+ )
1254
+ assertSchemaEqual(s1, s2)
1255
+
1256
+ def test_schema_array_unequal(self):
1257
+ s1 = StructType([StructField("names", ArrayType(IntegerType(), True), True)])
1258
+ s2 = StructType([StructField("names", ArrayType(DoubleType(), False), False)])
1259
+
1260
+ generated_diff = difflib.ndiff(str(s1).splitlines(), str(s2).splitlines())
1261
+
1262
+ expected_error_msg = "\n".join(generated_diff)
1263
+
1264
+ with self.assertRaises(PySparkAssertionError) as pe:
1265
+ assertSchemaEqual(s1, s2)
1266
+
1267
+ self.check_error(
1268
+ exception=pe.exception,
1269
+ error_class="DIFFERENT_SCHEMA",
1270
+ message_parameters={"error_msg": expected_error_msg},
1271
+ )
1272
+
1273
+ def test_schema_struct_unequal(self):
1274
+ s1 = StructType(
1275
+ [StructField("names", StructType([StructField("age", DoubleType(), True)]), True)]
1276
+ )
1277
+ s2 = StructType(
1278
+ [StructField("names", StructType([StructField("age", IntegerType(), True)]), True)]
1279
+ )
1280
+
1281
+ generated_diff = difflib.ndiff(str(s1).splitlines(), str(s2).splitlines())
1282
+
1283
+ expected_error_msg = "\n".join(generated_diff)
1284
+
1285
+ with self.assertRaises(PySparkAssertionError) as pe:
1286
+ assertSchemaEqual(s1, s2)
1287
+
1288
+ self.check_error(
1289
+ exception=pe.exception,
1290
+ error_class="DIFFERENT_SCHEMA",
1291
+ message_parameters={"error_msg": expected_error_msg},
1292
+ )
1293
+
1294
+ def test_schema_more_nested_struct_unequal(self):
1295
+ s1 = StructType(
1296
+ [
1297
+ StructField(
1298
+ "name",
1299
+ StructType(
1300
+ [
1301
+ StructField("firstname", StringType(), True),
1302
+ StructField("middlename", StringType(), True),
1303
+ StructField("lastname", StringType(), True),
1304
+ ]
1305
+ ),
1306
+ ),
1307
+ ]
1308
+ )
1309
+
1310
+ s2 = StructType(
1311
+ [
1312
+ StructField(
1313
+ "name",
1314
+ StructType(
1315
+ [
1316
+ StructField("firstname", StringType(), True),
1317
+ StructField("middlename", BooleanType(), True),
1318
+ StructField("lastname", StringType(), True),
1319
+ ]
1320
+ ),
1321
+ ),
1322
+ ]
1323
+ )
1324
+
1325
+ generated_diff = difflib.ndiff(str(s1).splitlines(), str(s2).splitlines())
1326
+
1327
+ expected_error_msg = "\n".join(generated_diff)
1328
+
1329
+ with self.assertRaises(PySparkAssertionError) as pe:
1330
+ assertSchemaEqual(s1, s2)
1331
+
1332
+ self.check_error(
1333
+ exception=pe.exception,
1334
+ error_class="DIFFERENT_SCHEMA",
1335
+ message_parameters={"error_msg": expected_error_msg},
1336
+ )
1337
+
1338
+ def test_schema_unsupported_type(self):
1339
+ s1 = "names: int"
1340
+ s2 = "names: int"
1341
+
1342
+ with self.assertRaises(PySparkAssertionError) as pe:
1343
+ assertSchemaEqual(s1, s2)
1344
+
1345
+ self.check_error(
1346
+ exception=pe.exception,
1347
+ error_class="UNSUPPORTED_DATA_TYPE",
1348
+ message_parameters={"data_type": type(s1)},
1349
+ )
1350
+
1351
+ def test_spark_sql(self):
1352
+ assertDataFrameEqual(self.spark.sql("select 1 + 2 AS x"), self.spark.sql("select 3 AS x"))
1353
+ assertDataFrameEqual(
1354
+ self.spark.sql("select 1 + 2 AS x"),
1355
+ self.spark.sql("select 3 AS x"),
1356
+ checkRowOrder=True,
1357
+ )
1358
+
1359
+ def test_spark_sql_sort_rows(self):
1360
+ df1 = self.spark.createDataFrame(
1361
+ data=[
1362
+ (1, 3000),
1363
+ (2, 1000),
1364
+ ],
1365
+ schema=["id", "amount"],
1366
+ )
1367
+
1368
+ df2 = self.spark.createDataFrame(
1369
+ data=[
1370
+ (2, 1000),
1371
+ (1, 3000),
1372
+ ],
1373
+ schema=["id", "amount"],
1374
+ )
1375
+
1376
+ df1.createOrReplaceTempView("df1")
1377
+ df2.createOrReplaceTempView("df2")
1378
+
1379
+ assertDataFrameEqual(
1380
+ self.spark.sql("select * from df1 order by amount"), self.spark.sql("select * from df2")
1381
+ )
1382
+
1383
+ assertDataFrameEqual(
1384
+ self.spark.sql("select * from df1 order by amount"),
1385
+ self.spark.sql("select * from df2"),
1386
+ checkRowOrder=True,
1387
+ )
1388
+
1389
+ def test_empty_dataset(self):
1390
+ df1 = self.spark.range(0, 10).limit(0)
1391
+
1392
+ df2 = self.spark.range(0, 10).limit(0)
1393
+
1394
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1395
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1396
+
1397
+ def test_no_column(self):
1398
+ df1 = self.spark.range(0, 10).drop("id")
1399
+
1400
+ df2 = self.spark.range(0, 10).drop("id")
1401
+
1402
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1403
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1404
+
1405
+ def test_empty_no_column(self):
1406
+ df1 = self.spark.range(0, 10).drop("id").limit(0)
1407
+
1408
+ df2 = self.spark.range(0, 10).drop("id").limit(0)
1409
+
1410
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1411
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1412
+
1413
+ def test_empty_expected_list(self):
1414
+ df1 = self.spark.range(0, 5).drop("id")
1415
+
1416
+ df2 = [Row(), Row(), Row(), Row(), Row()]
1417
+
1418
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1419
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1420
+
1421
+ def test_no_column_expected_list(self):
1422
+ df1 = self.spark.range(0, 10).limit(0)
1423
+
1424
+ df2 = []
1425
+
1426
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1427
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1428
+
1429
+ def test_empty_no_column_expected_list(self):
1430
+ df1 = self.spark.range(0, 10).drop("id").limit(0)
1431
+
1432
+ df2 = []
1433
+
1434
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1435
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1436
+
1437
+ def test_special_vals(self):
1438
+ df1 = self.spark.createDataFrame(
1439
+ data=[
1440
+ (1, float("nan")),
1441
+ (2, float("inf")),
1442
+ (2, float("-inf")),
1443
+ ],
1444
+ schema=["id", "amount"],
1445
+ )
1446
+
1447
+ df2 = self.spark.createDataFrame(
1448
+ data=[
1449
+ (1, float("nan")),
1450
+ (2, float("inf")),
1451
+ (2, float("-inf")),
1452
+ ],
1453
+ schema=["id", "amount"],
1454
+ )
1455
+
1456
+ assertDataFrameEqual(df1, df2, checkRowOrder=False)
1457
+ assertDataFrameEqual(df1, df2, checkRowOrder=True)
1458
+
1459
+ def test_df_list_row_equal(self):
1460
+ df1 = self.spark.createDataFrame(
1461
+ data=[
1462
+ (1, 3000),
1463
+ (2, 1000),
1464
+ ],
1465
+ schema=["id", "amount"],
1466
+ )
1467
+
1468
+ list_of_rows = [Row(1, 3000), Row(2, 1000)]
1469
+
1470
+ assertDataFrameEqual(df1, list_of_rows, checkRowOrder=False)
1471
+ assertDataFrameEqual(df1, list_of_rows, checkRowOrder=True)
1472
+
1473
+ def test_list_rows_equal(self):
1474
+ list_of_rows1 = [Row(1, "abc", 5000), Row(2, "def", 1000)]
1475
+ list_of_rows2 = [Row(1, "abc", 5000), Row(2, "def", 1000)]
1476
+
1477
+ assertDataFrameEqual(list_of_rows1, list_of_rows2, checkRowOrder=False)
1478
+ assertDataFrameEqual(list_of_rows1, list_of_rows2, checkRowOrder=True)
1479
+
1480
+ def test_list_rows_unequal(self):
1481
+ list_of_rows1 = [Row(1, "abc", 5000), Row(2, "def", 1000)]
1482
+ list_of_rows2 = [Row(1, "abc", 5000), Row(2, "defg", 1000)]
1483
+
1484
+ rows_str1 = ""
1485
+ rows_str2 = ""
1486
+
1487
+ # count different rows
1488
+ for r1, r2 in list(zip_longest(list_of_rows1, list_of_rows2)):
1489
+ rows_str1 += str(r1) + "\n"
1490
+ rows_str2 += str(r2) + "\n"
1491
+
1492
+ generated_diff = _context_diff(
1493
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=2
1494
+ )
1495
+
1496
+ error_msg = "Results do not match: "
1497
+ percent_diff = (1 / 2) * 100
1498
+ error_msg += "( %.5f %% )" % percent_diff
1499
+ error_msg += "\n" + "\n".join(generated_diff)
1500
+
1501
+ with self.assertRaises(PySparkAssertionError) as pe:
1502
+ assertDataFrameEqual(list_of_rows1, list_of_rows2)
1503
+
1504
+ self.check_error(
1505
+ exception=pe.exception,
1506
+ error_class="DIFFERENT_ROWS",
1507
+ message_parameters={"error_msg": error_msg},
1508
+ )
1509
+
1510
+ with self.assertRaises(PySparkAssertionError) as pe:
1511
+ assertDataFrameEqual(list_of_rows1, list_of_rows2, checkRowOrder=True)
1512
+
1513
+ self.check_error(
1514
+ exception=pe.exception,
1515
+ error_class="DIFFERENT_ROWS",
1516
+ message_parameters={"error_msg": error_msg},
1517
+ )
1518
+
1519
+ def test_list_row_unequal_schema(self):
1520
+ df1 = self.spark.createDataFrame(
1521
+ data=[
1522
+ (1, 3000),
1523
+ (2, 1000),
1524
+ (3, 10),
1525
+ ],
1526
+ schema=["id", "amount"],
1527
+ )
1528
+
1529
+ list_of_rows = [Row(id=1, amount=300), Row(id=2, amount=100), Row(id=3, amount=10)]
1530
+
1531
+ rows_str1 = ""
1532
+ rows_str2 = ""
1533
+
1534
+ # count different rows
1535
+ for r1, r2 in list(zip_longest(df1, list_of_rows)):
1536
+ rows_str1 += str(r1) + "\n"
1537
+ rows_str2 += str(r2) + "\n"
1538
+
1539
+ generated_diff = _context_diff(
1540
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=3
1541
+ )
1542
+
1543
+ error_msg = "Results do not match: "
1544
+ percent_diff = (2 / 3) * 100
1545
+ error_msg += "( %.5f %% )" % percent_diff
1546
+ error_msg += "\n" + "\n".join(generated_diff)
1547
+
1548
+ with self.assertRaises(PySparkAssertionError) as pe:
1549
+ assertDataFrameEqual(df1, list_of_rows)
1550
+
1551
+ self.check_error(
1552
+ exception=pe.exception,
1553
+ error_class="DIFFERENT_ROWS",
1554
+ message_parameters={"error_msg": error_msg},
1555
+ )
1556
+
1557
+ with self.assertRaises(PySparkAssertionError) as pe:
1558
+ assertDataFrameEqual(df1, list_of_rows, checkRowOrder=True)
1559
+
1560
+ self.check_error(
1561
+ exception=pe.exception,
1562
+ error_class="DIFFERENT_ROWS",
1563
+ message_parameters={"error_msg": error_msg},
1564
+ )
1565
+
1566
+ def test_list_row_unequal_schema(self):
1567
+ from pyspark.sql import Row
1568
+
1569
+ df1 = self.spark.createDataFrame(
1570
+ data=[
1571
+ (1, 3000),
1572
+ (2, 1000),
1573
+ ],
1574
+ schema=["id", "amount"],
1575
+ )
1576
+
1577
+ list_of_rows = [Row(1, "3000"), Row(2, "1000")]
1578
+
1579
+ rows_str1 = ""
1580
+ rows_str2 = ""
1581
+
1582
+ # count different rows
1583
+ for r1, r2 in list(zip_longest(df1.collect(), list_of_rows)):
1584
+ rows_str1 += str(r1) + "\n"
1585
+ rows_str2 += str(r2) + "\n"
1586
+
1587
+ generated_diff = _context_diff(
1588
+ actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=2
1589
+ )
1590
+
1591
+ error_msg = "Results do not match: "
1592
+ percent_diff = (2 / 2) * 100
1593
+ error_msg += "( %.5f %% )" % percent_diff
1594
+ error_msg += "\n" + "\n".join(generated_diff)
1595
+
1596
+ with self.assertRaises(PySparkAssertionError) as pe:
1597
+ assertDataFrameEqual(df1, list_of_rows)
1598
+
1599
+ self.check_error(
1600
+ exception=pe.exception,
1601
+ error_class="DIFFERENT_ROWS",
1602
+ message_parameters={"error_msg": error_msg},
1603
+ )
1604
+
1605
+ with self.assertRaises(PySparkAssertionError) as pe:
1606
+ assertDataFrameEqual(df1, list_of_rows, checkRowOrder=True)
1607
+
1608
+ self.check_error(
1609
+ exception=pe.exception,
1610
+ error_class="DIFFERENT_ROWS",
1611
+ message_parameters={"error_msg": error_msg},
1612
+ )
1613
+
1614
+ def test_assert_data_frame_equal_not_support_streaming(self):
1615
+ df1 = self.spark.readStream.format("rate").load()
1616
+ df2 = self.spark.readStream.format("rate").load()
1617
+ exception_thrown = False
1618
+ try:
1619
+ assertDataFrameEqual(df1, df2)
1620
+ except PySparkAssertionError as e:
1621
+ self.assertEqual(e.getErrorClass(), "UNSUPPORTED_OPERATION")
1622
+ exception_thrown = True
1623
+
1624
+ self.assertTrue(exception_thrown)
1625
+
1626
+
1627
+ class UtilsTests(ReusedSQLTestCase, UtilsTestsMixin):
1628
+ def test_capture_analysis_exception(self):
1629
+ self.assertRaises(AnalysisException, lambda: self.spark.sql("select abc"))
1630
+ self.assertRaises(AnalysisException, lambda: self.df.selectExpr("a + b"))
1631
+
1632
+ def test_capture_user_friendly_exception(self):
1633
+ try:
1634
+ self.spark.sql("select `中文字段`")
1635
+ except AnalysisException as e:
1636
+ self.assertRegex(str(e), ".*UNRESOLVED_COLUMN.*`中文字段`.*")
1637
+
1638
+ def test_spark_upgrade_exception(self):
1639
+ # SPARK-32161 : Test case to Handle SparkUpgradeException in pythonic way
1640
+ df = self.spark.createDataFrame([("2014-31-12",)], ["date_str"])
1641
+ df2 = df.select(
1642
+ "date_str", to_date(from_unixtime(unix_timestamp("date_str", "yyyy-dd-aa")))
1643
+ )
1644
+ self.assertRaises(SparkUpgradeException, df2.collect)
1645
+
1646
+ def test_capture_parse_exception(self):
1647
+ self.assertRaises(ParseException, lambda: self.spark.sql("abc"))
1648
+
1649
+ def test_capture_illegalargument_exception(self):
1650
+ self.assertRaisesRegex(
1651
+ IllegalArgumentException,
1652
+ "Setting negative mapred.reduce.tasks",
1653
+ lambda: self.spark.sql("SET mapred.reduce.tasks=-1"),
1654
+ )
1655
+ df = self.spark.createDataFrame([(1, 2)], ["a", "b"])
1656
+ self.assertRaisesRegex(
1657
+ IllegalArgumentException,
1658
+ "1024 is not in the permitted values",
1659
+ lambda: df.select(sha2(df.a, 1024)).collect(),
1660
+ )
1661
+ try:
1662
+ df.select(sha2(df.a, 1024)).collect()
1663
+ except IllegalArgumentException as e:
1664
+ self.assertRegex(e.desc, "1024 is not in the permitted values")
1665
+ self.assertRegex(e.stackTrace, "org.apache.spark.sql.functions")
1666
+
1667
+ def test_get_error_class_state(self):
1668
+ # SPARK-36953: test CapturedException.getErrorClass and getSqlState (from SparkThrowable)
1669
+ try:
1670
+ self.spark.sql("""SELECT a""")
1671
+ except AnalysisException as e:
1672
+ self.assertEquals(e.getErrorClass(), "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION")
1673
+ self.assertEquals(e.getSqlState(), "42703")
1674
+
1675
+
1676
+ if __name__ == "__main__":
1677
+ import unittest
1678
+ from pyspark.sql.tests.test_utils import * # noqa: F401
1679
+
1680
+ try:
1681
+ import xmlrunner
1682
+
1683
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
1684
+ except ImportError:
1685
+ testRunner = None
1686
+ unittest.main(testRunner=testRunner, verbosity=2)