snowpark-connect 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (474) hide show
  1. snowflake/snowpark_connect/column_name_handler.py +116 -4
  2. snowflake/snowpark_connect/config.py +13 -0
  3. snowflake/snowpark_connect/constants.py +0 -29
  4. snowflake/snowpark_connect/dataframe_container.py +6 -0
  5. snowflake/snowpark_connect/execute_plan/map_execution_command.py +56 -1
  6. snowflake/snowpark_connect/expression/literal.py +13 -2
  7. snowflake/snowpark_connect/expression/map_cast.py +5 -8
  8. snowflake/snowpark_connect/expression/map_sql_expression.py +23 -1
  9. snowflake/snowpark_connect/expression/map_udf.py +26 -8
  10. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +199 -15
  11. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +44 -16
  12. snowflake/snowpark_connect/expression/map_unresolved_function.py +825 -353
  13. snowflake/snowpark_connect/expression/map_unresolved_star.py +3 -2
  14. snowflake/snowpark_connect/hidden_column.py +39 -0
  15. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  16. snowflake/snowpark_connect/includes/jars/{hadoop-client-api-3.3.4.jar → spark-connect-client-jvm_2.12-3.5.6.jar} +0 -0
  17. snowflake/snowpark_connect/relation/map_column_ops.py +17 -4
  18. snowflake/snowpark_connect/relation/map_extension.py +52 -11
  19. snowflake/snowpark_connect/relation/map_join.py +258 -62
  20. snowflake/snowpark_connect/relation/map_sql.py +88 -11
  21. snowflake/snowpark_connect/relation/map_udtf.py +4 -2
  22. snowflake/snowpark_connect/relation/read/map_read.py +3 -3
  23. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +1 -1
  24. snowflake/snowpark_connect/relation/read/map_read_json.py +8 -1
  25. snowflake/snowpark_connect/relation/read/map_read_table.py +1 -9
  26. snowflake/snowpark_connect/relation/read/reader_config.py +3 -1
  27. snowflake/snowpark_connect/relation/write/map_write.py +62 -53
  28. snowflake/snowpark_connect/resources_initializer.py +29 -1
  29. snowflake/snowpark_connect/server.py +18 -3
  30. snowflake/snowpark_connect/type_mapping.py +29 -25
  31. snowflake/snowpark_connect/typed_column.py +14 -0
  32. snowflake/snowpark_connect/utils/artifacts.py +23 -0
  33. snowflake/snowpark_connect/utils/context.py +6 -1
  34. snowflake/snowpark_connect/utils/scala_udf_utils.py +588 -0
  35. snowflake/snowpark_connect/utils/telemetry.py +6 -17
  36. snowflake/snowpark_connect/utils/udf_helper.py +2 -0
  37. snowflake/snowpark_connect/utils/udf_utils.py +38 -7
  38. snowflake/snowpark_connect/utils/udtf_utils.py +17 -3
  39. snowflake/snowpark_connect/version.py +1 -1
  40. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.25.0.dist-info}/METADATA +1 -1
  41. snowpark_connect-0.25.0.dist-info/RECORD +477 -0
  42. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  46. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +0 -16
  47. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +0 -60
  48. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +0 -306
  49. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +0 -16
  50. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +0 -53
  51. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +0 -50
  52. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +0 -43
  53. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +0 -114
  54. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +0 -47
  55. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +0 -43
  56. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +0 -46
  57. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +0 -238
  58. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +0 -194
  59. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +0 -156
  60. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +0 -184
  61. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +0 -78
  62. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +0 -292
  63. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +0 -50
  64. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +0 -152
  65. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +0 -456
  66. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +0 -96
  67. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +0 -186
  68. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +0 -77
  69. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +0 -401
  70. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +0 -528
  71. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +0 -82
  72. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +0 -409
  73. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +0 -55
  74. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +0 -441
  75. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +0 -546
  76. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +0 -71
  77. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +0 -52
  78. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +0 -494
  79. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +0 -85
  80. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +0 -138
  81. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +0 -16
  82. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +0 -151
  83. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +0 -97
  84. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +0 -143
  85. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +0 -551
  86. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +0 -137
  87. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +0 -96
  88. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +0 -142
  89. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +0 -16
  90. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +0 -137
  91. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +0 -561
  92. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +0 -172
  93. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +0 -16
  94. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +0 -353
  95. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +0 -192
  96. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +0 -680
  97. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +0 -206
  98. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +0 -471
  99. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +0 -108
  100. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
  101. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
  102. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
  103. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
  104. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +0 -16
  105. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +0 -16
  106. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +0 -177
  107. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +0 -575
  108. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +0 -235
  109. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +0 -653
  110. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +0 -463
  111. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +0 -86
  112. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +0 -151
  113. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +0 -139
  114. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +0 -458
  115. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +0 -86
  116. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +0 -202
  117. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +0 -520
  118. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +0 -361
  119. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +0 -16
  120. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +0 -16
  121. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +0 -40
  122. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +0 -42
  123. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +0 -40
  124. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +0 -37
  125. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +0 -60
  126. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +0 -40
  127. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +0 -40
  128. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +0 -90
  129. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +0 -40
  130. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +0 -40
  131. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +0 -40
  132. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +0 -42
  133. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +0 -37
  134. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +0 -16
  135. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +0 -36
  136. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +0 -42
  137. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +0 -47
  138. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +0 -55
  139. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +0 -40
  140. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +0 -47
  141. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +0 -47
  142. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +0 -42
  143. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +0 -43
  144. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +0 -47
  145. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +0 -43
  146. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +0 -47
  147. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +0 -47
  148. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +0 -40
  149. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +0 -226
  150. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +0 -16
  151. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +0 -39
  152. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +0 -55
  153. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +0 -39
  154. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +0 -39
  155. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +0 -39
  156. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +0 -39
  157. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +0 -39
  158. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +0 -43
  159. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +0 -43
  160. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +0 -16
  161. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +0 -40
  162. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +0 -39
  163. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +0 -42
  164. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +0 -42
  165. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +0 -37
  166. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +0 -40
  167. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +0 -42
  168. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +0 -48
  169. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +0 -40
  170. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +0 -16
  171. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +0 -40
  172. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +0 -41
  173. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +0 -67
  174. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +0 -40
  175. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +0 -55
  176. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +0 -40
  177. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +0 -38
  178. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +0 -55
  179. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +0 -39
  180. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +0 -38
  181. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +0 -16
  182. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +0 -40
  183. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +0 -50
  184. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +0 -73
  185. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +0 -39
  186. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +0 -40
  187. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +0 -40
  188. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +0 -40
  189. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +0 -48
  190. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +0 -39
  191. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +0 -16
  192. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +0 -40
  193. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +0 -16
  194. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +0 -45
  195. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +0 -45
  196. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +0 -49
  197. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +0 -37
  198. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +0 -53
  199. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +0 -45
  200. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +0 -16
  201. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +0 -38
  202. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +0 -37
  203. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +0 -37
  204. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +0 -38
  205. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +0 -37
  206. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +0 -40
  207. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +0 -40
  208. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +0 -38
  209. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +0 -40
  210. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +0 -37
  211. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +0 -38
  212. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +0 -38
  213. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +0 -66
  214. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +0 -37
  215. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +0 -37
  216. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +0 -42
  217. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +0 -39
  218. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +0 -49
  219. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +0 -37
  220. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +0 -39
  221. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +0 -49
  222. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +0 -53
  223. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +0 -43
  224. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +0 -49
  225. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +0 -39
  226. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +0 -41
  227. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +0 -39
  228. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +0 -60
  229. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +0 -48
  230. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +0 -39
  231. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +0 -44
  232. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +0 -84
  233. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +0 -37
  234. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +0 -45
  235. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +0 -39
  236. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +0 -39
  237. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +0 -37
  238. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +0 -39
  239. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +0 -39
  240. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +0 -39
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +0 -39
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +0 -43
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +0 -37
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +0 -36
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +0 -37
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +0 -39
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +0 -16
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +0 -107
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +0 -224
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +0 -825
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +0 -562
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +0 -368
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +0 -257
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +0 -260
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +0 -178
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +0 -184
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +0 -497
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +0 -140
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +0 -354
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +0 -219
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +0 -192
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +0 -228
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +0 -16
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +0 -118
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +0 -198
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +0 -181
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +0 -103
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +0 -141
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +0 -109
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +0 -136
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +0 -125
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +0 -217
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +0 -16
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +0 -384
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +0 -598
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +0 -73
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +0 -869
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +0 -487
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +0 -309
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +0 -156
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +0 -149
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +0 -163
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +0 -16
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +0 -311
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +0 -524
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +0 -419
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +0 -144
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +0 -979
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +0 -234
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +0 -206
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +0 -421
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +0 -187
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +0 -397
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +0 -16
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +0 -100
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +0 -2743
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +0 -484
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +0 -276
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +0 -432
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +0 -310
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +0 -257
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +0 -160
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +0 -128
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +0 -16
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +0 -137
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +0 -16
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +0 -170
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +0 -547
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +0 -285
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +0 -106
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +0 -409
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +0 -247
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +0 -16
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +0 -105
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +0 -197
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +0 -137
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +0 -227
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +0 -634
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +0 -88
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +0 -139
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +0 -475
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +0 -265
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +0 -818
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +0 -162
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +0 -780
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +0 -741
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +0 -160
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +0 -453
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +0 -281
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +0 -487
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +0 -109
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +0 -434
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +0 -253
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +0 -152
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +0 -162
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +0 -234
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +0 -1339
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +0 -82
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +0 -124
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +0 -638
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +0 -200
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +0 -1355
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +0 -655
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +0 -113
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +0 -118
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +0 -192
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +0 -346
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +0 -495
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +0 -263
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +0 -59
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +0 -85
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +0 -364
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +0 -362
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +0 -46
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +0 -123
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +0 -581
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +0 -447
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +0 -301
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +0 -465
  360. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +0 -16
  361. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +0 -83
  362. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +0 -16
  363. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +0 -16
  364. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +0 -16
  365. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +0 -420
  366. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +0 -358
  367. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +0 -16
  368. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +0 -36
  369. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +0 -44
  370. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +0 -116
  371. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +0 -35
  372. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +0 -3612
  373. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +0 -1042
  374. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +0 -2381
  375. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +0 -1060
  376. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +0 -163
  377. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +0 -38
  378. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +0 -48
  379. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +0 -36
  380. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +0 -55
  381. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +0 -36
  382. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +0 -96
  383. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +0 -44
  384. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +0 -36
  385. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +0 -59
  386. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +0 -36
  387. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +0 -59
  388. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +0 -74
  389. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +0 -62
  390. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +0 -58
  391. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +0 -70
  392. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +0 -50
  393. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +0 -68
  394. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +0 -40
  395. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +0 -46
  396. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +0 -44
  397. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +0 -100
  398. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +0 -100
  399. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +0 -163
  400. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +0 -181
  401. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +0 -42
  402. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +0 -16
  403. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +0 -623
  404. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +0 -869
  405. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +0 -342
  406. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +0 -436
  407. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +0 -363
  408. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +0 -592
  409. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +0 -1503
  410. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +0 -392
  411. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +0 -375
  412. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +0 -411
  413. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +0 -16
  414. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +0 -401
  415. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +0 -295
  416. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +0 -106
  417. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +0 -558
  418. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +0 -1346
  419. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +0 -182
  420. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +0 -202
  421. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +0 -503
  422. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +0 -225
  423. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +0 -83
  424. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +0 -201
  425. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +0 -1931
  426. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +0 -256
  427. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +0 -69
  428. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +0 -1349
  429. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +0 -53
  430. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +0 -68
  431. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +0 -283
  432. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +0 -155
  433. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +0 -412
  434. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +0 -1581
  435. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +0 -961
  436. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +0 -165
  437. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +0 -1456
  438. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +0 -1686
  439. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +0 -16
  440. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +0 -184
  441. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +0 -706
  442. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +0 -118
  443. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +0 -160
  444. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +0 -16
  445. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +0 -306
  446. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +0 -196
  447. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +0 -44
  448. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +0 -346
  449. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +0 -89
  450. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +0 -124
  451. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +0 -69
  452. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +0 -167
  453. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +0 -194
  454. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +0 -168
  455. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +0 -939
  456. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +0 -52
  457. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +0 -66
  458. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +0 -368
  459. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +0 -257
  460. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +0 -267
  461. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +0 -153
  462. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +0 -130
  463. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +0 -350
  464. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +0 -97
  465. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +0 -271
  466. snowpark_connect-0.24.0.dist-info/RECORD +0 -898
  467. {snowpark_connect-0.24.0.data → snowpark_connect-0.25.0.data}/scripts/snowpark-connect +0 -0
  468. {snowpark_connect-0.24.0.data → snowpark_connect-0.25.0.data}/scripts/snowpark-session +0 -0
  469. {snowpark_connect-0.24.0.data → snowpark_connect-0.25.0.data}/scripts/snowpark-submit +0 -0
  470. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.25.0.dist-info}/WHEEL +0 -0
  471. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.25.0.dist-info}/licenses/LICENSE-binary +0 -0
  472. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.25.0.dist-info}/licenses/LICENSE.txt +0 -0
  473. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.25.0.dist-info}/licenses/NOTICE-binary +0 -0
  474. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,1349 +0,0 @@
1
- #
2
- # Licensed to the Apache Software Foundation (ASF) under one or more
3
- # contributor license agreements. See the NOTICE file distributed with
4
- # this work for additional information regarding copyright ownership.
5
- # The ASF licenses this file to You under the Apache License, Version 2.0
6
- # (the "License"); you may not use this file except in compliance with
7
- # the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- #
17
-
18
- from contextlib import redirect_stdout
19
- import datetime
20
- from inspect import getmembers, isfunction
21
- import io
22
- from itertools import chain
23
- import math
24
- import re
25
- import unittest
26
-
27
- from py4j.protocol import Py4JJavaError
28
-
29
- from pyspark.errors import PySparkTypeError, PySparkValueError
30
- from pyspark.sql import Row, Window, functions as F, types
31
- from pyspark.sql.column import Column
32
- from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils
33
- from pyspark.testing.utils import have_numpy
34
-
35
-
36
- class FunctionsTestsMixin:
37
- def test_function_parity(self):
38
- # This test compares the available list of functions in pyspark.sql.functions with those
39
- # available in the Scala/Java DataFrame API in org.apache.spark.sql.functions.
40
- #
41
- # NOTE FOR DEVELOPERS:
42
- # If this test fails one of the following needs to happen
43
- # * If a function was added to org.apache.spark.sql.functions it either needs to be added to
44
- # pyspark.sql.functions or added to the below expected_missing_in_py set.
45
- # * If a function was added to pyspark.sql.functions that was already in
46
- # org.apache.spark.sql.functions then it needs to be removed from expected_missing_in_py
47
- # below. If the function has a different name it needs to be added to py_equiv_jvm
48
- # mapping.
49
- # * If it's not related to an added/removed function then likely the exclusion list
50
- # jvm_excluded_fn needs to be updated.
51
-
52
- jvm_fn_set = {name for (name, value) in getmembers(self.sc._jvm.functions)}
53
- py_fn_set = {name for (name, value) in getmembers(F, isfunction) if name[0] != "_"}
54
-
55
- # Functions on the JVM side we do not expect to be available in python because they are
56
- # depreciated, irrelevant to python, or have equivalents.
57
- jvm_excluded_fn = [
58
- "callUDF", # depreciated, use call_udf
59
- "typedlit", # Scala only
60
- "typedLit", # Scala only
61
- "monotonicallyIncreasingId", # depreciated, use monotonically_increasing_id
62
- "not", # equivalent to python ~expression
63
- "any", # equivalent to python ~some
64
- "len", # equivalent to python ~length
65
- "udaf", # used for creating UDAF's which are not supported in PySpark
66
- "random", # namespace conflict with python built-in module
67
- "uuid", # namespace conflict with python built-in module
68
- "chr", # namespace conflict with python built-in function
69
- ]
70
-
71
- jvm_fn_set.difference_update(jvm_excluded_fn)
72
-
73
- # For functions that are named differently in pyspark this is the mapping of their
74
- # python name to the JVM equivalent
75
- py_equiv_jvm = {"create_map": "map"}
76
- for py_name, jvm_name in py_equiv_jvm.items():
77
- if py_name in py_fn_set:
78
- py_fn_set.remove(py_name)
79
- py_fn_set.add(jvm_name)
80
-
81
- missing_in_py = jvm_fn_set.difference(py_fn_set)
82
-
83
- # Functions that we expect to be missing in python until they are added to pyspark
84
- expected_missing_in_py = set()
85
-
86
- self.assertEqual(
87
- expected_missing_in_py, missing_in_py, "Missing functions in pyspark not as expected"
88
- )
89
-
90
- def test_explode(self):
91
- d = [
92
- Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}),
93
- Row(a=1, intlist=[], mapfield={}),
94
- Row(a=1, intlist=None, mapfield=None),
95
- ]
96
- data = self.spark.createDataFrame(d)
97
-
98
- result = data.select(F.explode(data.intlist).alias("a")).select("a").collect()
99
- self.assertEqual(result[0][0], 1)
100
- self.assertEqual(result[1][0], 2)
101
- self.assertEqual(result[2][0], 3)
102
-
103
- result = data.select(F.explode(data.mapfield).alias("a", "b")).select("a", "b").collect()
104
- self.assertEqual(result[0][0], "a")
105
- self.assertEqual(result[0][1], "b")
106
-
107
- result = [tuple(x) for x in data.select(F.posexplode_outer("intlist")).collect()]
108
- self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)])
109
-
110
- result = [tuple(x) for x in data.select(F.posexplode_outer("mapfield")).collect()]
111
- self.assertEqual(result, [(0, "a", "b"), (None, None, None), (None, None, None)])
112
-
113
- result = [x[0] for x in data.select(F.explode_outer("intlist")).collect()]
114
- self.assertEqual(result, [1, 2, 3, None, None])
115
-
116
- result = [tuple(x) for x in data.select(F.explode_outer("mapfield")).collect()]
117
- self.assertEqual(result, [("a", "b"), (None, None), (None, None)])
118
-
119
- def test_inline(self):
120
- d = [
121
- Row(structlist=[Row(b=1, c=2), Row(b=3, c=4)]),
122
- Row(structlist=[Row(b=None, c=5), None]),
123
- Row(structlist=[]),
124
- ]
125
- data = self.spark.createDataFrame(d)
126
-
127
- result = [tuple(x) for x in data.select(F.inline(data.structlist)).collect()]
128
- self.assertEqual(result, [(1, 2), (3, 4), (None, 5), (None, None)])
129
-
130
- result = [tuple(x) for x in data.select(F.inline_outer(data.structlist)).collect()]
131
- self.assertEqual(result, [(1, 2), (3, 4), (None, 5), (None, None), (None, None)])
132
-
133
- def test_basic_functions(self):
134
- rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
135
- df = self.spark.read.json(rdd)
136
- df.count()
137
- df.collect()
138
- df.schema
139
-
140
- # cache and checkpoint
141
- self.assertFalse(df.is_cached)
142
- df.persist()
143
- df.unpersist(True)
144
- df.cache()
145
- self.assertTrue(df.is_cached)
146
- self.assertEqual(2, df.count())
147
-
148
- with self.tempView("temp"):
149
- df.createOrReplaceTempView("temp")
150
- df = self.spark.sql("select foo from temp")
151
- df.count()
152
- df.collect()
153
-
154
- def test_corr(self):
155
- df = self.spark.createDataFrame([Row(a=i, b=math.sqrt(i)) for i in range(10)])
156
- corr = df.stat.corr("a", "b")
157
- self.assertTrue(abs(corr - 0.95734012) < 1e-6)
158
-
159
- def test_sampleby(self):
160
- df = self.spark.createDataFrame([Row(a=i, b=(i % 3)) for i in range(100)])
161
- sampled = df.stat.sampleBy("b", fractions={0: 0.5, 1: 0.5}, seed=0)
162
- self.assertTrue(35 <= sampled.count() <= 36)
163
-
164
- with self.assertRaises(PySparkTypeError) as pe:
165
- df.sampleBy(10, fractions={0: 0.5, 1: 0.5})
166
-
167
- self.check_error(
168
- exception=pe.exception,
169
- error_class="NOT_COLUMN_OR_STR",
170
- message_parameters={"arg_name": "col", "arg_type": "int"},
171
- )
172
-
173
- with self.assertRaises(PySparkTypeError) as pe:
174
- df.sampleBy("b", fractions=[0.5, 0.5])
175
-
176
- self.check_error(
177
- exception=pe.exception,
178
- error_class="NOT_DICT",
179
- message_parameters={"arg_name": "fractions", "arg_type": "list"},
180
- )
181
-
182
- with self.assertRaises(PySparkTypeError) as pe:
183
- df.sampleBy("b", fractions={None: 0.5, 1: 0.5})
184
-
185
- self.check_error(
186
- exception=pe.exception,
187
- error_class="DISALLOWED_TYPE_FOR_CONTAINER",
188
- message_parameters={
189
- "arg_name": "fractions",
190
- "arg_type": "dict",
191
- "allowed_types": "float, int, str",
192
- "return_type": "NoneType",
193
- },
194
- )
195
-
196
- def test_cov(self):
197
- df = self.spark.createDataFrame([Row(a=i, b=2 * i) for i in range(10)])
198
- cov = df.stat.cov("a", "b")
199
- self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)
200
-
201
- with self.assertRaises(PySparkTypeError) as pe:
202
- df.stat.cov(10, "b")
203
-
204
- self.check_error(
205
- exception=pe.exception,
206
- error_class="NOT_STR",
207
- message_parameters={"arg_name": "col1", "arg_type": "int"},
208
- )
209
-
210
- with self.assertRaises(PySparkTypeError) as pe:
211
- df.stat.cov("a", True)
212
-
213
- self.check_error(
214
- exception=pe.exception,
215
- error_class="NOT_STR",
216
- message_parameters={"arg_name": "col2", "arg_type": "bool"},
217
- )
218
-
219
- def test_crosstab(self):
220
- df = self.spark.createDataFrame([Row(a=i % 3, b=i % 2) for i in range(1, 7)])
221
- ct = df.stat.crosstab("a", "b").collect()
222
- ct = sorted(ct, key=lambda x: x[0])
223
- for i, row in enumerate(ct):
224
- self.assertEqual(row[0], str(i))
225
- self.assertTrue(row[1], 1)
226
- self.assertTrue(row[2], 1)
227
-
228
- def test_math_functions(self):
229
- df = self.spark.createDataFrame([Row(a=i, b=2 * i) for i in range(10)])
230
-
231
- SQLTestUtils.assert_close(
232
- [math.cos(i) for i in range(10)], df.select(F.cos(df.a)).collect()
233
- )
234
- SQLTestUtils.assert_close([math.cos(i) for i in range(10)], df.select(F.cos("a")).collect())
235
- SQLTestUtils.assert_close(
236
- [math.sin(i) for i in range(10)], df.select(F.sin(df.a)).collect()
237
- )
238
- SQLTestUtils.assert_close(
239
- [math.sin(i) for i in range(10)], df.select(F.sin(df["a"])).collect()
240
- )
241
- SQLTestUtils.assert_close(
242
- [math.pow(i, 2 * i) for i in range(10)], df.select(F.pow(df.a, df.b)).collect()
243
- )
244
- SQLTestUtils.assert_close(
245
- [math.pow(i, 2) for i in range(10)], df.select(F.pow(df.a, 2)).collect()
246
- )
247
- SQLTestUtils.assert_close(
248
- [math.pow(i, 2) for i in range(10)], df.select(F.pow(df.a, 2.0)).collect()
249
- )
250
- SQLTestUtils.assert_close(
251
- [math.hypot(i, 2 * i) for i in range(10)], df.select(F.hypot(df.a, df.b)).collect()
252
- )
253
- SQLTestUtils.assert_close(
254
- [math.hypot(i, 2 * i) for i in range(10)], df.select(F.hypot("a", "b")).collect()
255
- )
256
- SQLTestUtils.assert_close(
257
- [math.hypot(i, 2) for i in range(10)], df.select(F.hypot("a", 2)).collect()
258
- )
259
- SQLTestUtils.assert_close(
260
- [math.hypot(i, 2) for i in range(10)], df.select(F.hypot(df.a, 2)).collect()
261
- )
262
-
263
- def test_inverse_trig_functions(self):
264
- df = self.spark.createDataFrame([Row(a=i * 0.2, b=i * -0.2) for i in range(10)])
265
-
266
- def check(trig, inv, y_axis_symmetrical):
267
- SQLTestUtils.assert_close(
268
- [n * 0.2 for n in range(10)],
269
- df.select(inv(trig(df.a))).collect(),
270
- )
271
- if y_axis_symmetrical:
272
- SQLTestUtils.assert_close(
273
- [n * 0.2 for n in range(10)],
274
- df.select(inv(trig(df.b))).collect(),
275
- )
276
- else:
277
- SQLTestUtils.assert_close(
278
- [n * -0.2 for n in range(10)],
279
- df.select(inv(trig(df.b))).collect(),
280
- )
281
-
282
- check(F.cosh, F.acosh, y_axis_symmetrical=True)
283
- check(F.sinh, F.asinh, y_axis_symmetrical=False)
284
- check(F.tanh, F.atanh, y_axis_symmetrical=False)
285
-
286
- def test_reciprocal_trig_functions(self):
287
- # SPARK-36683: Tests for reciprocal trig functions (SEC, CSC and COT)
288
- lst = [
289
- 0.0,
290
- math.pi / 6,
291
- math.pi / 4,
292
- math.pi / 3,
293
- math.pi / 2,
294
- math.pi,
295
- 3 * math.pi / 2,
296
- 2 * math.pi,
297
- ]
298
-
299
- df = self.spark.createDataFrame(lst, types.DoubleType())
300
-
301
- def to_reciprocal_trig(func):
302
- return [1.0 / func(i) if func(i) != 0 else math.inf for i in lst]
303
-
304
- SQLTestUtils.assert_close(
305
- to_reciprocal_trig(math.cos), df.select(F.sec(df.value)).collect()
306
- )
307
- SQLTestUtils.assert_close(
308
- to_reciprocal_trig(math.sin), df.select(F.csc(df.value)).collect()
309
- )
310
- SQLTestUtils.assert_close(
311
- to_reciprocal_trig(math.tan), df.select(F.cot(df.value)).collect()
312
- )
313
-
314
- def test_rand_functions(self):
315
- df = self.spark.createDataFrame([Row(key=i, value=str(i)) for i in range(100)])
316
-
317
- rnd = df.select("key", F.rand()).collect()
318
- for row in rnd:
319
- assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
320
- rndn = df.select("key", F.randn(5)).collect()
321
- for row in rndn:
322
- assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
323
-
324
- # If the specified seed is 0, we should use it.
325
- # https://issues.apache.org/jira/browse/SPARK-9691
326
- rnd1 = df.select("key", F.rand(0)).collect()
327
- rnd2 = df.select("key", F.rand(0)).collect()
328
- self.assertEqual(sorted(rnd1), sorted(rnd2))
329
-
330
- rndn1 = df.select("key", F.randn(0)).collect()
331
- rndn2 = df.select("key", F.randn(0)).collect()
332
- self.assertEqual(sorted(rndn1), sorted(rndn2))
333
-
334
- def test_string_functions(self):
335
- string_functions = [
336
- "upper",
337
- "lower",
338
- "ascii",
339
- "base64",
340
- "unbase64",
341
- "ltrim",
342
- "rtrim",
343
- "trim",
344
- ]
345
-
346
- df = self.spark.createDataFrame([["nick"]], schema=["name"])
347
- with self.assertRaises(PySparkTypeError) as pe:
348
- df.select(F.col("name").substr(0, F.lit(1)))
349
-
350
- self.check_error(
351
- exception=pe.exception,
352
- error_class="NOT_SAME_TYPE",
353
- message_parameters={
354
- "arg_name1": "startPos",
355
- "arg_name2": "length",
356
- "arg_type1": "int",
357
- "arg_type2": "Column",
358
- },
359
- )
360
-
361
- for name in string_functions:
362
- self.assertEqual(
363
- df.select(getattr(F, name)("name")).first()[0],
364
- df.select(getattr(F, name)(F.col("name"))).first()[0],
365
- )
366
-
367
- def test_octet_length_function(self):
368
- # SPARK-36751: add octet length api for python
369
- df = self.spark.createDataFrame([("cat",), ("\U0001F408",)], ["cat"])
370
- actual = df.select(F.octet_length("cat")).collect()
371
- self.assertEqual([Row(3), Row(4)], actual)
372
-
373
- def test_bit_length_function(self):
374
- # SPARK-36751: add bit length api for python
375
- df = self.spark.createDataFrame([("cat",), ("\U0001F408",)], ["cat"])
376
- actual = df.select(F.bit_length("cat")).collect()
377
- self.assertEqual([Row(24), Row(32)], actual)
378
-
379
- def test_array_contains_function(self):
380
- df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ["data"])
381
- actual = df.select(F.array_contains(df.data, "1").alias("b")).collect()
382
- self.assertEqual([Row(b=True), Row(b=False)], actual)
383
-
384
- def test_levenshtein_function(self):
385
- df = self.spark.createDataFrame([("kitten", "sitting")], ["l", "r"])
386
- actual_without_threshold = df.select(F.levenshtein(df.l, df.r).alias("b")).collect()
387
- self.assertEqual([Row(b=3)], actual_without_threshold)
388
- actual_with_threshold = df.select(F.levenshtein(df.l, df.r, 2).alias("b")).collect()
389
- self.assertEqual([Row(b=-1)], actual_with_threshold)
390
-
391
- def test_between_function(self):
392
- df = self.spark.createDataFrame(
393
- [Row(a=1, b=2, c=3), Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)]
394
- )
395
- self.assertEqual(
396
- [Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)], df.filter(df.a.between(df.b, df.c)).collect()
397
- )
398
-
399
- def test_dayofweek(self):
400
- dt = datetime.datetime(2017, 11, 6)
401
- df = self.spark.createDataFrame([Row(date=dt)])
402
- row = df.select(F.dayofweek(df.date)).first()
403
- self.assertEqual(row[0], 2)
404
-
405
- # Test added for SPARK-37738; change Python API to accept both col & int as input
406
- def test_date_add_function(self):
407
- dt = datetime.date(2021, 12, 27)
408
-
409
- # Note; number var in Python gets converted to LongType column;
410
- # this is not supported by the function, so cast to Integer explicitly
411
- df = self.spark.createDataFrame([Row(date=dt, add=2)], "date date, add integer")
412
-
413
- self.assertTrue(
414
- all(
415
- df.select(
416
- F.date_add(df.date, df.add) == datetime.date(2021, 12, 29),
417
- F.date_add(df.date, "add") == datetime.date(2021, 12, 29),
418
- F.date_add(df.date, 3) == datetime.date(2021, 12, 30),
419
- ).first()
420
- )
421
- )
422
-
423
- # Test added for SPARK-37738; change Python API to accept both col & int as input
424
- def test_date_sub_function(self):
425
- dt = datetime.date(2021, 12, 27)
426
-
427
- # Note; number var in Python gets converted to LongType column;
428
- # this is not supported by the function, so cast to Integer explicitly
429
- df = self.spark.createDataFrame([Row(date=dt, sub=2)], "date date, sub integer")
430
-
431
- self.assertTrue(
432
- all(
433
- df.select(
434
- F.date_sub(df.date, df.sub) == datetime.date(2021, 12, 25),
435
- F.date_sub(df.date, "sub") == datetime.date(2021, 12, 25),
436
- F.date_sub(df.date, 3) == datetime.date(2021, 12, 24),
437
- ).first()
438
- )
439
- )
440
-
441
- # Test added for SPARK-37738; change Python API to accept both col & int as input
442
- def test_add_months_function(self):
443
- dt = datetime.date(2021, 12, 27)
444
-
445
- # Note; number in Python gets converted to LongType column;
446
- # this is not supported by the function, so cast to Integer explicitly
447
- df = self.spark.createDataFrame([Row(date=dt, add=2)], "date date, add integer")
448
-
449
- self.assertTrue(
450
- all(
451
- df.select(
452
- F.add_months(df.date, df.add) == datetime.date(2022, 2, 27),
453
- F.add_months(df.date, "add") == datetime.date(2022, 2, 27),
454
- F.add_months(df.date, 3) == datetime.date(2022, 3, 27),
455
- ).first()
456
- )
457
- )
458
-
459
- def test_make_date(self):
460
- # SPARK-36554: expose make_date expression
461
- df = self.spark.createDataFrame([(2020, 6, 26)], ["Y", "M", "D"])
462
- row_from_col = df.select(F.make_date(df.Y, df.M, df.D)).first()
463
- self.assertEqual(row_from_col[0], datetime.date(2020, 6, 26))
464
- row_from_name = df.select(F.make_date("Y", "M", "D")).first()
465
- self.assertEqual(row_from_name[0], datetime.date(2020, 6, 26))
466
-
467
- def test_expr(self):
468
- row = Row(a="length string", b=75)
469
- df = self.spark.createDataFrame([row])
470
- result = df.select(F.expr("length(a)")).collect()[0].asDict()
471
- self.assertEqual(13, result["length(a)"])
472
-
473
- # add test for SPARK-10577 (test broadcast join hint)
474
- def test_functions_broadcast(self):
475
- df1 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
476
- df2 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
477
-
478
- # equijoin - should be converted into broadcast join
479
- with io.StringIO() as buf, redirect_stdout(buf):
480
- df1.join(F.broadcast(df2), "key").explain(True)
481
- self.assertGreaterEqual(buf.getvalue().count("Broadcast"), 1)
482
-
483
- # no join key -- should not be a broadcast join
484
- with io.StringIO() as buf, redirect_stdout(buf):
485
- df1.crossJoin(F.broadcast(df2)).explain(True)
486
- self.assertGreaterEqual(buf.getvalue().count("Broadcast"), 1)
487
-
488
- # planner should not crash without a join
489
- F.broadcast(df1).explain(True)
490
-
491
- def test_first_last_ignorenulls(self):
492
- df = self.spark.range(0, 100)
493
- df2 = df.select(F.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
494
- df3 = df2.select(
495
- F.first(df2.id, False).alias("a"),
496
- F.first(df2.id, True).alias("b"),
497
- F.last(df2.id, False).alias("c"),
498
- F.last(df2.id, True).alias("d"),
499
- )
500
- self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
501
-
502
- def test_approxQuantile(self):
503
- df = self.spark.createDataFrame([Row(a=i, b=i + 10) for i in range(10)])
504
- for f in ["a", "a"]:
505
- aq = df.stat.approxQuantile(f, [0.1, 0.5, 0.9], 0.1)
506
- self.assertTrue(isinstance(aq, list))
507
- self.assertEqual(len(aq), 3)
508
- self.assertTrue(all(isinstance(q, float) for q in aq))
509
- aqs = df.stat.approxQuantile(["a", "b"], [0.1, 0.5, 0.9], 0.1)
510
- self.assertTrue(isinstance(aqs, list))
511
- self.assertEqual(len(aqs), 2)
512
- self.assertTrue(isinstance(aqs[0], list))
513
- self.assertEqual(len(aqs[0]), 3)
514
- self.assertTrue(all(isinstance(q, float) for q in aqs[0]))
515
- self.assertTrue(isinstance(aqs[1], list))
516
- self.assertEqual(len(aqs[1]), 3)
517
- self.assertTrue(all(isinstance(q, float) for q in aqs[1]))
518
- aqt = df.stat.approxQuantile(("a", "b"), [0.1, 0.5, 0.9], 0.1)
519
- self.assertTrue(isinstance(aqt, list))
520
- self.assertEqual(len(aqt), 2)
521
- self.assertTrue(isinstance(aqt[0], list))
522
- self.assertEqual(len(aqt[0]), 3)
523
- self.assertTrue(all(isinstance(q, float) for q in aqt[0]))
524
- self.assertTrue(isinstance(aqt[1], list))
525
- self.assertEqual(len(aqt[1]), 3)
526
- self.assertTrue(all(isinstance(q, float) for q in aqt[1]))
527
- self.assertRaises(TypeError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1))
528
- self.assertRaises(TypeError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1))
529
- self.assertRaises(TypeError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1))
530
-
531
- def test_sorting_functions_with_column(self):
532
- self.check_sorting_functions_with_column(Column)
533
-
534
- def check_sorting_functions_with_column(self, tpe):
535
- funs = [F.asc_nulls_first, F.asc_nulls_last, F.desc_nulls_first, F.desc_nulls_last]
536
- exprs = [F.col("x"), "x"]
537
-
538
- for fun in funs:
539
- for _expr in exprs:
540
- res = fun(_expr)
541
- self.assertIsInstance(res, tpe)
542
- self.assertIn(f"""'x {fun.__name__.replace("_", " ").upper()}'""", str(res))
543
-
544
- for _expr in exprs:
545
- res = F.asc(_expr)
546
- self.assertIsInstance(res, tpe)
547
- self.assertIn("""'x ASC NULLS FIRST'""", str(res))
548
-
549
- for _expr in exprs:
550
- res = F.desc(_expr)
551
- self.assertIsInstance(res, tpe)
552
- self.assertIn("""'x DESC NULLS LAST'""", str(res))
553
-
554
- def test_sort_with_nulls_order(self):
555
- df = self.spark.createDataFrame(
556
- [("Tom", 80), (None, 60), ("Alice", 50)], ["name", "height"]
557
- )
558
- self.assertEqual(
559
- df.select(df.name).orderBy(F.asc_nulls_first("name")).collect(),
560
- [Row(name=None), Row(name="Alice"), Row(name="Tom")],
561
- )
562
- self.assertEqual(
563
- df.select(df.name).orderBy(F.asc_nulls_last("name")).collect(),
564
- [Row(name="Alice"), Row(name="Tom"), Row(name=None)],
565
- )
566
- self.assertEqual(
567
- df.select(df.name).orderBy(F.desc_nulls_first("name")).collect(),
568
- [Row(name=None), Row(name="Tom"), Row(name="Alice")],
569
- )
570
- self.assertEqual(
571
- df.select(df.name).orderBy(F.desc_nulls_last("name")).collect(),
572
- [Row(name="Tom"), Row(name="Alice"), Row(name=None)],
573
- )
574
-
575
- def test_input_file_name_reset_for_rdd(self):
576
- rdd = self.sc.textFile("python/test_support/hello/hello.txt").map(lambda x: {"data": x})
577
- df = self.spark.createDataFrame(rdd, "data STRING")
578
- df.select(F.input_file_name().alias("file")).collect()
579
-
580
- non_file_df = self.spark.range(100).select(F.input_file_name())
581
-
582
- results = non_file_df.collect()
583
- self.assertTrue(len(results) == 100)
584
-
585
- # [SPARK-24605]: if everything was properly reset after the last job, this should return
586
- # empty string rather than the file read in the last job.
587
- for result in results:
588
- self.assertEqual(result[0], "")
589
-
590
- def test_slice(self):
591
- df = self.spark.createDataFrame(
592
- [
593
- (
594
- [1, 2, 3],
595
- 2,
596
- 2,
597
- ),
598
- (
599
- [4, 5],
600
- 2,
601
- 2,
602
- ),
603
- ],
604
- ["x", "index", "len"],
605
- )
606
-
607
- expected = [Row(sliced=[2, 3]), Row(sliced=[5])]
608
- self.assertEqual(df.select(F.slice(df.x, 2, 2).alias("sliced")).collect(), expected)
609
- self.assertEqual(
610
- df.select(F.slice(df.x, F.lit(2), F.lit(2)).alias("sliced")).collect(), expected
611
- )
612
- self.assertEqual(
613
- df.select(F.slice("x", "index", "len").alias("sliced")).collect(), expected
614
- )
615
-
616
- self.assertEqual(
617
- df.select(F.slice(df.x, F.size(df.x) - 1, F.lit(1)).alias("sliced")).collect(),
618
- [Row(sliced=[2]), Row(sliced=[4])],
619
- )
620
- self.assertEqual(
621
- df.select(F.slice(df.x, F.lit(1), F.size(df.x) - 1).alias("sliced")).collect(),
622
- [Row(sliced=[1, 2]), Row(sliced=[4])],
623
- )
624
-
625
- def test_array_repeat(self):
626
- df = self.spark.range(1)
627
- df = df.withColumn("repeat_n", F.lit(3))
628
-
629
- expected = [Row(val=[0, 0, 0])]
630
- self.assertEqual(df.select(F.array_repeat("id", 3).alias("val")).collect(), expected)
631
- self.assertEqual(df.select(F.array_repeat("id", F.lit(3)).alias("val")).collect(), expected)
632
- self.assertEqual(
633
- df.select(F.array_repeat("id", "repeat_n").alias("val")).collect(), expected
634
- )
635
-
636
- def test_input_file_name_udf(self):
637
- df = self.spark.read.text("python/test_support/hello/hello.txt")
638
- df = df.select(F.udf(lambda x: x)("value"), F.input_file_name().alias("file"))
639
- file_name = df.collect()[0].file
640
- self.assertTrue("python/test_support/hello/hello.txt" in file_name)
641
-
642
- def test_least(self):
643
- df = self.spark.createDataFrame([(1, 4, 3)], ["a", "b", "c"])
644
-
645
- expected = [Row(least=1)]
646
- self.assertEqual(df.select(F.least(df.a, df.b, df.c).alias("least")).collect(), expected)
647
- self.assertEqual(
648
- df.select(F.least(F.lit(3), F.lit(5), F.lit(1)).alias("least")).collect(), expected
649
- )
650
- self.assertEqual(df.select(F.least("a", "b", "c").alias("least")).collect(), expected)
651
-
652
- with self.assertRaises(PySparkValueError) as pe:
653
- df.select(F.least(df.a).alias("least")).collect()
654
-
655
- self.check_error(
656
- exception=pe.exception,
657
- error_class="WRONG_NUM_COLUMNS",
658
- message_parameters={"func_name": "least", "num_cols": "2"},
659
- )
660
-
661
- def test_overlay(self):
662
- actual = list(
663
- chain.from_iterable(
664
- [
665
- re.findall("(overlay\\(.*\\))", str(x))
666
- for x in [
667
- F.overlay(F.col("foo"), F.col("bar"), 1),
668
- F.overlay("x", "y", 3),
669
- F.overlay(F.col("x"), F.col("y"), 1, 3),
670
- F.overlay("x", "y", 2, 5),
671
- F.overlay("x", "y", F.lit(11)),
672
- F.overlay("x", "y", F.lit(2), F.lit(5)),
673
- ]
674
- ]
675
- )
676
- )
677
-
678
- expected = [
679
- "overlay(foo, bar, 1, -1)",
680
- "overlay(x, y, 3, -1)",
681
- "overlay(x, y, 1, 3)",
682
- "overlay(x, y, 2, 5)",
683
- "overlay(x, y, 11, -1)",
684
- "overlay(x, y, 2, 5)",
685
- ]
686
-
687
- self.assertListEqual(actual, expected)
688
-
689
- df = self.spark.createDataFrame([("SPARK_SQL", "CORE", 7, 0)], ("x", "y", "pos", "len"))
690
-
691
- exp = [Row(ol="SPARK_CORESQL")]
692
- self.assertEqual(df.select(F.overlay(df.x, df.y, 7, 0).alias("ol")).collect(), exp)
693
- self.assertEqual(
694
- df.select(F.overlay(df.x, df.y, F.lit(7), F.lit(0)).alias("ol")).collect(), exp
695
- )
696
- self.assertEqual(df.select(F.overlay("x", "y", "pos", "len").alias("ol")).collect(), exp)
697
-
698
- with self.assertRaises(PySparkTypeError) as pe:
699
- df.select(F.overlay(df.x, df.y, 7.5, 0).alias("ol")).collect()
700
-
701
- self.check_error(
702
- exception=pe.exception,
703
- error_class="NOT_COLUMN_OR_INT_OR_STR",
704
- message_parameters={"arg_name": "pos", "arg_type": "float"},
705
- )
706
-
707
- with self.assertRaises(PySparkTypeError) as pe:
708
- df.select(F.overlay(df.x, df.y, 7, 0.5).alias("ol")).collect()
709
-
710
- self.check_error(
711
- exception=pe.exception,
712
- error_class="NOT_COLUMN_OR_INT_OR_STR",
713
- message_parameters={"arg_name": "len", "arg_type": "float"},
714
- )
715
-
716
- def test_percentile(self):
717
- actual = list(
718
- chain.from_iterable(
719
- [
720
- re.findall("(percentile\\(.*\\))", str(x))
721
- for x in [
722
- F.percentile(F.col("foo"), F.lit(0.5)),
723
- F.percentile(F.col("bar"), 0.25, 2),
724
- F.percentile(F.col("bar"), [0.25, 0.5, 0.75]),
725
- F.percentile(F.col("foo"), (0.05, 0.95), 100),
726
- F.percentile("foo", 0.5),
727
- F.percentile("bar", [0.1, 0.9], F.lit(10)),
728
- ]
729
- ]
730
- )
731
- )
732
-
733
- expected = [
734
- "percentile(foo, 0.5, 1)",
735
- "percentile(bar, 0.25, 2)",
736
- "percentile(bar, array(0.25, 0.5, 0.75), 1)",
737
- "percentile(foo, array(0.05, 0.95), 100)",
738
- "percentile(foo, 0.5, 1)",
739
- "percentile(bar, array(0.1, 0.9), 10)",
740
- ]
741
-
742
- self.assertListEqual(actual, expected)
743
-
744
- def test_median(self):
745
- actual = list(
746
- chain.from_iterable(
747
- [
748
- re.findall("(median\\(.*\\))", str(x))
749
- for x in [
750
- F.median(F.col("foo")),
751
- ]
752
- ]
753
- )
754
- )
755
-
756
- expected = [
757
- "median(foo)",
758
- ]
759
-
760
- self.assertListEqual(actual, expected)
761
-
762
- def test_percentile_approx(self):
763
- actual = list(
764
- chain.from_iterable(
765
- [
766
- re.findall("(percentile_approx\\(.*\\))", str(x))
767
- for x in [
768
- F.percentile_approx(F.col("foo"), F.lit(0.5)),
769
- F.percentile_approx(F.col("bar"), 0.25, 42),
770
- F.percentile_approx(F.col("bar"), [0.25, 0.5, 0.75]),
771
- F.percentile_approx(F.col("foo"), (0.05, 0.95), 100),
772
- F.percentile_approx("foo", 0.5),
773
- F.percentile_approx("bar", [0.1, 0.9], F.lit(10)),
774
- ]
775
- ]
776
- )
777
- )
778
-
779
- expected = [
780
- "percentile_approx(foo, 0.5, 10000)",
781
- "percentile_approx(bar, 0.25, 42)",
782
- "percentile_approx(bar, array(0.25, 0.5, 0.75), 10000)",
783
- "percentile_approx(foo, array(0.05, 0.95), 100)",
784
- "percentile_approx(foo, 0.5, 10000)",
785
- "percentile_approx(bar, array(0.1, 0.9), 10)",
786
- ]
787
-
788
- self.assertListEqual(actual, expected)
789
-
790
- def test_nth_value(self):
791
- df = self.spark.createDataFrame(
792
- [
793
- ("a", 0, None),
794
- ("a", 1, "x"),
795
- ("a", 2, "y"),
796
- ("a", 3, "z"),
797
- ("a", 4, None),
798
- ("b", 1, None),
799
- ("b", 2, None),
800
- ],
801
- schema=("key", "order", "value"),
802
- )
803
- w = Window.partitionBy("key").orderBy("order")
804
-
805
- rs = df.select(
806
- df.key,
807
- df.order,
808
- F.nth_value("value", 2).over(w),
809
- F.nth_value("value", 2, False).over(w),
810
- F.nth_value("value", 2, True).over(w),
811
- ).collect()
812
-
813
- expected = [
814
- ("a", 0, None, None, None),
815
- ("a", 1, "x", "x", None),
816
- ("a", 2, "x", "x", "y"),
817
- ("a", 3, "x", "x", "y"),
818
- ("a", 4, "x", "x", "y"),
819
- ("b", 1, None, None, None),
820
- ("b", 2, None, None, None),
821
- ]
822
-
823
- for r, ex in zip(sorted(rs), sorted(expected)):
824
- self.assertEqual(tuple(r), ex[: len(r)])
825
-
826
- def test_higher_order_function_failures(self):
827
- # Should fail with varargs
828
- with self.assertRaises(PySparkValueError) as pe:
829
- F.transform(F.col("foo"), lambda *x: F.lit(1))
830
-
831
- self.check_error(
832
- exception=pe.exception,
833
- error_class="UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION",
834
- message_parameters={"func_name": "<lambda>"},
835
- )
836
-
837
- # Should fail with kwargs
838
- with self.assertRaises(PySparkValueError) as pe:
839
- F.transform(F.col("foo"), lambda **x: F.lit(1))
840
-
841
- self.check_error(
842
- exception=pe.exception,
843
- error_class="UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION",
844
- message_parameters={"func_name": "<lambda>"},
845
- )
846
-
847
- # Should fail with nullary function
848
- with self.assertRaises(PySparkValueError) as pe:
849
- F.transform(F.col("foo"), lambda: F.lit(1))
850
-
851
- self.check_error(
852
- exception=pe.exception,
853
- error_class="WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION",
854
- message_parameters={"func_name": "<lambda>", "num_args": "0"},
855
- )
856
-
857
- # Should fail with quaternary function
858
- with self.assertRaises(PySparkValueError) as pe:
859
- F.transform(F.col("foo"), lambda x1, x2, x3, x4: F.lit(1))
860
-
861
- self.check_error(
862
- exception=pe.exception,
863
- error_class="WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION",
864
- message_parameters={"func_name": "<lambda>", "num_args": "4"},
865
- )
866
-
867
- # Should fail if function doesn't return Column
868
- with self.assertRaises(PySparkValueError) as pe:
869
- F.transform(F.col("foo"), lambda x: 1)
870
-
871
- self.check_error(
872
- exception=pe.exception,
873
- error_class="HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN",
874
- message_parameters={"func_name": "<lambda>", "return_type": "int"},
875
- )
876
-
877
- def test_nested_higher_order_function(self):
878
- # SPARK-35382: lambda vars must be resolved properly in nested higher order functions
879
- df = self.spark.sql("SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters")
880
-
881
- actual = df.select(
882
- F.flatten(
883
- F.transform(
884
- "numbers",
885
- lambda number: F.transform(
886
- "letters", lambda letter: F.struct(number.alias("n"), letter.alias("l"))
887
- ),
888
- )
889
- )
890
- ).first()[0]
891
-
892
- expected = [
893
- (1, "a"),
894
- (1, "b"),
895
- (1, "c"),
896
- (2, "a"),
897
- (2, "b"),
898
- (2, "c"),
899
- (3, "a"),
900
- (3, "b"),
901
- (3, "c"),
902
- ]
903
-
904
- self.assertEquals(actual, expected)
905
-
906
- def test_window_functions(self):
907
- df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
908
- w = Window.partitionBy("value").orderBy("key")
909
-
910
- sel = df.select(
911
- df.value,
912
- df.key,
913
- F.max("key").over(w.rowsBetween(0, 1)),
914
- F.min("key").over(w.rowsBetween(0, 1)),
915
- F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
916
- F.row_number().over(w),
917
- F.rank().over(w),
918
- F.dense_rank().over(w),
919
- F.ntile(2).over(w),
920
- )
921
- rs = sorted(sel.collect())
922
- expected = [
923
- ("1", 1, 1, 1, 1, 1, 1, 1, 1),
924
- ("2", 1, 1, 1, 3, 1, 1, 1, 1),
925
- ("2", 1, 2, 1, 3, 2, 1, 1, 1),
926
- ("2", 2, 2, 2, 3, 3, 3, 2, 2),
927
- ]
928
- for r, ex in zip(rs, expected):
929
- self.assertEqual(tuple(r), ex[: len(r)])
930
-
931
- def test_window_functions_without_partitionBy(self):
932
- df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
933
- w = Window.orderBy("key", df.value)
934
-
935
- sel = df.select(
936
- df.value,
937
- df.key,
938
- F.max("key").over(w.rowsBetween(0, 1)),
939
- F.min("key").over(w.rowsBetween(0, 1)),
940
- F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
941
- F.row_number().over(w),
942
- F.rank().over(w),
943
- F.dense_rank().over(w),
944
- F.ntile(2).over(w),
945
- )
946
- rs = sorted(sel.collect())
947
- expected = [
948
- ("1", 1, 1, 1, 4, 1, 1, 1, 1),
949
- ("2", 1, 1, 1, 4, 2, 2, 2, 1),
950
- ("2", 1, 2, 1, 4, 3, 2, 2, 2),
951
- ("2", 2, 2, 2, 4, 4, 4, 3, 2),
952
- ]
953
- for r, ex in zip(rs, expected):
954
- self.assertEqual(tuple(r), ex[: len(r)])
955
-
956
- def test_window_functions_cumulative_sum(self):
957
- df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"])
958
-
959
- # Test cumulative sum
960
- sel = df.select(
961
- df.key, F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0))
962
- )
963
- rs = sorted(sel.collect())
964
- expected = [("one", 1), ("two", 3)]
965
- for r, ex in zip(rs, expected):
966
- self.assertEqual(tuple(r), ex[: len(r)])
967
-
968
- # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
969
- sel = df.select(
970
- df.key, F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0))
971
- )
972
- rs = sorted(sel.collect())
973
- expected = [("one", 1), ("two", 3)]
974
- for r, ex in zip(rs, expected):
975
- self.assertEqual(tuple(r), ex[: len(r)])
976
-
977
- # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
978
- frame_end = Window.unboundedFollowing + 1
979
- sel = df.select(
980
- df.key, F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end))
981
- )
982
- rs = sorted(sel.collect())
983
- expected = [("one", 3), ("two", 2)]
984
- for r, ex in zip(rs, expected):
985
- self.assertEqual(tuple(r), ex[: len(r)])
986
-
987
- def test_window_time(self):
988
- df = self.spark.createDataFrame(
989
- [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ["date", "val"]
990
- )
991
-
992
- w = df.groupBy(F.window("date", "5 seconds")).agg(F.sum("val").alias("sum"))
993
- r = w.select(
994
- w.window.end.cast("string").alias("end"),
995
- F.window_time(w.window).cast("string").alias("window_time"),
996
- "sum",
997
- ).collect()
998
- self.assertEqual(
999
- r[0], Row(end="2016-03-11 09:00:10", window_time="2016-03-11 09:00:09.999999", sum=1)
1000
- )
1001
-
1002
- def test_collect_functions(self):
1003
- df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
1004
-
1005
- self.assertEqual(sorted(df.select(F.collect_set(df.key).alias("r")).collect()[0].r), [1, 2])
1006
- self.assertEqual(
1007
- sorted(df.select(F.collect_list(df.key).alias("r")).collect()[0].r), [1, 1, 1, 2]
1008
- )
1009
- self.assertEqual(
1010
- sorted(df.select(F.collect_set(df.value).alias("r")).collect()[0].r), ["1", "2"]
1011
- )
1012
- self.assertEqual(
1013
- sorted(df.select(F.collect_list(df.value).alias("r")).collect()[0].r),
1014
- ["1", "2", "2", "2"],
1015
- )
1016
-
1017
- def test_datetime_functions(self):
1018
- df = self.spark.range(1).selectExpr("'2017-01-22' as dateCol")
1019
- parse_result = df.select(F.to_date(F.col("dateCol"))).first()
1020
- self.assertEqual(datetime.date(2017, 1, 22), parse_result["to_date(dateCol)"])
1021
-
1022
- def test_assert_true(self):
1023
- self.check_assert_true(Py4JJavaError)
1024
-
1025
- def check_assert_true(self, tpe):
1026
- df = self.spark.range(3)
1027
-
1028
- self.assertEqual(
1029
- df.select(F.assert_true(df.id < 3)).toDF("val").collect(),
1030
- [Row(val=None), Row(val=None), Row(val=None)],
1031
- )
1032
-
1033
- with self.assertRaisesRegex(tpe, "too big"):
1034
- df.select(F.assert_true(df.id < 2, "too big")).toDF("val").collect()
1035
-
1036
- with self.assertRaisesRegex(tpe, "2000000"):
1037
- df.select(F.assert_true(df.id < 2, df.id * 1e6)).toDF("val").collect()
1038
-
1039
- with self.assertRaises(PySparkTypeError) as pe:
1040
- df.select(F.assert_true(df.id < 2, 5))
1041
-
1042
- self.check_error(
1043
- exception=pe.exception,
1044
- error_class="NOT_COLUMN_OR_STR",
1045
- message_parameters={"arg_name": "errMsg", "arg_type": "int"},
1046
- )
1047
-
1048
- def test_raise_error(self):
1049
- self.check_raise_error(Py4JJavaError)
1050
-
1051
- def check_raise_error(self, tpe):
1052
- df = self.spark.createDataFrame([Row(id="foobar")])
1053
-
1054
- with self.assertRaisesRegex(tpe, "foobar"):
1055
- df.select(F.raise_error(df.id)).collect()
1056
-
1057
- with self.assertRaisesRegex(tpe, "barfoo"):
1058
- df.select(F.raise_error("barfoo")).collect()
1059
-
1060
- with self.assertRaises(PySparkTypeError) as pe:
1061
- df.select(F.raise_error(None))
1062
-
1063
- self.check_error(
1064
- exception=pe.exception,
1065
- error_class="NOT_COLUMN_OR_STR",
1066
- message_parameters={"arg_name": "errMsg", "arg_type": "NoneType"},
1067
- )
1068
-
1069
- def test_sum_distinct(self):
1070
- self.spark.range(10).select(
1071
- F.assert_true(F.sum_distinct(F.col("id")) == F.sumDistinct(F.col("id")))
1072
- ).collect()
1073
-
1074
- def test_shiftleft(self):
1075
- self.spark.range(10).select(
1076
- F.assert_true(F.shiftLeft(F.col("id"), 2) == F.shiftleft(F.col("id"), 2))
1077
- ).collect()
1078
-
1079
- def test_shiftright(self):
1080
- self.spark.range(10).select(
1081
- F.assert_true(F.shiftRight(F.col("id"), 2) == F.shiftright(F.col("id"), 2))
1082
- ).collect()
1083
-
1084
- def test_shiftrightunsigned(self):
1085
- self.spark.range(10).select(
1086
- F.assert_true(
1087
- F.shiftRightUnsigned(F.col("id"), 2) == F.shiftrightunsigned(F.col("id"), 2)
1088
- )
1089
- ).collect()
1090
-
1091
- def test_lit_day_time_interval(self):
1092
- td = datetime.timedelta(days=1, hours=12, milliseconds=123)
1093
- actual = self.spark.range(1).select(F.lit(td)).first()[0]
1094
- self.assertEqual(actual, td)
1095
-
1096
- def test_lit_list(self):
1097
- # SPARK-40271: added list type supporting
1098
- test_list = [1, 2, 3]
1099
- expected = [1, 2, 3]
1100
- actual = self.spark.range(1).select(F.lit(test_list)).first()[0]
1101
- self.assertEqual(actual, expected)
1102
-
1103
- test_list = [[1, 2, 3], [3, 4]]
1104
- expected = [[1, 2, 3], [3, 4]]
1105
- actual = self.spark.range(1).select(F.lit(test_list)).first()[0]
1106
- self.assertEqual(actual, expected)
1107
-
1108
- with self.sql_conf({"spark.sql.ansi.enabled": False}):
1109
- test_list = ["a", 1, None, 1.0]
1110
- expected = ["a", "1", None, "1.0"]
1111
- actual = self.spark.range(1).select(F.lit(test_list)).first()[0]
1112
- self.assertEqual(actual, expected)
1113
-
1114
- test_list = [["a", 1, None, 1.0], [1, None, "b"]]
1115
- expected = [["a", "1", None, "1.0"], ["1", None, "b"]]
1116
- actual = self.spark.range(1).select(F.lit(test_list)).first()[0]
1117
- self.assertEqual(actual, expected)
1118
-
1119
- df = self.spark.range(10)
1120
- with self.assertRaises(PySparkValueError) as pe:
1121
- F.lit([df.id, df.id])
1122
-
1123
- self.check_error(
1124
- exception=pe.exception,
1125
- error_class="COLUMN_IN_LIST",
1126
- message_parameters={"func_name": "lit"},
1127
- )
1128
-
1129
- # Test added for SPARK-39832; change Python API to accept both col & str as input
1130
- def test_regexp_replace(self):
1131
- df = self.spark.createDataFrame(
1132
- [("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"]
1133
- )
1134
- self.assertTrue(
1135
- all(
1136
- df.select(
1137
- F.regexp_replace("str", r"(\d+)", "--") == "-----",
1138
- F.regexp_replace("str", F.col("pattern"), F.col("replacement")) == "-----",
1139
- ).first()
1140
- )
1141
- )
1142
-
1143
- @unittest.skipIf(not have_numpy, "NumPy not installed")
1144
- def test_lit_np_scalar(self):
1145
- import numpy as np
1146
-
1147
- dtype_to_spark_dtypes = [
1148
- (np.int8, [("1", "tinyint")]),
1149
- (np.int16, [("1", "smallint")]),
1150
- (np.int32, [("1", "int")]),
1151
- (np.int64, [("1", "bigint")]),
1152
- (np.float32, [("1.0", "float")]),
1153
- (np.float64, [("1.0", "double")]),
1154
- (np.bool_, [("true", "boolean")]),
1155
- ]
1156
- for dtype, spark_dtypes in dtype_to_spark_dtypes:
1157
- with self.subTest(dtype):
1158
- self.assertEqual(self.spark.range(1).select(F.lit(dtype(1))).dtypes, spark_dtypes)
1159
-
1160
- @unittest.skipIf(not have_numpy, "NumPy not installed")
1161
- def test_np_scalar_input(self):
1162
- import numpy as np
1163
-
1164
- df = self.spark.createDataFrame([([1, 2, 3],), ([],)], ["data"])
1165
- for dtype in [np.int8, np.int16, np.int32, np.int64]:
1166
- res = df.select(F.array_contains(df.data, dtype(1)).alias("b")).collect()
1167
- self.assertEqual([Row(b=True), Row(b=False)], res)
1168
- res = df.select(F.array_position(df.data, dtype(1)).alias("c")).collect()
1169
- self.assertEqual([Row(c=1), Row(c=0)], res)
1170
-
1171
- df = self.spark.createDataFrame([([1.0, 2.0, 3.0],), ([],)], ["data"])
1172
- for dtype in [np.float32, np.float64]:
1173
- res = df.select(F.array_contains(df.data, dtype(1)).alias("b")).collect()
1174
- self.assertEqual([Row(b=True), Row(b=False)], res)
1175
- res = df.select(F.array_position(df.data, dtype(1)).alias("c")).collect()
1176
- self.assertEqual([Row(c=1), Row(c=0)], res)
1177
-
1178
- @unittest.skipIf(not have_numpy, "NumPy not installed")
1179
- def test_ndarray_input(self):
1180
- import numpy as np
1181
-
1182
- arr_dtype_to_spark_dtypes = [
1183
- ("int8", [("b", "array<smallint>")]),
1184
- ("int16", [("b", "array<smallint>")]),
1185
- ("int32", [("b", "array<int>")]),
1186
- ("int64", [("b", "array<bigint>")]),
1187
- ("float32", [("b", "array<float>")]),
1188
- ("float64", [("b", "array<double>")]),
1189
- ]
1190
- for t, expected_spark_dtypes in arr_dtype_to_spark_dtypes:
1191
- arr = np.array([1, 2]).astype(t)
1192
- self.assertEqual(
1193
- expected_spark_dtypes, self.spark.range(1).select(F.lit(arr).alias("b")).dtypes
1194
- )
1195
- arr = np.array([1, 2]).astype(np.uint)
1196
- with self.assertRaises(PySparkTypeError) as pe:
1197
- self.spark.range(1).select(F.lit(arr).alias("b"))
1198
-
1199
- self.check_error(
1200
- exception=pe.exception,
1201
- error_class="UNSUPPORTED_NUMPY_ARRAY_SCALAR",
1202
- message_parameters={
1203
- "dtype": "uint64",
1204
- },
1205
- )
1206
-
1207
- def test_binary_math_function(self):
1208
- funcs, expected = zip(
1209
- *[(F.atan2, 0.13664), (F.hypot, 8.07527), (F.pow, 2.14359), (F.pmod, 1.1)]
1210
- )
1211
- df = self.spark.range(1).select(*(func(1.1, 8) for func in funcs))
1212
- for a, e in zip(df.first(), expected):
1213
- self.assertAlmostEqual(a, e, 5)
1214
-
1215
- def test_map_functions(self):
1216
- # SPARK-38496: Check basic functionality of all "map" type related functions
1217
- expected = {"a": 1, "b": 2}
1218
- expected2 = {"c": 3, "d": 4}
1219
- df = self.spark.createDataFrame(
1220
- [(list(expected.keys()), list(expected.values()))], ["k", "v"]
1221
- )
1222
- actual = (
1223
- df.select(
1224
- F.expr("map('c', 3, 'd', 4) as dict2"),
1225
- F.map_from_arrays(df.k, df.v).alias("dict"),
1226
- "*",
1227
- )
1228
- .select(
1229
- F.map_contains_key("dict", "a").alias("one"),
1230
- F.map_contains_key("dict", "d").alias("not_exists"),
1231
- F.map_keys("dict").alias("keys"),
1232
- F.map_values("dict").alias("values"),
1233
- F.map_entries("dict").alias("items"),
1234
- "*",
1235
- )
1236
- .select(
1237
- F.map_concat("dict", "dict2").alias("merged"),
1238
- F.map_from_entries(F.arrays_zip("keys", "values")).alias("from_items"),
1239
- "*",
1240
- )
1241
- .first()
1242
- )
1243
- self.assertEqual(expected, actual["dict"])
1244
- self.assertTrue(actual["one"])
1245
- self.assertFalse(actual["not_exists"])
1246
- self.assertEqual(list(expected.keys()), actual["keys"])
1247
- self.assertEqual(list(expected.values()), actual["values"])
1248
- self.assertEqual(expected, dict(actual["items"]))
1249
- self.assertEqual({**expected, **expected2}, dict(actual["merged"]))
1250
- self.assertEqual(expected, actual["from_items"])
1251
-
1252
- def test_schema_of_json(self):
1253
- with self.assertRaises(PySparkTypeError) as pe:
1254
- F.schema_of_json(1)
1255
-
1256
- self.check_error(
1257
- exception=pe.exception,
1258
- error_class="NOT_COLUMN_OR_STR",
1259
- message_parameters={"arg_name": "json", "arg_type": "int"},
1260
- )
1261
-
1262
- def test_schema_of_csv(self):
1263
- with self.assertRaises(PySparkTypeError) as pe:
1264
- F.schema_of_csv(1)
1265
-
1266
- self.check_error(
1267
- exception=pe.exception,
1268
- error_class="NOT_COLUMN_OR_STR",
1269
- message_parameters={"arg_name": "csv", "arg_type": "int"},
1270
- )
1271
-
1272
- def test_from_csv(self):
1273
- df = self.spark.range(10)
1274
- with self.assertRaises(PySparkTypeError) as pe:
1275
- F.from_csv(df.id, 1)
1276
-
1277
- self.check_error(
1278
- exception=pe.exception,
1279
- error_class="NOT_COLUMN_OR_STR",
1280
- message_parameters={"arg_name": "schema", "arg_type": "int"},
1281
- )
1282
-
1283
- def test_greatest(self):
1284
- df = self.spark.range(10)
1285
- with self.assertRaises(PySparkValueError) as pe:
1286
- F.greatest(df.id)
1287
-
1288
- self.check_error(
1289
- exception=pe.exception,
1290
- error_class="WRONG_NUM_COLUMNS",
1291
- message_parameters={"func_name": "greatest", "num_cols": "2"},
1292
- )
1293
-
1294
- def test_when(self):
1295
- with self.assertRaises(PySparkTypeError) as pe:
1296
- F.when("id", 1)
1297
-
1298
- self.check_error(
1299
- exception=pe.exception,
1300
- error_class="NOT_COLUMN",
1301
- message_parameters={"arg_name": "condition", "arg_type": "str"},
1302
- )
1303
-
1304
- def test_window(self):
1305
- with self.assertRaises(PySparkTypeError) as pe:
1306
- F.window("date", 5)
1307
-
1308
- self.check_error(
1309
- exception=pe.exception,
1310
- error_class="NOT_STR",
1311
- message_parameters={"arg_name": "windowDuration", "arg_type": "int"},
1312
- )
1313
-
1314
- def test_session_window(self):
1315
- with self.assertRaises(PySparkTypeError) as pe:
1316
- F.session_window("date", 5)
1317
-
1318
- self.check_error(
1319
- exception=pe.exception,
1320
- error_class="NOT_COLUMN_OR_STR",
1321
- message_parameters={"arg_name": "gapDuration", "arg_type": "int"},
1322
- )
1323
-
1324
- def test_bucket(self):
1325
- with self.assertRaises(PySparkTypeError) as pe:
1326
- F.bucket("5", "id")
1327
-
1328
- self.check_error(
1329
- exception=pe.exception,
1330
- error_class="NOT_COLUMN_OR_INT",
1331
- message_parameters={"arg_name": "numBuckets", "arg_type": "str"},
1332
- )
1333
-
1334
-
1335
- class FunctionsTests(ReusedSQLTestCase, FunctionsTestsMixin):
1336
- pass
1337
-
1338
-
1339
- if __name__ == "__main__":
1340
- import unittest
1341
- from pyspark.sql.tests.test_functions import * # noqa: F401
1342
-
1343
- try:
1344
- import xmlrunner
1345
-
1346
- testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
1347
- except ImportError:
1348
- testRunner = None
1349
- unittest.main(testRunner=testRunner, verbosity=2)