snowpark-connect 0.24.0__py3-none-any.whl → 0.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (484) hide show
  1. snowflake/snowpark_connect/column_name_handler.py +116 -4
  2. snowflake/snowpark_connect/config.py +23 -0
  3. snowflake/snowpark_connect/constants.py +0 -29
  4. snowflake/snowpark_connect/dataframe_container.py +22 -0
  5. snowflake/snowpark_connect/execute_plan/map_execution_command.py +56 -1
  6. snowflake/snowpark_connect/expression/literal.py +13 -2
  7. snowflake/snowpark_connect/expression/map_cast.py +5 -8
  8. snowflake/snowpark_connect/expression/map_sql_expression.py +23 -1
  9. snowflake/snowpark_connect/expression/map_udf.py +88 -29
  10. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +199 -15
  11. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +44 -16
  12. snowflake/snowpark_connect/expression/map_unresolved_function.py +840 -367
  13. snowflake/snowpark_connect/expression/map_unresolved_star.py +3 -2
  14. snowflake/snowpark_connect/hidden_column.py +39 -0
  15. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  16. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  17. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  18. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  19. snowflake/snowpark_connect/includes/jars/{hadoop-client-api-3.3.4.jar → spark-connect-client-jvm_2.12-3.5.6.jar} +0 -0
  20. snowflake/snowpark_connect/relation/map_column_ops.py +17 -4
  21. snowflake/snowpark_connect/relation/map_extension.py +52 -11
  22. snowflake/snowpark_connect/relation/map_join.py +258 -62
  23. snowflake/snowpark_connect/relation/map_map_partitions.py +9 -4
  24. snowflake/snowpark_connect/relation/map_relation.py +12 -1
  25. snowflake/snowpark_connect/relation/map_row_ops.py +8 -1
  26. snowflake/snowpark_connect/relation/map_sql.py +88 -11
  27. snowflake/snowpark_connect/relation/map_udtf.py +100 -46
  28. snowflake/snowpark_connect/relation/read/map_read.py +3 -3
  29. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +1 -1
  30. snowflake/snowpark_connect/relation/read/map_read_json.py +8 -1
  31. snowflake/snowpark_connect/relation/read/map_read_table.py +1 -9
  32. snowflake/snowpark_connect/relation/read/reader_config.py +3 -1
  33. snowflake/snowpark_connect/relation/utils.py +44 -0
  34. snowflake/snowpark_connect/relation/write/map_write.py +175 -75
  35. snowflake/snowpark_connect/resources_initializer.py +47 -6
  36. snowflake/snowpark_connect/server.py +26 -4
  37. snowflake/snowpark_connect/type_mapping.py +29 -25
  38. snowflake/snowpark_connect/typed_column.py +14 -0
  39. snowflake/snowpark_connect/utils/artifacts.py +23 -0
  40. snowflake/snowpark_connect/utils/concurrent.py +4 -0
  41. snowflake/snowpark_connect/utils/context.py +6 -1
  42. snowflake/snowpark_connect/utils/external_udxf_cache.py +36 -0
  43. snowflake/snowpark_connect/utils/scala_udf_utils.py +596 -0
  44. snowflake/snowpark_connect/utils/session.py +4 -0
  45. snowflake/snowpark_connect/utils/telemetry.py +6 -17
  46. snowflake/snowpark_connect/utils/udf_helper.py +2 -0
  47. snowflake/snowpark_connect/utils/udf_utils.py +22 -1
  48. snowflake/snowpark_connect/utils/udtf_utils.py +1 -0
  49. snowflake/snowpark_connect/version.py +1 -1
  50. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/METADATA +1 -1
  51. snowpark_connect-0.26.0.dist-info/RECORD +481 -0
  52. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  56. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +0 -16
  57. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +0 -60
  58. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +0 -306
  59. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +0 -16
  60. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +0 -53
  61. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +0 -50
  62. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +0 -43
  63. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +0 -114
  64. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +0 -47
  65. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +0 -43
  66. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +0 -46
  67. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +0 -238
  68. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +0 -194
  69. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +0 -156
  70. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +0 -184
  71. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +0 -78
  72. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +0 -292
  73. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +0 -50
  74. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +0 -152
  75. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +0 -456
  76. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +0 -96
  77. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +0 -186
  78. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +0 -77
  79. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +0 -401
  80. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +0 -528
  81. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +0 -82
  82. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +0 -409
  83. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +0 -55
  84. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +0 -441
  85. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +0 -546
  86. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +0 -71
  87. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +0 -52
  88. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +0 -494
  89. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +0 -85
  90. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +0 -138
  91. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +0 -16
  92. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +0 -151
  93. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +0 -97
  94. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +0 -143
  95. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +0 -551
  96. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +0 -137
  97. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +0 -96
  98. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +0 -142
  99. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +0 -16
  100. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +0 -137
  101. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +0 -561
  102. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +0 -172
  103. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +0 -16
  104. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +0 -353
  105. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +0 -192
  106. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +0 -680
  107. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +0 -206
  108. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +0 -471
  109. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +0 -108
  110. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
  111. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
  112. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
  113. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
  114. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +0 -16
  115. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +0 -16
  116. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +0 -177
  117. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +0 -575
  118. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +0 -235
  119. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +0 -653
  120. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +0 -463
  121. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +0 -86
  122. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +0 -151
  123. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +0 -139
  124. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +0 -458
  125. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +0 -86
  126. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +0 -202
  127. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +0 -520
  128. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +0 -361
  129. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +0 -16
  130. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +0 -16
  131. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +0 -40
  132. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +0 -42
  133. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +0 -40
  134. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +0 -37
  135. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +0 -60
  136. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +0 -40
  137. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +0 -40
  138. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +0 -90
  139. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +0 -40
  140. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +0 -40
  141. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +0 -40
  142. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +0 -42
  143. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +0 -37
  144. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +0 -16
  145. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +0 -36
  146. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +0 -42
  147. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +0 -47
  148. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +0 -55
  149. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +0 -40
  150. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +0 -47
  151. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +0 -47
  152. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +0 -42
  153. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +0 -43
  154. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +0 -47
  155. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +0 -43
  156. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +0 -47
  157. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +0 -47
  158. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +0 -40
  159. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +0 -226
  160. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +0 -16
  161. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +0 -39
  162. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +0 -55
  163. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +0 -39
  164. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +0 -39
  165. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +0 -39
  166. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +0 -39
  167. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +0 -39
  168. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +0 -43
  169. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +0 -43
  170. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +0 -16
  171. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +0 -40
  172. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +0 -39
  173. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +0 -42
  174. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +0 -42
  175. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +0 -37
  176. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +0 -40
  177. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +0 -42
  178. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +0 -48
  179. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +0 -40
  180. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +0 -16
  181. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +0 -40
  182. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +0 -41
  183. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +0 -67
  184. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +0 -40
  185. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +0 -55
  186. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +0 -40
  187. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +0 -38
  188. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +0 -55
  189. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +0 -39
  190. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +0 -38
  191. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +0 -16
  192. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +0 -40
  193. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +0 -50
  194. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +0 -73
  195. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +0 -39
  196. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +0 -40
  197. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +0 -40
  198. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +0 -40
  199. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +0 -48
  200. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +0 -39
  201. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +0 -16
  202. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +0 -40
  203. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +0 -16
  204. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +0 -45
  205. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +0 -45
  206. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +0 -49
  207. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +0 -37
  208. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +0 -53
  209. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +0 -45
  210. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +0 -16
  211. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +0 -38
  212. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +0 -37
  213. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +0 -37
  214. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +0 -38
  215. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +0 -37
  216. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +0 -40
  217. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +0 -40
  218. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +0 -38
  219. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +0 -40
  220. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +0 -37
  221. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +0 -38
  222. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +0 -38
  223. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +0 -66
  224. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +0 -37
  225. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +0 -37
  226. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +0 -42
  227. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +0 -39
  228. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +0 -49
  229. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +0 -37
  230. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +0 -39
  231. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +0 -49
  232. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +0 -53
  233. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +0 -43
  234. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +0 -49
  235. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +0 -39
  236. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +0 -41
  237. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +0 -39
  238. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +0 -60
  239. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +0 -48
  240. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +0 -39
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +0 -44
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +0 -84
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +0 -37
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +0 -45
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +0 -39
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +0 -39
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +0 -37
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +0 -39
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +0 -39
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +0 -39
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +0 -39
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +0 -43
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +0 -37
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +0 -36
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +0 -37
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +0 -39
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +0 -16
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +0 -107
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +0 -224
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +0 -825
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +0 -562
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +0 -368
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +0 -257
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +0 -260
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +0 -178
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +0 -184
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +0 -497
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +0 -140
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +0 -354
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +0 -219
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +0 -192
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +0 -228
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +0 -16
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +0 -118
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +0 -198
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +0 -181
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +0 -103
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +0 -141
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +0 -109
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +0 -136
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +0 -125
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +0 -217
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +0 -16
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +0 -384
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +0 -598
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +0 -73
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +0 -869
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +0 -487
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +0 -309
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +0 -156
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +0 -149
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +0 -163
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +0 -16
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +0 -311
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +0 -524
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +0 -419
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +0 -144
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +0 -979
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +0 -234
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +0 -206
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +0 -421
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +0 -187
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +0 -397
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +0 -16
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +0 -100
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +0 -2743
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +0 -484
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +0 -276
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +0 -432
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +0 -310
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +0 -257
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +0 -160
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +0 -128
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +0 -16
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +0 -137
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +0 -16
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +0 -170
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +0 -547
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +0 -285
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +0 -106
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +0 -409
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +0 -247
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +0 -16
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +0 -105
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +0 -197
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +0 -137
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +0 -227
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +0 -634
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +0 -88
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +0 -139
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +0 -475
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +0 -265
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +0 -818
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +0 -162
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +0 -780
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +0 -741
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +0 -160
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +0 -453
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +0 -281
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +0 -487
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +0 -109
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +0 -434
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +0 -253
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +0 -152
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +0 -162
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +0 -234
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +0 -1339
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +0 -82
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +0 -124
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +0 -638
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +0 -200
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +0 -1355
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +0 -655
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +0 -113
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +0 -118
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +0 -192
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +0 -346
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +0 -495
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +0 -263
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +0 -59
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +0 -85
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +0 -364
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +0 -362
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +0 -46
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +0 -123
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +0 -581
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +0 -447
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +0 -301
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +0 -465
  370. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +0 -16
  371. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +0 -83
  372. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +0 -16
  373. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +0 -16
  374. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +0 -16
  375. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +0 -420
  376. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +0 -358
  377. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +0 -16
  378. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +0 -36
  379. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +0 -44
  380. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +0 -116
  381. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +0 -35
  382. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +0 -3612
  383. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +0 -1042
  384. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +0 -2381
  385. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +0 -1060
  386. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +0 -163
  387. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +0 -38
  388. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +0 -48
  389. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +0 -36
  390. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +0 -55
  391. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +0 -36
  392. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +0 -96
  393. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +0 -44
  394. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +0 -36
  395. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +0 -59
  396. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +0 -36
  397. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +0 -59
  398. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +0 -74
  399. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +0 -62
  400. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +0 -58
  401. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +0 -70
  402. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +0 -50
  403. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +0 -68
  404. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +0 -40
  405. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +0 -46
  406. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +0 -44
  407. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +0 -100
  408. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +0 -100
  409. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +0 -163
  410. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +0 -181
  411. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +0 -42
  412. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +0 -16
  413. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +0 -623
  414. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +0 -869
  415. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +0 -342
  416. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +0 -436
  417. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +0 -363
  418. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +0 -592
  419. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +0 -1503
  420. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +0 -392
  421. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +0 -375
  422. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +0 -411
  423. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +0 -16
  424. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +0 -401
  425. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +0 -295
  426. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +0 -106
  427. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +0 -558
  428. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +0 -1346
  429. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +0 -182
  430. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +0 -202
  431. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +0 -503
  432. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +0 -225
  433. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +0 -83
  434. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +0 -201
  435. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +0 -1931
  436. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +0 -256
  437. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +0 -69
  438. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +0 -1349
  439. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +0 -53
  440. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +0 -68
  441. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +0 -283
  442. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +0 -155
  443. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +0 -412
  444. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +0 -1581
  445. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +0 -961
  446. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +0 -165
  447. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +0 -1456
  448. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +0 -1686
  449. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +0 -16
  450. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +0 -184
  451. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +0 -706
  452. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +0 -118
  453. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +0 -160
  454. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +0 -16
  455. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +0 -306
  456. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +0 -196
  457. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +0 -44
  458. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +0 -346
  459. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +0 -89
  460. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +0 -124
  461. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +0 -69
  462. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +0 -167
  463. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +0 -194
  464. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +0 -168
  465. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +0 -939
  466. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +0 -52
  467. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +0 -66
  468. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +0 -368
  469. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +0 -257
  470. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +0 -267
  471. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +0 -153
  472. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +0 -130
  473. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +0 -350
  474. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +0 -97
  475. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +0 -271
  476. snowpark_connect-0.24.0.dist-info/RECORD +0 -898
  477. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-connect +0 -0
  478. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-session +0 -0
  479. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-submit +0 -0
  480. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/WHEEL +0 -0
  481. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/LICENSE-binary +0 -0
  482. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/LICENSE.txt +0 -0
  483. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/NOTICE-binary +0 -0
  484. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/top_level.txt +0 -0
@@ -1,1503 +0,0 @@
1
- #
2
- # Licensed to the Apache Software Foundation (ASF) under one or more
3
- # contributor license agreements. See the NOTICE file distributed with
4
- # this work for additional information regarding copyright ownership.
5
- # The ASF licenses this file to You under the Apache License, Version 2.0
6
- # (the "License"); you may not use this file except in compliance with
7
- # the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- #
17
- import os
18
- import random
19
- import shutil
20
- import tempfile
21
- import time
22
- import unittest
23
- from datetime import date, datetime
24
- from decimal import Decimal
25
- from distutils.version import LooseVersion
26
- from typing import cast
27
-
28
- from pyspark import TaskContext
29
- from pyspark.rdd import PythonEvalType
30
- from pyspark.sql import Column
31
- from pyspark.sql.functions import array, col, expr, lit, sum, struct, udf, pandas_udf, PandasUDFType
32
- from pyspark.sql.pandas.utils import pyarrow_version_less_than_minimum
33
- from pyspark.sql.types import (
34
- IntegerType,
35
- ByteType,
36
- StructType,
37
- ShortType,
38
- BooleanType,
39
- LongType,
40
- FloatType,
41
- DoubleType,
42
- DecimalType,
43
- StringType,
44
- ArrayType,
45
- StructField,
46
- Row,
47
- TimestampType,
48
- MapType,
49
- DateType,
50
- BinaryType,
51
- YearMonthIntervalType,
52
- )
53
- from pyspark.errors import AnalysisException
54
- from pyspark.testing.sqlutils import (
55
- ReusedSQLTestCase,
56
- test_compiled,
57
- test_not_compiled_message,
58
- have_pandas,
59
- have_pyarrow,
60
- pandas_requirement_message,
61
- pyarrow_requirement_message,
62
- )
63
- from pyspark.testing.utils import QuietTest, assertDataFrameEqual
64
-
65
- if have_pandas:
66
- import pandas as pd
67
-
68
- if have_pyarrow:
69
- import pyarrow as pa # noqa: F401
70
-
71
-
72
- @unittest.skipIf(
73
- not have_pandas or not have_pyarrow,
74
- cast(str, pandas_requirement_message or pyarrow_requirement_message),
75
- )
76
- class ScalarPandasUDFTestsMixin:
77
- @property
78
- def nondeterministic_vectorized_udf(self):
79
- import numpy as np
80
-
81
- @pandas_udf("double")
82
- def random_udf(v):
83
- return pd.Series(np.random.random(len(v)))
84
-
85
- random_udf = random_udf.asNondeterministic()
86
- return random_udf
87
-
88
- @property
89
- def nondeterministic_vectorized_iter_udf(self):
90
- import numpy as np
91
-
92
- @pandas_udf("double", PandasUDFType.SCALAR_ITER)
93
- def random_udf(it):
94
- for v in it:
95
- yield pd.Series(np.random.random(len(v)))
96
-
97
- random_udf = random_udf.asNondeterministic()
98
- return random_udf
99
-
100
- @property
101
- def df_with_nested_structs(self):
102
- schema = StructType(
103
- [
104
- StructField("id", IntegerType(), False),
105
- StructField(
106
- "info",
107
- StructType(
108
- [
109
- StructField("name", StringType(), False),
110
- StructField("age", IntegerType(), False),
111
- StructField(
112
- "details",
113
- StructType(
114
- [
115
- StructField("field1", StringType(), False),
116
- StructField("field2", IntegerType(), False),
117
- ]
118
- ),
119
- False,
120
- ),
121
- ]
122
- ),
123
- False,
124
- ),
125
- ]
126
- )
127
- data = [(1, ("John", 30, ("Value1", 10)))]
128
- df = self.spark.createDataFrame(data, schema)
129
- struct_df = df.select(struct(df.columns).alias("struct"))
130
- # struct_df.dtype:
131
- # [(
132
- # 'struct',
133
- # 'struct<id:int,info:
134
- # struct<name:string,age:int,details:
135
- # struct<field1:string, field2:int>>>'
136
- # )]
137
- return struct_df
138
-
139
- @property
140
- def df_with_nested_maps(self):
141
- schema = StructType(
142
- [
143
- StructField("id", StringType(), True),
144
- StructField(
145
- "attributes", MapType(StringType(), MapType(StringType(), StringType())), True
146
- ),
147
- ]
148
- )
149
- data = [("1", {"personal": {"name": "John", "city": "New York"}})]
150
- return self.spark.createDataFrame(data, schema)
151
-
152
- @property
153
- def df_with_nested_arrays(self):
154
- schema = StructType(
155
- [
156
- StructField("id", IntegerType(), nullable=False),
157
- StructField("nested_array", ArrayType(ArrayType(IntegerType())), nullable=False),
158
- ]
159
- )
160
- data = [(1, [[1, 2, 3], [4, 5]])]
161
- return self.spark.createDataFrame(data, schema)
162
-
163
- def test_pandas_udf_tokenize(self):
164
- tokenize = pandas_udf(
165
- lambda s: s.apply(lambda str: str.split(" ")), ArrayType(StringType())
166
- )
167
- self.assertEqual(tokenize.returnType, ArrayType(StringType()))
168
- df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"])
169
- result = df.select(tokenize("vals").alias("hi"))
170
- self.assertEqual([Row(hi=["hi", "boo"]), Row(hi=["bye", "boo"])], result.collect())
171
-
172
- def test_pandas_udf_nested_arrays(self):
173
- tokenize = pandas_udf(
174
- lambda s: s.apply(lambda str: [str.split(" ")]), ArrayType(ArrayType(StringType()))
175
- )
176
- self.assertEqual(tokenize.returnType, ArrayType(ArrayType(StringType())))
177
- df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"])
178
- result = df.select(tokenize("vals").alias("hi"))
179
- self.assertEqual([Row(hi=[["hi", "boo"]]), Row(hi=[["bye", "boo"]])], result.collect())
180
-
181
- def test_input_nested_structs(self):
182
- df = self.df_with_nested_structs
183
-
184
- mirror = pandas_udf(lambda s: s, df.dtypes[0][1])
185
-
186
- self.assertEquals(
187
- df.select(mirror(df.struct).alias("res")).first(),
188
- Row(
189
- res=Row(
190
- id=1, info=Row(name="John", age=30, details=Row(field1="Value1", field2=10))
191
- )
192
- ),
193
- )
194
-
195
- def test_input_nested_maps(self):
196
- df = self.df_with_nested_maps
197
-
198
- str_repr = pandas_udf(lambda s: s.astype(str), StringType())
199
- self.assertEquals(
200
- df.select(str_repr(df.attributes).alias("res")).first(),
201
- Row(res="{'personal': {'name': 'John', 'city': 'New York'}}"),
202
- )
203
-
204
- extract_name = pandas_udf(lambda s: s.apply(lambda x: x["personal"]["name"]), StringType())
205
- self.assertEquals(
206
- df.select(extract_name(df.attributes).alias("res")).first(),
207
- Row(res="John"),
208
- )
209
-
210
- def test_input_nested_arrays(self):
211
- df = self.df_with_nested_arrays
212
-
213
- str_repr = pandas_udf(lambda s: s.astype(str), StringType())
214
- self.assertEquals(
215
- df.select(str_repr(df.nested_array).alias("res")).first(),
216
- Row(res="[array([1, 2, 3], dtype=int32) array([4, 5], dtype=int32)]"),
217
- )
218
-
219
- @unittest.skipIf(
220
- pyarrow_version_less_than_minimum("2.0.0"),
221
- "Pyarrow version must be 2.0.0 or higher",
222
- )
223
- def test_pandas_array_struct(self):
224
- # SPARK-38098: Support Array of Struct for Pandas UDFs and toPandas
225
- import numpy as np
226
-
227
- @pandas_udf("Array<struct<col1:string, col2:long, col3:double>>")
228
- def return_cols(cols):
229
- assert type(cols) == pd.Series
230
- assert type(cols[0]) == np.ndarray
231
- assert type(cols[0][0]) == dict
232
- return cols
233
-
234
- df = self.spark.createDataFrame(
235
- [[[("a", 2, 3.0), ("a", 2, 3.0)]], [[("b", 5, 6.0), ("b", 5, 6.0)]]],
236
- "array_struct_col Array<struct<col1:string, col2:long, col3:double>>",
237
- )
238
- result = df.select(return_cols("array_struct_col"))
239
- self.assertEqual(
240
- [
241
- Row(output=[Row(col1="a", col2=2, col3=3.0), Row(col1="a", col2=2, col3=3.0)]),
242
- Row(output=[Row(col1="b", col2=5, col3=6.0), Row(col1="b", col2=5, col3=6.0)]),
243
- ],
244
- result.collect(),
245
- )
246
-
247
- def test_vectorized_udf_basic(self):
248
- df = self.spark.range(10).select(
249
- col("id").cast("string").alias("str"),
250
- col("id").cast("int").alias("int"),
251
- col("id").alias("long"),
252
- col("id").cast("float").alias("float"),
253
- col("id").cast("double").alias("double"),
254
- col("id").cast("decimal").alias("decimal"),
255
- col("id").cast("boolean").alias("bool"),
256
- array(col("id")).alias("array_long"),
257
- )
258
-
259
- def f(x):
260
- return x
261
-
262
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
263
- str_f = pandas_udf(f, StringType(), udf_type)
264
- int_f = pandas_udf(f, IntegerType(), udf_type)
265
- long_f = pandas_udf(f, LongType(), udf_type)
266
- float_f = pandas_udf(f, FloatType(), udf_type)
267
- double_f = pandas_udf(f, DoubleType(), udf_type)
268
- decimal_f = pandas_udf(f, DecimalType(), udf_type)
269
- bool_f = pandas_udf(f, BooleanType(), udf_type)
270
- array_long_f = pandas_udf(f, ArrayType(LongType()), udf_type)
271
- res = df.select(
272
- str_f(col("str")),
273
- int_f(col("int")),
274
- long_f(col("long")),
275
- float_f(col("float")),
276
- double_f(col("double")),
277
- decimal_f("decimal"),
278
- bool_f(col("bool")),
279
- array_long_f("array_long"),
280
- )
281
- self.assertEqual(df.collect(), res.collect())
282
-
283
- def test_register_nondeterministic_vectorized_udf_basic(self):
284
- random_pandas_udf = pandas_udf(
285
- lambda x: random.randint(6, 6) + x, IntegerType()
286
- ).asNondeterministic()
287
- self.assertEqual(random_pandas_udf.deterministic, False)
288
- self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
289
- nondeterministic_pandas_udf = self.spark.catalog.registerFunction(
290
- "randomPandasUDF", random_pandas_udf
291
- )
292
- self.assertEqual(nondeterministic_pandas_udf.deterministic, False)
293
- self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
294
- [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect()
295
- self.assertEqual(row[0], 7)
296
-
297
- def random_iter_udf(it):
298
- for i in it:
299
- yield random.randint(6, 6) + i
300
-
301
- random_pandas_iter_udf = pandas_udf(
302
- random_iter_udf, IntegerType(), PandasUDFType.SCALAR_ITER
303
- ).asNondeterministic()
304
- self.assertEqual(random_pandas_iter_udf.deterministic, False)
305
- self.assertEqual(random_pandas_iter_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF)
306
- nondeterministic_pandas_iter_udf = self.spark.catalog.registerFunction(
307
- "randomPandasIterUDF", random_pandas_iter_udf
308
- )
309
- self.assertEqual(nondeterministic_pandas_iter_udf.deterministic, False)
310
- self.assertEqual(
311
- nondeterministic_pandas_iter_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
312
- )
313
- [row] = self.spark.sql("SELECT randomPandasIterUDF(1)").collect()
314
- self.assertEqual(row[0], 7)
315
-
316
- def test_vectorized_udf_null_boolean(self):
317
- data = [(True,), (True,), (None,), (False,)]
318
- schema = StructType().add("bool", BooleanType())
319
- df = self.spark.createDataFrame(data, schema)
320
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
321
- bool_f = pandas_udf(lambda x: x, BooleanType(), udf_type)
322
- res = df.select(bool_f(col("bool")))
323
- self.assertEqual(df.collect(), res.collect())
324
-
325
- def test_vectorized_udf_null_byte(self):
326
- data = [(None,), (2,), (3,), (4,)]
327
- schema = StructType().add("byte", ByteType())
328
- df = self.spark.createDataFrame(data, schema)
329
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
330
- byte_f = pandas_udf(lambda x: x, ByteType(), udf_type)
331
- res = df.select(byte_f(col("byte")))
332
- self.assertEqual(df.collect(), res.collect())
333
-
334
- def test_vectorized_udf_null_short(self):
335
- data = [(None,), (2,), (3,), (4,)]
336
- schema = StructType().add("short", ShortType())
337
- df = self.spark.createDataFrame(data, schema)
338
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
339
- short_f = pandas_udf(lambda x: x, ShortType(), udf_type)
340
- res = df.select(short_f(col("short")))
341
- self.assertEqual(df.collect(), res.collect())
342
-
343
- def test_vectorized_udf_null_int(self):
344
- data = [(None,), (2,), (3,), (4,)]
345
- schema = StructType().add("int", IntegerType())
346
- df = self.spark.createDataFrame(data, schema)
347
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
348
- int_f = pandas_udf(lambda x: x, IntegerType(), udf_type)
349
- res = df.select(int_f(col("int")))
350
- self.assertEqual(df.collect(), res.collect())
351
-
352
- def test_vectorized_udf_null_long(self):
353
- data = [(None,), (2,), (3,), (4,)]
354
- schema = StructType().add("long", LongType())
355
- df = self.spark.createDataFrame(data, schema)
356
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
357
- long_f = pandas_udf(lambda x: x, LongType(), udf_type)
358
- res = df.select(long_f(col("long")))
359
- self.assertEqual(df.collect(), res.collect())
360
-
361
- def test_vectorized_udf_null_float(self):
362
- data = [(3.0,), (5.0,), (-1.0,), (None,)]
363
- schema = StructType().add("float", FloatType())
364
- df = self.spark.createDataFrame(data, schema)
365
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
366
- float_f = pandas_udf(lambda x: x, FloatType(), udf_type)
367
- res = df.select(float_f(col("float")))
368
- self.assertEqual(df.collect(), res.collect())
369
-
370
- def test_vectorized_udf_null_double(self):
371
- data = [(3.0,), (5.0,), (-1.0,), (None,)]
372
- schema = StructType().add("double", DoubleType())
373
- df = self.spark.createDataFrame(data, schema)
374
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
375
- double_f = pandas_udf(lambda x: x, DoubleType(), udf_type)
376
- res = df.select(double_f(col("double")))
377
- self.assertEqual(df.collect(), res.collect())
378
-
379
- def test_vectorized_udf_null_decimal(self):
380
- data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)]
381
- schema = StructType().add("decimal", DecimalType(38, 18))
382
- df = self.spark.createDataFrame(data, schema)
383
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
384
- decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18), udf_type)
385
- res = df.select(decimal_f(col("decimal")))
386
- self.assertEqual(df.collect(), res.collect())
387
-
388
- def test_vectorized_udf_null_string(self):
389
- data = [("foo",), (None,), ("bar",), ("bar",)]
390
- schema = StructType().add("str", StringType())
391
- df = self.spark.createDataFrame(data, schema)
392
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
393
- str_f = pandas_udf(lambda x: x, StringType(), udf_type)
394
- res = df.select(str_f(col("str")))
395
- self.assertEqual(df.collect(), res.collect())
396
-
397
- def test_vectorized_udf_string_in_udf(self):
398
- df = self.spark.range(10)
399
-
400
- def scalar_f(x):
401
- return pd.Series(map(str, x))
402
-
403
- def iter_f(it):
404
- for i in it:
405
- yield scalar_f(i)
406
-
407
- for f, udf_type in [(scalar_f, PandasUDFType.SCALAR), (iter_f, PandasUDFType.SCALAR_ITER)]:
408
- str_f = pandas_udf(f, StringType(), udf_type)
409
- actual = df.select(str_f(col("id")))
410
- expected = df.select(col("id").cast("string"))
411
- self.assertEqual(expected.collect(), actual.collect())
412
-
413
- def test_vectorized_udf_datatype_string(self):
414
- df = self.spark.range(10).select(
415
- col("id").cast("string").alias("str"),
416
- col("id").cast("int").alias("int"),
417
- col("id").alias("long"),
418
- col("id").cast("float").alias("float"),
419
- col("id").cast("double").alias("double"),
420
- col("id").cast("decimal").alias("decimal"),
421
- col("id").cast("boolean").alias("bool"),
422
- )
423
-
424
- def f(x):
425
- return x
426
-
427
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
428
- str_f = pandas_udf(f, "string", udf_type)
429
- int_f = pandas_udf(f, "integer", udf_type)
430
- long_f = pandas_udf(f, "long", udf_type)
431
- float_f = pandas_udf(f, "float", udf_type)
432
- double_f = pandas_udf(f, "double", udf_type)
433
- decimal_f = pandas_udf(f, "decimal(38, 18)", udf_type)
434
- bool_f = pandas_udf(f, "boolean", udf_type)
435
- res = df.select(
436
- str_f(col("str")),
437
- int_f(col("int")),
438
- long_f(col("long")),
439
- float_f(col("float")),
440
- double_f(col("double")),
441
- decimal_f("decimal"),
442
- bool_f(col("bool")),
443
- )
444
- self.assertEqual(df.collect(), res.collect())
445
-
446
- def test_vectorized_udf_null_binary(self):
447
- data = [(bytearray(b"a"),), (None,), (bytearray(b"bb"),), (bytearray(b"ccc"),)]
448
- schema = StructType().add("binary", BinaryType())
449
- df = self.spark.createDataFrame(data, schema)
450
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
451
- str_f = pandas_udf(lambda x: x, BinaryType(), udf_type)
452
- res = df.select(str_f(col("binary")))
453
- self.assertEqual(df.collect(), res.collect())
454
-
455
- def test_vectorized_udf_array_type(self):
456
- data = [([1, 2],), ([3, 4],)]
457
- array_schema = StructType([StructField("array", ArrayType(IntegerType()))])
458
- df = self.spark.createDataFrame(data, schema=array_schema)
459
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
460
- array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()), udf_type)
461
- result = df.select(array_f(col("array")))
462
- self.assertEqual(df.collect(), result.collect())
463
-
464
- def test_vectorized_udf_null_array(self):
465
- data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)]
466
- array_schema = StructType([StructField("array", ArrayType(IntegerType()))])
467
- df = self.spark.createDataFrame(data, schema=array_schema)
468
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
469
- array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()), udf_type)
470
- result = df.select(array_f(col("array")))
471
- self.assertEqual(df.collect(), result.collect())
472
-
473
- def test_vectorized_udf_struct_type(self):
474
- df = self.spark.range(10)
475
- return_type = StructType([StructField("id", LongType()), StructField("str", StringType())])
476
-
477
- def scalar_func(id):
478
- return pd.DataFrame({"id": id, "str": id.apply(str)})
479
-
480
- def iter_func(it):
481
- for id in it:
482
- yield scalar_func(id)
483
-
484
- for func, udf_type in [
485
- (scalar_func, PandasUDFType.SCALAR),
486
- (iter_func, PandasUDFType.SCALAR_ITER),
487
- ]:
488
- f = pandas_udf(func, returnType=return_type, functionType=udf_type)
489
-
490
- expected = df.select(
491
- struct(col("id"), col("id").cast("string").alias("str")).alias("struct")
492
- ).collect()
493
-
494
- actual = df.select(f(col("id")).alias("struct")).collect()
495
- self.assertEqual(expected, actual)
496
-
497
- g = pandas_udf(func, "id: long, str: string", functionType=udf_type)
498
- actual = df.select(g(col("id")).alias("struct")).collect()
499
- self.assertEqual(expected, actual)
500
-
501
- struct_f = pandas_udf(lambda x: x, return_type, functionType=udf_type)
502
- actual = df.select(struct_f(struct(col("id"), col("id").cast("string").alias("str"))))
503
- self.assertEqual(expected, actual.collect())
504
-
505
- def test_vectorized_udf_struct_complex(self):
506
- df = self.spark.range(10)
507
- return_type = StructType(
508
- [StructField("ts", TimestampType()), StructField("arr", ArrayType(LongType()))]
509
- )
510
-
511
- def _scalar_f(id):
512
- return pd.DataFrame(
513
- {"ts": id.apply(lambda i: pd.Timestamp(i)), "arr": id.apply(lambda i: [i, i + 1])}
514
- )
515
-
516
- scalar_f = pandas_udf(_scalar_f, returnType=return_type)
517
-
518
- @pandas_udf(returnType=return_type, functionType=PandasUDFType.SCALAR_ITER)
519
- def iter_f(it):
520
- for id in it:
521
- yield _scalar_f(id)
522
-
523
- for f, udf_type in [(scalar_f, PandasUDFType.SCALAR), (iter_f, PandasUDFType.SCALAR_ITER)]:
524
- actual = df.withColumn("f", f(col("id"))).collect()
525
- for i, row in enumerate(actual):
526
- id, f = row
527
- self.assertEqual(i, id)
528
- self.assertEqual(pd.Timestamp(i).to_pydatetime(), f[0])
529
- self.assertListEqual([i, i + 1], f[1])
530
-
531
- def test_vectorized_udf_struct_empty(self):
532
- df = self.spark.range(3)
533
- return_type = StructType()
534
-
535
- def _scalar_f(id):
536
- return pd.DataFrame(index=id)
537
-
538
- scalar_f = pandas_udf(_scalar_f, returnType=return_type)
539
-
540
- @pandas_udf(returnType=return_type, functionType=PandasUDFType.SCALAR_ITER)
541
- def iter_f(it):
542
- for id in it:
543
- yield _scalar_f(id)
544
-
545
- for f, udf_type in [(scalar_f, "SCALAR"), (iter_f, "SCALAR_ITER")]:
546
- with self.subTest(udf_type=udf_type):
547
- assertDataFrameEqual(
548
- df.withColumn("f", f(col("id"))),
549
- [Row(id=0, f=Row()), Row(id=1, f=Row()), Row(id=2, f=Row())],
550
- )
551
-
552
- def test_vectorized_udf_nested_struct(self):
553
- with QuietTest(self.sc):
554
- self.check_vectorized_udf_nested_struct()
555
-
556
- def check_vectorized_udf_nested_struct(self):
557
- df = self.spark.range(2)
558
-
559
- nested_type = StructType(
560
- [
561
- StructField("id", IntegerType()),
562
- StructField(
563
- "nested",
564
- StructType([StructField("foo", StringType()), StructField("bar", FloatType())]),
565
- ),
566
- ]
567
- )
568
-
569
- def func_dict(pser: pd.Series) -> pd.DataFrame:
570
- return pd.DataFrame(
571
- {"id": pser, "nested": pser.apply(lambda x: {"foo": str(x), "bar": float(x)})}
572
- )
573
-
574
- def func_row(pser: pd.Series) -> pd.DataFrame:
575
- return pd.DataFrame(
576
- {"id": pser, "nested": pser.apply(lambda x: Row(foo=str(x), bar=float(x)))}
577
- )
578
-
579
- expected = [
580
- Row(udf=Row(id=0, nested=Row(foo="0", bar=0.0))),
581
- Row(udf=Row(id=1, nested=Row(foo="1", bar=1.0))),
582
- ]
583
-
584
- for f in [func_dict, func_row]:
585
- for udf_type, func in [
586
- (PandasUDFType.SCALAR, f),
587
- (PandasUDFType.SCALAR_ITER, lambda iter: (f(pser) for pser in iter)),
588
- ]:
589
- with self.subTest(udf_type=udf_type, udf=f.__name__):
590
- result = df.select(
591
- pandas_udf(func, returnType=nested_type, functionType=udf_type)(
592
- col("id")
593
- ).alias("udf")
594
- ).collect()
595
- self.assertEqual(result, expected)
596
-
597
- def test_vectorized_udf_map_type(self):
598
- data = [({},), ({"a": 1},), ({"a": 1, "b": 2},), ({"a": 1, "b": 2, "c": 3},)]
599
- schema = StructType([StructField("map", MapType(StringType(), LongType()))])
600
- df = self.spark.createDataFrame(data, schema=schema)
601
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
602
- if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
603
- with QuietTest(self.sc):
604
- with self.assertRaisesRegex(Exception, "MapType.*not supported"):
605
- pandas_udf(lambda x: x, MapType(StringType(), LongType()), udf_type)
606
- else:
607
- map_f = pandas_udf(lambda x: x, MapType(StringType(), LongType()), udf_type)
608
- result = df.select(map_f(col("map")))
609
- self.assertEqual(df.collect(), result.collect())
610
-
611
- def test_vectorized_udf_complex(self):
612
- df = self.spark.range(10).select(
613
- col("id").cast("int").alias("a"),
614
- col("id").cast("int").alias("b"),
615
- col("id").cast("double").alias("c"),
616
- )
617
- scalar_add = pandas_udf(lambda x, y: x + y, IntegerType())
618
- scalar_power2 = pandas_udf(lambda x: 2**x, IntegerType())
619
- scalar_mul = pandas_udf(lambda x, y: x * y, DoubleType())
620
-
621
- @pandas_udf(IntegerType(), PandasUDFType.SCALAR_ITER)
622
- def iter_add(it):
623
- for x, y in it:
624
- yield x + y
625
-
626
- @pandas_udf(IntegerType(), PandasUDFType.SCALAR_ITER)
627
- def iter_power2(it):
628
- for x in it:
629
- yield 2**x
630
-
631
- @pandas_udf(DoubleType(), PandasUDFType.SCALAR_ITER)
632
- def iter_mul(it):
633
- for x, y in it:
634
- yield x * y
635
-
636
- for add, power2, mul in [
637
- (scalar_add, scalar_power2, scalar_mul),
638
- (iter_add, iter_power2, iter_mul),
639
- ]:
640
- res = df.select(add(col("a"), col("b")), power2(col("a")), mul(col("b"), col("c")))
641
- expected = df.select(expr("a + b"), expr("power(2, a)"), expr("b * c"))
642
- self.assertEqual(expected.collect(), res.collect())
643
-
644
- def test_vectorized_udf_exception(self):
645
- with QuietTest(self.sc):
646
- self.check_vectorized_udf_exception()
647
-
648
- def check_vectorized_udf_exception(self):
649
- df = self.spark.range(10)
650
- scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
651
-
652
- @pandas_udf(LongType(), PandasUDFType.SCALAR_ITER)
653
- def iter_raise_exception(it):
654
- for x in it:
655
- yield x * (1 / 0)
656
-
657
- for raise_exception in [scalar_raise_exception, iter_raise_exception]:
658
- with self.assertRaisesRegex(Exception, "division( or modulo)? by zero"):
659
- df.select(raise_exception(col("id"))).collect()
660
-
661
- def test_vectorized_udf_invalid_length(self):
662
- with QuietTest(self.sc):
663
- self.check_vectorized_udf_invalid_length()
664
-
665
- def check_vectorized_udf_invalid_length(self):
666
- df = self.spark.range(10)
667
- raise_exception = pandas_udf(lambda _: pd.Series(1), LongType())
668
- with self.assertRaisesRegex(
669
- Exception, "Result vector from pandas_udf was not the required length"
670
- ):
671
- df.select(raise_exception(col("id"))).collect()
672
-
673
- @pandas_udf(LongType(), PandasUDFType.SCALAR_ITER)
674
- def iter_udf_wong_output_size(it):
675
- for _ in it:
676
- yield pd.Series(1)
677
-
678
- with self.assertRaisesRegex(
679
- Exception, "The length of output in Scalar iterator.*" "the length of output was 1"
680
- ):
681
- df.select(iter_udf_wong_output_size(col("id"))).collect()
682
-
683
- @pandas_udf(LongType(), PandasUDFType.SCALAR_ITER)
684
- def iter_udf_not_reading_all_input(it):
685
- for batch in it:
686
- batch_len = len(batch)
687
- yield pd.Series([1] * batch_len)
688
- break
689
-
690
- with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}):
691
- df1 = self.spark.range(10).repartition(1)
692
- with self.assertRaisesRegex(Exception, "pandas iterator UDF should exhaust"):
693
- df1.select(iter_udf_not_reading_all_input(col("id"))).collect()
694
-
695
- def test_vectorized_udf_chained(self):
696
- df = self.spark.range(10)
697
- scalar_f = pandas_udf(lambda x: x + 1, LongType())
698
- scalar_g = pandas_udf(lambda x: x - 1, LongType())
699
-
700
- iter_f = pandas_udf(
701
- lambda it: map(lambda x: x + 1, it), LongType(), PandasUDFType.SCALAR_ITER
702
- )
703
- iter_g = pandas_udf(
704
- lambda it: map(lambda x: x - 1, it), LongType(), PandasUDFType.SCALAR_ITER
705
- )
706
-
707
- for f, g in [(scalar_f, scalar_g), (iter_f, iter_g)]:
708
- res = df.select(g(f(col("id"))))
709
- self.assertEqual(df.collect(), res.collect())
710
-
711
- def test_vectorized_udf_chained_struct_type(self):
712
- df = self.spark.range(10)
713
- return_type = StructType([StructField("id", LongType()), StructField("str", StringType())])
714
-
715
- @pandas_udf(return_type)
716
- def scalar_f(id):
717
- return pd.DataFrame({"id": id, "str": id.apply(str)})
718
-
719
- scalar_g = pandas_udf(lambda x: x, return_type)
720
-
721
- @pandas_udf(return_type, PandasUDFType.SCALAR_ITER)
722
- def iter_f(it):
723
- for id in it:
724
- yield pd.DataFrame({"id": id, "str": id.apply(str)})
725
-
726
- iter_g = pandas_udf(lambda x: x, return_type, PandasUDFType.SCALAR_ITER)
727
-
728
- expected = df.select(
729
- struct(col("id"), col("id").cast("string").alias("str")).alias("struct")
730
- ).collect()
731
-
732
- for f, g in [(scalar_f, scalar_g), (iter_f, iter_g)]:
733
- actual = df.select(g(f(col("id"))).alias("struct")).collect()
734
- self.assertEqual(expected, actual)
735
-
736
- def test_vectorized_udf_wrong_return_type(self):
737
- with QuietTest(self.sc):
738
- self.check_vectorized_udf_wrong_return_type()
739
-
740
- def check_vectorized_udf_wrong_return_type(self):
741
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
742
- with self.assertRaisesRegex(
743
- NotImplementedError,
744
- "Invalid return type.*scalar Pandas UDF.*ArrayType.*YearMonthIntervalType",
745
- ):
746
- pandas_udf(lambda x: x, ArrayType(YearMonthIntervalType()), udf_type)
747
-
748
- def test_vectorized_udf_return_scalar(self):
749
- with QuietTest(self.sc):
750
- self.check_vectorized_udf_return_scalar()
751
-
752
- def check_vectorized_udf_return_scalar(self):
753
- df = self.spark.range(10)
754
- scalar_f = pandas_udf(lambda x: 1.0, DoubleType())
755
- iter_f = pandas_udf(
756
- lambda it: map(lambda x: 1.0, it), DoubleType(), PandasUDFType.SCALAR_ITER
757
- )
758
- for f in [scalar_f, iter_f]:
759
- with self.assertRaisesRegex(Exception, "Return.*type.*Series"):
760
- df.select(f(col("id"))).collect()
761
-
762
- def test_vectorized_udf_decorator(self):
763
- df = self.spark.range(10)
764
-
765
- @pandas_udf(returnType=LongType())
766
- def scalar_identity(x):
767
- return x
768
-
769
- @pandas_udf(returnType=LongType(), functionType=PandasUDFType.SCALAR_ITER)
770
- def iter_identity(x):
771
- return x
772
-
773
- for identity in [scalar_identity, iter_identity]:
774
- res = df.select(identity(col("id")))
775
- self.assertEqual(df.collect(), res.collect())
776
-
777
- def test_vectorized_udf_empty_partition(self):
778
- df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))
779
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
780
- f = pandas_udf(lambda x: x, LongType(), udf_type)
781
- res = df.select(f(col("id")))
782
- self.assertEqual(df.collect(), res.collect())
783
-
784
- def test_vectorized_udf_struct_with_empty_partition(self):
785
- df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)).withColumn(
786
- "name", lit("John Doe")
787
- )
788
-
789
- @pandas_udf("first string, last string")
790
- def scalar_split_expand(n):
791
- return n.str.split(expand=True)
792
-
793
- @pandas_udf("first string, last string", PandasUDFType.SCALAR_ITER)
794
- def iter_split_expand(it):
795
- for n in it:
796
- yield n.str.split(expand=True)
797
-
798
- for split_expand in [scalar_split_expand, iter_split_expand]:
799
- result = df.select(split_expand("name")).collect()
800
- self.assertEqual(1, len(result))
801
- row = result[0]
802
- self.assertEqual("John", row[0]["first"])
803
- self.assertEqual("Doe", row[0]["last"])
804
-
805
- def test_vectorized_udf_varargs(self):
806
- df = self.spark.range(start=1, end=2)
807
- scalar_f = pandas_udf(lambda *v: v[0], LongType())
808
-
809
- @pandas_udf(LongType(), PandasUDFType.SCALAR_ITER)
810
- def iter_f(it):
811
- for v in it:
812
- yield v[0]
813
-
814
- for f in [scalar_f, iter_f]:
815
- res = df.select(f(col("id"), col("id")))
816
- self.assertEqual(df.collect(), res.collect())
817
-
818
- def test_vectorized_udf_dates(self):
819
- schema = StructType().add("idx", LongType()).add("date", DateType())
820
- data = [
821
- (
822
- 0,
823
- date(1969, 1, 1),
824
- ),
825
- (
826
- 1,
827
- date(2012, 2, 2),
828
- ),
829
- (
830
- 2,
831
- None,
832
- ),
833
- (
834
- 3,
835
- date(2100, 4, 4),
836
- ),
837
- (
838
- 4,
839
- date(2262, 4, 12),
840
- ),
841
- ]
842
- df = self.spark.createDataFrame(data, schema=schema)
843
-
844
- def scalar_check_data(idx, date, date_copy):
845
- msgs = []
846
- is_equal = date.isnull()
847
- for i in range(len(idx)):
848
- if (is_equal[i] and data[idx[i]][1] is None) or date[i] == data[idx[i]][1]:
849
- msgs.append(None)
850
- else:
851
- msgs.append(
852
- "date values are not equal (date='%s': data[%d][1]='%s')"
853
- % (date[i], idx[i], data[idx[i]][1])
854
- )
855
- return pd.Series(msgs)
856
-
857
- def iter_check_data(it):
858
- for idx, test_date, date_copy in it:
859
- yield scalar_check_data(idx, test_date, date_copy)
860
-
861
- pandas_scalar_check_data = pandas_udf(scalar_check_data, StringType())
862
- pandas_iter_check_data = pandas_udf(
863
- iter_check_data, StringType(), PandasUDFType.SCALAR_ITER
864
- )
865
-
866
- for check_data, udf_type in [
867
- (pandas_scalar_check_data, PandasUDFType.SCALAR),
868
- (pandas_iter_check_data, PandasUDFType.SCALAR_ITER),
869
- ]:
870
- date_copy = pandas_udf(lambda t: t, returnType=DateType(), functionType=udf_type)
871
- df = df.withColumn("date_copy", date_copy(col("date")))
872
- result = df.withColumn(
873
- "check_data", check_data(col("idx"), col("date"), col("date_copy"))
874
- ).collect()
875
-
876
- self.assertEqual(len(data), len(result))
877
- for i in range(len(result)):
878
- self.assertEqual(data[i][1], result[i][1]) # "date" col
879
- self.assertEqual(data[i][1], result[i][2]) # "date_copy" col
880
- self.assertIsNone(result[i][3]) # "check_data" col
881
-
882
- def test_vectorized_udf_timestamps(self):
883
- schema = StructType(
884
- [StructField("idx", LongType(), True), StructField("timestamp", TimestampType(), True)]
885
- )
886
- data = [
887
- (0, datetime(1969, 1, 1, 1, 1, 1)),
888
- (1, datetime(2012, 2, 2, 2, 2, 2)),
889
- (2, None),
890
- (3, datetime(2100, 3, 3, 3, 3, 3)),
891
- ]
892
-
893
- df = self.spark.createDataFrame(data, schema=schema)
894
-
895
- def scalar_check_data(idx, timestamp, timestamp_copy):
896
- msgs = []
897
- is_equal = timestamp.isnull() # use this array to check values are equal
898
- for i in range(len(idx)):
899
- # Check that timestamps are as expected in the UDF
900
- if (is_equal[i] and data[idx[i]][1] is None) or timestamp[
901
- i
902
- ].to_pydatetime() == data[idx[i]][1]:
903
- msgs.append(None)
904
- else:
905
- msgs.append(
906
- "timestamp values are not equal (timestamp='%s': data[%d][1]='%s')"
907
- % (timestamp[i], idx[i], data[idx[i]][1])
908
- )
909
- return pd.Series(msgs)
910
-
911
- def iter_check_data(it):
912
- for idx, timestamp, timestamp_copy in it:
913
- yield scalar_check_data(idx, timestamp, timestamp_copy)
914
-
915
- pandas_scalar_check_data = pandas_udf(scalar_check_data, StringType())
916
- pandas_iter_check_data = pandas_udf(
917
- iter_check_data, StringType(), PandasUDFType.SCALAR_ITER
918
- )
919
-
920
- for check_data, udf_type in [
921
- (pandas_scalar_check_data, PandasUDFType.SCALAR),
922
- (pandas_iter_check_data, PandasUDFType.SCALAR_ITER),
923
- ]:
924
- # Check that a timestamp passed through a pandas_udf will not be altered by timezone
925
- # calc
926
- f_timestamp_copy = pandas_udf(
927
- lambda t: t, returnType=TimestampType(), functionType=udf_type
928
- )
929
- df = df.withColumn("timestamp_copy", f_timestamp_copy(col("timestamp")))
930
- result = df.withColumn(
931
- "check_data", check_data(col("idx"), col("timestamp"), col("timestamp_copy"))
932
- ).collect()
933
- # Check that collection values are correct
934
- self.assertEqual(len(data), len(result))
935
- for i in range(len(result)):
936
- self.assertEqual(data[i][1], result[i][1]) # "timestamp" col
937
- self.assertEqual(data[i][1], result[i][2]) # "timestamp_copy" col
938
- self.assertIsNone(result[i][3]) # "check_data" col
939
-
940
- def test_vectorized_udf_return_timestamp_tz(self):
941
- df = self.spark.range(10)
942
-
943
- @pandas_udf(returnType=TimestampType())
944
- def scalar_gen_timestamps(id):
945
- ts = [pd.Timestamp(i, unit="D", tz="America/Los_Angeles") for i in id]
946
- return pd.Series(ts)
947
-
948
- @pandas_udf(returnType=TimestampType(), functionType=PandasUDFType.SCALAR_ITER)
949
- def iter_gen_timestamps(it):
950
- for id in it:
951
- ts = [pd.Timestamp(i, unit="D", tz="America/Los_Angeles") for i in id]
952
- yield pd.Series(ts)
953
-
954
- for gen_timestamps in [scalar_gen_timestamps, iter_gen_timestamps]:
955
- result = df.withColumn("ts", gen_timestamps(col("id"))).collect()
956
- spark_ts_t = TimestampType()
957
- for r in result:
958
- i, ts = r
959
- ts_tz = pd.Timestamp(i, unit="D", tz="America/Los_Angeles").to_pydatetime()
960
- expected = spark_ts_t.fromInternal(spark_ts_t.toInternal(ts_tz))
961
- self.assertEqual(expected, ts)
962
-
963
- def test_vectorized_udf_check_config(self):
964
- with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}):
965
- df = self.spark.range(10, numPartitions=1)
966
-
967
- @pandas_udf(returnType=LongType())
968
- def scalar_check_records_per_batch(x):
969
- return pd.Series(x.size).repeat(x.size)
970
-
971
- @pandas_udf(returnType=LongType(), functionType=PandasUDFType.SCALAR_ITER)
972
- def iter_check_records_per_batch(it):
973
- for x in it:
974
- yield pd.Series(x.size).repeat(x.size)
975
-
976
- for check_records_per_batch in [
977
- scalar_check_records_per_batch,
978
- iter_check_records_per_batch,
979
- ]:
980
- result = df.select(check_records_per_batch(col("id"))).collect()
981
- for (r,) in result:
982
- self.assertTrue(r <= 3)
983
-
984
- def test_vectorized_udf_timestamps_respect_session_timezone(self):
985
- schema = StructType(
986
- [StructField("idx", LongType(), True), StructField("timestamp", TimestampType(), True)]
987
- )
988
- data = [
989
- (1, datetime(1969, 1, 1, 1, 1, 1)),
990
- (2, datetime(2012, 2, 2, 2, 2, 2)),
991
- (3, None),
992
- (4, datetime(2100, 3, 3, 3, 3, 3)),
993
- ]
994
- df = self.spark.createDataFrame(data, schema=schema)
995
-
996
- scalar_internal_value = pandas_udf(
997
- lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType()
998
- )
999
-
1000
- @pandas_udf(LongType(), PandasUDFType.SCALAR_ITER)
1001
- def iter_internal_value(it):
1002
- for ts in it:
1003
- yield ts.apply(lambda ts: ts.value if ts is not pd.NaT else None)
1004
-
1005
- for internal_value, udf_type in [
1006
- (scalar_internal_value, PandasUDFType.SCALAR),
1007
- (iter_internal_value, PandasUDFType.SCALAR_ITER),
1008
- ]:
1009
- f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType(), udf_type)
1010
- timezone = "America/Los_Angeles"
1011
- with self.sql_conf({"spark.sql.session.timeZone": timezone}):
1012
- df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))).withColumn(
1013
- "internal_value", internal_value(col("timestamp"))
1014
- )
1015
- result_la = df_la.select(col("idx"), col("internal_value")).collect()
1016
- # Correct result_la by adjusting 3 hours difference between Los Angeles and New York
1017
- diff = 3 * 60 * 60 * 1000 * 1000 * 1000
1018
- result_la_corrected = df_la.select(
1019
- col("idx"), col("tscopy"), col("internal_value") + diff
1020
- ).collect()
1021
-
1022
- timezone = "America/New_York"
1023
- with self.sql_conf({"spark.sql.session.timeZone": timezone}):
1024
- df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))).withColumn(
1025
- "internal_value", internal_value(col("timestamp"))
1026
- )
1027
- result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect()
1028
-
1029
- self.assertNotEqual(result_ny, result_la)
1030
- self.assertEqual(result_ny, result_la_corrected)
1031
-
1032
- def test_nondeterministic_vectorized_udf(self):
1033
- # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
1034
- @pandas_udf("double")
1035
- def scalar_plus_ten(v):
1036
- return v + 10
1037
-
1038
- @pandas_udf("double", PandasUDFType.SCALAR_ITER)
1039
- def iter_plus_ten(it):
1040
- for v in it:
1041
- yield v + 10
1042
-
1043
- for plus_ten in [scalar_plus_ten, iter_plus_ten]:
1044
- random_udf = self.nondeterministic_vectorized_udf
1045
-
1046
- df = self.spark.range(10).withColumn("rand", random_udf(col("id")))
1047
- result1 = df.withColumn("plus_ten(rand)", plus_ten(df["rand"])).toPandas()
1048
-
1049
- self.assertEqual(random_udf.deterministic, False)
1050
- self.assertTrue(result1["plus_ten(rand)"].equals(result1["rand"] + 10))
1051
-
1052
- def test_nondeterministic_vectorized_udf_in_aggregate(self):
1053
- with QuietTest(self.sc):
1054
- self.check_nondeterministic_analysis_exception()
1055
-
1056
- def check_nondeterministic_analysis_exception(self):
1057
- df = self.spark.range(10)
1058
- for random_udf in [
1059
- self.nondeterministic_vectorized_udf,
1060
- self.nondeterministic_vectorized_iter_udf,
1061
- ]:
1062
- with self.assertRaisesRegex(AnalysisException, "Non-deterministic"):
1063
- df.groupby(df.id).agg(sum(random_udf(df.id))).collect()
1064
- with self.assertRaisesRegex(AnalysisException, "Non-deterministic"):
1065
- df.agg(sum(random_udf(df.id))).collect()
1066
-
1067
- def test_register_vectorized_udf_basic(self):
1068
- df = self.spark.range(10).select(
1069
- col("id").cast("int").alias("a"), col("id").cast("int").alias("b")
1070
- )
1071
- scalar_original_add = pandas_udf(lambda x, y: x + y, IntegerType())
1072
- self.assertEqual(scalar_original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
1073
-
1074
- @pandas_udf(IntegerType(), PandasUDFType.SCALAR_ITER)
1075
- def iter_original_add(it):
1076
- for x, y in it:
1077
- yield x + y
1078
-
1079
- self.assertEqual(iter_original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF)
1080
-
1081
- for original_add in [scalar_original_add, iter_original_add]:
1082
- self.assertEqual(original_add.deterministic, True)
1083
- new_add = self.spark.catalog.registerFunction("add1", original_add)
1084
- res1 = df.select(new_add(col("a"), col("b")))
1085
- res2 = self.spark.sql(
1086
- "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t"
1087
- )
1088
- expected = df.select(expr("a + b"))
1089
- self.assertEqual(expected.collect(), res1.collect())
1090
- self.assertEqual(expected.collect(), res2.collect())
1091
-
1092
- def test_scalar_iter_udf_init(self):
1093
- import numpy as np
1094
-
1095
- @pandas_udf("int", PandasUDFType.SCALAR_ITER)
1096
- def rng(batch_iter):
1097
- context = TaskContext.get()
1098
- part = context.partitionId()
1099
- np.random.seed(part)
1100
- for batch in batch_iter:
1101
- yield pd.Series(np.random.randint(100, size=len(batch)))
1102
-
1103
- with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 2}):
1104
- df = self.spark.range(10, numPartitions=2).select(rng(col("id").alias("v")))
1105
- result1 = df.collect()
1106
- result2 = df.collect()
1107
- self.assertEqual(
1108
- result1,
1109
- result2,
1110
- "SCALAR ITER UDF can initialize state and produce deterministic RNG",
1111
- )
1112
-
1113
- def test_scalar_iter_udf_close(self):
1114
- with QuietTest(self.sc):
1115
- self.check_scalar_iter_udf_close()
1116
-
1117
- def check_scalar_iter_udf_close(self):
1118
- @pandas_udf("int", PandasUDFType.SCALAR_ITER)
1119
- def test_close(batch_iter):
1120
- try:
1121
- for batch in batch_iter:
1122
- yield batch
1123
- finally:
1124
- raise RuntimeError("reached finally block")
1125
-
1126
- with self.assertRaisesRegex(Exception, "reached finally block"):
1127
- self.spark.range(1).select(test_close(col("id"))).collect()
1128
-
1129
- @unittest.skip("LimitPushDown should push limits through Python UDFs so this won't occur")
1130
- def test_scalar_iter_udf_close_early(self):
1131
- tmp_dir = tempfile.mkdtemp()
1132
- try:
1133
- tmp_file = tmp_dir + "/reach_finally_block"
1134
-
1135
- @pandas_udf("int", PandasUDFType.SCALAR_ITER)
1136
- def test_close(batch_iter):
1137
- generator_exit_caught = False
1138
- try:
1139
- for batch in batch_iter:
1140
- yield batch
1141
- time.sleep(1.0) # avoid the function finish too fast.
1142
- except GeneratorExit as ge:
1143
- generator_exit_caught = True
1144
- raise ge
1145
- finally:
1146
- assert generator_exit_caught, "Generator exit exception was not caught."
1147
- open(tmp_file, "a").close()
1148
-
1149
- with QuietTest(self.sc):
1150
- with self.sql_conf(
1151
- {
1152
- "spark.sql.execution.arrow.maxRecordsPerBatch": 1,
1153
- "spark.sql.execution.pandas.udf.buffer.size": 4,
1154
- }
1155
- ):
1156
- self.spark.range(10).repartition(1).select(test_close(col("id"))).limit(
1157
- 2
1158
- ).collect()
1159
- # wait here because python udf worker will take some time to detect
1160
- # jvm side socket closed and then will trigger `GenerateExit` raised.
1161
- # wait timeout is 10s.
1162
- for i in range(100):
1163
- time.sleep(0.1)
1164
- if os.path.exists(tmp_file):
1165
- break
1166
-
1167
- assert os.path.exists(tmp_file), "finally block not reached."
1168
-
1169
- finally:
1170
- shutil.rmtree(tmp_dir)
1171
-
1172
- # Regression test for SPARK-23314
1173
- def test_timestamp_dst(self):
1174
- # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
1175
- dt = [
1176
- datetime(2015, 11, 1, 0, 30),
1177
- datetime(2015, 11, 1, 1, 30),
1178
- datetime(2015, 11, 1, 2, 30),
1179
- ]
1180
- df = self.spark.createDataFrame(dt, "timestamp").toDF("time")
1181
-
1182
- for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]:
1183
- foo_udf = pandas_udf(lambda x: x, "timestamp", udf_type)
1184
- result = df.withColumn("time", foo_udf(df.time))
1185
- self.assertEqual(df.collect(), result.collect())
1186
-
1187
- def test_udf_category_type(self):
1188
- @pandas_udf("string")
1189
- def to_category_func(x):
1190
- return x.astype("category")
1191
-
1192
- pdf = pd.DataFrame({"A": ["a", "b", "c", "a"]})
1193
- df = self.spark.createDataFrame(pdf)
1194
- df = df.withColumn("B", to_category_func(df["A"]))
1195
- result_spark = df.toPandas()
1196
-
1197
- spark_type = df.dtypes[1][1]
1198
- # spark data frame and arrow execution mode enabled data frame type must match pandas
1199
- self.assertEqual(spark_type, "string")
1200
-
1201
- # Check result of column 'B' must be equal to column 'A' in type and values
1202
- pd.testing.assert_series_equal(result_spark["A"], result_spark["B"], check_names=False)
1203
-
1204
- def test_type_annotation(self):
1205
- # Regression test to check if type hints can be used. See SPARK-23569.
1206
- def noop(col: pd.Series) -> pd.Series:
1207
- return col
1208
-
1209
- df = self.spark.range(1).select(pandas_udf(f=noop, returnType="bigint")("id"))
1210
- self.assertEqual(df.first()[0], 0)
1211
-
1212
- def test_mixed_udf(self):
1213
- df = self.spark.range(0, 1).toDF("v")
1214
-
1215
- # Test mixture of multiple UDFs and Pandas UDFs.
1216
-
1217
- @udf("int")
1218
- def f1(x):
1219
- assert type(x) == int
1220
- return x + 1
1221
-
1222
- @pandas_udf("int")
1223
- def f2_scalar(x):
1224
- assert type(x) == pd.Series
1225
- return x + 10
1226
-
1227
- @pandas_udf("int", PandasUDFType.SCALAR_ITER)
1228
- def f2_iter(it):
1229
- for x in it:
1230
- assert type(x) == pd.Series
1231
- yield x + 10
1232
-
1233
- @udf("int")
1234
- def f3(x):
1235
- assert type(x) == int
1236
- return x + 100
1237
-
1238
- @pandas_udf("int")
1239
- def f4_scalar(x):
1240
- assert type(x) == pd.Series
1241
- return x + 1000
1242
-
1243
- @pandas_udf("int", PandasUDFType.SCALAR_ITER)
1244
- def f4_iter(it):
1245
- for x in it:
1246
- assert type(x) == pd.Series
1247
- yield x + 1000
1248
-
1249
- expected_chained_1 = df.withColumn("f2_f1", df["v"] + 11).collect()
1250
- expected_chained_2 = df.withColumn("f3_f2_f1", df["v"] + 111).collect()
1251
- expected_chained_3 = df.withColumn("f4_f3_f2_f1", df["v"] + 1111).collect()
1252
- expected_chained_4 = df.withColumn("f4_f2_f1", df["v"] + 1011).collect()
1253
- expected_chained_5 = df.withColumn("f4_f3_f1", df["v"] + 1101).collect()
1254
-
1255
- expected_multi = (
1256
- df.withColumn("f1", df["v"] + 1)
1257
- .withColumn("f2", df["v"] + 10)
1258
- .withColumn("f3", df["v"] + 100)
1259
- .withColumn("f4", df["v"] + 1000)
1260
- .withColumn("f2_f1", df["v"] + 11)
1261
- .withColumn("f3_f1", df["v"] + 101)
1262
- .withColumn("f4_f1", df["v"] + 1001)
1263
- .withColumn("f3_f2", df["v"] + 110)
1264
- .withColumn("f4_f2", df["v"] + 1010)
1265
- .withColumn("f4_f3", df["v"] + 1100)
1266
- .withColumn("f3_f2_f1", df["v"] + 111)
1267
- .withColumn("f4_f2_f1", df["v"] + 1011)
1268
- .withColumn("f4_f3_f1", df["v"] + 1101)
1269
- .withColumn("f4_f3_f2", df["v"] + 1110)
1270
- .withColumn("f4_f3_f2_f1", df["v"] + 1111)
1271
- .collect()
1272
- )
1273
-
1274
- for f2, f4 in [
1275
- (f2_scalar, f4_scalar),
1276
- (f2_scalar, f4_iter),
1277
- (f2_iter, f4_scalar),
1278
- (f2_iter, f4_iter),
1279
- ]:
1280
- # Test single expression with chained UDFs
1281
- df_chained_1 = df.withColumn("f2_f1", f2(f1(df["v"])))
1282
- df_chained_2 = df.withColumn("f3_f2_f1", f3(f2(f1(df["v"]))))
1283
- df_chained_3 = df.withColumn("f4_f3_f2_f1", f4(f3(f2(f1(df["v"])))))
1284
- df_chained_4 = df.withColumn("f4_f2_f1", f4(f2(f1(df["v"]))))
1285
- df_chained_5 = df.withColumn("f4_f3_f1", f4(f3(f1(df["v"]))))
1286
-
1287
- self.assertEqual(expected_chained_1, df_chained_1.collect())
1288
- self.assertEqual(expected_chained_2, df_chained_2.collect())
1289
- self.assertEqual(expected_chained_3, df_chained_3.collect())
1290
- self.assertEqual(expected_chained_4, df_chained_4.collect())
1291
- self.assertEqual(expected_chained_5, df_chained_5.collect())
1292
-
1293
- # Test multiple mixed UDF expressions in a single projection
1294
- df_multi_1 = (
1295
- df.withColumn("f1", f1(col("v")))
1296
- .withColumn("f2", f2(col("v")))
1297
- .withColumn("f3", f3(col("v")))
1298
- .withColumn("f4", f4(col("v")))
1299
- .withColumn("f2_f1", f2(col("f1")))
1300
- .withColumn("f3_f1", f3(col("f1")))
1301
- .withColumn("f4_f1", f4(col("f1")))
1302
- .withColumn("f3_f2", f3(col("f2")))
1303
- .withColumn("f4_f2", f4(col("f2")))
1304
- .withColumn("f4_f3", f4(col("f3")))
1305
- .withColumn("f3_f2_f1", f3(col("f2_f1")))
1306
- .withColumn("f4_f2_f1", f4(col("f2_f1")))
1307
- .withColumn("f4_f3_f1", f4(col("f3_f1")))
1308
- .withColumn("f4_f3_f2", f4(col("f3_f2")))
1309
- .withColumn("f4_f3_f2_f1", f4(col("f3_f2_f1")))
1310
- )
1311
-
1312
- # Test mixed udfs in a single expression
1313
- df_multi_2 = (
1314
- df.withColumn("f1", f1(col("v")))
1315
- .withColumn("f2", f2(col("v")))
1316
- .withColumn("f3", f3(col("v")))
1317
- .withColumn("f4", f4(col("v")))
1318
- .withColumn("f2_f1", f2(f1(col("v"))))
1319
- .withColumn("f3_f1", f3(f1(col("v"))))
1320
- .withColumn("f4_f1", f4(f1(col("v"))))
1321
- .withColumn("f3_f2", f3(f2(col("v"))))
1322
- .withColumn("f4_f2", f4(f2(col("v"))))
1323
- .withColumn("f4_f3", f4(f3(col("v"))))
1324
- .withColumn("f3_f2_f1", f3(f2(f1(col("v")))))
1325
- .withColumn("f4_f2_f1", f4(f2(f1(col("v")))))
1326
- .withColumn("f4_f3_f1", f4(f3(f1(col("v")))))
1327
- .withColumn("f4_f3_f2", f4(f3(f2(col("v")))))
1328
- .withColumn("f4_f3_f2_f1", f4(f3(f2(f1(col("v"))))))
1329
- )
1330
-
1331
- self.assertEqual(expected_multi, df_multi_1.collect())
1332
- self.assertEqual(expected_multi, df_multi_2.collect())
1333
-
1334
- def test_mixed_udf_and_sql(self):
1335
- from pyspark.sql.connect.column import Column as ConnectColumn
1336
-
1337
- df = self.spark.range(0, 1).toDF("v")
1338
-
1339
- # Test mixture of UDFs, Pandas UDFs and SQL expression.
1340
-
1341
- @udf("int")
1342
- def f1(x):
1343
- assert type(x) == int
1344
- return x + 1
1345
-
1346
- def f2(x):
1347
- assert type(x) in (Column, ConnectColumn)
1348
- return x + 10
1349
-
1350
- @pandas_udf("int")
1351
- def f3s(x):
1352
- assert type(x) == pd.Series
1353
- return x + 100
1354
-
1355
- @pandas_udf("int", PandasUDFType.SCALAR_ITER)
1356
- def f3i(it):
1357
- for x in it:
1358
- assert type(x) == pd.Series
1359
- yield x + 100
1360
-
1361
- expected = (
1362
- df.withColumn("f1", df["v"] + 1)
1363
- .withColumn("f2", df["v"] + 10)
1364
- .withColumn("f3", df["v"] + 100)
1365
- .withColumn("f1_f2", df["v"] + 11)
1366
- .withColumn("f1_f3", df["v"] + 101)
1367
- .withColumn("f2_f1", df["v"] + 11)
1368
- .withColumn("f2_f3", df["v"] + 110)
1369
- .withColumn("f3_f1", df["v"] + 101)
1370
- .withColumn("f3_f2", df["v"] + 110)
1371
- .withColumn("f1_f2_f3", df["v"] + 111)
1372
- .withColumn("f1_f3_f2", df["v"] + 111)
1373
- .withColumn("f2_f1_f3", df["v"] + 111)
1374
- .withColumn("f2_f3_f1", df["v"] + 111)
1375
- .withColumn("f3_f1_f2", df["v"] + 111)
1376
- .withColumn("f3_f2_f1", df["v"] + 111)
1377
- .collect()
1378
- )
1379
-
1380
- for f3 in [f3s, f3i]:
1381
- df1 = (
1382
- df.withColumn("f1", f1(df["v"]))
1383
- .withColumn("f2", f2(df["v"]))
1384
- .withColumn("f3", f3(df["v"]))
1385
- .withColumn("f1_f2", f1(f2(df["v"])))
1386
- .withColumn("f1_f3", f1(f3(df["v"])))
1387
- .withColumn("f2_f1", f2(f1(df["v"])))
1388
- .withColumn("f2_f3", f2(f3(df["v"])))
1389
- .withColumn("f3_f1", f3(f1(df["v"])))
1390
- .withColumn("f3_f2", f3(f2(df["v"])))
1391
- .withColumn("f1_f2_f3", f1(f2(f3(df["v"]))))
1392
- .withColumn("f1_f3_f2", f1(f3(f2(df["v"]))))
1393
- .withColumn("f2_f1_f3", f2(f1(f3(df["v"]))))
1394
- .withColumn("f2_f3_f1", f2(f3(f1(df["v"]))))
1395
- .withColumn("f3_f1_f2", f3(f1(f2(df["v"]))))
1396
- .withColumn("f3_f2_f1", f3(f2(f1(df["v"]))))
1397
- )
1398
-
1399
- self.assertEqual(expected, df1.collect())
1400
-
1401
- # SPARK-24721
1402
- @unittest.skipIf(not test_compiled, test_not_compiled_message) # type: ignore
1403
- def test_datasource_with_udf(self):
1404
- # Same as SQLTests.test_datasource_with_udf, but with Pandas UDF
1405
- # This needs to a separate test because Arrow dependency is optional
1406
- import numpy as np
1407
-
1408
- path = tempfile.mkdtemp()
1409
- shutil.rmtree(path)
1410
-
1411
- try:
1412
- self.spark.range(1).write.mode("overwrite").format("csv").save(path)
1413
- filesource_df = self.spark.read.option("inferSchema", True).csv(path).toDF("i")
1414
- datasource_df = (
1415
- self.spark.read.format("org.apache.spark.sql.sources.SimpleScanSource")
1416
- .option("from", 0)
1417
- .option("to", 1)
1418
- .load()
1419
- .toDF("i")
1420
- )
1421
- datasource_v2_df = (
1422
- self.spark.read.format("org.apache.spark.sql.connector.SimpleDataSourceV2")
1423
- .load()
1424
- .toDF("i", "j")
1425
- )
1426
-
1427
- c1 = pandas_udf(lambda x: x + 1, "int")(lit(1))
1428
- c2 = pandas_udf(lambda x: x + 1, "int")(col("i"))
1429
-
1430
- f1 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), "boolean")(lit(1))
1431
- f2 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), "boolean")(col("i"))
1432
-
1433
- for df in [filesource_df, datasource_df, datasource_v2_df]:
1434
- result = df.withColumn("c", c1)
1435
- expected = df.withColumn("c", lit(2))
1436
- self.assertEqual(expected.collect(), result.collect())
1437
-
1438
- for df in [filesource_df, datasource_df, datasource_v2_df]:
1439
- result = df.withColumn("c", c2)
1440
- expected = df.withColumn("c", col("i") + 1)
1441
- self.assertEqual(expected.collect(), result.collect())
1442
-
1443
- for df in [filesource_df, datasource_df, datasource_v2_df]:
1444
- for f in [f1, f2]:
1445
- result = df.filter(f)
1446
- self.assertEqual(0, result.count())
1447
- finally:
1448
- shutil.rmtree(path)
1449
-
1450
- # SPARK-33277
1451
- def test_pandas_udf_with_column_vector(self):
1452
- path = tempfile.mkdtemp()
1453
- shutil.rmtree(path)
1454
-
1455
- try:
1456
- self.spark.range(0, 200000, 1, 1).write.parquet(path)
1457
-
1458
- @pandas_udf(LongType())
1459
- def udf(x):
1460
- return pd.Series([0] * len(x))
1461
-
1462
- for offheap in ["true", "false"]:
1463
- with self.sql_conf({"spark.sql.columnVector.offheap.enabled": offheap}):
1464
- self.assertEquals(
1465
- self.spark.read.parquet(path).select(udf("id")).head(), Row(0)
1466
- )
1467
- finally:
1468
- shutil.rmtree(path)
1469
-
1470
-
1471
- class ScalarPandasUDFTests(ScalarPandasUDFTestsMixin, ReusedSQLTestCase):
1472
- @classmethod
1473
- def setUpClass(cls):
1474
- ReusedSQLTestCase.setUpClass()
1475
-
1476
- # Synchronize default timezone between Python and Java
1477
- cls.tz_prev = os.environ.get("TZ", None) # save current tz if set
1478
- tz = "America/Los_Angeles"
1479
- os.environ["TZ"] = tz
1480
- time.tzset()
1481
-
1482
- cls.sc.environment["TZ"] = tz
1483
- cls.spark.conf.set("spark.sql.session.timeZone", tz)
1484
-
1485
- @classmethod
1486
- def tearDownClass(cls):
1487
- del os.environ["TZ"]
1488
- if cls.tz_prev is not None:
1489
- os.environ["TZ"] = cls.tz_prev
1490
- time.tzset()
1491
- ReusedSQLTestCase.tearDownClass()
1492
-
1493
-
1494
- if __name__ == "__main__":
1495
- from pyspark.sql.tests.pandas.test_pandas_udf_scalar import * # noqa: F401
1496
-
1497
- try:
1498
- import xmlrunner
1499
-
1500
- testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
1501
- except ImportError:
1502
- testRunner = None
1503
- unittest.main(testRunner=testRunner, verbosity=2)