snowpark-connect 0.24.0__py3-none-any.whl → 0.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (484) hide show
  1. snowflake/snowpark_connect/column_name_handler.py +116 -4
  2. snowflake/snowpark_connect/config.py +23 -0
  3. snowflake/snowpark_connect/constants.py +0 -29
  4. snowflake/snowpark_connect/dataframe_container.py +22 -0
  5. snowflake/snowpark_connect/execute_plan/map_execution_command.py +56 -1
  6. snowflake/snowpark_connect/expression/literal.py +13 -2
  7. snowflake/snowpark_connect/expression/map_cast.py +5 -8
  8. snowflake/snowpark_connect/expression/map_sql_expression.py +23 -1
  9. snowflake/snowpark_connect/expression/map_udf.py +88 -29
  10. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +199 -15
  11. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +44 -16
  12. snowflake/snowpark_connect/expression/map_unresolved_function.py +840 -367
  13. snowflake/snowpark_connect/expression/map_unresolved_star.py +3 -2
  14. snowflake/snowpark_connect/hidden_column.py +39 -0
  15. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  16. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  17. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  18. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  19. snowflake/snowpark_connect/includes/jars/{hadoop-client-api-3.3.4.jar → spark-connect-client-jvm_2.12-3.5.6.jar} +0 -0
  20. snowflake/snowpark_connect/relation/map_column_ops.py +17 -4
  21. snowflake/snowpark_connect/relation/map_extension.py +52 -11
  22. snowflake/snowpark_connect/relation/map_join.py +258 -62
  23. snowflake/snowpark_connect/relation/map_map_partitions.py +9 -4
  24. snowflake/snowpark_connect/relation/map_relation.py +12 -1
  25. snowflake/snowpark_connect/relation/map_row_ops.py +8 -1
  26. snowflake/snowpark_connect/relation/map_sql.py +88 -11
  27. snowflake/snowpark_connect/relation/map_udtf.py +100 -46
  28. snowflake/snowpark_connect/relation/read/map_read.py +3 -3
  29. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +1 -1
  30. snowflake/snowpark_connect/relation/read/map_read_json.py +8 -1
  31. snowflake/snowpark_connect/relation/read/map_read_table.py +1 -9
  32. snowflake/snowpark_connect/relation/read/reader_config.py +3 -1
  33. snowflake/snowpark_connect/relation/utils.py +44 -0
  34. snowflake/snowpark_connect/relation/write/map_write.py +175 -75
  35. snowflake/snowpark_connect/resources_initializer.py +47 -6
  36. snowflake/snowpark_connect/server.py +26 -4
  37. snowflake/snowpark_connect/type_mapping.py +29 -25
  38. snowflake/snowpark_connect/typed_column.py +14 -0
  39. snowflake/snowpark_connect/utils/artifacts.py +23 -0
  40. snowflake/snowpark_connect/utils/concurrent.py +4 -0
  41. snowflake/snowpark_connect/utils/context.py +6 -1
  42. snowflake/snowpark_connect/utils/external_udxf_cache.py +36 -0
  43. snowflake/snowpark_connect/utils/scala_udf_utils.py +596 -0
  44. snowflake/snowpark_connect/utils/session.py +4 -0
  45. snowflake/snowpark_connect/utils/telemetry.py +6 -17
  46. snowflake/snowpark_connect/utils/udf_helper.py +2 -0
  47. snowflake/snowpark_connect/utils/udf_utils.py +22 -1
  48. snowflake/snowpark_connect/utils/udtf_utils.py +1 -0
  49. snowflake/snowpark_connect/version.py +1 -1
  50. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/METADATA +1 -1
  51. snowpark_connect-0.26.0.dist-info/RECORD +481 -0
  52. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  56. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +0 -16
  57. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +0 -60
  58. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +0 -306
  59. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +0 -16
  60. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +0 -53
  61. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +0 -50
  62. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +0 -43
  63. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +0 -114
  64. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +0 -47
  65. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +0 -43
  66. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +0 -46
  67. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +0 -238
  68. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +0 -194
  69. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +0 -156
  70. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +0 -184
  71. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +0 -78
  72. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +0 -292
  73. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +0 -50
  74. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +0 -152
  75. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +0 -456
  76. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +0 -96
  77. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +0 -186
  78. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +0 -77
  79. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +0 -401
  80. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +0 -528
  81. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +0 -82
  82. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +0 -409
  83. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +0 -55
  84. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +0 -441
  85. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +0 -546
  86. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +0 -71
  87. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +0 -52
  88. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +0 -494
  89. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +0 -85
  90. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +0 -138
  91. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +0 -16
  92. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +0 -151
  93. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +0 -97
  94. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +0 -143
  95. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +0 -551
  96. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +0 -137
  97. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +0 -96
  98. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +0 -142
  99. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +0 -16
  100. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +0 -137
  101. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +0 -561
  102. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +0 -172
  103. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +0 -16
  104. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +0 -353
  105. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +0 -192
  106. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +0 -680
  107. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +0 -206
  108. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +0 -471
  109. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +0 -108
  110. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
  111. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
  112. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
  113. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
  114. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +0 -16
  115. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +0 -16
  116. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +0 -177
  117. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +0 -575
  118. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +0 -235
  119. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +0 -653
  120. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +0 -463
  121. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +0 -86
  122. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +0 -151
  123. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +0 -139
  124. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +0 -458
  125. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +0 -86
  126. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +0 -202
  127. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +0 -520
  128. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +0 -361
  129. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +0 -16
  130. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +0 -16
  131. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +0 -40
  132. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +0 -42
  133. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +0 -40
  134. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +0 -37
  135. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +0 -60
  136. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +0 -40
  137. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +0 -40
  138. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +0 -90
  139. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +0 -40
  140. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +0 -40
  141. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +0 -40
  142. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +0 -42
  143. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +0 -37
  144. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +0 -16
  145. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +0 -36
  146. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +0 -42
  147. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +0 -47
  148. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +0 -55
  149. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +0 -40
  150. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +0 -47
  151. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +0 -47
  152. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +0 -42
  153. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +0 -43
  154. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +0 -47
  155. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +0 -43
  156. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +0 -47
  157. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +0 -47
  158. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +0 -40
  159. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +0 -226
  160. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +0 -16
  161. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +0 -39
  162. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +0 -55
  163. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +0 -39
  164. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +0 -39
  165. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +0 -39
  166. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +0 -39
  167. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +0 -39
  168. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +0 -43
  169. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +0 -43
  170. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +0 -16
  171. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +0 -40
  172. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +0 -39
  173. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +0 -42
  174. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +0 -42
  175. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +0 -37
  176. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +0 -40
  177. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +0 -42
  178. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +0 -48
  179. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +0 -40
  180. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +0 -16
  181. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +0 -40
  182. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +0 -41
  183. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +0 -67
  184. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +0 -40
  185. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +0 -55
  186. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +0 -40
  187. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +0 -38
  188. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +0 -55
  189. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +0 -39
  190. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +0 -38
  191. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +0 -16
  192. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +0 -40
  193. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +0 -50
  194. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +0 -73
  195. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +0 -39
  196. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +0 -40
  197. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +0 -40
  198. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +0 -40
  199. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +0 -48
  200. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +0 -39
  201. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +0 -16
  202. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +0 -40
  203. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +0 -16
  204. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +0 -45
  205. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +0 -45
  206. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +0 -49
  207. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +0 -37
  208. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +0 -53
  209. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +0 -45
  210. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +0 -16
  211. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +0 -38
  212. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +0 -37
  213. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +0 -37
  214. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +0 -38
  215. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +0 -37
  216. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +0 -40
  217. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +0 -40
  218. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +0 -38
  219. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +0 -40
  220. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +0 -37
  221. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +0 -38
  222. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +0 -38
  223. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +0 -66
  224. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +0 -37
  225. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +0 -37
  226. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +0 -42
  227. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +0 -39
  228. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +0 -49
  229. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +0 -37
  230. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +0 -39
  231. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +0 -49
  232. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +0 -53
  233. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +0 -43
  234. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +0 -49
  235. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +0 -39
  236. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +0 -41
  237. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +0 -39
  238. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +0 -60
  239. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +0 -48
  240. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +0 -39
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +0 -44
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +0 -84
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +0 -37
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +0 -45
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +0 -39
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +0 -39
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +0 -37
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +0 -39
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +0 -39
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +0 -39
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +0 -39
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +0 -43
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +0 -37
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +0 -36
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +0 -37
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +0 -39
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +0 -16
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +0 -107
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +0 -224
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +0 -825
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +0 -562
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +0 -368
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +0 -257
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +0 -260
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +0 -178
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +0 -184
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +0 -497
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +0 -140
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +0 -354
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +0 -219
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +0 -192
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +0 -228
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +0 -16
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +0 -118
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +0 -198
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +0 -181
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +0 -103
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +0 -141
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +0 -109
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +0 -136
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +0 -125
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +0 -217
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +0 -16
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +0 -384
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +0 -598
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +0 -73
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +0 -869
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +0 -487
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +0 -309
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +0 -156
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +0 -149
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +0 -163
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +0 -16
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +0 -311
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +0 -524
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +0 -419
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +0 -144
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +0 -979
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +0 -234
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +0 -206
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +0 -421
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +0 -187
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +0 -397
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +0 -16
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +0 -100
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +0 -2743
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +0 -484
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +0 -276
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +0 -432
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +0 -310
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +0 -257
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +0 -160
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +0 -128
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +0 -16
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +0 -137
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +0 -16
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +0 -170
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +0 -547
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +0 -285
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +0 -106
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +0 -409
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +0 -247
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +0 -16
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +0 -105
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +0 -197
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +0 -137
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +0 -227
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +0 -634
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +0 -88
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +0 -139
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +0 -475
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +0 -265
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +0 -818
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +0 -162
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +0 -780
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +0 -741
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +0 -160
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +0 -453
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +0 -281
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +0 -487
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +0 -109
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +0 -434
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +0 -253
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +0 -152
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +0 -162
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +0 -234
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +0 -1339
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +0 -82
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +0 -124
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +0 -638
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +0 -200
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +0 -1355
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +0 -655
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +0 -113
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +0 -118
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +0 -192
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +0 -346
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +0 -495
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +0 -263
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +0 -59
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +0 -85
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +0 -364
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +0 -362
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +0 -46
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +0 -123
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +0 -581
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +0 -447
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +0 -301
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +0 -465
  370. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +0 -16
  371. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +0 -83
  372. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +0 -16
  373. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +0 -16
  374. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +0 -16
  375. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +0 -420
  376. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +0 -358
  377. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +0 -16
  378. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +0 -36
  379. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +0 -44
  380. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +0 -116
  381. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +0 -35
  382. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +0 -3612
  383. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +0 -1042
  384. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +0 -2381
  385. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +0 -1060
  386. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +0 -163
  387. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +0 -38
  388. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +0 -48
  389. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +0 -36
  390. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +0 -55
  391. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +0 -36
  392. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +0 -96
  393. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +0 -44
  394. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +0 -36
  395. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +0 -59
  396. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +0 -36
  397. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +0 -59
  398. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +0 -74
  399. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +0 -62
  400. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +0 -58
  401. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +0 -70
  402. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +0 -50
  403. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +0 -68
  404. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +0 -40
  405. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +0 -46
  406. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +0 -44
  407. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +0 -100
  408. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +0 -100
  409. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +0 -163
  410. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +0 -181
  411. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +0 -42
  412. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +0 -16
  413. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +0 -623
  414. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +0 -869
  415. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +0 -342
  416. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +0 -436
  417. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +0 -363
  418. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +0 -592
  419. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +0 -1503
  420. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +0 -392
  421. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +0 -375
  422. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +0 -411
  423. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +0 -16
  424. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +0 -401
  425. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +0 -295
  426. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +0 -106
  427. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +0 -558
  428. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +0 -1346
  429. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +0 -182
  430. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +0 -202
  431. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +0 -503
  432. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +0 -225
  433. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +0 -83
  434. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +0 -201
  435. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +0 -1931
  436. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +0 -256
  437. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +0 -69
  438. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +0 -1349
  439. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +0 -53
  440. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +0 -68
  441. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +0 -283
  442. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +0 -155
  443. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +0 -412
  444. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +0 -1581
  445. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +0 -961
  446. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +0 -165
  447. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +0 -1456
  448. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +0 -1686
  449. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +0 -16
  450. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +0 -184
  451. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +0 -706
  452. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +0 -118
  453. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +0 -160
  454. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +0 -16
  455. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +0 -306
  456. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +0 -196
  457. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +0 -44
  458. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +0 -346
  459. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +0 -89
  460. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +0 -124
  461. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +0 -69
  462. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +0 -167
  463. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +0 -194
  464. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +0 -168
  465. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +0 -939
  466. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +0 -52
  467. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +0 -66
  468. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +0 -368
  469. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +0 -257
  470. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +0 -267
  471. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +0 -153
  472. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +0 -130
  473. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +0 -350
  474. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +0 -97
  475. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +0 -271
  476. snowpark_connect-0.24.0.dist-info/RECORD +0 -898
  477. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-connect +0 -0
  478. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-session +0 -0
  479. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-submit +0 -0
  480. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/WHEEL +0 -0
  481. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/LICENSE-binary +0 -0
  482. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/LICENSE.txt +0 -0
  483. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/NOTICE-binary +0 -0
  484. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/top_level.txt +0 -0
@@ -1,1931 +0,0 @@
1
- #
2
- # Licensed to the Apache Software Foundation (ASF) under one or more
3
- # contributor license agreements. See the NOTICE file distributed with
4
- # this work for additional information regarding copyright ownership.
5
- # The ASF licenses this file to You under the Apache License, Version 2.0
6
- # (the "License"); you may not use this file except in compliance with
7
- # the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- #
17
- import platform
18
- from decimal import Decimal
19
- import os
20
- import pydoc
21
- import shutil
22
- import tempfile
23
- import time
24
- import unittest
25
- from typing import cast
26
- import io
27
- from contextlib import redirect_stdout
28
-
29
- from pyspark import StorageLevel
30
- from pyspark.sql import SparkSession, Row
31
- from pyspark.sql.functions import col, lit, count, sum, mean, struct
32
- from pyspark.sql.pandas.utils import pyarrow_version_less_than_minimum
33
- from pyspark.sql.types import (
34
- StringType,
35
- IntegerType,
36
- DoubleType,
37
- LongType,
38
- StructType,
39
- StructField,
40
- BooleanType,
41
- DateType,
42
- TimestampType,
43
- TimestampNTZType,
44
- FloatType,
45
- DayTimeIntervalType,
46
- )
47
- from pyspark.storagelevel import StorageLevel
48
- from pyspark.errors import (
49
- AnalysisException,
50
- IllegalArgumentException,
51
- PySparkTypeError,
52
- PySparkValueError,
53
- )
54
- from pyspark.testing.sqlutils import (
55
- ReusedSQLTestCase,
56
- SQLTestUtils,
57
- have_pyarrow,
58
- have_pandas,
59
- pandas_requirement_message,
60
- pyarrow_requirement_message,
61
- )
62
- from pyspark.testing.utils import QuietTest
63
-
64
-
65
- class DataFrameTestsMixin:
66
- def test_range(self):
67
- self.assertEqual(self.spark.range(1, 1).count(), 0)
68
- self.assertEqual(self.spark.range(1, 0, -1).count(), 1)
69
- self.assertEqual(self.spark.range(0, 1 << 40, 1 << 39).count(), 2)
70
- self.assertEqual(self.spark.range(-2).count(), 0)
71
- self.assertEqual(self.spark.range(3).count(), 3)
72
-
73
- def test_duplicated_column_names(self):
74
- df = self.spark.createDataFrame([(1, 2)], ["c", "c"])
75
- row = df.select("*").first()
76
- self.assertEqual(1, row[0])
77
- self.assertEqual(2, row[1])
78
- self.assertEqual("Row(c=1, c=2)", str(row))
79
- # Cannot access columns
80
- self.assertRaises(AnalysisException, lambda: df.select(df[0]).first())
81
- self.assertRaises(AnalysisException, lambda: df.select(df.c).first())
82
- self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first())
83
-
84
- def test_freqItems(self):
85
- vals = [Row(a=1, b=-2.0) if i % 2 == 0 else Row(a=i, b=i * 1.0) for i in range(100)]
86
- df = self.spark.createDataFrame(vals)
87
- items = df.stat.freqItems(("a", "b"), 0.4).collect()[0]
88
- self.assertTrue(1 in items[0])
89
- self.assertTrue(-2.0 in items[1])
90
-
91
- def test_help_command(self):
92
- # Regression test for SPARK-5464
93
- rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
94
- df = self.spark.read.json(rdd)
95
- # render_doc() reproduces the help() exception without printing output
96
- pydoc.render_doc(df)
97
- pydoc.render_doc(df.foo)
98
- pydoc.render_doc(df.take(1))
99
-
100
- def test_drop(self):
101
- df = self.spark.createDataFrame([("A", 50, "Y"), ("B", 60, "Y")], ["name", "age", "active"])
102
- self.assertEqual(df.drop("active").columns, ["name", "age"])
103
- self.assertEqual(df.drop("active", "nonexistent_column").columns, ["name", "age"])
104
- self.assertEqual(df.drop("name", "age", "active").columns, [])
105
- self.assertEqual(df.drop(col("name")).columns, ["age", "active"])
106
- self.assertEqual(df.drop(col("name"), col("age")).columns, ["active"])
107
- self.assertEqual(df.drop(col("name"), col("age"), col("random")).columns, ["active"])
108
-
109
- def test_drop_join(self):
110
- left_df = self.spark.createDataFrame(
111
- [(1, "a"), (2, "b"), (3, "c")],
112
- ["join_key", "value1"],
113
- )
114
- right_df = self.spark.createDataFrame(
115
- [(1, "aa"), (2, "bb"), (4, "dd")],
116
- ["join_key", "value2"],
117
- )
118
- joined_df = left_df.join(
119
- right_df,
120
- on=left_df["join_key"] == right_df["join_key"],
121
- how="left",
122
- )
123
-
124
- dropped_1 = joined_df.drop(left_df["join_key"])
125
- self.assertEqual(dropped_1.columns, ["value1", "join_key", "value2"])
126
- self.assertEqual(
127
- dropped_1.sort("value1").collect(),
128
- [
129
- Row(value1="a", join_key=1, value2="aa"),
130
- Row(value1="b", join_key=2, value2="bb"),
131
- Row(value1="c", join_key=None, value2=None),
132
- ],
133
- )
134
-
135
- dropped_2 = joined_df.drop(right_df["join_key"])
136
- self.assertEqual(dropped_2.columns, ["join_key", "value1", "value2"])
137
- self.assertEqual(
138
- dropped_2.sort("value1").collect(),
139
- [
140
- Row(join_key=1, value1="a", value2="aa"),
141
- Row(join_key=2, value1="b", value2="bb"),
142
- Row(join_key=3, value1="c", value2=None),
143
- ],
144
- )
145
-
146
- def test_with_columns_renamed(self):
147
- df = self.spark.createDataFrame([("Alice", 50), ("Alice", 60)], ["name", "age"])
148
-
149
- # rename both columns
150
- renamed_df1 = df.withColumnsRenamed({"name": "naam", "age": "leeftijd"})
151
- self.assertEqual(renamed_df1.columns, ["naam", "leeftijd"])
152
-
153
- # rename one column with one missing name
154
- renamed_df2 = df.withColumnsRenamed({"name": "naam", "address": "adres"})
155
- self.assertEqual(renamed_df2.columns, ["naam", "age"])
156
-
157
- # negative test for incorrect type
158
- with self.assertRaises(PySparkTypeError) as pe:
159
- df.withColumnsRenamed(("name", "x"))
160
-
161
- self.check_error(
162
- exception=pe.exception,
163
- error_class="NOT_DICT",
164
- message_parameters={"arg_name": "colsMap", "arg_type": "tuple"},
165
- )
166
-
167
- def test_drop_duplicates(self):
168
- # SPARK-36034 test that drop duplicates throws a type error when in correct type provided
169
- df = self.spark.createDataFrame([("Alice", 50), ("Alice", 60)], ["name", "age"])
170
-
171
- # shouldn't drop a non-null row
172
- self.assertEqual(df.dropDuplicates().count(), 2)
173
-
174
- self.assertEqual(df.dropDuplicates(["name"]).count(), 1)
175
-
176
- self.assertEqual(df.dropDuplicates(["name", "age"]).count(), 2)
177
-
178
- with self.assertRaises(PySparkTypeError) as pe:
179
- df.dropDuplicates("name")
180
-
181
- self.check_error(
182
- exception=pe.exception,
183
- error_class="NOT_LIST_OR_TUPLE",
184
- message_parameters={"arg_name": "subset", "arg_type": "str"},
185
- )
186
-
187
- def test_drop_duplicates_with_ambiguous_reference(self):
188
- df1 = self.spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
189
- df2 = self.spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
190
- df3 = df1.join(df2, df1.name == df2.name, "inner")
191
-
192
- self.assertEqual(df3.drop("name", "age").columns, ["height"])
193
- self.assertEqual(df3.drop("name", df3.age, "unknown").columns, ["height"])
194
- self.assertEqual(df3.drop("name", "age", df3.height).columns, [])
195
-
196
- def test_drop_empty_column(self):
197
- df = self.spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
198
-
199
- self.assertEqual(df.drop().columns, ["age", "name"])
200
- self.assertEqual(df.drop(*[]).columns, ["age", "name"])
201
-
202
- def test_drop_column_name_with_dot(self):
203
- df = (
204
- self.spark.range(1, 3)
205
- .withColumn("first.name", lit("Peter"))
206
- .withColumn("city.name", lit("raleigh"))
207
- .withColumn("state", lit("nc"))
208
- )
209
-
210
- self.assertEqual(df.drop("first.name").columns, ["id", "city.name", "state"])
211
- self.assertEqual(df.drop("city.name").columns, ["id", "first.name", "state"])
212
- self.assertEqual(df.drop("first.name", "city.name").columns, ["id", "state"])
213
- self.assertEqual(
214
- df.drop("first.name", "city.name", "unknown.unknown").columns, ["id", "state"]
215
- )
216
- self.assertEqual(
217
- df.drop("unknown.unknown").columns, ["id", "first.name", "city.name", "state"]
218
- )
219
-
220
- def test_dropna(self):
221
- schema = StructType(
222
- [
223
- StructField("name", StringType(), True),
224
- StructField("age", IntegerType(), True),
225
- StructField("height", DoubleType(), True),
226
- ]
227
- )
228
-
229
- # shouldn't drop a non-null row
230
- self.assertEqual(
231
- self.spark.createDataFrame([("Alice", 50, 80.1)], schema).dropna().count(), 1
232
- )
233
-
234
- # dropping rows with a single null value
235
- self.assertEqual(
236
- self.spark.createDataFrame([("Alice", None, 80.1)], schema).dropna().count(), 0
237
- )
238
- self.assertEqual(
239
- self.spark.createDataFrame([("Alice", None, 80.1)], schema).dropna(how="any").count(), 0
240
- )
241
-
242
- # if how = 'all', only drop rows if all values are null
243
- self.assertEqual(
244
- self.spark.createDataFrame([("Alice", None, 80.1)], schema).dropna(how="all").count(), 1
245
- )
246
- self.assertEqual(
247
- self.spark.createDataFrame([(None, None, None)], schema).dropna(how="all").count(), 0
248
- )
249
-
250
- # how and subset
251
- self.assertEqual(
252
- self.spark.createDataFrame([("Alice", 50, None)], schema)
253
- .dropna(how="any", subset=["name", "age"])
254
- .count(),
255
- 1,
256
- )
257
- self.assertEqual(
258
- self.spark.createDataFrame([("Alice", None, None)], schema)
259
- .dropna(how="any", subset=["name", "age"])
260
- .count(),
261
- 0,
262
- )
263
-
264
- # threshold
265
- self.assertEqual(
266
- self.spark.createDataFrame([("Alice", None, 80.1)], schema).dropna(thresh=2).count(), 1
267
- )
268
- self.assertEqual(
269
- self.spark.createDataFrame([("Alice", None, None)], schema).dropna(thresh=2).count(), 0
270
- )
271
-
272
- # threshold and subset
273
- self.assertEqual(
274
- self.spark.createDataFrame([("Alice", 50, None)], schema)
275
- .dropna(thresh=2, subset=["name", "age"])
276
- .count(),
277
- 1,
278
- )
279
- self.assertEqual(
280
- self.spark.createDataFrame([("Alice", None, 180.9)], schema)
281
- .dropna(thresh=2, subset=["name", "age"])
282
- .count(),
283
- 0,
284
- )
285
-
286
- # thresh should take precedence over how
287
- self.assertEqual(
288
- self.spark.createDataFrame([("Alice", 50, None)], schema)
289
- .dropna(how="any", thresh=2, subset=["name", "age"])
290
- .count(),
291
- 1,
292
- )
293
-
294
- with self.assertRaises(PySparkTypeError) as pe:
295
- self.spark.createDataFrame([("Alice", 50, None)], schema).dropna(subset=10)
296
-
297
- self.check_error(
298
- exception=pe.exception,
299
- error_class="NOT_LIST_OR_STR_OR_TUPLE",
300
- message_parameters={"arg_name": "subset", "arg_type": "int"},
301
- )
302
-
303
- def test_fillna(self):
304
- schema = StructType(
305
- [
306
- StructField("name", StringType(), True),
307
- StructField("age", IntegerType(), True),
308
- StructField("height", DoubleType(), True),
309
- StructField("spy", BooleanType(), True),
310
- ]
311
- )
312
-
313
- # fillna shouldn't change non-null values
314
- row = self.spark.createDataFrame([("Alice", 10, 80.1, True)], schema).fillna(50).first()
315
- self.assertEqual(row.age, 10)
316
-
317
- # fillna with int
318
- row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(50).first()
319
- self.assertEqual(row.age, 50)
320
- self.assertEqual(row.height, 50.0)
321
-
322
- # fillna with double
323
- row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(50.1).first()
324
- self.assertEqual(row.age, 50)
325
- self.assertEqual(row.height, 50.1)
326
-
327
- # fillna with bool
328
- row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(True).first()
329
- self.assertEqual(row.age, None)
330
- self.assertEqual(row.spy, True)
331
-
332
- # fillna with string
333
- row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna("hello").first()
334
- self.assertEqual(row.name, "hello")
335
- self.assertEqual(row.age, None)
336
-
337
- # fillna with subset specified for numeric cols
338
- row = (
339
- self.spark.createDataFrame([(None, None, None, None)], schema)
340
- .fillna(50, subset=["name", "age"])
341
- .first()
342
- )
343
- self.assertEqual(row.name, None)
344
- self.assertEqual(row.age, 50)
345
- self.assertEqual(row.height, None)
346
- self.assertEqual(row.spy, None)
347
-
348
- # fillna with subset specified for string cols
349
- row = (
350
- self.spark.createDataFrame([(None, None, None, None)], schema)
351
- .fillna("haha", subset=["name", "age"])
352
- .first()
353
- )
354
- self.assertEqual(row.name, "haha")
355
- self.assertEqual(row.age, None)
356
- self.assertEqual(row.height, None)
357
- self.assertEqual(row.spy, None)
358
-
359
- # fillna with subset specified for bool cols
360
- row = (
361
- self.spark.createDataFrame([(None, None, None, None)], schema)
362
- .fillna(True, subset=["name", "spy"])
363
- .first()
364
- )
365
- self.assertEqual(row.name, None)
366
- self.assertEqual(row.age, None)
367
- self.assertEqual(row.height, None)
368
- self.assertEqual(row.spy, True)
369
-
370
- # fillna with dictionary for boolean types
371
- row = self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna({"a": True}).first()
372
- self.assertEqual(row.a, True)
373
-
374
- with self.assertRaises(PySparkTypeError) as pe:
375
- self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna(["a", True])
376
-
377
- self.check_error(
378
- exception=pe.exception,
379
- error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR",
380
- message_parameters={"arg_name": "value", "arg_type": "list"},
381
- )
382
-
383
- with self.assertRaises(PySparkTypeError) as pe:
384
- self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna(50, subset=10)
385
-
386
- self.check_error(
387
- exception=pe.exception,
388
- error_class="NOT_LIST_OR_TUPLE",
389
- message_parameters={"arg_name": "subset", "arg_type": "int"},
390
- )
391
-
392
- def test_repartitionByRange_dataframe(self):
393
- schema = StructType(
394
- [
395
- StructField("name", StringType(), True),
396
- StructField("age", IntegerType(), True),
397
- StructField("height", DoubleType(), True),
398
- ]
399
- )
400
-
401
- df1 = self.spark.createDataFrame(
402
- [("Bob", 27, 66.0), ("Alice", 10, 10.0), ("Bob", 10, 66.0)], schema
403
- )
404
- df2 = self.spark.createDataFrame(
405
- [("Alice", 10, 10.0), ("Bob", 10, 66.0), ("Bob", 27, 66.0)], schema
406
- )
407
-
408
- # test repartitionByRange(numPartitions, *cols)
409
- df3 = df1.repartitionByRange(2, "name", "age")
410
- self.assertEqual(df3.rdd.getNumPartitions(), 2)
411
- self.assertEqual(df3.rdd.first(), df2.rdd.first())
412
- self.assertEqual(df3.rdd.take(3), df2.rdd.take(3))
413
-
414
- # test repartitionByRange(numPartitions, *cols)
415
- df4 = df1.repartitionByRange(3, "name", "age")
416
- self.assertEqual(df4.rdd.getNumPartitions(), 3)
417
- self.assertEqual(df4.rdd.first(), df2.rdd.first())
418
- self.assertEqual(df4.rdd.take(3), df2.rdd.take(3))
419
-
420
- # test repartitionByRange(*cols)
421
- df5 = df1.repartitionByRange(5, "name", "age")
422
- self.assertEqual(df5.rdd.first(), df2.rdd.first())
423
- self.assertEqual(df5.rdd.take(3), df2.rdd.take(3))
424
-
425
- with self.assertRaises(PySparkTypeError) as pe:
426
- df1.repartitionByRange([10], "name", "age")
427
-
428
- self.check_error(
429
- exception=pe.exception,
430
- error_class="NOT_COLUMN_OR_INT_OR_STR",
431
- message_parameters={"arg_name": "numPartitions", "arg_type": "list"},
432
- )
433
-
434
- def test_replace(self):
435
- schema = StructType(
436
- [
437
- StructField("name", StringType(), True),
438
- StructField("age", IntegerType(), True),
439
- StructField("height", DoubleType(), True),
440
- ]
441
- )
442
-
443
- # replace with int
444
- row = self.spark.createDataFrame([("Alice", 10, 10.0)], schema).replace(10, 20).first()
445
- self.assertEqual(row.age, 20)
446
- self.assertEqual(row.height, 20.0)
447
-
448
- # replace with double
449
- row = self.spark.createDataFrame([("Alice", 80, 80.0)], schema).replace(80.0, 82.1).first()
450
- self.assertEqual(row.age, 82)
451
- self.assertEqual(row.height, 82.1)
452
-
453
- # replace with string
454
- row = (
455
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
456
- .replace("Alice", "Ann")
457
- .first()
458
- )
459
- self.assertEqual(row.name, "Ann")
460
- self.assertEqual(row.age, 10)
461
-
462
- # replace with subset specified by a string of a column name w/ actual change
463
- row = (
464
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
465
- .replace(10, 20, subset="age")
466
- .first()
467
- )
468
- self.assertEqual(row.age, 20)
469
-
470
- # replace with subset specified by a string of a column name w/o actual change
471
- row = (
472
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
473
- .replace(10, 20, subset="height")
474
- .first()
475
- )
476
- self.assertEqual(row.age, 10)
477
-
478
- # replace with subset specified with one column replaced, another column not in subset
479
- # stays unchanged.
480
- row = (
481
- self.spark.createDataFrame([("Alice", 10, 10.0)], schema)
482
- .replace(10, 20, subset=["name", "age"])
483
- .first()
484
- )
485
- self.assertEqual(row.name, "Alice")
486
- self.assertEqual(row.age, 20)
487
- self.assertEqual(row.height, 10.0)
488
-
489
- # replace with subset specified but no column will be replaced
490
- row = (
491
- self.spark.createDataFrame([("Alice", 10, None)], schema)
492
- .replace(10, 20, subset=["name", "height"])
493
- .first()
494
- )
495
- self.assertEqual(row.name, "Alice")
496
- self.assertEqual(row.age, 10)
497
- self.assertEqual(row.height, None)
498
-
499
- # replace with lists
500
- row = (
501
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
502
- .replace(["Alice"], ["Ann"])
503
- .first()
504
- )
505
- self.assertTupleEqual(row, ("Ann", 10, 80.1))
506
-
507
- # replace with dict
508
- row = self.spark.createDataFrame([("Alice", 10, 80.1)], schema).replace({10: 11}).first()
509
- self.assertTupleEqual(row, ("Alice", 11, 80.1))
510
-
511
- # test backward compatibility with dummy value
512
- dummy_value = 1
513
- row = (
514
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
515
- .replace({"Alice": "Bob"}, dummy_value)
516
- .first()
517
- )
518
- self.assertTupleEqual(row, ("Bob", 10, 80.1))
519
-
520
- # test dict with mixed numerics
521
- row = (
522
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
523
- .replace({10: -10, 80.1: 90.5})
524
- .first()
525
- )
526
- self.assertTupleEqual(row, ("Alice", -10, 90.5))
527
-
528
- # replace with tuples
529
- row = (
530
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema)
531
- .replace(("Alice",), ("Bob",))
532
- .first()
533
- )
534
- self.assertTupleEqual(row, ("Bob", 10, 80.1))
535
-
536
- # replace multiple columns
537
- row = (
538
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema)
539
- .replace((10, 80.0), (20, 90))
540
- .first()
541
- )
542
- self.assertTupleEqual(row, ("Alice", 20, 90.0))
543
-
544
- # test for mixed numerics
545
- row = (
546
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema)
547
- .replace((10, 80), (20, 90.5))
548
- .first()
549
- )
550
- self.assertTupleEqual(row, ("Alice", 20, 90.5))
551
-
552
- row = (
553
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema)
554
- .replace({10: 20, 80: 90.5})
555
- .first()
556
- )
557
- self.assertTupleEqual(row, ("Alice", 20, 90.5))
558
-
559
- # replace with boolean
560
- row = (
561
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema)
562
- .selectExpr("name = 'Bob'", "age <= 15")
563
- .replace(False, True)
564
- .first()
565
- )
566
- self.assertTupleEqual(row, (True, True))
567
-
568
- # replace string with None and then drop None rows
569
- row = (
570
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema)
571
- .replace("Alice", None)
572
- .dropna()
573
- )
574
- self.assertEqual(row.count(), 0)
575
-
576
- # replace with number and None
577
- row = (
578
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema)
579
- .replace([10, 80], [20, None])
580
- .first()
581
- )
582
- self.assertTupleEqual(row, ("Alice", 20, None))
583
-
584
- # should fail if subset is not list, tuple or None
585
- with self.assertRaises(TypeError):
586
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema).replace(
587
- {10: 11}, subset=1
588
- ).first()
589
-
590
- # should fail if to_replace and value have different length
591
- with self.assertRaises(ValueError):
592
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema).replace(
593
- ["Alice", "Bob"], ["Eve"]
594
- ).first()
595
-
596
- # should fail if when received unexpected type
597
- with self.assertRaises(TypeError):
598
- from datetime import datetime
599
-
600
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema).replace(
601
- datetime.now(), datetime.now()
602
- ).first()
603
-
604
- # should fail if provided mixed type replacements
605
- with self.assertRaises(ValueError):
606
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema).replace(
607
- ["Alice", 10], ["Eve", 20]
608
- ).first()
609
-
610
- with self.assertRaises(ValueError):
611
- self.spark.createDataFrame([("Alice", 10, 80.1)], schema).replace(
612
- {"Alice": "Bob", 10: 20}
613
- ).first()
614
-
615
- with self.assertRaises(PySparkTypeError) as pe:
616
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema).replace(["Alice", "Bob"])
617
-
618
- self.check_error(
619
- exception=pe.exception,
620
- error_class="ARGUMENT_REQUIRED",
621
- message_parameters={"arg_name": "value", "condition": "`to_replace` is dict"},
622
- )
623
-
624
- with self.assertRaises(PySparkTypeError) as pe:
625
- self.spark.createDataFrame([("Alice", 10, 80.0)], schema).replace(lambda x: x + 1, 10)
626
-
627
- self.check_error(
628
- exception=pe.exception,
629
- error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_LIST_OR_STR_OR_TUPLE",
630
- message_parameters={"arg_name": "to_replace", "arg_type": "function"},
631
- )
632
-
633
- def test_with_column_with_existing_name(self):
634
- keys = self.df.withColumn("key", self.df.key).select("key").collect()
635
- self.assertEqual([r.key for r in keys], list(range(100)))
636
-
637
- # regression test for SPARK-10417
638
- def test_column_iterator(self):
639
- def foo():
640
- for x in self.df.key:
641
- break
642
-
643
- self.assertRaises(TypeError, foo)
644
-
645
- def test_with_columns(self):
646
- # With single column
647
- keys = self.df.withColumns({"key": self.df.key}).select("key").collect()
648
- self.assertEqual([r.key for r in keys], list(range(100)))
649
-
650
- # With key and value columns
651
- kvs = (
652
- self.df.withColumns({"key": self.df.key, "value": self.df.value})
653
- .select("key", "value")
654
- .collect()
655
- )
656
- self.assertEqual([(r.key, r.value) for r in kvs], [(i, str(i)) for i in range(100)])
657
-
658
- # Columns rename
659
- kvs = (
660
- self.df.withColumns({"key_alias": self.df.key, "value_alias": self.df.value})
661
- .select("key_alias", "value_alias")
662
- .collect()
663
- )
664
- self.assertEqual(
665
- [(r.key_alias, r.value_alias) for r in kvs], [(i, str(i)) for i in range(100)]
666
- )
667
-
668
- # Type check
669
- self.assertRaises(TypeError, self.df.withColumns, ["key"])
670
- self.assertRaises(Exception, self.df.withColumns)
671
-
672
- def test_generic_hints(self):
673
- df1 = self.spark.range(10e10).toDF("id")
674
- df2 = self.spark.range(10e10).toDF("id")
675
-
676
- self.assertIsInstance(df1.hint("broadcast"), type(df1))
677
-
678
- # Dummy rules
679
- self.assertIsInstance(df1.hint("broadcast", "foo", "bar"), type(df1))
680
-
681
- with io.StringIO() as buf, redirect_stdout(buf):
682
- df1.join(df2.hint("broadcast"), "id").explain(True)
683
- self.assertEqual(1, buf.getvalue().count("BroadcastHashJoin"))
684
-
685
- # add tests for SPARK-23647 (test more types for hint)
686
- def test_extended_hint_types(self):
687
- df = self.spark.range(10e10).toDF("id")
688
- such_a_nice_list = ["itworks1", "itworks2", "itworks3"]
689
- hinted_df = df.hint("my awesome hint", 1.2345, "what", such_a_nice_list)
690
-
691
- self.assertIsInstance(df.hint("broadcast", []), type(df))
692
- self.assertIsInstance(df.hint("broadcast", ["foo", "bar"]), type(df))
693
-
694
- with io.StringIO() as buf, redirect_stdout(buf):
695
- hinted_df.explain(True)
696
- explain_output = buf.getvalue()
697
- self.assertGreaterEqual(explain_output.count("1.2345"), 1)
698
- self.assertGreaterEqual(explain_output.count("what"), 1)
699
- self.assertGreaterEqual(explain_output.count("itworks"), 1)
700
-
701
- def test_unpivot(self):
702
- # SPARK-39877: test the DataFrame.unpivot method
703
- df = self.spark.createDataFrame(
704
- [
705
- (1, 10, 1.0, "one"),
706
- (2, 20, 2.0, "two"),
707
- (3, 30, 3.0, "three"),
708
- ],
709
- ["id", "int", "double", "str"],
710
- )
711
-
712
- with self.subTest(desc="with none identifier"):
713
- with self.assertRaisesRegex(AssertionError, "ids must not be None"):
714
- df.unpivot(None, ["int", "double"], "var", "val")
715
-
716
- with self.subTest(desc="with no identifier"):
717
- for id in [[], ()]:
718
- with self.subTest(ids=id):
719
- actual = df.unpivot(id, ["int", "double"], "var", "val")
720
- self.assertEqual(actual.schema.simpleString(), "struct<var:string,val:double>")
721
- self.assertEqual(
722
- actual.collect(),
723
- [
724
- Row(var="int", value=10.0),
725
- Row(var="double", value=1.0),
726
- Row(var="int", value=20.0),
727
- Row(var="double", value=2.0),
728
- Row(var="int", value=30.0),
729
- Row(var="double", value=3.0),
730
- ],
731
- )
732
-
733
- with self.subTest(desc="with single identifier column"):
734
- for id in ["id", ["id"], ("id",)]:
735
- with self.subTest(ids=id):
736
- actual = df.unpivot(id, ["int", "double"], "var", "val")
737
- self.assertEqual(
738
- actual.schema.simpleString(),
739
- "struct<id:bigint,var:string,val:double>",
740
- )
741
- self.assertEqual(
742
- actual.collect(),
743
- [
744
- Row(id=1, var="int", value=10.0),
745
- Row(id=1, var="double", value=1.0),
746
- Row(id=2, var="int", value=20.0),
747
- Row(id=2, var="double", value=2.0),
748
- Row(id=3, var="int", value=30.0),
749
- Row(id=3, var="double", value=3.0),
750
- ],
751
- )
752
-
753
- with self.subTest(desc="with multiple identifier columns"):
754
- for ids in [["id", "double"], ("id", "double")]:
755
- with self.subTest(ids=ids):
756
- actual = df.unpivot(ids, ["int", "double"], "var", "val")
757
- self.assertEqual(
758
- actual.schema.simpleString(),
759
- "struct<id:bigint,double:double,var:string,val:double>",
760
- )
761
- self.assertEqual(
762
- actual.collect(),
763
- [
764
- Row(id=1, double=1.0, var="int", value=10.0),
765
- Row(id=1, double=1.0, var="double", value=1.0),
766
- Row(id=2, double=2.0, var="int", value=20.0),
767
- Row(id=2, double=2.0, var="double", value=2.0),
768
- Row(id=3, double=3.0, var="int", value=30.0),
769
- Row(id=3, double=3.0, var="double", value=3.0),
770
- ],
771
- )
772
-
773
- with self.subTest(desc="with no identifier columns but none value columns"):
774
- # select only columns that have common data type (double)
775
- actual = df.select("id", "int", "double").unpivot([], None, "var", "val")
776
- self.assertEqual(actual.schema.simpleString(), "struct<var:string,val:double>")
777
- self.assertEqual(
778
- actual.collect(),
779
- [
780
- Row(var="id", value=1.0),
781
- Row(var="int", value=10.0),
782
- Row(var="double", value=1.0),
783
- Row(var="id", value=2.0),
784
- Row(var="int", value=20.0),
785
- Row(var="double", value=2.0),
786
- Row(var="id", value=3.0),
787
- Row(var="int", value=30.0),
788
- Row(var="double", value=3.0),
789
- ],
790
- )
791
-
792
- with self.subTest(desc="with single identifier columns but none value columns"):
793
- for ids in ["id", ["id"], ("id",)]:
794
- with self.subTest(ids=ids):
795
- # select only columns that have common data type (double)
796
- actual = df.select("id", "int", "double").unpivot(ids, None, "var", "val")
797
- self.assertEqual(
798
- actual.schema.simpleString(), "struct<id:bigint,var:string,val:double>"
799
- )
800
- self.assertEqual(
801
- actual.collect(),
802
- [
803
- Row(id=1, var="int", value=10.0),
804
- Row(id=1, var="double", value=1.0),
805
- Row(id=2, var="int", value=20.0),
806
- Row(id=2, var="double", value=2.0),
807
- Row(id=3, var="int", value=30.0),
808
- Row(id=3, var="double", value=3.0),
809
- ],
810
- )
811
-
812
- with self.subTest(desc="with multiple identifier columns but none given value columns"):
813
- for ids in [["id", "str"], ("id", "str")]:
814
- with self.subTest(ids=ids):
815
- actual = df.unpivot(ids, None, "var", "val")
816
- self.assertEqual(
817
- actual.schema.simpleString(),
818
- "struct<id:bigint,str:string,var:string,val:double>",
819
- )
820
- self.assertEqual(
821
- actual.collect(),
822
- [
823
- Row(id=1, str="one", var="int", val=10.0),
824
- Row(id=1, str="one", var="double", val=1.0),
825
- Row(id=2, str="two", var="int", val=20.0),
826
- Row(id=2, str="two", var="double", val=2.0),
827
- Row(id=3, str="three", var="int", val=30.0),
828
- Row(id=3, str="three", var="double", val=3.0),
829
- ],
830
- )
831
-
832
- with self.subTest(desc="with single value column"):
833
- for values in ["int", ["int"], ("int",)]:
834
- with self.subTest(values=values):
835
- actual = df.unpivot("id", values, "var", "val")
836
- self.assertEqual(
837
- actual.schema.simpleString(), "struct<id:bigint,var:string,val:bigint>"
838
- )
839
- self.assertEqual(
840
- actual.collect(),
841
- [
842
- Row(id=1, var="int", val=10),
843
- Row(id=2, var="int", val=20),
844
- Row(id=3, var="int", val=30),
845
- ],
846
- )
847
-
848
- with self.subTest(desc="with multiple value columns"):
849
- for values in [["int", "double"], ("int", "double")]:
850
- with self.subTest(values=values):
851
- actual = df.unpivot("id", values, "var", "val")
852
- self.assertEqual(
853
- actual.schema.simpleString(), "struct<id:bigint,var:string,val:double>"
854
- )
855
- self.assertEqual(
856
- actual.collect(),
857
- [
858
- Row(id=1, var="int", val=10.0),
859
- Row(id=1, var="double", val=1.0),
860
- Row(id=2, var="int", val=20.0),
861
- Row(id=2, var="double", val=2.0),
862
- Row(id=3, var="int", val=30.0),
863
- Row(id=3, var="double", val=3.0),
864
- ],
865
- )
866
-
867
- with self.subTest(desc="with columns"):
868
- for id in [df.id, [df.id], (df.id,)]:
869
- for values in [[df.int, df.double], (df.int, df.double)]:
870
- with self.subTest(ids=id, values=values):
871
- self.assertEqual(
872
- df.unpivot(id, values, "var", "val").collect(),
873
- df.unpivot("id", ["int", "double"], "var", "val").collect(),
874
- )
875
-
876
- with self.subTest(desc="with column names and columns"):
877
- for ids in [[df.id, "str"], (df.id, "str")]:
878
- for values in [[df.int, "double"], (df.int, "double")]:
879
- with self.subTest(ids=ids, values=values):
880
- self.assertEqual(
881
- df.unpivot(ids, values, "var", "val").collect(),
882
- df.unpivot(["id", "str"], ["int", "double"], "var", "val").collect(),
883
- )
884
-
885
- with self.subTest(desc="melt alias"):
886
- self.assertEqual(
887
- df.unpivot("id", ["int", "double"], "var", "val").collect(),
888
- df.melt("id", ["int", "double"], "var", "val").collect(),
889
- )
890
-
891
- def test_unpivot_negative(self):
892
- # SPARK-39877: test the DataFrame.unpivot method
893
- df = self.spark.createDataFrame(
894
- [
895
- (1, 10, 1.0, "one"),
896
- (2, 20, 2.0, "two"),
897
- (3, 30, 3.0, "three"),
898
- ],
899
- ["id", "int", "double", "str"],
900
- )
901
-
902
- with self.subTest(desc="with no value columns"):
903
- for values in [[], ()]:
904
- with self.subTest(values=values):
905
- with self.assertRaisesRegex(
906
- AnalysisException,
907
- r"\[UNPIVOT_REQUIRES_VALUE_COLUMNS] At least one value column "
908
- r"needs to be specified for UNPIVOT, all columns specified as ids.*",
909
- ):
910
- df.unpivot("id", values, "var", "val").collect()
911
-
912
- with self.subTest(desc="with value columns without common data type"):
913
- with self.assertRaisesRegex(
914
- AnalysisException,
915
- r"\[UNPIVOT_VALUE_DATA_TYPE_MISMATCH\] Unpivot value columns must share "
916
- r"a least common type, some types do not: .*",
917
- ):
918
- df.unpivot("id", ["int", "str"], "var", "val").collect()
919
-
920
- def test_observe(self):
921
- # SPARK-36263: tests the DataFrame.observe(Observation, *Column) method
922
- from pyspark.sql import Observation
923
-
924
- df = self.spark.createDataFrame(
925
- [
926
- (1, 1.0, "one"),
927
- (2, 2.0, "two"),
928
- (3, 3.0, "three"),
929
- ],
930
- ["id", "val", "label"],
931
- )
932
-
933
- unnamed_observation = Observation()
934
- named_observation = Observation("metric")
935
- observed = (
936
- df.orderBy("id")
937
- .observe(
938
- named_observation,
939
- count(lit(1)).alias("cnt"),
940
- sum(col("id")).alias("sum"),
941
- mean(col("val")).alias("mean"),
942
- )
943
- .observe(unnamed_observation, count(lit(1)).alias("rows"))
944
- )
945
-
946
- # test that observe works transparently
947
- actual = observed.collect()
948
- self.assertEqual(
949
- [
950
- {"id": 1, "val": 1.0, "label": "one"},
951
- {"id": 2, "val": 2.0, "label": "two"},
952
- {"id": 3, "val": 3.0, "label": "three"},
953
- ],
954
- [row.asDict() for row in actual],
955
- )
956
-
957
- # test that we retrieve the metrics
958
- self.assertEqual(named_observation.get, dict(cnt=3, sum=6, mean=2.0))
959
- self.assertEqual(unnamed_observation.get, dict(rows=3))
960
-
961
- # observation requires name (if given) to be non empty string
962
- with self.assertRaisesRegex(TypeError, "name should be a string"):
963
- Observation(123)
964
- with self.assertRaisesRegex(ValueError, "name should not be empty"):
965
- Observation("")
966
-
967
- # dataframe.observe requires at least one expr
968
- with self.assertRaises(PySparkValueError) as pe:
969
- df.observe(Observation())
970
-
971
- self.check_error(
972
- exception=pe.exception,
973
- error_class="CANNOT_BE_EMPTY",
974
- message_parameters={"item": "exprs"},
975
- )
976
-
977
- # dataframe.observe requires non-None Columns
978
- for args in [(None,), ("id",), (lit(1), None), (lit(1), "id")]:
979
- with self.subTest(args=args):
980
- with self.assertRaises(PySparkTypeError) as pe:
981
- df.observe(Observation(), *args)
982
-
983
- self.check_error(
984
- exception=pe.exception,
985
- error_class="NOT_LIST_OF_COLUMN",
986
- message_parameters={"arg_name": "exprs"},
987
- )
988
-
989
- def test_observe_str(self):
990
- # SPARK-38760: tests the DataFrame.observe(str, *Column) method
991
- from pyspark.sql.streaming import StreamingQueryListener
992
-
993
- observed_metrics = None
994
-
995
- class TestListener(StreamingQueryListener):
996
- def onQueryStarted(self, event):
997
- pass
998
-
999
- def onQueryProgress(self, event):
1000
- nonlocal observed_metrics
1001
- observed_metrics = event.progress.observedMetrics
1002
-
1003
- def onQueryIdle(self, event):
1004
- pass
1005
-
1006
- def onQueryTerminated(self, event):
1007
- pass
1008
-
1009
- self.spark.streams.addListener(TestListener())
1010
-
1011
- df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load()
1012
- df = df.observe("metric", count(lit(1)).alias("cnt"), sum(col("value")).alias("sum"))
1013
- q = df.writeStream.format("noop").queryName("test").start()
1014
- self.assertTrue(q.isActive)
1015
- time.sleep(10)
1016
- q.stop()
1017
-
1018
- self.assertTrue(isinstance(observed_metrics, dict))
1019
- self.assertTrue("metric" in observed_metrics)
1020
- row = observed_metrics["metric"]
1021
- self.assertTrue(isinstance(row, Row))
1022
- self.assertTrue(hasattr(row, "cnt"))
1023
- self.assertTrue(hasattr(row, "sum"))
1024
- self.assertGreaterEqual(row.cnt, 0)
1025
- self.assertGreaterEqual(row.sum, 0)
1026
-
1027
- def test_sample(self):
1028
- with self.assertRaises(PySparkTypeError) as pe:
1029
- self.spark.range(1).sample()
1030
-
1031
- self.check_error(
1032
- exception=pe.exception,
1033
- error_class="NOT_BOOL_OR_FLOAT_OR_INT",
1034
- message_parameters={
1035
- "arg_name": "withReplacement (optional), fraction (required) and seed (optional)",
1036
- "arg_type": "NoneType, NoneType, NoneType",
1037
- },
1038
- )
1039
-
1040
- self.assertRaises(TypeError, lambda: self.spark.range(1).sample("a"))
1041
-
1042
- self.assertRaises(TypeError, lambda: self.spark.range(1).sample(seed="abc"))
1043
-
1044
- self.assertRaises(
1045
- IllegalArgumentException, lambda: self.spark.range(1).sample(-1.0).count()
1046
- )
1047
-
1048
- def test_sample_with_random_seed(self):
1049
- df = self.spark.range(10000).sample(0.1)
1050
- cnts = [df.count() for i in range(10)]
1051
- self.assertEqual(1, len(set(cnts)))
1052
-
1053
- def test_toDF_with_string(self):
1054
- df = self.spark.createDataFrame([("John", 30), ("Alice", 25), ("Bob", 28)])
1055
- data = [("John", 30), ("Alice", 25), ("Bob", 28)]
1056
-
1057
- result = df.toDF("key", "value")
1058
- self.assertEqual(result.schema.simpleString(), "struct<key:string,value:bigint>")
1059
- self.assertEqual(result.collect(), data)
1060
-
1061
- with self.assertRaises(PySparkTypeError) as pe:
1062
- df.toDF("key", None)
1063
-
1064
- self.check_error(
1065
- exception=pe.exception,
1066
- error_class="NOT_LIST_OF_STR",
1067
- message_parameters={"arg_name": "cols", "arg_type": "NoneType"},
1068
- )
1069
-
1070
- def test_toDF_with_schema_string(self):
1071
- data = [Row(key=i, value=str(i)) for i in range(100)]
1072
- rdd = self.sc.parallelize(data, 5)
1073
-
1074
- df = rdd.toDF("key: int, value: string")
1075
- self.assertEqual(df.schema.simpleString(), "struct<key:int,value:string>")
1076
- self.assertEqual(df.collect(), data)
1077
-
1078
- # different but compatible field types can be used.
1079
- df = rdd.toDF("key: string, value: string")
1080
- self.assertEqual(df.schema.simpleString(), "struct<key:string,value:string>")
1081
- self.assertEqual(df.collect(), [Row(key=str(i), value=str(i)) for i in range(100)])
1082
-
1083
- # field names can differ.
1084
- df = rdd.toDF(" a: int, b: string ")
1085
- self.assertEqual(df.schema.simpleString(), "struct<a:int,b:string>")
1086
- self.assertEqual(df.collect(), data)
1087
-
1088
- # number of fields must match.
1089
- self.assertRaisesRegex(
1090
- Exception, "LENGTH_SHOULD_BE_THE_SAME", lambda: rdd.toDF("key: int").collect()
1091
- )
1092
-
1093
- # field types mismatch will cause exception at runtime.
1094
- self.assertRaisesRegex(
1095
- Exception,
1096
- "CANNOT_ACCEPT_OBJECT_IN_TYPE",
1097
- lambda: rdd.toDF("key: float, value: string").collect(),
1098
- )
1099
-
1100
- # flat schema values will be wrapped into row.
1101
- df = rdd.map(lambda row: row.key).toDF("int")
1102
- self.assertEqual(df.schema.simpleString(), "struct<value:int>")
1103
- self.assertEqual(df.collect(), [Row(key=i) for i in range(100)])
1104
-
1105
- # users can use DataType directly instead of data type string.
1106
- df = rdd.map(lambda row: row.key).toDF(IntegerType())
1107
- self.assertEqual(df.schema.simpleString(), "struct<value:int>")
1108
- self.assertEqual(df.collect(), [Row(key=i) for i in range(100)])
1109
-
1110
- def test_print_schema(self):
1111
- df = self.spark.createDataFrame([(1, (2, 2))], ["a", "b"])
1112
-
1113
- with io.StringIO() as buf, redirect_stdout(buf):
1114
- df.printSchema(1)
1115
- self.assertEqual(1, buf.getvalue().count("long"))
1116
- self.assertEqual(0, buf.getvalue().count("_1"))
1117
- self.assertEqual(0, buf.getvalue().count("_2"))
1118
-
1119
- buf.truncate(0)
1120
- buf.seek(0)
1121
-
1122
- df.printSchema(2)
1123
- self.assertEqual(3, buf.getvalue().count("long"))
1124
- self.assertEqual(1, buf.getvalue().count("_1"))
1125
- self.assertEqual(1, buf.getvalue().count("_2"))
1126
-
1127
- def test_join_without_on(self):
1128
- df1 = self.spark.range(1).toDF("a")
1129
- df2 = self.spark.range(1).toDF("b")
1130
-
1131
- with self.sql_conf({"spark.sql.crossJoin.enabled": False}):
1132
- self.assertRaises(AnalysisException, lambda: df1.join(df2, how="inner").collect())
1133
-
1134
- with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
1135
- actual = df1.join(df2, how="inner").collect()
1136
- expected = [Row(a=0, b=0)]
1137
- self.assertEqual(actual, expected)
1138
-
1139
- # Regression test for invalid join methods when on is None, Spark-14761
1140
- def test_invalid_join_method(self):
1141
- df1 = self.spark.createDataFrame([("Alice", 5), ("Bob", 8)], ["name", "age"])
1142
- df2 = self.spark.createDataFrame([("Alice", 80), ("Bob", 90)], ["name", "height"])
1143
- self.assertRaises(IllegalArgumentException, lambda: df1.join(df2, how="invalid-join-type"))
1144
-
1145
- # Cartesian products require cross join syntax
1146
- def test_require_cross(self):
1147
-
1148
- df1 = self.spark.createDataFrame([(1, "1")], ("key", "value"))
1149
- df2 = self.spark.createDataFrame([(1, "1")], ("key", "value"))
1150
-
1151
- with self.sql_conf({"spark.sql.crossJoin.enabled": False}):
1152
- # joins without conditions require cross join syntax
1153
- self.assertRaises(AnalysisException, lambda: df1.join(df2).collect())
1154
-
1155
- # works with crossJoin
1156
- self.assertEqual(1, df1.crossJoin(df2).count())
1157
-
1158
- def test_cache_dataframe(self):
1159
- df = self.spark.createDataFrame([(2, 2), (3, 3)])
1160
- try:
1161
- self.assertEqual(df.storageLevel, StorageLevel.NONE)
1162
-
1163
- df.cache()
1164
- self.assertEqual(df.storageLevel, StorageLevel.MEMORY_AND_DISK_DESER)
1165
-
1166
- df.unpersist()
1167
- self.assertEqual(df.storageLevel, StorageLevel.NONE)
1168
-
1169
- df.persist()
1170
- self.assertEqual(df.storageLevel, StorageLevel.MEMORY_AND_DISK_DESER)
1171
-
1172
- df.unpersist(blocking=True)
1173
- self.assertEqual(df.storageLevel, StorageLevel.NONE)
1174
-
1175
- df.persist(StorageLevel.DISK_ONLY)
1176
- self.assertEqual(df.storageLevel, StorageLevel.DISK_ONLY)
1177
- finally:
1178
- df.unpersist()
1179
- self.assertEqual(df.storageLevel, StorageLevel.NONE)
1180
-
1181
- def test_cache_table(self):
1182
- spark = self.spark
1183
- tables = ["tab1", "tab2", "tab3"]
1184
- with self.tempView(*tables):
1185
- for i, tab in enumerate(tables):
1186
- spark.createDataFrame([(2, i), (3, i)]).createOrReplaceTempView(tab)
1187
- self.assertFalse(spark.catalog.isCached(tab))
1188
- spark.catalog.cacheTable("tab1")
1189
- spark.catalog.cacheTable("tab3", StorageLevel.OFF_HEAP)
1190
- self.assertTrue(spark.catalog.isCached("tab1"))
1191
- self.assertFalse(spark.catalog.isCached("tab2"))
1192
- self.assertTrue(spark.catalog.isCached("tab3"))
1193
- spark.catalog.cacheTable("tab2")
1194
- spark.catalog.uncacheTable("tab1")
1195
- spark.catalog.uncacheTable("tab3")
1196
- self.assertFalse(spark.catalog.isCached("tab1"))
1197
- self.assertTrue(spark.catalog.isCached("tab2"))
1198
- self.assertFalse(spark.catalog.isCached("tab3"))
1199
- spark.catalog.clearCache()
1200
- self.assertFalse(spark.catalog.isCached("tab1"))
1201
- self.assertFalse(spark.catalog.isCached("tab2"))
1202
- self.assertFalse(spark.catalog.isCached("tab3"))
1203
- self.assertRaisesRegex(
1204
- AnalysisException,
1205
- "does_not_exist",
1206
- lambda: spark.catalog.isCached("does_not_exist"),
1207
- )
1208
- self.assertRaisesRegex(
1209
- AnalysisException,
1210
- "does_not_exist",
1211
- lambda: spark.catalog.cacheTable("does_not_exist"),
1212
- )
1213
- self.assertRaisesRegex(
1214
- AnalysisException,
1215
- "does_not_exist",
1216
- lambda: spark.catalog.uncacheTable("does_not_exist"),
1217
- )
1218
-
1219
- def _to_pandas(self):
1220
- from datetime import datetime, date, timedelta
1221
-
1222
- schema = (
1223
- StructType()
1224
- .add("a", IntegerType())
1225
- .add("b", StringType())
1226
- .add("c", BooleanType())
1227
- .add("d", FloatType())
1228
- .add("dt", DateType())
1229
- .add("ts", TimestampType())
1230
- .add("ts_ntz", TimestampNTZType())
1231
- .add("dt_interval", DayTimeIntervalType())
1232
- )
1233
- data = [
1234
- (
1235
- 1,
1236
- "foo",
1237
- True,
1238
- 3.0,
1239
- date(1969, 1, 1),
1240
- datetime(1969, 1, 1, 1, 1, 1),
1241
- datetime(1969, 1, 1, 1, 1, 1),
1242
- timedelta(days=1),
1243
- ),
1244
- (2, "foo", True, 5.0, None, None, None, None),
1245
- (
1246
- 3,
1247
- "bar",
1248
- False,
1249
- -1.0,
1250
- date(2012, 3, 3),
1251
- datetime(2012, 3, 3, 3, 3, 3),
1252
- datetime(2012, 3, 3, 3, 3, 3),
1253
- timedelta(hours=-1, milliseconds=421),
1254
- ),
1255
- (
1256
- 4,
1257
- "bar",
1258
- False,
1259
- 6.0,
1260
- date(2100, 4, 4),
1261
- datetime(2100, 4, 4, 4, 4, 4),
1262
- datetime(2100, 4, 4, 4, 4, 4),
1263
- timedelta(microseconds=123),
1264
- ),
1265
- ]
1266
- df = self.spark.createDataFrame(data, schema)
1267
- return df.toPandas()
1268
-
1269
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1270
- def test_to_pandas(self):
1271
- import numpy as np
1272
-
1273
- pdf = self._to_pandas()
1274
- types = pdf.dtypes
1275
- self.assertEqual(types[0], np.int32)
1276
- self.assertEqual(types[1], object)
1277
- self.assertEqual(types[2], bool)
1278
- self.assertEqual(types[3], np.float32)
1279
- self.assertEqual(types[4], object) # datetime.date
1280
- self.assertEqual(types[5], "datetime64[ns]")
1281
- self.assertEqual(types[6], "datetime64[ns]")
1282
- self.assertEqual(types[7], "timedelta64[ns]")
1283
-
1284
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1285
- def test_to_pandas_with_duplicated_column_names(self):
1286
- for arrow_enabled in [False, True]:
1287
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": arrow_enabled}):
1288
- self.check_to_pandas_with_duplicated_column_names()
1289
-
1290
- def check_to_pandas_with_duplicated_column_names(self):
1291
- import numpy as np
1292
-
1293
- sql = "select 1 v, 1 v"
1294
- df = self.spark.sql(sql)
1295
- pdf = df.toPandas()
1296
- types = pdf.dtypes
1297
- self.assertEqual(types.iloc[0], np.int32)
1298
- self.assertEqual(types.iloc[1], np.int32)
1299
-
1300
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1301
- def test_to_pandas_on_cross_join(self):
1302
- for arrow_enabled in [False, True]:
1303
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": arrow_enabled}):
1304
- self.check_to_pandas_on_cross_join()
1305
-
1306
- def check_to_pandas_on_cross_join(self):
1307
- import numpy as np
1308
-
1309
- sql = """
1310
- select t1.*, t2.* from (
1311
- select explode(sequence(1, 3)) v
1312
- ) t1 left join (
1313
- select explode(sequence(1, 3)) v
1314
- ) t2
1315
- """
1316
- with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
1317
- df = self.spark.sql(sql)
1318
- pdf = df.toPandas()
1319
- types = pdf.dtypes
1320
- self.assertEqual(types.iloc[0], np.int32)
1321
- self.assertEqual(types.iloc[1], np.int32)
1322
-
1323
- @unittest.skipIf(have_pandas, "Required Pandas was found.")
1324
- def test_to_pandas_required_pandas_not_found(self):
1325
- with QuietTest(self.sc):
1326
- with self.assertRaisesRegex(ImportError, "Pandas >= .* must be installed"):
1327
- self._to_pandas()
1328
-
1329
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1330
- def test_to_pandas_avoid_astype(self):
1331
- import numpy as np
1332
-
1333
- schema = StructType().add("a", IntegerType()).add("b", StringType()).add("c", IntegerType())
1334
- data = [(1, "foo", 16777220), (None, "bar", None)]
1335
- df = self.spark.createDataFrame(data, schema)
1336
- types = df.toPandas().dtypes
1337
- self.assertEqual(types[0], np.float64) # doesn't convert to np.int32 due to NaN value.
1338
- self.assertEqual(types[1], object)
1339
- self.assertEqual(types[2], np.float64)
1340
-
1341
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1342
- def test_to_pandas_from_empty_dataframe(self):
1343
- is_arrow_enabled = [True, False]
1344
- for value in is_arrow_enabled:
1345
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": value}):
1346
- self.check_to_pandas_from_empty_dataframe()
1347
-
1348
- def check_to_pandas_from_empty_dataframe(self):
1349
- # SPARK-29188 test that toPandas() on an empty dataframe has the correct dtypes
1350
- # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes
1351
- # when arrow is enabled
1352
- import numpy as np
1353
-
1354
- sql = """
1355
- SELECT CAST(1 AS TINYINT) AS tinyint,
1356
- CAST(1 AS SMALLINT) AS smallint,
1357
- CAST(1 AS INT) AS int,
1358
- CAST(1 AS BIGINT) AS bigint,
1359
- CAST(0 AS FLOAT) AS float,
1360
- CAST(0 AS DOUBLE) AS double,
1361
- CAST(1 AS BOOLEAN) AS boolean,
1362
- CAST('foo' AS STRING) AS string,
1363
- CAST('2019-01-01' AS TIMESTAMP) AS timestamp,
1364
- CAST('2019-01-01' AS TIMESTAMP_NTZ) AS timestamp_ntz,
1365
- INTERVAL '1563:04' MINUTE TO SECOND AS day_time_interval
1366
- """
1367
- dtypes_when_nonempty_df = self.spark.sql(sql).toPandas().dtypes
1368
- dtypes_when_empty_df = self.spark.sql(sql).filter("False").toPandas().dtypes
1369
- self.assertTrue(np.all(dtypes_when_empty_df == dtypes_when_nonempty_df))
1370
-
1371
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1372
- def test_to_pandas_from_null_dataframe(self):
1373
- is_arrow_enabled = [True, False]
1374
- for value in is_arrow_enabled:
1375
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": value}):
1376
- self.check_to_pandas_from_null_dataframe()
1377
-
1378
- def check_to_pandas_from_null_dataframe(self):
1379
- # SPARK-29188 test that toPandas() on a dataframe with only nulls has correct dtypes
1380
- # SPARK-30537 test that toPandas() on a dataframe with only nulls has correct dtypes
1381
- # using arrow
1382
- import numpy as np
1383
-
1384
- sql = """
1385
- SELECT CAST(NULL AS TINYINT) AS tinyint,
1386
- CAST(NULL AS SMALLINT) AS smallint,
1387
- CAST(NULL AS INT) AS int,
1388
- CAST(NULL AS BIGINT) AS bigint,
1389
- CAST(NULL AS FLOAT) AS float,
1390
- CAST(NULL AS DOUBLE) AS double,
1391
- CAST(NULL AS BOOLEAN) AS boolean,
1392
- CAST(NULL AS STRING) AS string,
1393
- CAST(NULL AS TIMESTAMP) AS timestamp,
1394
- CAST(NULL AS TIMESTAMP_NTZ) AS timestamp_ntz,
1395
- INTERVAL '1563:04' MINUTE TO SECOND AS day_time_interval
1396
- """
1397
- pdf = self.spark.sql(sql).toPandas()
1398
- types = pdf.dtypes
1399
- self.assertEqual(types[0], np.float64)
1400
- self.assertEqual(types[1], np.float64)
1401
- self.assertEqual(types[2], np.float64)
1402
- self.assertEqual(types[3], np.float64)
1403
- self.assertEqual(types[4], np.float32)
1404
- self.assertEqual(types[5], np.float64)
1405
- self.assertEqual(types[6], object)
1406
- self.assertEqual(types[7], object)
1407
- self.assertTrue(np.can_cast(np.datetime64, types[8]))
1408
- self.assertTrue(np.can_cast(np.datetime64, types[9]))
1409
- self.assertTrue(np.can_cast(np.timedelta64, types[10]))
1410
-
1411
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1412
- def test_to_pandas_from_mixed_dataframe(self):
1413
- is_arrow_enabled = [True, False]
1414
- for value in is_arrow_enabled:
1415
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": value}):
1416
- self.check_to_pandas_from_mixed_dataframe()
1417
-
1418
- def check_to_pandas_from_mixed_dataframe(self):
1419
- # SPARK-29188 test that toPandas() on a dataframe with some nulls has correct dtypes
1420
- # SPARK-30537 test that toPandas() on a dataframe with some nulls has correct dtypes
1421
- # using arrow
1422
- import numpy as np
1423
-
1424
- sql = """
1425
- SELECT CAST(col1 AS TINYINT) AS tinyint,
1426
- CAST(col2 AS SMALLINT) AS smallint,
1427
- CAST(col3 AS INT) AS int,
1428
- CAST(col4 AS BIGINT) AS bigint,
1429
- CAST(col5 AS FLOAT) AS float,
1430
- CAST(col6 AS DOUBLE) AS double,
1431
- CAST(col7 AS BOOLEAN) AS boolean,
1432
- CAST(col8 AS STRING) AS string,
1433
- timestamp_seconds(col9) AS timestamp,
1434
- timestamp_seconds(col10) AS timestamp_ntz,
1435
- INTERVAL '1563:04' MINUTE TO SECOND AS day_time_interval
1436
- FROM VALUES (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
1437
- (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
1438
- """
1439
- pdf_with_some_nulls = self.spark.sql(sql).toPandas()
1440
- pdf_with_only_nulls = self.spark.sql(sql).filter("tinyint is null").toPandas()
1441
- self.assertTrue(np.all(pdf_with_only_nulls.dtypes == pdf_with_some_nulls.dtypes))
1442
-
1443
- @unittest.skipIf(
1444
- not have_pandas or not have_pyarrow or pyarrow_version_less_than_minimum("2.0.0"),
1445
- pandas_requirement_message
1446
- or pyarrow_requirement_message
1447
- or "Pyarrow version must be 2.0.0 or higher",
1448
- )
1449
- def test_to_pandas_for_array_of_struct(self):
1450
- for is_arrow_enabled in [True, False]:
1451
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": is_arrow_enabled}):
1452
- self.check_to_pandas_for_array_of_struct(is_arrow_enabled)
1453
-
1454
- def check_to_pandas_for_array_of_struct(self, is_arrow_enabled):
1455
- # SPARK-38098: Support Array of Struct for Pandas UDFs and toPandas
1456
- import numpy as np
1457
- import pandas as pd
1458
-
1459
- df = self.spark.createDataFrame(
1460
- [[[("a", 2, 3.0), ("a", 2, 3.0)]], [[("b", 5, 6.0), ("b", 5, 6.0)]]],
1461
- "array_struct_col Array<struct<col1:string, col2:long, col3:double>>",
1462
- )
1463
-
1464
- pdf = df.toPandas()
1465
- self.assertEqual(type(pdf), pd.DataFrame)
1466
- self.assertEqual(type(pdf["array_struct_col"]), pd.Series)
1467
- if is_arrow_enabled:
1468
- self.assertEqual(type(pdf["array_struct_col"][0]), np.ndarray)
1469
- else:
1470
- self.assertEqual(type(pdf["array_struct_col"][0]), list)
1471
-
1472
- def test_create_dataframe_from_array_of_long(self):
1473
- import array
1474
-
1475
- data = [Row(longarray=array.array("l", [-9223372036854775808, 0, 9223372036854775807]))]
1476
- df = self.spark.createDataFrame(data)
1477
- self.assertEqual(df.first(), Row(longarray=[-9223372036854775808, 0, 9223372036854775807]))
1478
-
1479
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1480
- def test_create_dataframe_from_pandas_with_timestamp(self):
1481
- import pandas as pd
1482
- from datetime import datetime
1483
-
1484
- pdf = pd.DataFrame(
1485
- {"ts": [datetime(2017, 10, 31, 1, 1, 1)], "d": [pd.Timestamp.now().date()]},
1486
- columns=["d", "ts"],
1487
- )
1488
- # test types are inferred correctly without specifying schema
1489
- df = self.spark.createDataFrame(pdf)
1490
- self.assertIsInstance(df.schema["ts"].dataType, TimestampType)
1491
- self.assertIsInstance(df.schema["d"].dataType, DateType)
1492
- # test with schema will accept pdf as input
1493
- df = self.spark.createDataFrame(pdf, schema="d date, ts timestamp")
1494
- self.assertIsInstance(df.schema["ts"].dataType, TimestampType)
1495
- self.assertIsInstance(df.schema["d"].dataType, DateType)
1496
- df = self.spark.createDataFrame(pdf, schema="d date, ts timestamp_ntz")
1497
- self.assertIsInstance(df.schema["ts"].dataType, TimestampNTZType)
1498
- self.assertIsInstance(df.schema["d"].dataType, DateType)
1499
-
1500
- @unittest.skipIf(have_pandas, "Required Pandas was found.")
1501
- def test_create_dataframe_required_pandas_not_found(self):
1502
- with QuietTest(self.sc):
1503
- with self.assertRaisesRegex(
1504
- ImportError, "(Pandas >= .* must be installed|No module named '?pandas'?)"
1505
- ):
1506
- import pandas as pd
1507
- from datetime import datetime
1508
-
1509
- pdf = pd.DataFrame(
1510
- {"ts": [datetime(2017, 10, 31, 1, 1, 1)], "d": [pd.Timestamp.now().date()]}
1511
- )
1512
- self.spark.createDataFrame(pdf)
1513
-
1514
- # Regression test for SPARK-23360
1515
- @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore
1516
- def test_create_dataframe_from_pandas_with_dst(self):
1517
- import pandas as pd
1518
- from pandas.testing import assert_frame_equal
1519
- from datetime import datetime
1520
-
1521
- pdf = pd.DataFrame({"time": [datetime(2015, 10, 31, 22, 30)]})
1522
-
1523
- df = self.spark.createDataFrame(pdf)
1524
- assert_frame_equal(pdf, df.toPandas())
1525
-
1526
- orig_env_tz = os.environ.get("TZ", None)
1527
- try:
1528
- tz = "America/Los_Angeles"
1529
- os.environ["TZ"] = tz
1530
- time.tzset()
1531
- with self.sql_conf({"spark.sql.session.timeZone": tz}):
1532
- df = self.spark.createDataFrame(pdf)
1533
- assert_frame_equal(pdf, df.toPandas())
1534
- finally:
1535
- del os.environ["TZ"]
1536
- if orig_env_tz is not None:
1537
- os.environ["TZ"] = orig_env_tz
1538
- time.tzset()
1539
-
1540
- # TODO(SPARK-43354): Re-enable test_create_dataframe_from_pandas_with_day_time_interval
1541
- @unittest.skipIf(
1542
- "pypy" in platform.python_implementation().lower(),
1543
- "Fails in PyPy Python 3.8, should enable.",
1544
- )
1545
- def test_create_dataframe_from_pandas_with_day_time_interval(self):
1546
- # SPARK-37277: Test DayTimeIntervalType in createDataFrame without Arrow.
1547
- import pandas as pd
1548
- from datetime import timedelta
1549
-
1550
- df = self.spark.createDataFrame(pd.DataFrame({"a": [timedelta(microseconds=123)]}))
1551
- self.assertEqual(df.toPandas().a.iloc[0], timedelta(microseconds=123))
1552
-
1553
- @unittest.skipIf(
1554
- "SPARK_SKIP_CONNECT_COMPAT_TESTS" in os.environ, "Newline difference from the server"
1555
- )
1556
- def test_repr_behaviors(self):
1557
- import re
1558
-
1559
- pattern = re.compile(r"^ *\|", re.MULTILINE)
1560
- df = self.spark.createDataFrame([(1, "1"), (22222, "22222")], ("key", "value"))
1561
-
1562
- # test when eager evaluation is enabled and _repr_html_ will not be called
1563
- with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}):
1564
- expected1 = """+-----+-----+
1565
- || key|value|
1566
- |+-----+-----+
1567
- || 1| 1|
1568
- ||22222|22222|
1569
- |+-----+-----+
1570
- |"""
1571
- self.assertEqual(re.sub(pattern, "", expected1), df.__repr__())
1572
- with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}):
1573
- expected2 = """+---+-----+
1574
- ||key|value|
1575
- |+---+-----+
1576
- || 1| 1|
1577
- ||222| 222|
1578
- |+---+-----+
1579
- |"""
1580
- self.assertEqual(re.sub(pattern, "", expected2), df.__repr__())
1581
- with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}):
1582
- expected3 = """+---+-----+
1583
- ||key|value|
1584
- |+---+-----+
1585
- || 1| 1|
1586
- |+---+-----+
1587
- |only showing top 1 row
1588
- |"""
1589
- self.assertEqual(re.sub(pattern, "", expected3), df.__repr__())
1590
-
1591
- # test when eager evaluation is enabled and _repr_html_ will be called
1592
- with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}):
1593
- expected1 = """<table border='1'>
1594
- |<tr><th>key</th><th>value</th></tr>
1595
- |<tr><td>1</td><td>1</td></tr>
1596
- |<tr><td>22222</td><td>22222</td></tr>
1597
- |</table>
1598
- |"""
1599
- self.assertEqual(re.sub(pattern, "", expected1), df._repr_html_())
1600
- with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}):
1601
- expected2 = """<table border='1'>
1602
- |<tr><th>key</th><th>value</th></tr>
1603
- |<tr><td>1</td><td>1</td></tr>
1604
- |<tr><td>222</td><td>222</td></tr>
1605
- |</table>
1606
- |"""
1607
- self.assertEqual(re.sub(pattern, "", expected2), df._repr_html_())
1608
- with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}):
1609
- expected3 = """<table border='1'>
1610
- |<tr><th>key</th><th>value</th></tr>
1611
- |<tr><td>1</td><td>1</td></tr>
1612
- |</table>
1613
- |only showing top 1 row
1614
- |"""
1615
- self.assertEqual(re.sub(pattern, "", expected3), df._repr_html_())
1616
-
1617
- # test when eager evaluation is disabled and _repr_html_ will be called
1618
- with self.sql_conf({"spark.sql.repl.eagerEval.enabled": False}):
1619
- expected = "DataFrame[key: bigint, value: string]"
1620
- self.assertEqual(None, df._repr_html_())
1621
- self.assertEqual(expected, df.__repr__())
1622
- with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}):
1623
- self.assertEqual(None, df._repr_html_())
1624
- self.assertEqual(expected, df.__repr__())
1625
- with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}):
1626
- self.assertEqual(None, df._repr_html_())
1627
- self.assertEqual(expected, df.__repr__())
1628
-
1629
- def test_to_local_iterator(self):
1630
- df = self.spark.range(8, numPartitions=4)
1631
- expected = df.collect()
1632
- it = df.toLocalIterator()
1633
- self.assertEqual(expected, list(it))
1634
-
1635
- # Test DataFrame with empty partition
1636
- df = self.spark.range(3, numPartitions=4)
1637
- it = df.toLocalIterator()
1638
- expected = df.collect()
1639
- self.assertEqual(expected, list(it))
1640
-
1641
- def test_to_local_iterator_prefetch(self):
1642
- df = self.spark.range(8, numPartitions=4)
1643
- expected = df.collect()
1644
- it = df.toLocalIterator(prefetchPartitions=True)
1645
- self.assertEqual(expected, list(it))
1646
-
1647
- def test_to_local_iterator_not_fully_consumed(self):
1648
- with QuietTest(self.sc):
1649
- self.check_to_local_iterator_not_fully_consumed()
1650
-
1651
- def check_to_local_iterator_not_fully_consumed(self):
1652
- # SPARK-23961: toLocalIterator throws exception when not fully consumed
1653
- # Create a DataFrame large enough so that write to socket will eventually block
1654
- df = self.spark.range(1 << 20, numPartitions=2)
1655
- it = df.toLocalIterator()
1656
- self.assertEqual(df.take(1)[0], next(it))
1657
- it = None # remove iterator from scope, socket is closed when cleaned up
1658
- # Make sure normal df operations still work
1659
- result = []
1660
- for i, row in enumerate(df.toLocalIterator()):
1661
- result.append(row)
1662
- if i == 7:
1663
- break
1664
- self.assertEqual(df.take(8), result)
1665
-
1666
- def test_same_semantics_error(self):
1667
- with QuietTest(self.sc):
1668
- with self.assertRaises(PySparkTypeError) as pe:
1669
- self.spark.range(10).sameSemantics(1)
1670
-
1671
- self.check_error(
1672
- exception=pe.exception,
1673
- error_class="NOT_STR",
1674
- message_parameters={"arg_name": "other", "arg_type": "int"},
1675
- )
1676
-
1677
- def test_input_files(self):
1678
- tpath = tempfile.mkdtemp()
1679
- shutil.rmtree(tpath)
1680
- try:
1681
- self.spark.range(1, 100, 1, 10).write.parquet(tpath)
1682
- # read parquet file and get the input files list
1683
- input_files_list = self.spark.read.parquet(tpath).inputFiles()
1684
-
1685
- # input files list should contain 10 entries
1686
- self.assertEqual(len(input_files_list), 10)
1687
- # all file paths in list must contain tpath
1688
- for file_path in input_files_list:
1689
- self.assertTrue(tpath in file_path)
1690
- finally:
1691
- shutil.rmtree(tpath)
1692
-
1693
- def test_df_show(self):
1694
- # SPARK-35408: ensure better diagnostics if incorrect parameters are passed
1695
- # to DataFrame.show
1696
-
1697
- df = self.spark.createDataFrame([("foo",)])
1698
- df.show(5)
1699
- df.show(5, True)
1700
- df.show(5, 1, True)
1701
- df.show(n=5, truncate="1", vertical=False)
1702
- df.show(n=5, truncate=1.5, vertical=False)
1703
-
1704
- with self.assertRaises(PySparkTypeError) as pe:
1705
- df.show(True)
1706
-
1707
- self.check_error(
1708
- exception=pe.exception,
1709
- error_class="NOT_INT",
1710
- message_parameters={"arg_name": "n", "arg_type": "bool"},
1711
- )
1712
-
1713
- with self.assertRaises(PySparkTypeError) as pe:
1714
- df.show(vertical="foo")
1715
-
1716
- self.check_error(
1717
- exception=pe.exception,
1718
- error_class="NOT_BOOL",
1719
- message_parameters={"arg_name": "vertical", "arg_type": "str"},
1720
- )
1721
-
1722
- with self.assertRaises(PySparkTypeError) as pe:
1723
- df.show(truncate="foo")
1724
-
1725
- self.check_error(
1726
- exception=pe.exception,
1727
- error_class="NOT_BOOL",
1728
- message_parameters={"arg_name": "truncate", "arg_type": "str"},
1729
- )
1730
-
1731
- @unittest.skipIf(
1732
- not have_pandas or not have_pyarrow,
1733
- cast(str, pandas_requirement_message or pyarrow_requirement_message),
1734
- )
1735
- def test_pandas_api(self):
1736
- import pandas as pd
1737
- from pandas.testing import assert_frame_equal
1738
-
1739
- sdf = self.spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"])
1740
- psdf_from_sdf = sdf.pandas_api()
1741
- psdf_from_sdf_with_index = sdf.pandas_api(index_col="Col1")
1742
- pdf = pd.DataFrame({"Col1": ["a", "b", "c"], "Col2": [1, 2, 3]})
1743
- pdf_with_index = pdf.set_index("Col1")
1744
-
1745
- assert_frame_equal(pdf, psdf_from_sdf.to_pandas())
1746
- assert_frame_equal(pdf_with_index, psdf_from_sdf_with_index.to_pandas())
1747
-
1748
- # test for SPARK-36337
1749
- def test_create_nan_decimal_dataframe(self):
1750
- self.assertEqual(
1751
- self.spark.createDataFrame(data=[Decimal("NaN")], schema="decimal").collect(),
1752
- [Row(value=None)],
1753
- )
1754
-
1755
- def test_to(self):
1756
- schema = StructType(
1757
- [StructField("i", StringType(), True), StructField("j", IntegerType(), True)]
1758
- )
1759
- df = self.spark.createDataFrame([("a", 1)], schema)
1760
-
1761
- schema1 = StructType([StructField("j", StringType()), StructField("i", StringType())])
1762
- df1 = df.to(schema1)
1763
- self.assertEqual(schema1, df1.schema)
1764
- self.assertEqual(df.count(), df1.count())
1765
-
1766
- schema2 = StructType([StructField("j", LongType())])
1767
- df2 = df.to(schema2)
1768
- self.assertEqual(schema2, df2.schema)
1769
- self.assertEqual(df.count(), df2.count())
1770
-
1771
- schema3 = StructType([StructField("struct", schema1, False)])
1772
- df3 = df.select(struct("i", "j").alias("struct")).to(schema3)
1773
- self.assertEqual(schema3, df3.schema)
1774
- self.assertEqual(df.count(), df3.count())
1775
-
1776
- # incompatible field nullability
1777
- schema4 = StructType([StructField("j", LongType(), False)])
1778
- self.assertRaisesRegex(
1779
- AnalysisException, "NULLABLE_COLUMN_OR_FIELD", lambda: df.to(schema4).count()
1780
- )
1781
-
1782
- # field cannot upcast
1783
- schema5 = StructType([StructField("i", LongType())])
1784
- self.assertRaisesRegex(
1785
- AnalysisException, "INVALID_COLUMN_OR_FIELD_DATA_TYPE", lambda: df.to(schema5).count()
1786
- )
1787
-
1788
- def test_repartition(self):
1789
- df = self.spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1790
- with self.assertRaises(PySparkTypeError) as pe:
1791
- df.repartition([10], "name", "age").rdd.getNumPartitions()
1792
-
1793
- self.check_error(
1794
- exception=pe.exception,
1795
- error_class="NOT_COLUMN_OR_STR",
1796
- message_parameters={"arg_name": "numPartitions", "arg_type": "list"},
1797
- )
1798
-
1799
- def test_colregex(self):
1800
- with self.assertRaises(PySparkTypeError) as pe:
1801
- self.spark.range(10).colRegex(10)
1802
-
1803
- self.check_error(
1804
- exception=pe.exception,
1805
- error_class="NOT_STR",
1806
- message_parameters={"arg_name": "colName", "arg_type": "int"},
1807
- )
1808
-
1809
- def test_where(self):
1810
- with self.assertRaises(PySparkTypeError) as pe:
1811
- self.spark.range(10).where(10)
1812
-
1813
- self.check_error(
1814
- exception=pe.exception,
1815
- error_class="NOT_COLUMN_OR_STR",
1816
- message_parameters={"arg_name": "condition", "arg_type": "int"},
1817
- )
1818
-
1819
- def test_duplicate_field_names(self):
1820
- data = [
1821
- Row(Row("a", 1), Row(2, 3, "b", 4, "c", "d")),
1822
- Row(Row("w", 6), Row(7, 8, "x", 9, "y", "z")),
1823
- ]
1824
- schema = (
1825
- StructType()
1826
- .add("struct", StructType().add("x", StringType()).add("x", IntegerType()))
1827
- .add(
1828
- "struct",
1829
- StructType()
1830
- .add("a", IntegerType())
1831
- .add("x", IntegerType())
1832
- .add("x", StringType())
1833
- .add("y", IntegerType())
1834
- .add("y", StringType())
1835
- .add("x", StringType()),
1836
- )
1837
- )
1838
- df = self.spark.createDataFrame(data, schema=schema)
1839
-
1840
- self.assertEqual(df.schema, schema)
1841
- self.assertEqual(df.collect(), data)
1842
-
1843
-
1844
- class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils):
1845
- # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is
1846
- # static and immutable. This can't be set or unset, for example, via `spark.conf`.
1847
-
1848
- @classmethod
1849
- def setUpClass(cls):
1850
- import glob
1851
- from pyspark.find_spark_home import _find_spark_home
1852
-
1853
- SPARK_HOME = _find_spark_home()
1854
- filename_pattern = (
1855
- "sql/core/target/scala-*/test-classes/org/apache/spark/sql/"
1856
- "TestQueryExecutionListener.class"
1857
- )
1858
- cls.has_listener = bool(glob.glob(os.path.join(SPARK_HOME, filename_pattern)))
1859
-
1860
- if cls.has_listener:
1861
- # Note that 'spark.sql.queryExecutionListeners' is a static immutable configuration.
1862
- cls.spark = (
1863
- SparkSession.builder.master("local[4]")
1864
- .appName(cls.__name__)
1865
- .config(
1866
- "spark.sql.queryExecutionListeners",
1867
- "org.apache.spark.sql.TestQueryExecutionListener",
1868
- )
1869
- .getOrCreate()
1870
- )
1871
-
1872
- def setUp(self):
1873
- if not self.has_listener:
1874
- raise self.skipTest(
1875
- "'org.apache.spark.sql.TestQueryExecutionListener' is not "
1876
- "available. Will skip the related tests."
1877
- )
1878
-
1879
- @classmethod
1880
- def tearDownClass(cls):
1881
- if hasattr(cls, "spark"):
1882
- cls.spark.stop()
1883
-
1884
- def tearDown(self):
1885
- self.spark._jvm.OnSuccessCall.clear()
1886
-
1887
- def test_query_execution_listener_on_collect(self):
1888
- self.assertFalse(
1889
- self.spark._jvm.OnSuccessCall.isCalled(),
1890
- "The callback from the query execution listener should not be called before 'collect'",
1891
- )
1892
- self.spark.sql("SELECT * FROM range(1)").collect()
1893
- self.spark.sparkContext._jsc.sc().listenerBus().waitUntilEmpty(10000)
1894
- self.assertTrue(
1895
- self.spark._jvm.OnSuccessCall.isCalled(),
1896
- "The callback from the query execution listener should be called after 'collect'",
1897
- )
1898
-
1899
- @unittest.skipIf(
1900
- not have_pandas or not have_pyarrow,
1901
- cast(str, pandas_requirement_message or pyarrow_requirement_message),
1902
- )
1903
- def test_query_execution_listener_on_collect_with_arrow(self):
1904
- with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": True}):
1905
- self.assertFalse(
1906
- self.spark._jvm.OnSuccessCall.isCalled(),
1907
- "The callback from the query execution listener should not be "
1908
- "called before 'toPandas'",
1909
- )
1910
- self.spark.sql("SELECT * FROM range(1)").toPandas()
1911
- self.spark.sparkContext._jsc.sc().listenerBus().waitUntilEmpty(10000)
1912
- self.assertTrue(
1913
- self.spark._jvm.OnSuccessCall.isCalled(),
1914
- "The callback from the query execution listener should be called after 'toPandas'",
1915
- )
1916
-
1917
-
1918
- class DataFrameTests(DataFrameTestsMixin, ReusedSQLTestCase):
1919
- pass
1920
-
1921
-
1922
- if __name__ == "__main__":
1923
- from pyspark.sql.tests.test_dataframe import * # noqa: F401
1924
-
1925
- try:
1926
- import xmlrunner # type: ignore
1927
-
1928
- testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
1929
- except ImportError:
1930
- testRunner = None
1931
- unittest.main(testRunner=testRunner, verbosity=2)