snowpark-connect 0.24.0__py3-none-any.whl → 0.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (484) hide show
  1. snowflake/snowpark_connect/column_name_handler.py +116 -4
  2. snowflake/snowpark_connect/config.py +23 -0
  3. snowflake/snowpark_connect/constants.py +0 -29
  4. snowflake/snowpark_connect/dataframe_container.py +22 -0
  5. snowflake/snowpark_connect/execute_plan/map_execution_command.py +56 -1
  6. snowflake/snowpark_connect/expression/literal.py +13 -2
  7. snowflake/snowpark_connect/expression/map_cast.py +5 -8
  8. snowflake/snowpark_connect/expression/map_sql_expression.py +23 -1
  9. snowflake/snowpark_connect/expression/map_udf.py +88 -29
  10. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +199 -15
  11. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +44 -16
  12. snowflake/snowpark_connect/expression/map_unresolved_function.py +840 -367
  13. snowflake/snowpark_connect/expression/map_unresolved_star.py +3 -2
  14. snowflake/snowpark_connect/hidden_column.py +39 -0
  15. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  16. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  17. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  18. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  19. snowflake/snowpark_connect/includes/jars/{hadoop-client-api-3.3.4.jar → spark-connect-client-jvm_2.12-3.5.6.jar} +0 -0
  20. snowflake/snowpark_connect/relation/map_column_ops.py +17 -4
  21. snowflake/snowpark_connect/relation/map_extension.py +52 -11
  22. snowflake/snowpark_connect/relation/map_join.py +258 -62
  23. snowflake/snowpark_connect/relation/map_map_partitions.py +9 -4
  24. snowflake/snowpark_connect/relation/map_relation.py +12 -1
  25. snowflake/snowpark_connect/relation/map_row_ops.py +8 -1
  26. snowflake/snowpark_connect/relation/map_sql.py +88 -11
  27. snowflake/snowpark_connect/relation/map_udtf.py +100 -46
  28. snowflake/snowpark_connect/relation/read/map_read.py +3 -3
  29. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +1 -1
  30. snowflake/snowpark_connect/relation/read/map_read_json.py +8 -1
  31. snowflake/snowpark_connect/relation/read/map_read_table.py +1 -9
  32. snowflake/snowpark_connect/relation/read/reader_config.py +3 -1
  33. snowflake/snowpark_connect/relation/utils.py +44 -0
  34. snowflake/snowpark_connect/relation/write/map_write.py +175 -75
  35. snowflake/snowpark_connect/resources_initializer.py +47 -6
  36. snowflake/snowpark_connect/server.py +26 -4
  37. snowflake/snowpark_connect/type_mapping.py +29 -25
  38. snowflake/snowpark_connect/typed_column.py +14 -0
  39. snowflake/snowpark_connect/utils/artifacts.py +23 -0
  40. snowflake/snowpark_connect/utils/concurrent.py +4 -0
  41. snowflake/snowpark_connect/utils/context.py +6 -1
  42. snowflake/snowpark_connect/utils/external_udxf_cache.py +36 -0
  43. snowflake/snowpark_connect/utils/scala_udf_utils.py +596 -0
  44. snowflake/snowpark_connect/utils/session.py +4 -0
  45. snowflake/snowpark_connect/utils/telemetry.py +6 -17
  46. snowflake/snowpark_connect/utils/udf_helper.py +2 -0
  47. snowflake/snowpark_connect/utils/udf_utils.py +22 -1
  48. snowflake/snowpark_connect/utils/udtf_utils.py +1 -0
  49. snowflake/snowpark_connect/version.py +1 -1
  50. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/METADATA +1 -1
  51. snowpark_connect-0.26.0.dist-info/RECORD +481 -0
  52. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  56. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +0 -16
  57. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +0 -60
  58. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +0 -306
  59. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +0 -16
  60. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +0 -53
  61. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +0 -50
  62. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +0 -43
  63. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +0 -114
  64. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +0 -47
  65. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +0 -43
  66. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +0 -46
  67. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +0 -238
  68. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +0 -194
  69. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +0 -156
  70. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +0 -184
  71. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +0 -78
  72. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +0 -292
  73. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +0 -50
  74. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +0 -152
  75. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +0 -456
  76. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +0 -96
  77. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +0 -186
  78. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +0 -77
  79. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +0 -401
  80. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +0 -528
  81. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +0 -82
  82. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +0 -409
  83. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +0 -55
  84. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +0 -441
  85. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +0 -546
  86. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +0 -71
  87. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +0 -52
  88. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +0 -494
  89. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +0 -85
  90. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +0 -138
  91. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +0 -16
  92. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +0 -151
  93. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +0 -97
  94. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +0 -143
  95. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +0 -551
  96. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +0 -137
  97. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +0 -96
  98. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +0 -142
  99. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +0 -16
  100. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +0 -137
  101. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +0 -561
  102. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +0 -172
  103. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +0 -16
  104. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +0 -353
  105. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +0 -192
  106. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +0 -680
  107. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +0 -206
  108. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +0 -471
  109. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +0 -108
  110. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
  111. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
  112. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
  113. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
  114. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +0 -16
  115. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +0 -16
  116. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +0 -177
  117. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +0 -575
  118. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +0 -235
  119. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +0 -653
  120. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +0 -463
  121. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +0 -86
  122. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +0 -151
  123. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +0 -139
  124. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +0 -458
  125. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +0 -86
  126. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +0 -202
  127. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +0 -520
  128. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +0 -361
  129. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +0 -16
  130. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +0 -16
  131. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +0 -40
  132. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +0 -42
  133. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +0 -40
  134. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +0 -37
  135. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +0 -60
  136. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +0 -40
  137. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +0 -40
  138. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +0 -90
  139. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +0 -40
  140. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +0 -40
  141. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +0 -40
  142. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +0 -42
  143. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +0 -37
  144. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +0 -16
  145. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +0 -36
  146. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +0 -42
  147. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +0 -47
  148. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +0 -55
  149. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +0 -40
  150. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +0 -47
  151. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +0 -47
  152. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +0 -42
  153. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +0 -43
  154. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +0 -47
  155. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +0 -43
  156. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +0 -47
  157. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +0 -47
  158. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +0 -40
  159. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +0 -226
  160. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +0 -16
  161. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +0 -39
  162. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +0 -55
  163. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +0 -39
  164. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +0 -39
  165. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +0 -39
  166. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +0 -39
  167. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +0 -39
  168. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +0 -43
  169. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +0 -43
  170. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +0 -16
  171. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +0 -40
  172. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +0 -39
  173. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +0 -42
  174. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +0 -42
  175. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +0 -37
  176. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +0 -40
  177. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +0 -42
  178. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +0 -48
  179. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +0 -40
  180. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +0 -16
  181. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +0 -40
  182. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +0 -41
  183. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +0 -67
  184. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +0 -40
  185. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +0 -55
  186. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +0 -40
  187. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +0 -38
  188. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +0 -55
  189. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +0 -39
  190. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +0 -38
  191. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +0 -16
  192. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +0 -40
  193. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +0 -50
  194. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +0 -73
  195. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +0 -39
  196. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +0 -40
  197. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +0 -40
  198. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +0 -40
  199. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +0 -48
  200. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +0 -39
  201. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +0 -16
  202. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +0 -40
  203. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +0 -16
  204. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +0 -45
  205. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +0 -45
  206. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +0 -49
  207. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +0 -37
  208. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +0 -53
  209. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +0 -45
  210. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +0 -16
  211. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +0 -38
  212. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +0 -37
  213. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +0 -37
  214. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +0 -38
  215. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +0 -37
  216. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +0 -40
  217. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +0 -40
  218. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +0 -38
  219. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +0 -40
  220. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +0 -37
  221. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +0 -38
  222. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +0 -38
  223. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +0 -66
  224. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +0 -37
  225. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +0 -37
  226. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +0 -42
  227. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +0 -39
  228. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +0 -49
  229. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +0 -37
  230. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +0 -39
  231. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +0 -49
  232. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +0 -53
  233. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +0 -43
  234. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +0 -49
  235. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +0 -39
  236. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +0 -41
  237. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +0 -39
  238. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +0 -60
  239. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +0 -48
  240. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +0 -39
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +0 -44
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +0 -84
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +0 -37
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +0 -45
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +0 -39
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +0 -39
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +0 -37
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +0 -39
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +0 -39
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +0 -39
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +0 -39
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +0 -43
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +0 -37
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +0 -36
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +0 -37
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +0 -39
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +0 -16
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +0 -107
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +0 -224
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +0 -825
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +0 -562
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +0 -368
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +0 -257
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +0 -260
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +0 -178
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +0 -184
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +0 -497
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +0 -140
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +0 -354
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +0 -219
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +0 -192
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +0 -228
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +0 -16
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +0 -118
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +0 -198
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +0 -181
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +0 -103
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +0 -141
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +0 -109
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +0 -136
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +0 -125
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +0 -217
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +0 -16
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +0 -384
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +0 -598
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +0 -73
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +0 -869
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +0 -487
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +0 -309
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +0 -156
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +0 -149
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +0 -163
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +0 -16
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +0 -311
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +0 -524
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +0 -419
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +0 -144
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +0 -979
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +0 -234
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +0 -206
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +0 -421
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +0 -187
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +0 -397
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +0 -16
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +0 -100
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +0 -2743
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +0 -484
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +0 -276
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +0 -432
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +0 -310
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +0 -257
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +0 -160
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +0 -128
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +0 -16
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +0 -137
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +0 -16
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +0 -170
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +0 -547
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +0 -285
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +0 -106
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +0 -409
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +0 -247
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +0 -16
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +0 -105
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +0 -197
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +0 -137
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +0 -227
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +0 -634
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +0 -88
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +0 -139
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +0 -475
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +0 -265
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +0 -818
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +0 -162
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +0 -780
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +0 -741
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +0 -160
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +0 -453
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +0 -281
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +0 -487
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +0 -109
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +0 -434
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +0 -253
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +0 -152
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +0 -162
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +0 -234
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +0 -1339
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +0 -82
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +0 -124
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +0 -638
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +0 -200
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +0 -1355
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +0 -655
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +0 -113
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +0 -118
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +0 -192
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +0 -346
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +0 -495
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +0 -263
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +0 -59
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +0 -85
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +0 -364
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +0 -362
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +0 -46
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +0 -123
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +0 -581
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +0 -447
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +0 -301
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +0 -465
  370. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +0 -16
  371. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +0 -83
  372. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +0 -16
  373. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +0 -16
  374. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +0 -16
  375. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +0 -420
  376. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +0 -358
  377. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +0 -16
  378. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +0 -36
  379. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +0 -44
  380. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +0 -116
  381. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +0 -35
  382. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +0 -3612
  383. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +0 -1042
  384. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +0 -2381
  385. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +0 -1060
  386. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +0 -163
  387. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +0 -38
  388. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +0 -48
  389. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +0 -36
  390. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +0 -55
  391. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +0 -36
  392. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +0 -96
  393. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +0 -44
  394. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +0 -36
  395. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +0 -59
  396. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +0 -36
  397. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +0 -59
  398. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +0 -74
  399. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +0 -62
  400. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +0 -58
  401. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +0 -70
  402. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +0 -50
  403. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +0 -68
  404. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +0 -40
  405. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +0 -46
  406. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +0 -44
  407. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +0 -100
  408. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +0 -100
  409. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +0 -163
  410. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +0 -181
  411. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +0 -42
  412. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +0 -16
  413. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +0 -623
  414. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +0 -869
  415. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +0 -342
  416. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +0 -436
  417. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +0 -363
  418. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +0 -592
  419. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +0 -1503
  420. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +0 -392
  421. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +0 -375
  422. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +0 -411
  423. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +0 -16
  424. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +0 -401
  425. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +0 -295
  426. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +0 -106
  427. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +0 -558
  428. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +0 -1346
  429. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +0 -182
  430. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +0 -202
  431. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +0 -503
  432. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +0 -225
  433. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +0 -83
  434. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +0 -201
  435. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +0 -1931
  436. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +0 -256
  437. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +0 -69
  438. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +0 -1349
  439. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +0 -53
  440. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +0 -68
  441. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +0 -283
  442. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +0 -155
  443. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +0 -412
  444. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +0 -1581
  445. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +0 -961
  446. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +0 -165
  447. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +0 -1456
  448. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +0 -1686
  449. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +0 -16
  450. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +0 -184
  451. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +0 -706
  452. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +0 -118
  453. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +0 -160
  454. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +0 -16
  455. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +0 -306
  456. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +0 -196
  457. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +0 -44
  458. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +0 -346
  459. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +0 -89
  460. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +0 -124
  461. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +0 -69
  462. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +0 -167
  463. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +0 -194
  464. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +0 -168
  465. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +0 -939
  466. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +0 -52
  467. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +0 -66
  468. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +0 -368
  469. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +0 -257
  470. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +0 -267
  471. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +0 -153
  472. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +0 -130
  473. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +0 -350
  474. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +0 -97
  475. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +0 -271
  476. snowpark_connect-0.24.0.dist-info/RECORD +0 -898
  477. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-connect +0 -0
  478. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-session +0 -0
  479. {snowpark_connect-0.24.0.data → snowpark_connect-0.26.0.data}/scripts/snowpark-submit +0 -0
  480. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/WHEEL +0 -0
  481. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/LICENSE-binary +0 -0
  482. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/LICENSE.txt +0 -0
  483. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/licenses/NOTICE-binary +0 -0
  484. {snowpark_connect-0.24.0.dist-info → snowpark_connect-0.26.0.dist-info}/top_level.txt +0 -0
@@ -1,2743 +0,0 @@
1
- #
2
- # Licensed to the Apache Software Foundation (ASF) under one or more
3
- # contributor license agreements. See the NOTICE file distributed with
4
- # this work for additional information regarding copyright ownership.
5
- # The ASF licenses this file to You under the Apache License, Version 2.0
6
- # (the "License"); you may not use this file except in compliance with
7
- # the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- #
17
-
18
- import inspect
19
- import unittest
20
- from distutils.version import LooseVersion
21
- from datetime import datetime, timedelta
22
-
23
- import numpy as np
24
- import pandas as pd
25
-
26
- import pyspark.pandas as ps
27
- from pyspark.pandas.exceptions import PandasNotImplementedError
28
- from pyspark.pandas.missing.indexes import (
29
- MissingPandasLikeDatetimeIndex,
30
- MissingPandasLikeIndex,
31
- MissingPandasLikeMultiIndex,
32
- MissingPandasLikeTimedeltaIndex,
33
- )
34
- from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils, SPARK_CONF_ARROW_ENABLED
35
-
36
-
37
- class IndexesTestsMixin:
38
- @property
39
- def pdf(self):
40
- return pd.DataFrame(
41
- {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
42
- index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
43
- )
44
-
45
- @unittest.skipIf(
46
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
47
- "TODO(SPARK-43606): Enable IndexesTests.test_index_basic for pandas 2.0.0.",
48
- )
49
- def test_index_basic(self):
50
- for pdf in [
51
- pd.DataFrame(np.random.randn(10, 5), index=np.random.randint(100, size=10)),
52
- pd.DataFrame(
53
- np.random.randn(10, 5), index=np.random.randint(100, size=10).astype(np.int32)
54
- ),
55
- pd.DataFrame(np.random.randn(10, 5), index=np.random.randn(10)),
56
- pd.DataFrame(np.random.randn(10, 5), index=np.random.randn(10).astype(np.float32)),
57
- pd.DataFrame(np.random.randn(10, 5), index=list("abcdefghij")),
58
- pd.DataFrame(
59
- np.random.randn(10, 5), index=pd.date_range("2011-01-01", freq="D", periods=10)
60
- ),
61
- pd.DataFrame(np.random.randn(10, 5), index=pd.Categorical(list("abcdefghij"))),
62
- pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")).set_index(["a", "b"]),
63
- ]:
64
- psdf = ps.from_pandas(pdf)
65
- self.assert_eq(psdf.index, pdf.index)
66
- # Int64Index is removed from pandas 2.0.0, so we should compare the dtype itself.
67
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
68
- self.assert_eq(psdf.index.dtype, pdf.index.dtype)
69
- else:
70
- self.assert_eq(type(psdf.index).__name__, type(pdf.index).__name__)
71
-
72
- self.assert_eq(ps.Index([])._summary(), "Index: 0 entries")
73
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
74
- with self.assertRaisesRegexp(ValueError, "The truth value of a Index is ambiguous."):
75
- bool(ps.Index([1]))
76
- with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
77
- ps.Index([1, 2, 3], name=[(1, 2, 3)])
78
- with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
79
- ps.Index([1.0, 2.0, 3.0], name=[(1, 2, 3)])
80
- else:
81
- with self.assertRaisesRegexp(
82
- ValueError, "The truth value of a Int64Index is ambiguous."
83
- ):
84
- bool(ps.Index([1]))
85
- with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
86
- ps.Int64Index([1, 2, 3], name=[(1, 2, 3)])
87
- with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
88
- ps.Float64Index([1.0, 2.0, 3.0], name=[(1, 2, 3)])
89
-
90
- def test_index_from_series(self):
91
- pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30])
92
- psser = ps.from_pandas(pser)
93
-
94
- self.assert_eq(ps.Index(psser), pd.Index(pser))
95
- self.assert_eq(ps.Index(psser, dtype="float"), pd.Index(pser, dtype="float"))
96
- self.assert_eq(ps.Index(psser, name="x"), pd.Index(pser, name="x"))
97
-
98
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
99
- self.assert_eq(ps.Index(psser, dtype="int64"), pd.Index(pser, dtype="int64"))
100
- self.assert_eq(ps.Index(psser, dtype="float64"), pd.Index(pser, dtype="float64"))
101
- elif LooseVersion(pd.__version__) >= LooseVersion("1.1"):
102
- self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser))
103
- self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser))
104
- else:
105
- self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser).rename("a"))
106
- self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser).rename("a"))
107
-
108
- pser = pd.Series([datetime(2021, 3, 1), datetime(2021, 3, 2)], name="x", index=[10, 20])
109
- psser = ps.from_pandas(pser)
110
-
111
- self.assert_eq(ps.Index(psser), pd.Index(pser))
112
- self.assert_eq(ps.DatetimeIndex(psser), pd.DatetimeIndex(pser))
113
-
114
- def test_index_from_index(self):
115
- pidx = pd.Index([1, 2, 3], name="a")
116
- psidx = ps.from_pandas(pidx)
117
-
118
- self.assert_eq(ps.Index(psidx), pd.Index(pidx))
119
- self.assert_eq(ps.Index(psidx, dtype="float"), pd.Index(pidx, dtype="float"))
120
- self.assert_eq(ps.Index(psidx, name="x"), pd.Index(pidx, name="x"))
121
- self.assert_eq(ps.Index(psidx, copy=True), pd.Index(pidx, copy=True))
122
-
123
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
124
- self.assert_eq(ps.Index(psidx, dtype="int64"), pd.Index(pidx, dtype="int64"))
125
- self.assert_eq(ps.Index(psidx, dtype="float64"), pd.Index(pidx, dtype="float64"))
126
- else:
127
- self.assert_eq(ps.Int64Index(psidx), pd.Int64Index(pidx))
128
- self.assert_eq(ps.Float64Index(psidx), pd.Float64Index(pidx))
129
-
130
- pidx = pd.DatetimeIndex(["2021-03-01", "2021-03-02"])
131
- psidx = ps.from_pandas(pidx)
132
-
133
- self.assert_eq(ps.Index(psidx), pd.Index(pidx))
134
- self.assert_eq(ps.DatetimeIndex(psidx), pd.DatetimeIndex(pidx))
135
-
136
- def test_index_getattr(self):
137
- psidx = self.psdf.index
138
- item = "databricks"
139
-
140
- expected_error_message = "'.*Index' object has no attribute '{}'".format(item)
141
- with self.assertRaisesRegex(AttributeError, expected_error_message):
142
- psidx.__getattr__(item)
143
- with self.assertRaisesRegex(AttributeError, expected_error_message):
144
- ps.from_pandas(pd.date_range("2011-01-01", freq="D", periods=10)).__getattr__(item)
145
-
146
- def test_multi_index_getattr(self):
147
- arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
148
- idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
149
- pdf = pd.DataFrame(np.random.randn(4, 5), idx)
150
- psdf = ps.from_pandas(pdf)
151
- psidx = psdf.index
152
- item = "databricks"
153
-
154
- expected_error_message = "'MultiIndex' object has no attribute '{}'".format(item)
155
- with self.assertRaisesRegex(AttributeError, expected_error_message):
156
- psidx.__getattr__(item)
157
-
158
- def test_to_series(self):
159
- pidx = self.pdf.index
160
- psidx = self.psdf.index
161
-
162
- self.assert_eq(psidx.to_series(), pidx.to_series())
163
- self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
164
-
165
- # With name
166
- pidx.name = "Koalas"
167
- psidx.name = "Koalas"
168
- self.assert_eq(psidx.to_series(), pidx.to_series())
169
- self.assert_eq(psidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a")))
170
-
171
- # With tupled name
172
- pidx.name = ("x", "a")
173
- psidx.name = ("x", "a")
174
- self.assert_eq(psidx.to_series(), pidx.to_series())
175
- self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
176
-
177
- self.assert_eq((psidx + 1).to_series(), (pidx + 1).to_series())
178
-
179
- pidx = self.pdf.set_index("b", append=True).index
180
- psidx = self.psdf.set_index("b", append=True).index
181
-
182
- with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
183
- self.assert_eq(psidx.to_series(), pidx.to_series())
184
- self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
185
-
186
- expected_error_message = "Series.name must be a hashable type"
187
- with self.assertRaisesRegex(TypeError, expected_error_message):
188
- psidx.to_series(name=["x", "a"])
189
-
190
- def test_to_frame(self):
191
- pidx = self.pdf.index
192
- psidx = self.psdf.index
193
-
194
- self.assert_eq(psidx.to_frame(), pidx.to_frame())
195
- self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False))
196
-
197
- pidx.name = "a"
198
- psidx.name = "a"
199
-
200
- self.assert_eq(psidx.to_frame(), pidx.to_frame())
201
- self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False))
202
-
203
- self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x"))
204
- self.assert_eq(psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"))
205
-
206
- self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"]))
207
-
208
- # non-string name
209
- self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10))
210
- self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10)))
211
-
212
- pidx = self.pdf.set_index("b", append=True).index
213
- psidx = self.psdf.set_index("b", append=True).index
214
-
215
- self.assert_eq(psidx.to_frame(), pidx.to_frame())
216
- self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False))
217
-
218
- self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"]))
219
- self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y")))
220
- self.assert_eq(
221
- psidx.to_frame(index=False, name=["x", "y"]),
222
- pidx.to_frame(index=False, name=["x", "y"]),
223
- )
224
-
225
- self.assertRaises(TypeError, lambda: psidx.to_frame(name="x"))
226
- self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"]))
227
-
228
- # non-string names
229
- self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20]))
230
- self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10)))
231
- if LooseVersion(pd.__version__) < LooseVersion("1.5.0"):
232
- self.assert_eq(
233
- psidx.to_frame(name=[("x", 10), ("y", 20)]),
234
- pidx.to_frame(name=[("x", 10), ("y", 20)]),
235
- )
236
- else:
237
- # Since pandas 1.5.0, the result is changed as below:
238
- # (x, 10) (y, 20)
239
- # b
240
- # 0 4 0 4
241
- # 1 5 1 5
242
- # 3 6 3 6
243
- # 5 3 5 3
244
- # 6 2 6 2
245
- # 8 1 8 1
246
- # 9 0 9 0
247
- # 0 9 0
248
- # 0 9 0
249
- #
250
- # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`,
251
- # but pandas API on Spark doesn't support such a way for creating Index.
252
- # So, we currently cannot follow the behavior of pandas.
253
- expected_result = ps.DataFrame(
254
- {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]},
255
- index=ps.MultiIndex.from_tuples(
256
- [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)],
257
- names=[None, "b"],
258
- ),
259
- )
260
- self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result)
261
-
262
- def test_index_names(self):
263
- psdf = self.psdf
264
- self.assertIsNone(psdf.index.name)
265
-
266
- idx = pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x")
267
- pdf = pd.DataFrame(np.random.randn(10, 5), index=idx, columns=list("abcde"))
268
- psdf = ps.from_pandas(pdf)
269
-
270
- pser = pdf.a
271
- psser = psdf.a
272
-
273
- self.assertEqual(psdf.index.name, pdf.index.name)
274
- self.assertEqual(psdf.index.names, pdf.index.names)
275
-
276
- pidx = pdf.index
277
- psidx = psdf.index
278
- pidx.name = "renamed"
279
- psidx.name = "renamed"
280
- self.assertEqual(psidx.name, pidx.name)
281
- self.assertEqual(psidx.names, pidx.names)
282
- self.assert_eq(psidx, pidx)
283
- self.assertEqual(psdf.index.name, pdf.index.name)
284
- self.assertEqual(psdf.index.names, pdf.index.names)
285
- self.assertEqual(psser.index.names, pser.index.names)
286
-
287
- pidx.name = None
288
- psidx.name = None
289
- self.assertEqual(psidx.name, pidx.name)
290
- self.assertEqual(psidx.names, pidx.names)
291
- self.assert_eq(psidx, pidx)
292
- self.assertEqual(psdf.index.name, pdf.index.name)
293
- self.assertEqual(psdf.index.names, pdf.index.names)
294
- self.assertEqual(psser.index.names, pser.index.names)
295
-
296
- with self.assertRaisesRegex(ValueError, "Names must be a list-like"):
297
- psidx.names = "hi"
298
-
299
- expected_error_message = "Length of new names must be {}, got {}".format(
300
- psdf._internal.index_level, len(["0", "1"])
301
- )
302
- with self.assertRaisesRegex(ValueError, expected_error_message):
303
- psidx.names = ["0", "1"]
304
-
305
- expected_error_message = "Index.name must be a hashable type"
306
- with self.assertRaisesRegex(TypeError, expected_error_message):
307
- ps.Index([1, 2, 3], name=["0", "1"])
308
- with self.assertRaisesRegex(TypeError, expected_error_message):
309
- psidx.name = ["renamed"]
310
- with self.assertRaisesRegex(TypeError, expected_error_message):
311
- psidx.name = ["0", "1"]
312
- # Specifying `names` when creating Index is no longer supported from pandas 2.0.0.
313
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
314
- pass
315
- else:
316
- with self.assertRaisesRegex(TypeError, expected_error_message):
317
- ps.Index([(1, 2), (3, 4)], names=["a", ["b"]])
318
-
319
- def test_multi_index_names(self):
320
- arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
321
- idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
322
- pdf = pd.DataFrame(np.random.randn(4, 5), idx)
323
- psdf = ps.from_pandas(pdf)
324
-
325
- self.assertEqual(psdf.index.names, pdf.index.names)
326
-
327
- pidx = pdf.index
328
- psidx = psdf.index
329
- pidx.names = ["renamed_number", "renamed_color"]
330
- psidx.names = ["renamed_number", "renamed_color"]
331
- self.assertEqual(psidx.names, pidx.names)
332
-
333
- pidx.names = ["renamed_number", None]
334
- psidx.names = ["renamed_number", None]
335
- self.assertEqual(psidx.names, pidx.names)
336
- self.assert_eq(psidx, pidx)
337
-
338
- with self.assertRaises(PandasNotImplementedError):
339
- psidx.name
340
- with self.assertRaises(PandasNotImplementedError):
341
- psidx.name = "renamed"
342
-
343
- def test_index_rename(self):
344
- pdf = pd.DataFrame(
345
- np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x")
346
- )
347
- psdf = ps.from_pandas(pdf)
348
-
349
- pidx = pdf.index
350
- psidx = psdf.index
351
-
352
- self.assert_eq(psidx.rename("y"), pidx.rename("y"))
353
- self.assert_eq(psdf.index.names, pdf.index.names)
354
-
355
- # non-string names
356
- self.assert_eq(psidx.rename(0), pidx.rename(0))
357
- self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0)))
358
-
359
- psidx.rename("z", inplace=True)
360
- pidx.rename("z", inplace=True)
361
-
362
- self.assert_eq(psidx, pidx)
363
- self.assert_eq(psdf.index.names, pdf.index.names)
364
-
365
- self.assert_eq(psidx.rename(None), pidx.rename(None))
366
- self.assert_eq(psdf.index.names, pdf.index.names)
367
-
368
- self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"]))
369
-
370
- def test_multi_index_rename(self):
371
- arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
372
- idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
373
- pdf = pd.DataFrame(np.random.randn(4, 5), idx)
374
- psdf = ps.from_pandas(pdf)
375
-
376
- pmidx = pdf.index
377
- psmidx = psdf.index
378
-
379
- self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"]))
380
- self.assert_eq(psdf.index.names, pdf.index.names)
381
-
382
- # non-string names
383
- self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1]))
384
- self.assert_eq(
385
- psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")])
386
- )
387
-
388
- psmidx.rename(["num", "col"], inplace=True)
389
- pmidx.rename(["num", "col"], inplace=True)
390
-
391
- self.assert_eq(psmidx, pmidx)
392
- self.assert_eq(psdf.index.names, pdf.index.names)
393
-
394
- self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None]))
395
- self.assert_eq(psdf.index.names, pdf.index.names)
396
-
397
- self.assertRaises(TypeError, lambda: psmidx.rename("number"))
398
- self.assertRaises(TypeError, lambda: psmidx.rename(None))
399
- self.assertRaises(ValueError, lambda: psmidx.rename(["number"]))
400
-
401
- def test_multi_index_levshape(self):
402
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
403
- psidx = ps.from_pandas(pidx)
404
- self.assertEqual(pidx.levshape, psidx.levshape)
405
-
406
- def test_index_unique(self):
407
- psidx = self.psdf.index
408
-
409
- # here the output is different than pandas in terms of order
410
- expected = [0, 1, 3, 5, 6, 8, 9]
411
-
412
- self.assert_eq(expected, sorted(psidx.unique()._to_pandas()))
413
- self.assert_eq(expected, sorted(psidx.unique(level=0)._to_pandas()))
414
-
415
- expected = [1, 2, 4, 6, 7, 9, 10]
416
- self.assert_eq(expected, sorted((psidx + 1).unique()._to_pandas()))
417
-
418
- with self.assertRaisesRegex(IndexError, "Too many levels*"):
419
- psidx.unique(level=1)
420
-
421
- with self.assertRaisesRegex(KeyError, "Requested level (hi)*"):
422
- psidx.unique(level="hi")
423
-
424
- def test_multi_index_copy(self):
425
- arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
426
- idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
427
- pdf = pd.DataFrame(np.random.randn(4, 5), idx)
428
- psdf = ps.from_pandas(pdf)
429
-
430
- self.assert_eq(psdf.index.copy(), pdf.index.copy())
431
-
432
- def test_drop_duplicates(self):
433
- pidx = pd.Index([4, 2, 4, 1, 4, 3])
434
- psidx = ps.from_pandas(pidx)
435
-
436
- self.assert_eq(psidx.drop_duplicates(), pidx.drop_duplicates())
437
- self.assert_eq((psidx + 1).drop_duplicates(), (pidx + 1).drop_duplicates())
438
-
439
- self.assert_eq(psidx.drop_duplicates(keep="first"), pidx.drop_duplicates(keep="first"))
440
- self.assert_eq(psidx.drop_duplicates(keep="last"), pidx.drop_duplicates(keep="last"))
441
- self.assert_eq(psidx.drop_duplicates(keep=False), pidx.drop_duplicates(keep=False))
442
-
443
- arrays = [[1, 2, 3, 1, 2], ["red", "blue", "black", "red", "blue"]]
444
- pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
445
- psmidx = ps.from_pandas(pmidx)
446
- self.assert_eq(psmidx.drop_duplicates(), pmidx.drop_duplicates())
447
- self.assert_eq(psmidx.drop_duplicates(keep="first"), pmidx.drop_duplicates(keep="first"))
448
- self.assert_eq(psmidx.drop_duplicates(keep="last"), pmidx.drop_duplicates(keep="last"))
449
- self.assert_eq(psmidx.drop_duplicates(keep=False), pmidx.drop_duplicates(keep=False))
450
-
451
- def test_dropna(self):
452
- pidx = pd.Index([np.nan, 2, 4, 1, None, 3])
453
- psidx = ps.from_pandas(pidx)
454
-
455
- self.assert_eq(psidx.dropna(), pidx.dropna())
456
- self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna())
457
-
458
- self.assert_eq(psidx.dropna(how="any"), pidx.dropna(how="any"))
459
- self.assert_eq(psidx.dropna(how="all"), pidx.dropna(how="all"))
460
-
461
- pmidx = pd.MultiIndex.from_tuples(
462
- [(np.nan, 1.0), (2.0, 2.0), (np.nan, None), (3.0, np.nan)]
463
- )
464
- psmidx = ps.from_pandas(pmidx)
465
- self.assert_eq(psmidx.dropna(), pmidx.dropna())
466
- self.assert_eq(psmidx.dropna(how="any"), pmidx.dropna(how="any"))
467
- self.assert_eq(psmidx.dropna(how="all"), pmidx.dropna(how="all"))
468
-
469
- invalid_how = "none"
470
- with self.assertRaisesRegex(ValueError, "invalid how option: %s" % invalid_how):
471
- psmidx.dropna(invalid_how)
472
-
473
- def test_index_symmetric_difference(self):
474
- pidx1 = pd.Index([1, 2, 3, 4])
475
- pidx2 = pd.Index([2, 3, 4, 5])
476
- psidx1 = ps.from_pandas(pidx1)
477
- psidx2 = ps.from_pandas(pidx2)
478
-
479
- self.assert_eq(
480
- psidx1.symmetric_difference(psidx2).sort_values(),
481
- pidx1.symmetric_difference(pidx2).sort_values(),
482
- )
483
- self.assert_eq(
484
- (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
485
- (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
486
- )
487
- # No longer supported from pandas 2.0.0.
488
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
489
- self.assert_eq(
490
- (psidx1 ^ psidx2).sort_values(),
491
- ps.Index([1, 5], dtype="int64"),
492
- )
493
- else:
494
- self.assert_eq(
495
- (psidx1 ^ psidx2).sort_values(),
496
- (pidx1 ^ pidx2).sort_values(),
497
- )
498
- self.assert_eq(
499
- psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
500
- pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
501
- )
502
-
503
- pmidx1 = pd.MultiIndex(
504
- [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
505
- [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
506
- )
507
- pmidx2 = pd.MultiIndex(
508
- [["koalas", "cow", "falcon"], ["speed", "weight", "length"]],
509
- [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
510
- )
511
- psmidx1 = ps.from_pandas(pmidx1)
512
- psmidx2 = ps.from_pandas(pmidx2)
513
-
514
- self.assert_eq(
515
- psmidx1.symmetric_difference(psmidx2).sort_values(),
516
- pmidx1.symmetric_difference(pmidx2).sort_values(),
517
- )
518
-
519
- # Pandas has a bug that raise TypeError when setting `result_name` for MultiIndex.
520
- pandas_result = pmidx1.symmetric_difference(pmidx2)
521
- pandas_result.names = ["a", "b"]
522
- self.assert_eq(
523
- psmidx1.symmetric_difference(psmidx2, result_name=["a", "b"]).sort_values(),
524
- pandas_result,
525
- )
526
-
527
- # Pandas sort the result by default, so doesn't provide the `True` for sort.
528
- self.assert_eq(
529
- psmidx1.symmetric_difference(psmidx2, sort=True),
530
- pmidx1.symmetric_difference(pmidx2),
531
- )
532
-
533
- idx = ps.Index(["a", "b", "c"])
534
- midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
535
-
536
- with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
537
- idx.symmetric_difference(midx)
538
-
539
- def test_multi_index_symmetric_difference(self):
540
- idx = ps.Index(["a", "b", "c"])
541
- midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
542
- midx_ = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
543
-
544
- self.assert_eq(
545
- midx.symmetric_difference(midx_),
546
- midx._to_pandas().symmetric_difference(midx_._to_pandas()),
547
- )
548
-
549
- with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
550
- midx.symmetric_difference(idx)
551
-
552
- def test_missing(self):
553
- psdf = ps.DataFrame(
554
- {
555
- "a": [1, 2, 3],
556
- "b": [4, 5, 6],
557
- "c": pd.date_range("2011-01-01", freq="D", periods=3),
558
- "d": pd.Categorical(["a", "b", "c"]),
559
- "e": [timedelta(1), timedelta(2), timedelta(3)],
560
- }
561
- )
562
-
563
- # Index functions
564
- missing_functions = inspect.getmembers(MissingPandasLikeIndex, inspect.isfunction)
565
- unsupported_functions = [
566
- name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
567
- ]
568
- for name in unsupported_functions:
569
- with self.assertRaisesRegex(
570
- PandasNotImplementedError,
571
- "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
572
- ):
573
- getattr(psdf.set_index("a").index, name)()
574
-
575
- deprecated_functions = [
576
- name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
577
- ]
578
- for name in deprecated_functions:
579
- with self.assertRaisesRegex(
580
- PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
581
- ):
582
- getattr(psdf.set_index("a").index, name)()
583
-
584
- # MultiIndex functions
585
- missing_functions = inspect.getmembers(MissingPandasLikeMultiIndex, inspect.isfunction)
586
- unsupported_functions = [
587
- name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
588
- ]
589
- for name in unsupported_functions:
590
- with self.assertRaisesRegex(
591
- PandasNotImplementedError,
592
- "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
593
- ):
594
- getattr(psdf.set_index(["a", "b"]).index, name)()
595
-
596
- deprecated_functions = [
597
- name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
598
- ]
599
- for name in deprecated_functions:
600
- with self.assertRaisesRegex(
601
- PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
602
- ):
603
- getattr(psdf.set_index(["a", "b"]).index, name)()
604
-
605
- # DatetimeIndex functions
606
- missing_functions = inspect.getmembers(MissingPandasLikeDatetimeIndex, inspect.isfunction)
607
- unsupported_functions = [
608
- name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
609
- ]
610
- for name in unsupported_functions:
611
- with self.assertRaisesRegex(
612
- PandasNotImplementedError,
613
- "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
614
- ):
615
- getattr(psdf.set_index("c").index, name)()
616
-
617
- deprecated_functions = [
618
- name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
619
- ]
620
- for name in deprecated_functions:
621
- with self.assertRaisesRegex(
622
- PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
623
- ):
624
- getattr(psdf.set_index("c").index, name)()
625
-
626
- # TimedeltaIndex functions
627
- missing_functions = inspect.getmembers(MissingPandasLikeTimedeltaIndex, inspect.isfunction)
628
- unsupported_functions = [
629
- name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
630
- ]
631
- for name in unsupported_functions:
632
- with self.assertRaisesRegex(
633
- PandasNotImplementedError,
634
- "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
635
- ):
636
- getattr(psdf.set_index("e").index, name)()
637
-
638
- deprecated_functions = [
639
- name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
640
- ]
641
- for name in deprecated_functions:
642
- with self.assertRaisesRegex(
643
- PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
644
- ):
645
- getattr(psdf.set_index("e").index, name)()
646
-
647
- # Index properties
648
- missing_properties = inspect.getmembers(
649
- MissingPandasLikeIndex, lambda o: isinstance(o, property)
650
- )
651
- unsupported_properties = [
652
- name
653
- for (name, type_) in missing_properties
654
- if type_.fget.__name__ == "unsupported_property"
655
- ]
656
- for name in unsupported_properties:
657
- with self.assertRaisesRegex(
658
- PandasNotImplementedError,
659
- "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
660
- ):
661
- getattr(psdf.set_index("a").index, name)
662
-
663
- deprecated_properties = [
664
- name
665
- for (name, type_) in missing_properties
666
- if type_.fget.__name__ == "deprecated_property"
667
- ]
668
- for name in deprecated_properties:
669
- with self.assertRaisesRegex(
670
- PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name)
671
- ):
672
- getattr(psdf.set_index("a").index, name)
673
-
674
- # MultiIndex properties
675
- missing_properties = inspect.getmembers(
676
- MissingPandasLikeMultiIndex, lambda o: isinstance(o, property)
677
- )
678
- unsupported_properties = [
679
- name
680
- for (name, type_) in missing_properties
681
- if type_.fget.__name__ == "unsupported_property"
682
- ]
683
- for name in unsupported_properties:
684
- with self.assertRaisesRegex(
685
- PandasNotImplementedError,
686
- "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
687
- ):
688
- getattr(psdf.set_index(["a", "b"]).index, name)
689
-
690
- deprecated_properties = [
691
- name
692
- for (name, type_) in missing_properties
693
- if type_.fget.__name__ == "deprecated_property"
694
- ]
695
- for name in deprecated_properties:
696
- with self.assertRaisesRegex(
697
- PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name)
698
- ):
699
- getattr(psdf.set_index(["a", "b"]).index, name)
700
-
701
- # DatetimeIndex properties
702
- missing_properties = inspect.getmembers(
703
- MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property)
704
- )
705
- unsupported_properties = [
706
- name
707
- for (name, type_) in missing_properties
708
- if type_.fget.__name__ == "unsupported_property"
709
- ]
710
- for name in unsupported_properties:
711
- with self.assertRaisesRegex(
712
- PandasNotImplementedError,
713
- "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
714
- ):
715
- getattr(psdf.set_index("c").index, name)
716
-
717
- # TimedeltaIndex properties
718
- missing_properties = inspect.getmembers(
719
- MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property)
720
- )
721
- unsupported_properties = [
722
- name
723
- for (name, type_) in missing_properties
724
- if type_.fget.__name__ == "unsupported_property"
725
- ]
726
- for name in unsupported_properties:
727
- with self.assertRaisesRegex(
728
- PandasNotImplementedError,
729
- "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
730
- ):
731
- getattr(psdf.set_index("c").index, name)
732
-
733
- def test_index_has_duplicates(self):
734
- indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
735
- names = [None, "ks", "ks", None]
736
- has_dup = [False, True, True, False]
737
-
738
- for idx, name, expected in zip(indexes, names, has_dup):
739
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
740
- psdf = ps.from_pandas(pdf)
741
-
742
- self.assertEqual(psdf.index.has_duplicates, expected)
743
-
744
- def test_multiindex_has_duplicates(self):
745
- indexes = [
746
- [list("abc"), list("edf")],
747
- [list("aac"), list("edf")],
748
- [list("aac"), list("eef")],
749
- [[1, 4, 4], [4, 6, 6]],
750
- ]
751
- has_dup = [False, False, True, True]
752
-
753
- for idx, expected in zip(indexes, has_dup):
754
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
755
- psdf = ps.from_pandas(pdf)
756
-
757
- self.assertEqual(psdf.index.has_duplicates, expected)
758
-
759
- def test_multi_index_not_supported(self):
760
- psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
761
-
762
- with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"):
763
- psdf.set_index(["a", "b"]).index.any()
764
-
765
- with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"):
766
- psdf.set_index(["a", "b"]).index.all()
767
-
768
- def test_index_nlevels(self):
769
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"]))
770
- psdf = ps.from_pandas(pdf)
771
-
772
- self.assertEqual(psdf.index.nlevels, 1)
773
-
774
- def test_multiindex_nlevel(self):
775
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")])
776
- psdf = ps.from_pandas(pdf)
777
-
778
- self.assertEqual(psdf.index.nlevels, 2)
779
-
780
- def test_multiindex_from_arrays(self):
781
- arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]]
782
- pidx = pd.MultiIndex.from_arrays(arrays)
783
- psidx = ps.MultiIndex.from_arrays(arrays)
784
-
785
- self.assert_eq(pidx, psidx)
786
-
787
- def test_multiindex_swaplevel(self):
788
- pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]])
789
- psidx = ps.from_pandas(pidx)
790
- self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1))
791
-
792
- pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"])
793
- psidx = ps.from_pandas(pidx)
794
- self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1))
795
-
796
- pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None])
797
- psidx = ps.from_pandas(pidx)
798
- self.assert_eq(pidx.swaplevel(-2, -1), psidx.swaplevel(-2, -1))
799
- self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1))
800
- self.assert_eq(pidx.swaplevel("word", 1), psidx.swaplevel("word", 1))
801
-
802
- with self.assertRaisesRegex(IndexError, "Too many levels: Index"):
803
- psidx.swaplevel(-3, "word")
804
- with self.assertRaisesRegex(IndexError, "Too many levels: Index"):
805
- psidx.swaplevel(0, 2)
806
- with self.assertRaisesRegex(IndexError, "Too many levels: Index"):
807
- psidx.swaplevel(0, -3)
808
- with self.assertRaisesRegex(KeyError, "Level work not found"):
809
- psidx.swaplevel(0, "work")
810
-
811
- def test_multiindex_droplevel(self):
812
- pidx = pd.MultiIndex.from_tuples(
813
- [("a", "x", 1), ("b", "y", 2)], names=["level1", "level2", "level3"]
814
- )
815
- psidx = ps.from_pandas(pidx)
816
- with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 5"):
817
- psidx.droplevel(4)
818
-
819
- with self.assertRaisesRegex(KeyError, "Level level4 not found"):
820
- psidx.droplevel("level4")
821
-
822
- with self.assertRaisesRegex(KeyError, "Level.*level3.*level4.*not found"):
823
- psidx.droplevel([("level3", "level4")])
824
-
825
- with self.assertRaisesRegex(
826
- ValueError,
827
- "Cannot remove 4 levels from an index with 3 levels: at least one "
828
- "level must be left.",
829
- ):
830
- psidx.droplevel([0, 0, 1, 2])
831
-
832
- with self.assertRaisesRegex(
833
- ValueError,
834
- "Cannot remove 3 levels from an index with 3 levels: at least one "
835
- "level must be left.",
836
- ):
837
- psidx.droplevel([0, 1, 2])
838
-
839
- self.assert_eq(pidx.droplevel(0), psidx.droplevel(0))
840
- self.assert_eq(pidx.droplevel([0, 1]), psidx.droplevel([0, 1]))
841
- self.assert_eq(pidx.droplevel((0, 1)), psidx.droplevel((0, 1)))
842
- self.assert_eq(pidx.droplevel([0, "level2"]), psidx.droplevel([0, "level2"]))
843
- self.assert_eq(pidx.droplevel((0, "level2")), psidx.droplevel((0, "level2")))
844
-
845
- # non-string names
846
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)], names=[1.0, 2.0, 3.0])
847
- psidx = ps.from_pandas(pidx)
848
- self.assert_eq(pidx.droplevel(1.0), psidx.droplevel(1.0))
849
- self.assert_eq(pidx.droplevel([0, 2.0]), psidx.droplevel([0, 2.0]))
850
-
851
- def test_index_fillna(self):
852
- pidx = pd.Index([1, 2, None])
853
- psidx = ps.from_pandas(pidx)
854
-
855
- self.assert_eq(pidx.fillna(0), psidx.fillna(0), almost=True)
856
- self.assert_eq(pidx.rename("name").fillna(0), psidx.rename("name").fillna(0), almost=True)
857
-
858
- with self.assertRaisesRegex(TypeError, "Unsupported type list"):
859
- psidx.fillna([1, 2])
860
-
861
- def test_index_drop(self):
862
- pidx = pd.Index([1, 2, 3])
863
- psidx = ps.from_pandas(pidx)
864
-
865
- self.assert_eq(pidx.drop(1), psidx.drop(1))
866
- self.assert_eq(pidx.drop([1, 2]), psidx.drop([1, 2]))
867
- self.assert_eq((pidx + 1).drop([2, 3]), (psidx + 1).drop([2, 3]))
868
-
869
- def test_multiindex_drop(self):
870
- pidx = pd.MultiIndex.from_tuples(
871
- [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"]
872
- )
873
- psidx = ps.from_pandas(pidx)
874
- self.assert_eq(pidx.drop("a"), psidx.drop("a"))
875
- self.assert_eq(pidx.drop(["a", "b"]), psidx.drop(["a", "b"]))
876
- self.assert_eq(pidx.drop(["x", "y"], level=1), psidx.drop(["x", "y"], level=1))
877
- self.assert_eq(
878
- pidx.drop(["x", "y"], level="level2"), psidx.drop(["x", "y"], level="level2")
879
- )
880
-
881
- pidx.names = ["lv1", "lv2"]
882
- psidx.names = ["lv1", "lv2"]
883
- self.assert_eq(pidx.drop(["x", "y"], level="lv2"), psidx.drop(["x", "y"], level="lv2"))
884
-
885
- self.assertRaises(IndexError, lambda: psidx.drop(["a", "b"], level=2))
886
- self.assertRaises(KeyError, lambda: psidx.drop(["a", "b"], level="level"))
887
-
888
- psidx.names = ["lv", "lv"]
889
- self.assertRaises(ValueError, lambda: psidx.drop(["x", "y"], level="lv"))
890
-
891
- def _test_sort_values(self, pidx, psidx):
892
- self.assert_eq(pidx.sort_values(), psidx.sort_values())
893
- # Parameter ascending
894
- self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
895
- # Parameter return_indexer
896
- p_sorted, p_indexer = pidx.sort_values(return_indexer=True)
897
- ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True)
898
- self.assert_eq(p_sorted, ps_sorted)
899
- self.assert_eq(p_indexer, ps_indexer.to_list())
900
- self.assert_eq(
901
- pidx.sort_values(return_indexer=False), psidx.sort_values(return_indexer=False)
902
- )
903
- # Parameter return_indexer and ascending
904
- p_sorted, p_indexer = pidx.sort_values(return_indexer=True, ascending=False)
905
- ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True, ascending=False)
906
- self.assert_eq(p_sorted, ps_sorted)
907
- self.assert_eq(p_indexer, ps_indexer.to_list())
908
- self.assert_eq(
909
- pidx.sort_values(return_indexer=False, ascending=False),
910
- psidx.sort_values(return_indexer=False, ascending=False),
911
- )
912
-
913
- def test_sort_values(self):
914
- pidx = pd.Index([-10, -100, 200, 100])
915
- psidx = ps.from_pandas(pidx)
916
-
917
- self._test_sort_values(pidx, psidx)
918
-
919
- pidx.name = "koalas"
920
- psidx.name = "koalas"
921
-
922
- self._test_sort_values(pidx, psidx)
923
-
924
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
925
- psidx = ps.from_pandas(pidx)
926
-
927
- pidx.names = ["hello", "koalas", "goodbye"]
928
- psidx.names = ["hello", "koalas", "goodbye"]
929
-
930
- self._test_sort_values(pidx, psidx)
931
-
932
- def test_index_drop_duplicates(self):
933
- pidx = pd.Index([1, 1, 2])
934
- psidx = ps.from_pandas(pidx)
935
- self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values())
936
-
937
- pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"])
938
- psidx = ps.from_pandas(pidx)
939
- self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values())
940
-
941
- def test_index_sort(self):
942
- idx = ps.Index([1, 2, 3, 4, 5])
943
- midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
944
-
945
- with self.assertRaisesRegex(
946
- TypeError, "cannot sort an Index object in-place, use sort_values instead"
947
- ):
948
- idx.sort()
949
- with self.assertRaisesRegex(
950
- TypeError, "cannot sort an Index object in-place, use sort_values instead"
951
- ):
952
- midx.sort()
953
-
954
- def test_multiindex_isna(self):
955
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
956
-
957
- with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"):
958
- psidx.isna()
959
-
960
- with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"):
961
- psidx.isnull()
962
-
963
- with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
964
- psidx.notna()
965
-
966
- with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
967
- psidx.notnull()
968
-
969
- def test_index_nunique(self):
970
- pidx = pd.Index([1, 1, 2, None])
971
- psidx = ps.from_pandas(pidx)
972
-
973
- self.assert_eq(pidx.nunique(), psidx.nunique())
974
- self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True))
975
-
976
- def test_multiindex_nunique(self):
977
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
978
- with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
979
- psidx.notnull()
980
-
981
- def test_multiindex_rename(self):
982
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
983
- psidx = ps.from_pandas(pidx)
984
-
985
- pidx = pidx.rename(list("ABC"))
986
- psidx = psidx.rename(list("ABC"))
987
- self.assert_eq(pidx, psidx)
988
-
989
- pidx = pidx.rename(["my", "name", "is"])
990
- psidx = psidx.rename(["my", "name", "is"])
991
- self.assert_eq(pidx, psidx)
992
-
993
- def test_multiindex_set_names(self):
994
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
995
- psidx = ps.from_pandas(pidx)
996
-
997
- pidx = pidx.set_names(["set", "new", "names"])
998
- psidx = psidx.set_names(["set", "new", "names"])
999
- self.assert_eq(pidx, psidx)
1000
-
1001
- pidx.set_names(["set", "new", "names"], inplace=True)
1002
- psidx.set_names(["set", "new", "names"], inplace=True)
1003
- self.assert_eq(pidx, psidx)
1004
-
1005
- pidx = pidx.set_names("first", level=0)
1006
- psidx = psidx.set_names("first", level=0)
1007
- self.assert_eq(pidx, psidx)
1008
-
1009
- pidx = pidx.set_names("second", level=1)
1010
- psidx = psidx.set_names("second", level=1)
1011
- self.assert_eq(pidx, psidx)
1012
-
1013
- pidx = pidx.set_names("third", level=2)
1014
- psidx = psidx.set_names("third", level=2)
1015
- self.assert_eq(pidx, psidx)
1016
-
1017
- pidx.set_names("first", level=0, inplace=True)
1018
- psidx.set_names("first", level=0, inplace=True)
1019
- self.assert_eq(pidx, psidx)
1020
-
1021
- pidx.set_names("second", level=1, inplace=True)
1022
- psidx.set_names("second", level=1, inplace=True)
1023
- self.assert_eq(pidx, psidx)
1024
-
1025
- pidx.set_names("third", level=2, inplace=True)
1026
- psidx.set_names("third", level=2, inplace=True)
1027
- self.assert_eq(pidx, psidx)
1028
-
1029
- def test_multiindex_from_tuples(self):
1030
- tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]
1031
- pidx = pd.MultiIndex.from_tuples(tuples)
1032
- psidx = ps.MultiIndex.from_tuples(tuples)
1033
-
1034
- self.assert_eq(pidx, psidx)
1035
-
1036
- def test_multiindex_from_product(self):
1037
- iterables = [[0, 1, 2], ["green", "purple"]]
1038
- pidx = pd.MultiIndex.from_product(iterables)
1039
- psidx = ps.MultiIndex.from_product(iterables)
1040
-
1041
- self.assert_eq(pidx, psidx)
1042
-
1043
- def test_multiindex_tuple_column_name(self):
1044
- column_labels = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
1045
- pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=column_labels)
1046
- pdf.set_index(("a", "x"), append=True, inplace=True)
1047
- psdf = ps.from_pandas(pdf)
1048
- self.assert_eq(pdf, psdf)
1049
-
1050
- def test_len(self):
1051
- pidx = pd.Index(range(10000))
1052
- psidx = ps.from_pandas(pidx)
1053
-
1054
- self.assert_eq(len(pidx), len(psidx))
1055
-
1056
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1057
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1058
-
1059
- self.assert_eq(len(pidx), len(psidx))
1060
-
1061
- def test_delete(self):
1062
- pidx = pd.Index([10, 9, 8, 7, 6, 7, 8, 9, 10])
1063
- psidx = ps.from_pandas(pidx)
1064
-
1065
- self.assert_eq(pidx.delete(8).sort_values(), psidx.delete(8).sort_values())
1066
- self.assert_eq(pidx.delete(-9).sort_values(), psidx.delete(-9).sort_values())
1067
- self.assert_eq(
1068
- pidx.delete([-9, 0, 8]).sort_values(), psidx.delete([-9, 0, 8]).sort_values()
1069
- )
1070
-
1071
- with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"):
1072
- psidx.delete([0, 9])
1073
- with self.assertRaisesRegex(
1074
- IndexError, "index -10 is out of bounds for axis 0 with size 9"
1075
- ):
1076
- psidx.delete([-10, 0])
1077
- with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"):
1078
- psidx.delete(9)
1079
- with self.assertRaisesRegex(
1080
- IndexError, "index -10 is out of bounds for axis 0 with size 9"
1081
- ):
1082
- psidx.delete(-10)
1083
-
1084
- # MultiIndex
1085
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1086
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1087
-
1088
- self.assert_eq(pidx.delete(2).sort_values(), psidx.delete(2).sort_values())
1089
- self.assert_eq(pidx.delete(-3).sort_values(), psidx.delete(-3).sort_values())
1090
- self.assert_eq(
1091
- pidx.delete([-3, 0, 2]).sort_values(), psidx.delete([-3, 0, 2]).sort_values()
1092
- )
1093
-
1094
- with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"):
1095
- psidx.delete([0, 3])
1096
- with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"):
1097
- psidx.delete([-4, 0])
1098
- with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"):
1099
- psidx.delete(3)
1100
- with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"):
1101
- psidx.delete(-4)
1102
-
1103
- def test_append(self):
1104
- # Index
1105
- pidx = pd.Index(range(10000))
1106
- psidx = ps.from_pandas(pidx)
1107
-
1108
- self.assert_eq(pidx.append(pidx), psidx.append(psidx))
1109
-
1110
- # Index with name
1111
- pidx1 = pd.Index(range(10000), name="a")
1112
- pidx2 = pd.Index(range(10000), name="b")
1113
- psidx1 = ps.from_pandas(pidx1)
1114
- psidx2 = ps.from_pandas(pidx2)
1115
-
1116
- self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2))
1117
-
1118
- self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1))
1119
-
1120
- # Index from DataFrame
1121
- pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"])
1122
- pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, None]}, index=["x", "y", "z"])
1123
- psdf1 = ps.from_pandas(pdf1)
1124
- psdf2 = ps.from_pandas(pdf2)
1125
-
1126
- pidx1 = pdf1.set_index("a").index
1127
- pidx2 = pdf2.set_index("d").index
1128
- psidx1 = psdf1.set_index("a").index
1129
- psidx2 = psdf2.set_index("d").index
1130
-
1131
- self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2))
1132
-
1133
- self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1))
1134
-
1135
- # Index from DataFrame with MultiIndex columns
1136
- pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
1137
- pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]})
1138
- pdf1.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
1139
- pdf2.columns = pd.MultiIndex.from_tuples([("a", "x"), ("d", "y")])
1140
- psdf1 = ps.from_pandas(pdf1)
1141
- psdf2 = ps.from_pandas(pdf2)
1142
-
1143
- pidx1 = pdf1.set_index(("a", "x")).index
1144
- pidx2 = pdf2.set_index(("d", "y")).index
1145
- psidx1 = psdf1.set_index(("a", "x")).index
1146
- psidx2 = psdf2.set_index(("d", "y")).index
1147
-
1148
- self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2))
1149
-
1150
- self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1))
1151
-
1152
- # MultiIndex
1153
- pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1154
- psmidx = ps.from_pandas(pmidx)
1155
-
1156
- self.assert_eq(pmidx.append(pmidx), psmidx.append(psmidx))
1157
-
1158
- # MultiIndex with names
1159
- pmidx1 = pd.MultiIndex.from_tuples(
1160
- [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["x", "y", "z"]
1161
- )
1162
- pmidx2 = pd.MultiIndex.from_tuples(
1163
- [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"]
1164
- )
1165
- psmidx1 = ps.from_pandas(pmidx1)
1166
- psmidx2 = ps.from_pandas(pmidx2)
1167
-
1168
- # TODO(SPARK-43241): MultiIndex.append not checking names for equality.
1169
- # Also refer to https://github.com/pandas-dev/pandas/pull/48288.
1170
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1171
- self.assert_eq(
1172
- pmidx1.append(pmidx2), psmidx1.append(psmidx2).rename([None, None, None])
1173
- )
1174
- else:
1175
- self.assert_eq(pmidx1.append(pmidx2), psmidx1.append(psmidx2))
1176
-
1177
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1178
- self.assert_eq(
1179
- pmidx2.append(pmidx1), psmidx2.append(psmidx1).rename([None, None, None])
1180
- )
1181
- else:
1182
- self.assert_eq(pmidx2.append(pmidx1), psmidx2.append(psmidx1))
1183
-
1184
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1185
- self.assert_eq(
1186
- pmidx1.append(pmidx2).names,
1187
- psmidx1.append(psmidx2).rename([None, None, None]).names,
1188
- )
1189
- else:
1190
- self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names)
1191
-
1192
- # Index & MultiIndex is currently not supported
1193
- expected_error_message = r"append\(\) between Index & MultiIndex is currently not supported"
1194
- with self.assertRaisesRegex(NotImplementedError, expected_error_message):
1195
- psidx.append(psmidx)
1196
- with self.assertRaisesRegex(NotImplementedError, expected_error_message):
1197
- psmidx.append(psidx)
1198
-
1199
- # MultiIndexs with different levels is currently not supported
1200
- psmidx3 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
1201
- expected_error_message = (
1202
- r"append\(\) between MultiIndexs with different levels is currently not supported"
1203
- )
1204
- with self.assertRaisesRegex(NotImplementedError, expected_error_message):
1205
- psmidx.append(psmidx3)
1206
-
1207
- def test_argmin(self):
1208
- pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0])
1209
- psidx = ps.from_pandas(pidx)
1210
-
1211
- self.assert_eq(pidx.argmin(), psidx.argmin())
1212
-
1213
- # MultiIndex
1214
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1215
- with self.assertRaisesRegex(
1216
- TypeError, "reduction operation 'argmin' not allowed for this dtype"
1217
- ):
1218
- psidx.argmin()
1219
-
1220
- def test_argmax(self):
1221
- pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0])
1222
- psidx = ps.from_pandas(pidx)
1223
-
1224
- self.assert_eq(pidx.argmax(), psidx.argmax())
1225
-
1226
- # MultiIndex
1227
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
1228
- with self.assertRaisesRegex(
1229
- TypeError, "reduction operation 'argmax' not allowed for this dtype"
1230
- ):
1231
- psidx.argmax()
1232
-
1233
- def test_min(self):
1234
- pidx = pd.Index([3, 2, 1])
1235
- psidx = ps.from_pandas(pidx)
1236
-
1237
- self.assert_eq(pidx.min(), psidx.min())
1238
-
1239
- # MultiIndex
1240
- pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
1241
- psmidx = ps.from_pandas(pmidx)
1242
-
1243
- self.assert_eq(pmidx.min(), psmidx.min())
1244
-
1245
- pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"])
1246
- psidx = ps.from_pandas(pidx)
1247
-
1248
- self.assert_eq(pidx.min(), psidx.min())
1249
-
1250
- def test_max(self):
1251
- pidx = pd.Index([3, 2, 1])
1252
- psidx = ps.from_pandas(pidx)
1253
-
1254
- self.assert_eq(pidx.max(), psidx.max())
1255
-
1256
- # MultiIndex
1257
- pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
1258
- psmidx = ps.from_pandas(pmidx)
1259
-
1260
- self.assert_eq(pmidx.max(), psmidx.max())
1261
-
1262
- pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"])
1263
- psidx = ps.from_pandas(pidx)
1264
-
1265
- self.assert_eq(pidx.max(), psidx.max())
1266
-
1267
- def test_monotonic(self):
1268
- # test monotonic_increasing & monotonic_decreasing for MultiIndex.
1269
- # Since the Behavior for null value was changed in pandas >= 1.0.0,
1270
- # several cases are tested differently.
1271
- datas = []
1272
-
1273
- # increasing / decreasing ordered each index level with string
1274
- datas.append([("w", "a"), ("x", "b"), ("y", "c"), ("z", "d")])
1275
- datas.append([("w", "d"), ("x", "c"), ("y", "b"), ("z", "a")])
1276
- datas.append([("z", "a"), ("y", "b"), ("x", "c"), ("w", "d")])
1277
- datas.append([("z", "d"), ("y", "c"), ("x", "b"), ("w", "a")])
1278
- # mixed order each index level with string
1279
- datas.append([("z", "a"), ("x", "b"), ("y", "c"), ("w", "d")])
1280
- datas.append([("z", "a"), ("y", "c"), ("x", "b"), ("w", "d")])
1281
-
1282
- # increasing / decreasing ordered each index level with integer
1283
- datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
1284
- datas.append([(1, 500), (2, 400), (3, 300), (4, 200), (5, 100)])
1285
- datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, 500)])
1286
- datas.append([(5, 500), (4, 400), (3, 300), (2, 200), (1, 100)])
1287
- # mixed order each index level with integer
1288
- datas.append([(1, 500), (3, 400), (2, 300), (4, 200), (5, 100)])
1289
- datas.append([(1, 100), (2, 300), (3, 200), (4, 400), (5, 500)])
1290
-
1291
- # integer / negative mixed tests
1292
- datas.append([("a", -500), ("b", -400), ("c", -300), ("d", -200), ("e", -100)])
1293
- datas.append([("e", -500), ("d", -400), ("c", -300), ("b", -200), ("a", -100)])
1294
- datas.append([(-5, "a"), (-4, "b"), (-3, "c"), (-2, "d"), (-1, "e")])
1295
- datas.append([(-5, "e"), (-4, "d"), (-3, "c"), (-2, "b"), (-1, "a")])
1296
- datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")])
1297
- datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")])
1298
-
1299
- # boolean type tests
1300
- datas.append([(True, True), (True, True)])
1301
- datas.append([(True, True), (True, False)])
1302
- datas.append([(True, False), (True, True)])
1303
- datas.append([(False, True), (False, True)])
1304
- datas.append([(False, True), (False, False)])
1305
- datas.append([(False, False), (False, True)])
1306
- datas.append([(True, True), (False, True)])
1307
- datas.append([(True, True), (False, False)])
1308
- datas.append([(True, False), (False, True)])
1309
- datas.append([(False, True), (True, True)])
1310
- datas.append([(False, True), (True, False)])
1311
- datas.append([(False, False), (True, True)])
1312
-
1313
- # duplicated index value tests
1314
- datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
1315
- datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")])
1316
-
1317
- # more depth tests
1318
- datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")])
1319
- datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")])
1320
-
1321
- # None type tests (None type is treated as False from pandas >= 1.1.4)
1322
- # Refer https://github.com/pandas-dev/pandas/issues/37220
1323
- datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
1324
- datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
1325
- datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
1326
- datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
1327
- datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
1328
- datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")])
1329
-
1330
- for data in datas:
1331
- with self.subTest(data=data):
1332
- pmidx = pd.MultiIndex.from_tuples(data)
1333
- psmidx = ps.from_pandas(pmidx)
1334
- self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1335
- self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1336
-
1337
- # datas below return different result depends on pandas version.
1338
- # Because the behavior of handling null values is changed in pandas >= 1.1.4.
1339
- # Since Koalas follows latest pandas, all of them should return `False`.
1340
- datas = []
1341
- datas.append([(1, 100), (2, 200), (3, None), (4, 400), (5, 500)])
1342
- datas.append([(1, None), (2, 200), (3, 300), (4, 400), (5, 500)])
1343
- datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, None)])
1344
- datas.append([(False, None), (True, True)])
1345
- datas.append([(None, False), (True, True)])
1346
- datas.append([(False, False), (True, None)])
1347
- datas.append([(False, False), (None, True)])
1348
- datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
1349
- datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
1350
- datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
1351
- datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
1352
- datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
1353
- datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
1354
- datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
1355
- datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
1356
- datas.append([(True, None), (True, True)])
1357
- datas.append([(None, True), (True, True)])
1358
- datas.append([(True, True), (None, True)])
1359
- datas.append([(True, True), (True, None)])
1360
- datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
1361
- datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
1362
- datas.append([("x", "d"), ("y", None), ("y", "c"), ("z", "a")])
1363
- datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", "q"), ("z", "a", "r")])
1364
-
1365
- for data in datas:
1366
- with self.subTest(data=data):
1367
- pmidx = pd.MultiIndex.from_tuples(data)
1368
- psmidx = ps.from_pandas(pmidx)
1369
- if LooseVersion(pd.__version__) < LooseVersion("1.1.4"):
1370
- self.assert_eq(psmidx.is_monotonic_increasing, False)
1371
- self.assert_eq(psmidx.is_monotonic_decreasing, False)
1372
- else:
1373
- self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1374
- self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1375
-
1376
- # The datas below are tested another way since they cannot be an arguments for
1377
- # `MultiIndex.from_tuples` in pandas >= 1.1.0.
1378
- # Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
1379
- if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
1380
- pmidx = pd.MultiIndex.from_tuples(
1381
- [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
1382
- )
1383
- psmidx = ps.from_pandas(pmidx)
1384
- self.assert_eq(psmidx.is_monotonic_increasing, False)
1385
- self.assert_eq(psmidx.is_monotonic_decreasing, False)
1386
-
1387
- pmidx = pd.MultiIndex.from_tuples(
1388
- [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
1389
- )
1390
- psmidx = ps.from_pandas(pmidx)
1391
- self.assert_eq(psmidx.is_monotonic_increasing, False)
1392
- self.assert_eq(psmidx.is_monotonic_decreasing, False)
1393
-
1394
- pmidx = pd.MultiIndex.from_tuples(
1395
- [(None, None), (None, None), (None, None), (None, None), (None, None)]
1396
- )
1397
- psmidx = ps.from_pandas(pmidx)
1398
- self.assert_eq(psmidx.is_monotonic_increasing, False)
1399
- self.assert_eq(psmidx.is_monotonic_decreasing, False)
1400
-
1401
- pmidx = pd.MultiIndex.from_tuples([(None, None)])
1402
- psmidx = ps.from_pandas(pmidx)
1403
- self.assert_eq(psmidx.is_monotonic_increasing, False)
1404
- self.assert_eq(psmidx.is_monotonic_decreasing, False)
1405
-
1406
- else:
1407
- # For [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
1408
- psdf = ps.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]})
1409
- psdf["b"] = None
1410
- psmidx = psdf.set_index(["a", "b"]).index
1411
- pmidx = psmidx._to_pandas()
1412
- self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1413
- self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1414
-
1415
- # For [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
1416
- psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]})
1417
- psdf["a"] = None
1418
- psmidx = psdf.set_index(["a", "b"]).index
1419
- pmidx = psmidx._to_pandas()
1420
- self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1421
- self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1422
-
1423
- # For [(None, None), (None, None), (None, None), (None, None), (None, None)]
1424
- psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]})
1425
- psdf["a"] = None
1426
- psdf["b"] = None
1427
- psmidx = psdf.set_index(["a", "b"]).index
1428
- pmidx = psmidx._to_pandas()
1429
- self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1430
- self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1431
-
1432
- # For [(None, None)]
1433
- psdf = ps.DataFrame({"a": [1], "b": [1]})
1434
- psdf["a"] = None
1435
- psdf["b"] = None
1436
- psmidx = psdf.set_index(["a", "b"]).index
1437
- pmidx = psmidx._to_pandas()
1438
- self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1439
- self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1440
-
1441
- def test_difference(self):
1442
- # Index
1443
- pidx1 = pd.Index([1, 2, 3, 4], name="koalas")
1444
- pidx2 = pd.Index([3, 4, 5, 6], name="koalas")
1445
- psidx1 = ps.from_pandas(pidx1)
1446
- psidx2 = ps.from_pandas(pidx2)
1447
- # Series
1448
- pser = pd.Series([3, 4, 5, 6], name="koalas")
1449
- psser = ps.from_pandas(pser)
1450
-
1451
- self.assert_eq(
1452
- psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values()
1453
- )
1454
- self.assert_eq(psidx1.difference(psser).sort_values(), pidx1.difference(pser).sort_values())
1455
- self.assert_eq(
1456
- psidx1.difference([3, 4, 5, 6]).sort_values(),
1457
- pidx1.difference([3, 4, 5, 6]).sort_values(),
1458
- )
1459
- self.assert_eq(
1460
- psidx1.difference((3, 4, 5, 6)).sort_values(),
1461
- pidx1.difference((3, 4, 5, 6)).sort_values(),
1462
- )
1463
- self.assert_eq(
1464
- psidx1.difference({3, 4, 5, 6}).sort_values(),
1465
- pidx1.difference({3, 4, 5, 6}).sort_values(),
1466
- )
1467
- self.assert_eq(
1468
- psidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(),
1469
- pidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(),
1470
- )
1471
-
1472
- # Exceptions for Index
1473
- with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
1474
- psidx1.difference("1234")
1475
- with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
1476
- psidx1.difference(1234)
1477
- with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
1478
- psidx1.difference(12.34)
1479
- with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
1480
- psidx1.difference(None)
1481
- with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
1482
- psidx1.difference(np.nan)
1483
- with self.assertRaisesRegex(
1484
- ValueError, "The 'sort' keyword only takes the values of None or True; 1 was passed."
1485
- ):
1486
- psidx1.difference(psidx2, sort=1)
1487
-
1488
- # MultiIndex
1489
- pmidx1 = pd.MultiIndex.from_tuples(
1490
- [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["hello", "koalas", "world"]
1491
- )
1492
- pmidx2 = pd.MultiIndex.from_tuples(
1493
- [("a", "x", 1), ("b", "z", 2), ("k", "z", 3)], names=["hello", "koalas", "world"]
1494
- )
1495
- psmidx1 = ps.from_pandas(pmidx1)
1496
- psmidx2 = ps.from_pandas(pmidx2)
1497
-
1498
- self.assert_eq(
1499
- psmidx1.difference(psmidx2).sort_values(), pmidx1.difference(pmidx2).sort_values()
1500
- )
1501
- self.assert_eq(
1502
- psmidx1.difference(psidx1).sort_values(), pmidx1.difference(pidx1).sort_values()
1503
- )
1504
- self.assert_eq(
1505
- psidx1.difference(psmidx1).sort_values(), pidx1.difference(pmidx1).sort_values()
1506
- )
1507
- self.assert_eq(psidx1.difference(psser).sort_values(), pidx1.difference(pser).sort_values())
1508
- self.assert_eq(
1509
- psmidx1.difference({("a", "x", 1)}).sort_values(),
1510
- pmidx1.difference({("a", "x", 1)}).sort_values(),
1511
- )
1512
- self.assert_eq(
1513
- psmidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(),
1514
- pmidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(),
1515
- )
1516
-
1517
- # Exceptions for MultiIndex
1518
- with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
1519
- psmidx1.difference(["b", "z", "2"])
1520
-
1521
- def test_repeat(self):
1522
- pidx = pd.Index(["a", "b", "c"])
1523
- psidx = ps.from_pandas(pidx)
1524
-
1525
- self.assert_eq(psidx.repeat(3).sort_values(), pidx.repeat(3).sort_values())
1526
- self.assert_eq(psidx.repeat(0).sort_values(), pidx.repeat(0).sort_values())
1527
- self.assert_eq((psidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values())
1528
-
1529
- self.assertRaises(ValueError, lambda: psidx.repeat(-1))
1530
- self.assertRaises(TypeError, lambda: psidx.repeat("abc"))
1531
-
1532
- pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
1533
- psmidx = ps.from_pandas(pmidx)
1534
-
1535
- self.assert_eq(psmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values())
1536
- self.assert_eq(psmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True)
1537
-
1538
- self.assertRaises(ValueError, lambda: psmidx.repeat(-1))
1539
- self.assertRaises(TypeError, lambda: psmidx.repeat("abc"))
1540
-
1541
- def test_unique(self):
1542
- pidx = pd.Index(["a", "b", "a"])
1543
- psidx = ps.from_pandas(pidx)
1544
-
1545
- self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
1546
- self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
1547
-
1548
- pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")])
1549
- psmidx = ps.from_pandas(pmidx)
1550
-
1551
- self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
1552
- self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
1553
-
1554
- with self.assertRaisesRegex(
1555
- IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
1556
- ):
1557
- psidx.unique(level=-2)
1558
-
1559
- def test_asof(self):
1560
- # Increasing values
1561
- pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"])
1562
- psidx = ps.from_pandas(pidx)
1563
-
1564
- self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
1565
- self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
1566
- self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02")))
1567
- self.assert_eq(psidx.asof("2014-01-04"), pidx.asof("2014-01-04"))
1568
-
1569
- pidx = pd.DatetimeIndex(["2013-12-31", "2014-01-02", "2014-01-03"])
1570
- psidx = ps.from_pandas(pidx)
1571
-
1572
- self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
1573
- self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
1574
- self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02")))
1575
-
1576
- # Decreasing values
1577
- pidx = pd.Index(["2014-01-03", "2014-01-02", "2013-12-31"])
1578
- psidx = ps.from_pandas(pidx)
1579
-
1580
- self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
1581
- self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
1582
- self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02"))
1583
- self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02")))
1584
-
1585
- pidx = pd.DatetimeIndex(["2014-01-03", "2014-01-02", "2013-12-31"])
1586
- psidx = ps.from_pandas(pidx)
1587
-
1588
- # TODO: a pandas bug?
1589
- # self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
1590
- # self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
1591
- # self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02"))
1592
- # self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02")))
1593
- self.assert_eq(psidx.asof("2014-01-01"), pd.Timestamp("2014-01-02 00:00:00"))
1594
- self.assert_eq(psidx.asof("2014-01-02"), pd.Timestamp("2014-01-02 00:00:00"))
1595
- self.assert_eq(psidx.asof("1999-01-02"), pd.Timestamp("2013-12-31 00:00:00"))
1596
- self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pd.NaT))
1597
-
1598
- # Not increasing, neither decreasing (ValueError)
1599
- psidx = ps.Index(["2013-12-31", "2015-01-02", "2014-01-03"])
1600
- self.assertRaises(ValueError, lambda: psidx.asof("2013-12-31"))
1601
-
1602
- psmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")])
1603
- self.assertRaises(NotImplementedError, lambda: psmidx.asof(("a", "b")))
1604
-
1605
- @unittest.skipIf(
1606
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
1607
- "TODO(SPARK-43608): Enable IndexesTests.test_union for pandas 2.0.0.",
1608
- )
1609
- def test_union(self):
1610
- # Index
1611
- pidx1 = pd.Index([1, 2, 3, 4])
1612
- pidx2 = pd.Index([3, 4, 5, 6])
1613
- pidx3 = pd.Index([7.0, 8.0, 9.0, 10.0])
1614
- psidx1 = ps.from_pandas(pidx1)
1615
- psidx2 = ps.from_pandas(pidx2)
1616
- psidx3 = ps.from_pandas(pidx3)
1617
-
1618
- self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
1619
- self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
1620
- self.assert_eq(psidx1.union(psidx3), pidx1.union(pidx3))
1621
- # Deprecated case, but adding to track if pandas stop supporting union
1622
- # as a set operation. It should work fine until stop supporting anyway.
1623
- # No longer supported from pandas 2.0.0.
1624
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1625
- self.assert_eq(psidx1 | psidx2, ps.Index([3, 4], dtype="int64"))
1626
- else:
1627
- self.assert_eq(pidx1 | pidx2, psidx1 | psidx2)
1628
-
1629
- self.assert_eq(psidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True)
1630
- self.assert_eq(psidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True)
1631
- self.assert_eq(
1632
- psidx1.union(ps.Series([3, 4, 5, 6])), pidx1.union(pd.Series([3, 4, 5, 6])), almost=True
1633
- )
1634
- self.assert_eq(
1635
- psidx2.union(ps.Series([1, 2, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4])), almost=True
1636
- )
1637
-
1638
- # Testing if the result is correct after sort=False.
1639
- self.assert_eq(
1640
- psidx1.union(psidx2, sort=False).sort_values(),
1641
- pidx1.union(pidx2, sort=False).sort_values(),
1642
- )
1643
- self.assert_eq(
1644
- psidx2.union(psidx1, sort=False).sort_values(),
1645
- pidx2.union(pidx1, sort=False).sort_values(),
1646
- )
1647
- self.assert_eq(
1648
- psidx1.union([3, 4, 5, 6], sort=False).sort_values(),
1649
- pidx1.union([3, 4, 5, 6], sort=False).sort_values(),
1650
- almost=True,
1651
- )
1652
- self.assert_eq(
1653
- psidx2.union([1, 2, 3, 4], sort=False).sort_values(),
1654
- pidx2.union([1, 2, 3, 4], sort=False).sort_values(),
1655
- almost=True,
1656
- )
1657
- self.assert_eq(
1658
- psidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(),
1659
- pidx1.union(pd.Series([3, 4, 5, 6]), sort=False).sort_values(),
1660
- almost=True,
1661
- )
1662
- self.assert_eq(
1663
- psidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(),
1664
- pidx2.union(pd.Series([1, 2, 3, 4]), sort=False).sort_values(),
1665
- almost=True,
1666
- )
1667
-
1668
- pidx1 = pd.Index([1, 2, 3, 4, 3, 4, 3, 4])
1669
- pidx2 = pd.Index([3, 4, 3, 4, 5, 6])
1670
- psidx1 = ps.from_pandas(pidx1)
1671
- psidx2 = ps.from_pandas(pidx2)
1672
-
1673
- self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
1674
- self.assert_eq(
1675
- psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True
1676
- )
1677
- self.assert_eq(
1678
- psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])),
1679
- pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])),
1680
- almost=True,
1681
- )
1682
-
1683
- # Manually create the expected result here since there is a bug in Index.union
1684
- # dropping duplicated values in pandas < 1.3.
1685
- expected = pd.Index([1, 2, 3, 3, 3, 4, 4, 4, 5, 6])
1686
- self.assert_eq(psidx2.union(psidx1), expected)
1687
- self.assert_eq(
1688
- psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1689
- expected,
1690
- almost=True,
1691
- )
1692
- self.assert_eq(
1693
- psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1694
- expected,
1695
- almost=True,
1696
- )
1697
-
1698
- # MultiIndex
1699
- pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")])
1700
- pmidx2 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")])
1701
- pmidx3 = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)])
1702
- pmidx4 = pd.MultiIndex.from_tuples([(1, 3), (1, 4), (1, 5), (1, 6)])
1703
- psmidx1 = ps.from_pandas(pmidx1)
1704
- psmidx2 = ps.from_pandas(pmidx2)
1705
- psmidx3 = ps.from_pandas(pmidx3)
1706
- psmidx4 = ps.from_pandas(pmidx4)
1707
-
1708
- # Manually create the expected result here since there is a bug in MultiIndex.union
1709
- # dropping duplicated values in pandas < 1.3.
1710
- expected = pd.MultiIndex.from_tuples(
1711
- [("x", "a"), ("x", "a"), ("x", "b"), ("x", "b"), ("x", "c"), ("x", "d")]
1712
- )
1713
- self.assert_eq(psmidx1.union(psmidx2), expected)
1714
- self.assert_eq(psmidx2.union(psmidx1), expected)
1715
- self.assert_eq(
1716
- psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1717
- expected,
1718
- )
1719
- self.assert_eq(
1720
- psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1721
- expected,
1722
- )
1723
-
1724
- expected = pd.MultiIndex.from_tuples(
1725
- [(1, 1), (1, 2), (1, 3), (1, 3), (1, 4), (1, 4), (1, 5), (1, 6)]
1726
- )
1727
- self.assert_eq(psmidx3.union(psmidx4), expected)
1728
- self.assert_eq(psmidx4.union(psmidx3), expected)
1729
- self.assert_eq(
1730
- psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1731
- expected,
1732
- )
1733
- self.assert_eq(
1734
- psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1735
- expected,
1736
- )
1737
-
1738
- # Testing if the result is correct after sort=False.
1739
- # Manually create the expected result here since there is a bug in MultiIndex.union
1740
- # dropping duplicated values in pandas < 1.3.
1741
- expected = pd.MultiIndex.from_tuples(
1742
- [("x", "a"), ("x", "a"), ("x", "b"), ("x", "b"), ("x", "c"), ("x", "d")]
1743
- )
1744
- self.assert_eq(psmidx1.union(psmidx2, sort=False).sort_values(), expected)
1745
- self.assert_eq(psmidx2.union(psmidx1, sort=False).sort_values(), expected)
1746
- self.assert_eq(
1747
- psmidx1.union(
1748
- [("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False
1749
- ).sort_values(),
1750
- expected,
1751
- )
1752
- self.assert_eq(
1753
- psmidx2.union(
1754
- [("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False
1755
- ).sort_values(),
1756
- expected,
1757
- )
1758
-
1759
- expected = pd.MultiIndex.from_tuples(
1760
- [(1, 1), (1, 2), (1, 3), (1, 3), (1, 4), (1, 4), (1, 5), (1, 6)]
1761
- )
1762
- self.assert_eq(psmidx3.union(psmidx4, sort=False).sort_values(), expected)
1763
- self.assert_eq(psmidx4.union(psmidx3, sort=False).sort_values(), expected)
1764
- self.assert_eq(
1765
- psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), expected
1766
- )
1767
- self.assert_eq(
1768
- psmidx4.union(
1769
- [(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False
1770
- ).sort_values(),
1771
- expected,
1772
- )
1773
-
1774
- self.assertRaises(NotImplementedError, lambda: psidx1.union(psmidx1))
1775
- self.assertRaises(TypeError, lambda: psmidx1.union(psidx1))
1776
- self.assertRaises(TypeError, lambda: psmidx1.union(["x", "a"]))
1777
- self.assertRaises(ValueError, lambda: psidx1.union(ps.range(2)))
1778
-
1779
- def test_take(self):
1780
- # Index
1781
- pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas")
1782
- psidx = ps.from_pandas(pidx)
1783
-
1784
- self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values())
1785
- self.assert_eq(
1786
- psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values()
1787
- )
1788
- self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values())
1789
- self.assert_eq(
1790
- psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values()
1791
- )
1792
-
1793
- # MultiIndex
1794
- pmidx = pd.MultiIndex.from_tuples(
1795
- [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"]
1796
- )
1797
- psmidx = ps.from_pandas(pmidx)
1798
-
1799
- self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values())
1800
- self.assert_eq(
1801
- psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values()
1802
- )
1803
- self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values())
1804
- self.assert_eq(
1805
- psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values()
1806
- )
1807
-
1808
- # Checking the type of indices.
1809
- self.assertRaises(TypeError, lambda: psidx.take(1))
1810
- self.assertRaises(TypeError, lambda: psidx.take("1"))
1811
- self.assertRaises(TypeError, lambda: psidx.take({1, 2}))
1812
- self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None}))
1813
- self.assertRaises(TypeError, lambda: psmidx.take(1))
1814
- self.assertRaises(TypeError, lambda: psmidx.take("1"))
1815
- self.assertRaises(TypeError, lambda: psmidx.take({1, 2}))
1816
- self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None}))
1817
-
1818
- def test_index_get_level_values(self):
1819
- pidx = pd.Index([1, 2, 3], name="ks")
1820
- psidx = ps.from_pandas(pidx)
1821
-
1822
- for level in [0, "ks"]:
1823
- self.assert_eq(psidx.get_level_values(level), pidx.get_level_values(level))
1824
-
1825
- def test_multiindex_get_level_values(self):
1826
- pmidx = pd.MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")])
1827
- pmidx.names = ["level_1", "level_2"]
1828
- psmidx = ps.from_pandas(pmidx)
1829
-
1830
- for level in [0, 1, "level_1", "level_2"]:
1831
- self.assert_eq(psmidx.get_level_values(level), pmidx.get_level_values(level))
1832
-
1833
- def test_index_get_level_number(self):
1834
- # name of two levels are the same, which is None
1835
- psdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")])
1836
- with self.assertRaisesRegex(
1837
- ValueError, "The name None occurs multiple times, use a level number"
1838
- ):
1839
- psdf.index._get_level_number(None)
1840
-
1841
- mi = pd.MultiIndex.from_arrays((list("abc"), list("def")))
1842
- mi.names = ["level_1", "level_2"]
1843
- psdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi)
1844
-
1845
- # level is not int and not in the level name list
1846
- with self.assertRaisesRegex(KeyError, "Level lv_3 not found"):
1847
- psdf.index._get_level_number("lv_3")
1848
-
1849
- # level is int, but an invalid negative number
1850
- with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"):
1851
- psdf.index._get_level_number(-3)
1852
-
1853
- # level is int, but an invalid positive number
1854
- with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"):
1855
- psdf.index._get_level_number(3)
1856
-
1857
- # Correct and valid inputs in numbers
1858
- level_number = [-2, -1, 0, 1]
1859
- outputs = [0, 1, 0, 1]
1860
-
1861
- for lv, output in zip(level_number, outputs):
1862
- self.assertEqual(output, psdf.index._get_level_number(lv))
1863
-
1864
- # Valid inputs as level names
1865
- level_names = ["level_1", "level_2"]
1866
- outputs = [0, 1]
1867
-
1868
- for lv, output in zip(level_names, outputs):
1869
- self.assertEqual(output, psdf.index._get_level_number(lv))
1870
-
1871
- def test_holds_integer(self):
1872
- pidx = pd.Index([1, 2, 3, 4])
1873
- psidx = ps.from_pandas(pidx)
1874
- self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
1875
-
1876
- pidx = pd.Index([1.1, 2.2, 3.3, 4.4])
1877
- psidx = ps.from_pandas(pidx)
1878
- self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
1879
-
1880
- pidx = pd.Index(["A", "B", "C", "D"])
1881
- psidx = ps.from_pandas(pidx)
1882
- self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
1883
-
1884
- # MultiIndex
1885
- pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "a")])
1886
- psmidx = ps.from_pandas(pmidx)
1887
- self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer())
1888
-
1889
- pmidx = pd.MultiIndex.from_tuples([(10, 1), (10, 2), (20, 1)])
1890
- psmidx = ps.from_pandas(pmidx)
1891
- self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer())
1892
-
1893
- def test_abs(self):
1894
- pidx = pd.Index([-2, -1, 0, 1])
1895
- psidx = ps.from_pandas(pidx)
1896
-
1897
- self.assert_eq(abs(pidx), abs(psidx))
1898
- self.assert_eq(np.abs(pidx), np.abs(psidx))
1899
-
1900
- psidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"])
1901
- with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"):
1902
- abs(psidx)
1903
-
1904
- def test_hasnans(self):
1905
- # BooleanType
1906
- pidx = pd.Index([True, False, True, True])
1907
- psidx = ps.from_pandas(pidx)
1908
- self.assert_eq(pidx.hasnans, psidx.hasnans)
1909
-
1910
- pidx = pd.Index([True, False, np.nan, True])
1911
- psidx = ps.from_pandas(pidx)
1912
- self.assert_eq(pidx.hasnans, psidx.hasnans)
1913
-
1914
- # TimestampType
1915
- pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
1916
- psser = ps.from_pandas(pser)
1917
- self.assert_eq(pser.hasnans, psser.hasnans)
1918
-
1919
- pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
1920
- psser = ps.from_pandas(pser)
1921
- self.assert_eq(pser.hasnans, psser.hasnans)
1922
-
1923
- # empty
1924
- pidx = pd.Index([])
1925
- psidx = ps.from_pandas(pidx)
1926
- self.assert_eq(pidx.hasnans, psidx.hasnans)
1927
-
1928
- # Not supported for MultiIndex
1929
- psmidx = ps.Index([("a", 1), ("b", 2)])
1930
- self.assertRaises(NotImplementedError, lambda: psmidx.hasnans())
1931
-
1932
- @unittest.skipIf(
1933
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
1934
- "TODO(SPARK-43607): Enable IndexesTests.test_intersection for pandas 2.0.0.",
1935
- )
1936
- def test_intersection(self):
1937
- pidx = pd.Index([1, 2, 3, 4], name="Koalas")
1938
- psidx = ps.from_pandas(pidx)
1939
-
1940
- # other = Index
1941
- pidx_other = pd.Index([3, 4, 5, 6], name="Koalas")
1942
- psidx_other = ps.from_pandas(pidx_other)
1943
- self.assert_eq(pidx.intersection(pidx_other), psidx.intersection(psidx_other).sort_values())
1944
- self.assert_eq(
1945
- (pidx + 1).intersection(pidx_other), (psidx + 1).intersection(psidx_other).sort_values()
1946
- )
1947
- # Deprecated case, but adding to track if pandas stop supporting intersection
1948
- # as a set operation. It should work fine until stop supporting anyway.
1949
- # No longer supported from pandas 2.0.0.
1950
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1951
- self.assert_eq(
1952
- (psidx & psidx_other).sort_values(), ps.Index([3, 1, 7, 1], dtype="int64")
1953
- )
1954
- else:
1955
- self.assert_eq(pidx & pidx_other, (psidx & psidx_other).sort_values())
1956
-
1957
- pidx_other_different_name = pd.Index([3, 4, 5, 6], name="Databricks")
1958
- psidx_other_different_name = ps.from_pandas(pidx_other_different_name)
1959
- self.assert_eq(
1960
- pidx.intersection(pidx_other_different_name),
1961
- psidx.intersection(psidx_other_different_name).sort_values(),
1962
- )
1963
- self.assert_eq(
1964
- (pidx + 1).intersection(pidx_other_different_name),
1965
- (psidx + 1).intersection(psidx_other_different_name).sort_values(),
1966
- )
1967
-
1968
- pidx_other_from_frame = pd.DataFrame({"a": [3, 4, 5, 6]}).set_index("a").index
1969
- psidx_other_from_frame = ps.from_pandas(pidx_other_from_frame)
1970
- self.assert_eq(
1971
- pidx.intersection(pidx_other_from_frame),
1972
- psidx.intersection(psidx_other_from_frame).sort_values(),
1973
- )
1974
- self.assert_eq(
1975
- (pidx + 1).intersection(pidx_other_from_frame),
1976
- (psidx + 1).intersection(psidx_other_from_frame).sort_values(),
1977
- )
1978
-
1979
- # other = MultiIndex
1980
- pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
1981
- psmidx = ps.from_pandas(pmidx)
1982
- if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
1983
- self.assert_eq(
1984
- psidx.intersection(psmidx).sort_values(),
1985
- psidx._psdf.head(0).index.rename(None),
1986
- almost=True,
1987
- )
1988
- self.assert_eq(
1989
- (psidx + 1).intersection(psmidx).sort_values(),
1990
- psidx._psdf.head(0).index.rename(None),
1991
- almost=True,
1992
- )
1993
- else:
1994
- self.assert_eq(
1995
- pidx.intersection(pmidx), psidx.intersection(psmidx).sort_values(), almost=True
1996
- )
1997
- self.assert_eq(
1998
- (pidx + 1).intersection(pmidx),
1999
- (psidx + 1).intersection(psmidx).sort_values(),
2000
- almost=True,
2001
- )
2002
-
2003
- # other = Series
2004
- pser = pd.Series([3, 4, 5, 6])
2005
- psser = ps.from_pandas(pser)
2006
- if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
2007
- self.assert_eq(psidx.intersection(psser).sort_values(), ps.Index([3, 4], name="Koalas"))
2008
- self.assert_eq(
2009
- (psidx + 1).intersection(psser).sort_values(), ps.Index([3, 4, 5], name="Koalas")
2010
- )
2011
- else:
2012
- self.assert_eq(pidx.intersection(pser), psidx.intersection(psser).sort_values())
2013
- self.assert_eq(
2014
- (pidx + 1).intersection(pser), (psidx + 1).intersection(psser).sort_values()
2015
- )
2016
-
2017
- pser_different_name = pd.Series([3, 4, 5, 6], name="Databricks")
2018
- psser_different_name = ps.from_pandas(pser_different_name)
2019
- if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
2020
- self.assert_eq(
2021
- psidx.intersection(psser_different_name).sort_values(),
2022
- ps.Index([3, 4], name="Koalas"),
2023
- )
2024
- self.assert_eq(
2025
- (psidx + 1).intersection(psser_different_name).sort_values(),
2026
- ps.Index([3, 4, 5], name="Koalas"),
2027
- )
2028
- else:
2029
- self.assert_eq(
2030
- pidx.intersection(pser_different_name),
2031
- psidx.intersection(psser_different_name).sort_values(),
2032
- )
2033
- self.assert_eq(
2034
- (pidx + 1).intersection(pser_different_name),
2035
- (psidx + 1).intersection(psser_different_name).sort_values(),
2036
- )
2037
-
2038
- others = ([3, 4, 5, 6], (3, 4, 5, 6), {3: None, 4: None, 5: None, 6: None})
2039
- for other in others:
2040
- if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
2041
- self.assert_eq(
2042
- psidx.intersection(other).sort_values(), ps.Index([3, 4], name="Koalas")
2043
- )
2044
- self.assert_eq(
2045
- (psidx + 1).intersection(other).sort_values(),
2046
- ps.Index([3, 4, 5], name="Koalas"),
2047
- )
2048
- else:
2049
- self.assert_eq(pidx.intersection(other), psidx.intersection(other).sort_values())
2050
- self.assert_eq(
2051
- (pidx + 1).intersection(other), (psidx + 1).intersection(other).sort_values()
2052
- )
2053
-
2054
- # MultiIndex / other = Index
2055
- self.assert_eq(
2056
- pmidx.intersection(pidx), psmidx.intersection(psidx).sort_values(), almost=True
2057
- )
2058
- self.assert_eq(
2059
- pmidx.intersection(pidx_other_from_frame),
2060
- psmidx.intersection(psidx_other_from_frame).sort_values(),
2061
- almost=True,
2062
- )
2063
-
2064
- # MultiIndex / other = MultiIndex
2065
- pmidx_other = pd.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
2066
- psmidx_other = ps.from_pandas(pmidx_other)
2067
- self.assert_eq(
2068
- pmidx.intersection(pmidx_other), psmidx.intersection(psmidx_other).sort_values()
2069
- )
2070
-
2071
- # MultiIndex / other = list
2072
- other = [("c", "z"), ("d", "w")]
2073
- self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values())
2074
-
2075
- # MultiIndex / other = tuple
2076
- other = (("c", "z"), ("d", "w"))
2077
- self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values())
2078
-
2079
- # MultiIndex / other = dict
2080
- other = {("c", "z"): None, ("d", "w"): None}
2081
- self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values())
2082
-
2083
- # MultiIndex with different names.
2084
- pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")], names=["X", "Y"])
2085
- pmidx2 = pd.MultiIndex.from_tuples([("c", "z"), ("d", "w")], names=["A", "B"])
2086
- psmidx1 = ps.from_pandas(pmidx1)
2087
- psmidx2 = ps.from_pandas(pmidx2)
2088
- self.assert_eq(pmidx1.intersection(pmidx2), psmidx1.intersection(psmidx2).sort_values())
2089
-
2090
- with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
2091
- psidx.intersection(4)
2092
- with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
2093
- psmidx.intersection(4)
2094
- with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
2095
- psmidx.intersection(ps.Series([3, 4, 5, 6]))
2096
- with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
2097
- psmidx.intersection([("c", "z"), ["d", "w"]])
2098
- with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
2099
- psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
2100
- with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
2101
- psmidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
2102
- # other = list of tuple
2103
- with self.assertRaisesRegex(ValueError, "Names should be list-like for a MultiIndex"):
2104
- psidx.intersection([(1, 2), (3, 4)])
2105
-
2106
- def test_item(self):
2107
- pidx = pd.Index([10])
2108
- psidx = ps.from_pandas(pidx)
2109
-
2110
- self.assert_eq(pidx.item(), psidx.item())
2111
-
2112
- # with timestamp
2113
- pidx = pd.Index([datetime(1990, 3, 9)])
2114
- psidx = ps.from_pandas(pidx)
2115
-
2116
- self.assert_eq(pidx.item(), psidx.item())
2117
-
2118
- # MultiIndex
2119
- pmidx = pd.MultiIndex.from_tuples([("a", "x")])
2120
- psmidx = ps.from_pandas(pmidx)
2121
-
2122
- self.assert_eq(pmidx.item(), psmidx.item())
2123
-
2124
- # MultiIndex with timestamp
2125
- pmidx = pd.MultiIndex.from_tuples([(datetime(1990, 3, 9), datetime(2019, 8, 15))])
2126
- psmidx = ps.from_pandas(pmidx)
2127
-
2128
- self.assert_eq(pidx.item(), psidx.item())
2129
-
2130
- err_msg = "can only convert an array of size 1 to a Python scalar"
2131
- with self.assertRaisesRegex(ValueError, err_msg):
2132
- ps.Index([10, 20]).item()
2133
- with self.assertRaisesRegex(ValueError, err_msg):
2134
- ps.MultiIndex.from_tuples([("a", "x"), ("b", "y")]).item()
2135
-
2136
- def test_inferred_type(self):
2137
- # Integer
2138
- pidx = pd.Index([1, 2, 3])
2139
- psidx = ps.from_pandas(pidx)
2140
- self.assert_eq(pidx.inferred_type, psidx.inferred_type)
2141
-
2142
- # Floating
2143
- pidx = pd.Index([1.0, 2.0, 3.0])
2144
- psidx = ps.from_pandas(pidx)
2145
- self.assert_eq(pidx.inferred_type, psidx.inferred_type)
2146
-
2147
- # String
2148
- pidx = pd.Index(["a", "b", "c"])
2149
- psidx = ps.from_pandas(pidx)
2150
- self.assert_eq(pidx.inferred_type, psidx.inferred_type)
2151
-
2152
- # Boolean
2153
- pidx = pd.Index([True, False, True, False])
2154
- psidx = ps.from_pandas(pidx)
2155
- self.assert_eq(pidx.inferred_type, psidx.inferred_type)
2156
-
2157
- # MultiIndex
2158
- pmidx = pd.MultiIndex.from_tuples([("a", "x")])
2159
- psmidx = ps.from_pandas(pmidx)
2160
- self.assert_eq(pmidx.inferred_type, psmidx.inferred_type)
2161
-
2162
- def test_multi_index_from_index(self):
2163
- tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]
2164
- pmidx = pd.Index(tuples)
2165
- psmidx = ps.Index(tuples)
2166
-
2167
- self.assertTrue(isinstance(psmidx, ps.MultiIndex))
2168
- self.assert_eq(pmidx, psmidx)
2169
-
2170
- # Specify the `names`
2171
- # Specify the `names` while Index creating is no longer supported from pandas 2.0.0.
2172
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2173
- pmidx = pd.Index(tuples)
2174
- pmidx.names = ["Hello", "Koalas"]
2175
- psmidx = ps.Index(tuples)
2176
- psmidx.names = ["Hello", "Koalas"]
2177
- else:
2178
- pmidx = pd.Index(tuples, names=["Hello", "Koalas"])
2179
- psmidx = ps.Index(tuples, names=["Hello", "Koalas"])
2180
-
2181
- self.assertTrue(isinstance(psmidx, ps.MultiIndex))
2182
- self.assert_eq(pmidx, psmidx)
2183
-
2184
- def test_multiindex_from_frame(self):
2185
- pdf = pd.DataFrame(
2186
- [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"]
2187
- )
2188
- psdf = ps.from_pandas(pdf)
2189
- pidx = pd.MultiIndex.from_frame(pdf)
2190
- psidx = ps.MultiIndex.from_frame(psdf)
2191
-
2192
- self.assert_eq(pidx, psidx)
2193
-
2194
- # Specify `names`
2195
- pidx = pd.MultiIndex.from_frame(pdf, names=["state", "observation"])
2196
- psidx = ps.MultiIndex.from_frame(psdf, names=["state", "observation"])
2197
- self.assert_eq(pidx, psidx)
2198
-
2199
- pidx = pd.MultiIndex.from_frame(pdf, names=("state", "observation"))
2200
- psidx = ps.MultiIndex.from_frame(psdf, names=("state", "observation"))
2201
- self.assert_eq(pidx, psidx)
2202
-
2203
- # MultiIndex columns
2204
- pidx = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x")])
2205
- pdf.columns = pidx
2206
- psdf = ps.from_pandas(pdf)
2207
-
2208
- pidx = pd.MultiIndex.from_frame(pdf)
2209
- psidx = ps.MultiIndex.from_frame(psdf)
2210
-
2211
- self.assert_eq(pidx, psidx)
2212
-
2213
- # tuples for names
2214
- pidx = pd.MultiIndex.from_frame(pdf, names=[("a", "w"), ("b", "x")])
2215
- psidx = ps.MultiIndex.from_frame(psdf, names=[("a", "w"), ("b", "x")])
2216
-
2217
- self.assert_eq(pidx, psidx)
2218
-
2219
- err_msg = "Input must be a DataFrame"
2220
- with self.assertRaisesRegex(TypeError, err_msg):
2221
- ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]})
2222
-
2223
- self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(psdf, names="ab"))
2224
-
2225
- # non-string names
2226
- self.assert_eq(
2227
- ps.MultiIndex.from_frame(psdf, names=[0, 1]),
2228
- pd.MultiIndex.from_frame(pdf, names=[0, 1]),
2229
- )
2230
- self.assert_eq(
2231
- ps.MultiIndex.from_frame(psdf, names=[("x", 0), ("y", 1)]),
2232
- pd.MultiIndex.from_frame(pdf, names=[("x", 0), ("y", 1)]),
2233
- )
2234
-
2235
- pdf = pd.DataFrame([["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]])
2236
- psdf = ps.from_pandas(pdf)
2237
- self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf))
2238
-
2239
- def test_is_type_compatible(self):
2240
- data_types = ["integer", "floating", "string", "boolean"]
2241
- # Integer
2242
- pidx = pd.Index([1, 2, 3])
2243
- psidx = ps.from_pandas(pidx)
2244
- # is_type_compatible is removed from pandas 2.0.0.
2245
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2246
- expected_results = [True, False, False, False]
2247
- for data_type, expected_result in zip(data_types, expected_results):
2248
- self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
2249
- else:
2250
- for data_type in data_types:
2251
- self.assert_eq(
2252
- pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
2253
- )
2254
-
2255
- # Floating
2256
- pidx = pd.Index([1.0, 2.0, 3.0])
2257
- psidx = ps.from_pandas(pidx)
2258
- # is_type_compatible is removed from pandas 2.0.0.
2259
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2260
- expected_results = [False, True, False, False]
2261
- for data_type, expected_result in zip(data_types, expected_results):
2262
- self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
2263
- else:
2264
- for data_type in data_types:
2265
- self.assert_eq(
2266
- pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
2267
- )
2268
-
2269
- # String
2270
- pidx = pd.Index(["a", "b", "c"])
2271
- psidx = ps.from_pandas(pidx)
2272
- # is_type_compatible is removed from pandas 2.0.0.
2273
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2274
- expected_results = [False, False, True, False]
2275
- for data_type, expected_result in zip(data_types, expected_results):
2276
- self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
2277
- else:
2278
- for data_type in data_types:
2279
- self.assert_eq(
2280
- pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
2281
- )
2282
-
2283
- # Boolean
2284
- pidx = pd.Index([True, False, True, False])
2285
- psidx = ps.from_pandas(pidx)
2286
- # is_type_compatible is removed from pandas 2.0.0.
2287
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2288
- expected_results = [False, False, False, True]
2289
- for data_type, expected_result in zip(data_types, expected_results):
2290
- self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
2291
- else:
2292
- for data_type in data_types:
2293
- self.assert_eq(
2294
- pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
2295
- )
2296
-
2297
- # MultiIndex
2298
- pmidx = pd.MultiIndex.from_tuples([("a", "x")])
2299
- psmidx = ps.from_pandas(pmidx)
2300
- # is_type_compatible is removed from pandas 2.0.0.
2301
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2302
- expected_results = [False, False, False, False]
2303
- for data_type, expected_result in zip(data_types, expected_results):
2304
- self.assert_eq(psmidx.is_type_compatible(data_type), expected_result)
2305
- else:
2306
- for data_type in data_types:
2307
- self.assert_eq(
2308
- pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type)
2309
- )
2310
-
2311
- def test_asi8(self):
2312
- # Integer
2313
- pidx = pd.Index([1, 2, 3])
2314
- psidx = ps.from_pandas(pidx)
2315
- # asi8 is removed from pandas 2.0.0.
2316
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2317
- self.assert_eq(np.array(pidx), psidx.asi8)
2318
- self.assert_eq(np.array(pidx.astype("int")), psidx.astype("int").asi8)
2319
- self.assert_eq(np.array(pidx.astype("int16")), psidx.astype("int16").asi8)
2320
- self.assert_eq(np.array(pidx.astype("int8")), psidx.astype("int8").asi8)
2321
- else:
2322
- self.assert_eq(pidx.asi8, psidx.asi8)
2323
- self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8)
2324
- self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8)
2325
- self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8)
2326
-
2327
- # Integer with missing value
2328
- pidx = pd.Index([1, 2, None, 4, 5])
2329
- psidx = ps.from_pandas(pidx)
2330
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2331
- self.assert_eq(None, psidx.asi8)
2332
- else:
2333
- self.assert_eq(pidx.asi8, psidx.asi8)
2334
-
2335
- # Datetime
2336
- pidx = pd.date_range(end="1/1/2018", periods=3)
2337
- psidx = ps.from_pandas(pidx)
2338
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2339
- self.assert_eq(
2340
- np.array([1514592000000000000, 1514678400000000000, 1514764800000000000]),
2341
- psidx.asi8,
2342
- )
2343
- else:
2344
- self.assert_eq(pidx.asi8, psidx.asi8)
2345
-
2346
- # Floating
2347
- pidx = pd.Index([1.0, 2.0, 3.0])
2348
- psidx = ps.from_pandas(pidx)
2349
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2350
- self.assert_eq(None, psidx.asi8)
2351
- else:
2352
- self.assert_eq(pidx.asi8, psidx.asi8)
2353
-
2354
- # String
2355
- pidx = pd.Index(["a", "b", "c"])
2356
- psidx = ps.from_pandas(pidx)
2357
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2358
- self.assert_eq(None, psidx.asi8)
2359
- else:
2360
- self.assert_eq(pidx.asi8, psidx.asi8)
2361
-
2362
- # Boolean
2363
- pidx = pd.Index([True, False, True, False])
2364
- psidx = ps.from_pandas(pidx)
2365
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2366
- self.assert_eq(None, psidx.asi8)
2367
- else:
2368
- self.assert_eq(pidx.asi8, psidx.asi8)
2369
-
2370
- # MultiIndex
2371
- pmidx = pd.MultiIndex.from_tuples([(1, 2)])
2372
- psmidx = ps.from_pandas(pmidx)
2373
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
2374
- self.assert_eq(None, psmidx.asi8)
2375
- else:
2376
- self.assert_eq(pmidx.asi8, psmidx.asi8)
2377
-
2378
- def test_index_is_unique(self):
2379
- indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
2380
- names = [None, "ks", "ks", None]
2381
- is_uniq = [True, False, False, True]
2382
-
2383
- for idx, name, expected in zip(indexes, names, is_uniq):
2384
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
2385
- psdf = ps.from_pandas(pdf)
2386
-
2387
- self.assertEqual(psdf.index.is_unique, expected)
2388
-
2389
- def test_multiindex_is_unique(self):
2390
- indexes = [
2391
- [list("abc"), list("edf")],
2392
- [list("aac"), list("edf")],
2393
- [list("aac"), list("eef")],
2394
- [[1, 4, 4], [4, 6, 6]],
2395
- ]
2396
- is_uniq = [True, True, False, False]
2397
-
2398
- for idx, expected in zip(indexes, is_uniq):
2399
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
2400
- psdf = ps.from_pandas(pdf)
2401
-
2402
- self.assertEqual(psdf.index.is_unique, expected)
2403
-
2404
- def test_view(self):
2405
- pidx = pd.Index([1, 2, 3, 4], name="Koalas")
2406
- psidx = ps.from_pandas(pidx)
2407
-
2408
- self.assert_eq(pidx.view(), psidx.view())
2409
-
2410
- # MultiIndex
2411
- pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
2412
- psmidx = ps.from_pandas(pmidx)
2413
-
2414
- self.assert_eq(pmidx.view(), psmidx.view())
2415
-
2416
- def test_insert(self):
2417
- # Integer
2418
- pidx = pd.Index([1, 2, 3], name="Koalas")
2419
- psidx = ps.from_pandas(pidx)
2420
- self.assert_eq(pidx.insert(1, 100), psidx.insert(1, 100))
2421
- self.assert_eq(pidx.insert(-1, 100), psidx.insert(-1, 100))
2422
- err_msg = "index 100 is out of bounds for axis 0 with size 3"
2423
- with self.assertRaisesRegex(IndexError, err_msg):
2424
- psidx.insert(100, 100)
2425
- err_msg = "index -100 is out of bounds for axis 0 with size 3"
2426
- with self.assertRaisesRegex(IndexError, err_msg):
2427
- psidx.insert(-100, 100)
2428
-
2429
- # Floating
2430
- pidx = pd.Index([1.0, 2.0, 3.0], name="Koalas")
2431
- psidx = ps.from_pandas(pidx)
2432
- self.assert_eq(pidx.insert(1, 100.0), psidx.insert(1, 100.0))
2433
- self.assert_eq(pidx.insert(-1, 100.0), psidx.insert(-1, 100.0))
2434
- err_msg = "index 100 is out of bounds for axis 0 with size 3"
2435
- with self.assertRaisesRegex(IndexError, err_msg):
2436
- psidx.insert(100, 100)
2437
- err_msg = "index -100 is out of bounds for axis 0 with size 3"
2438
- with self.assertRaisesRegex(IndexError, err_msg):
2439
- psidx.insert(-100, 100)
2440
-
2441
- # String
2442
- pidx = pd.Index(["a", "b", "c"], name="Koalas")
2443
- psidx = ps.from_pandas(pidx)
2444
- self.assert_eq(pidx.insert(1, "x"), psidx.insert(1, "x"))
2445
- self.assert_eq(pidx.insert(-1, "x"), psidx.insert(-1, "x"))
2446
- err_msg = "index 100 is out of bounds for axis 0 with size 3"
2447
- with self.assertRaisesRegex(IndexError, err_msg):
2448
- psidx.insert(100, "x")
2449
- err_msg = "index -100 is out of bounds for axis 0 with size 3"
2450
- with self.assertRaisesRegex(IndexError, err_msg):
2451
- psidx.insert(-100, "x")
2452
-
2453
- # Boolean
2454
- pidx = pd.Index([True, False, True, False], name="Koalas")
2455
- psidx = ps.from_pandas(pidx)
2456
- self.assert_eq(pidx.insert(1, True), psidx.insert(1, True))
2457
- self.assert_eq(pidx.insert(-1, True), psidx.insert(-1, True))
2458
- err_msg = "index 100 is out of bounds for axis 0 with size 4"
2459
- with self.assertRaisesRegex(IndexError, err_msg):
2460
- psidx.insert(100, True)
2461
- err_msg = "index -100 is out of bounds for axis 0 with size 4"
2462
- with self.assertRaisesRegex(IndexError, err_msg):
2463
- psidx.insert(-100, True)
2464
-
2465
- # MultiIndex
2466
- pmidx = pd.MultiIndex.from_tuples(
2467
- [("a", "x"), ("b", "y"), ("c", "z")], names=["Hello", "Koalas"]
2468
- )
2469
- psmidx = ps.from_pandas(pmidx)
2470
- self.assert_eq(pmidx.insert(2, ("h", "j")), psmidx.insert(2, ("h", "j")))
2471
- self.assert_eq(pmidx.insert(-1, ("h", "j")), psmidx.insert(-1, ("h", "j")))
2472
-
2473
- err_msg = "index 4 is out of bounds for axis 0 with size 3"
2474
- with self.assertRaisesRegex(IndexError, err_msg):
2475
- psmidx.insert(4, ("b", "y"))
2476
-
2477
- err_msg = "index -4 is out of bounds for axis 0 with size 3"
2478
- with self.assertRaisesRegex(IndexError, err_msg):
2479
- psmidx.insert(-4, ("b", "y"))
2480
-
2481
- def test_astype(self):
2482
- pidx = pd.Index([10, 20, 15, 30, 45], name="x")
2483
- psidx = ps.Index(pidx)
2484
-
2485
- self.assert_eq(psidx.astype(int), pidx.astype(int))
2486
- self.assert_eq(psidx.astype(np.int8), pidx.astype(np.int8))
2487
- self.assert_eq(psidx.astype(np.int16), pidx.astype(np.int16))
2488
- self.assert_eq(psidx.astype(np.int32), pidx.astype(np.int32))
2489
- self.assert_eq(psidx.astype(np.int64), pidx.astype(np.int64))
2490
- self.assert_eq(psidx.astype(np.byte), pidx.astype(np.byte))
2491
- self.assert_eq(psidx.astype("int"), pidx.astype("int"))
2492
- self.assert_eq(psidx.astype("int8"), pidx.astype("int8"))
2493
- self.assert_eq(psidx.astype("int16"), pidx.astype("int16"))
2494
- self.assert_eq(psidx.astype("int32"), pidx.astype("int32"))
2495
- self.assert_eq(psidx.astype("int64"), pidx.astype("int64"))
2496
- self.assert_eq(psidx.astype("b"), pidx.astype("b"))
2497
- self.assert_eq(psidx.astype("byte"), pidx.astype("byte"))
2498
- self.assert_eq(psidx.astype("i"), pidx.astype("i"))
2499
- self.assert_eq(psidx.astype("long"), pidx.astype("long"))
2500
- self.assert_eq(psidx.astype("short"), pidx.astype("short"))
2501
- self.assert_eq(psidx.astype(np.float32), pidx.astype(np.float32))
2502
- self.assert_eq(psidx.astype(np.float64), pidx.astype(np.float64))
2503
- self.assert_eq(psidx.astype("float"), pidx.astype("float"))
2504
- self.assert_eq(psidx.astype("float32"), pidx.astype("float32"))
2505
- self.assert_eq(psidx.astype("float64"), pidx.astype("float64"))
2506
- self.assert_eq(psidx.astype("double"), pidx.astype("double"))
2507
- self.assert_eq(psidx.astype("f"), pidx.astype("f"))
2508
- self.assert_eq(psidx.astype(bool), pidx.astype(bool))
2509
- self.assert_eq(psidx.astype("bool"), pidx.astype("bool"))
2510
- self.assert_eq(psidx.astype("?"), pidx.astype("?"))
2511
- self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_))
2512
- self.assert_eq(psidx.astype("str"), pidx.astype("str"))
2513
- self.assert_eq(psidx.astype("U"), pidx.astype("U"))
2514
-
2515
- pidx = pd.Index([10, 20, 15, 30, 45, None], name="x")
2516
- psidx = ps.Index(pidx)
2517
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
2518
- self.assert_eq(psidx.astype(bool), pidx.astype(bool))
2519
- self.assert_eq(psidx.astype(str), pidx.astype(str))
2520
- else:
2521
- self.assert_eq(
2522
- psidx.astype(bool), ps.Index([True, True, True, True, True, True], name="x")
2523
- )
2524
- self.assert_eq(
2525
- psidx.astype(str),
2526
- ps.Index(["10.0", "20.0", "15.0", "30.0", "45.0", "nan"], name="x"),
2527
- )
2528
-
2529
- pidx = pd.Index(["hi", "hi ", " ", " \t", "", None], name="x")
2530
- psidx = ps.Index(pidx)
2531
-
2532
- self.assert_eq(psidx.astype(bool), pidx.astype(bool))
2533
- self.assert_eq(psidx.astype(str), pidx.astype(str))
2534
-
2535
- pidx = pd.Index([True, False, None], name="x")
2536
- psidx = ps.Index(pidx)
2537
-
2538
- self.assert_eq(psidx.astype(bool), pidx.astype(bool))
2539
-
2540
- pidx = pd.Index(["2020-10-27"], name="x")
2541
- psidx = ps.Index(pidx)
2542
-
2543
- self.assert_eq(psidx.astype("datetime64[ns]"), pidx.astype("datetime64[ns]"))
2544
-
2545
- with self.assertRaisesRegex(TypeError, "not understood"):
2546
- psidx.astype("int63")
2547
-
2548
- def test_to_list(self):
2549
- # Index
2550
- pidx = pd.Index([1, 2, 3, 4, 5])
2551
- psidx = ps.from_pandas(pidx)
2552
- # MultiIndex
2553
- tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
2554
- pmidx = pd.MultiIndex.from_tuples(tuples)
2555
- psmidx = ps.from_pandas(pmidx)
2556
-
2557
- self.assert_eq(psidx.tolist(), pidx.tolist())
2558
- self.assert_eq(psmidx.tolist(), pmidx.tolist())
2559
-
2560
- def test_index_ops(self):
2561
- pidx = pd.Index([1, 2, 3, 4, 5])
2562
- psidx = ps.from_pandas(pidx)
2563
-
2564
- self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx)
2565
-
2566
- pidx = pd.Index([1, 2, 3, 4, 5], name="a")
2567
- psidx = ps.from_pandas(pidx)
2568
-
2569
- self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx)
2570
-
2571
- pdf = pd.DataFrame(
2572
- index=pd.MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6)], names=["a", "b"])
2573
- )
2574
- psdf = ps.from_pandas(pdf)
2575
-
2576
- pidx1 = pdf.index.get_level_values(0)
2577
- pidx2 = pdf.index.get_level_values(1)
2578
- psidx1 = psdf.index.get_level_values(0)
2579
- psidx2 = psdf.index.get_level_values(1)
2580
-
2581
- self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2)
2582
-
2583
- def test_factorize(self):
2584
- pidx = pd.Index(["a", "b", "a", "b"])
2585
- psidx = ps.from_pandas(pidx)
2586
- pcodes, puniques = pidx.factorize(sort=True)
2587
- kcodes, kuniques = psidx.factorize()
2588
- self.assert_eq(pcodes.tolist(), kcodes.to_list())
2589
- self.assert_eq(puniques, kuniques)
2590
-
2591
- pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
2592
- psmidx = ps.from_pandas(pmidx)
2593
-
2594
- self.assertRaises(PandasNotImplementedError, lambda: psmidx.factorize())
2595
-
2596
- def test_map(self):
2597
- pidx = pd.Index([1, 2, 3])
2598
- psidx = ps.from_pandas(pidx)
2599
-
2600
- # Apply dict
2601
- self.assert_eq(
2602
- pidx.map({1: "one", 2: "two", 3: "three"}),
2603
- psidx.map({1: "one", 2: "two", 3: "three"}),
2604
- )
2605
- self.assert_eq(
2606
- pidx.map({1: "one", 2: "two"}),
2607
- psidx.map({1: "one", 2: "two"}),
2608
- )
2609
- self.assert_eq(
2610
- pidx.map({1: "one", 2: "two"}, na_action="ignore"),
2611
- psidx.map({1: "one", 2: "two"}, na_action="ignore"),
2612
- )
2613
- self.assert_eq(
2614
- pidx.map({1: 10, 2: 20}),
2615
- psidx.map({1: 10, 2: 20}),
2616
- )
2617
- self.assert_eq(
2618
- (pidx + 1).map({1: 10, 2: 20}),
2619
- (psidx + 1).map({1: 10, 2: 20}),
2620
- )
2621
-
2622
- # Apply lambda
2623
- self.assert_eq(
2624
- pidx.map(lambda id: id + 1),
2625
- psidx.map(lambda id: id + 1),
2626
- )
2627
- self.assert_eq(
2628
- pidx.map(lambda id: id + 1.1),
2629
- psidx.map(lambda id: id + 1.1),
2630
- )
2631
- self.assert_eq(
2632
- pidx.map(lambda id: "{id} + 1".format(id=id)),
2633
- psidx.map(lambda id: "{id} + 1".format(id=id)),
2634
- )
2635
- self.assert_eq(
2636
- (pidx + 1).map(lambda id: "{id} + 1".format(id=id)),
2637
- (psidx + 1).map(lambda id: "{id} + 1".format(id=id)),
2638
- )
2639
-
2640
- # Apply series
2641
- pser = pd.Series(["one", "two", "three"], index=[1, 2, 3])
2642
- self.assert_eq(
2643
- pidx.map(pser),
2644
- psidx.map(pser),
2645
- )
2646
- pser = pd.Series(["one", "two", "three"])
2647
- self.assert_eq(
2648
- pidx.map(pser),
2649
- psidx.map(pser),
2650
- )
2651
- self.assert_eq(
2652
- pidx.map(pser, na_action="ignore"),
2653
- psidx.map(pser, na_action="ignore"),
2654
- )
2655
- pser = pd.Series([1, 2, 3])
2656
- self.assert_eq(
2657
- pidx.map(pser),
2658
- psidx.map(pser),
2659
- )
2660
- self.assert_eq(
2661
- (pidx + 1).map(pser),
2662
- (psidx + 1).map(pser),
2663
- )
2664
-
2665
- self.assertRaises(
2666
- TypeError,
2667
- lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}),
2668
- )
2669
-
2670
- def test_multiindex_equal_levels(self):
2671
- pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
2672
- pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
2673
- psmidx1 = ps.from_pandas(pmidx1)
2674
- psmidx2 = ps.from_pandas(pmidx2)
2675
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2676
-
2677
- pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
2678
- psmidx2 = ps.from_pandas(pmidx2)
2679
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2680
-
2681
- pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")])
2682
- psmidx2 = ps.from_pandas(pmidx2)
2683
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2684
-
2685
- pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
2686
- psmidx2 = ps.from_pandas(pmidx2)
2687
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2688
-
2689
- pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")])
2690
- psmidx2 = ps.from_pandas(pmidx2)
2691
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2692
-
2693
- pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
2694
- pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
2695
- psmidx1 = ps.from_pandas(pmidx1)
2696
- psmidx2 = ps.from_pandas(pmidx2)
2697
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2698
-
2699
- pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
2700
- pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")])
2701
- psmidx1 = ps.from_pandas(pmidx1)
2702
- psmidx2 = ps.from_pandas(pmidx2)
2703
- self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
2704
-
2705
- def test_to_numpy(self):
2706
- pidx = pd.Index([1, 2, 3, 4])
2707
- psidx = ps.from_pandas(pidx)
2708
-
2709
- self.assert_eq(pidx.to_numpy(copy=True), psidx.to_numpy(copy=True))
2710
-
2711
- def test_drop_level(self):
2712
- tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
2713
- pmidx = pd.MultiIndex.from_tuples(tuples)
2714
- psmidx = ps.from_pandas(pmidx)
2715
-
2716
- with self.assertRaisesRegex(
2717
- IndexError, "Too many levels: Index has only 2 levels, -3 is not a valid level number"
2718
- ):
2719
- psmidx.droplevel(-3)
2720
-
2721
- def test_multi_index_nunique(self):
2722
- tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
2723
- pmidx = pd.MultiIndex.from_tuples(tuples)
2724
- psmidx = ps.from_pandas(pmidx)
2725
-
2726
- with self.assertRaisesRegex(NotImplementedError, "nunique is not defined for MultiIndex"):
2727
- psmidx.nunique()
2728
-
2729
-
2730
- class IndexesTests(IndexesTestsMixin, ComparisonTestBase, TestUtils):
2731
- pass
2732
-
2733
-
2734
- if __name__ == "__main__":
2735
- from pyspark.pandas.tests.indexes.test_base import * # noqa: F401
2736
-
2737
- try:
2738
- import xmlrunner
2739
-
2740
- testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
2741
- except ImportError:
2742
- testRunner = None
2743
- unittest.main(testRunner=testRunner, verbosity=2)