snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2783 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ from functools import partial
19
+ from typing import (
20
+ Any,
21
+ Callable,
22
+ Iterator,
23
+ List,
24
+ Optional,
25
+ Tuple,
26
+ Union,
27
+ cast,
28
+ no_type_check,
29
+ TYPE_CHECKING,
30
+ )
31
+ import warnings
32
+
33
+ import pandas as pd
34
+ import numpy as np
35
+ from pandas.api.types import ( # type: ignore[attr-defined]
36
+ is_list_like,
37
+ is_interval_dtype,
38
+ is_bool_dtype,
39
+ is_categorical_dtype,
40
+ is_integer_dtype,
41
+ is_float_dtype,
42
+ is_numeric_dtype,
43
+ is_object_dtype,
44
+ )
45
+ from pandas.core.accessor import CachedAccessor
46
+ from pandas.io.formats.printing import pprint_thing
47
+ from pandas.api.types import CategoricalDtype, is_hashable # type: ignore[attr-defined]
48
+ from pandas._libs import lib
49
+
50
+ from pyspark.sql.column import Column
51
+ from pyspark.sql import functions as F
52
+ from pyspark.sql.types import (
53
+ DayTimeIntervalType,
54
+ FractionalType,
55
+ IntegralType,
56
+ TimestampType,
57
+ TimestampNTZType,
58
+ )
59
+
60
+ from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm.
61
+ from pyspark.pandas._typing import Dtype, Label, Name, Scalar
62
+ from pyspark.pandas.config import get_option, option_context
63
+ from pyspark.pandas.base import IndexOpsMixin
64
+ from pyspark.pandas.frame import DataFrame
65
+ from pyspark.pandas.missing.indexes import MissingPandasLikeIndex
66
+ from pyspark.pandas.series import Series, first_series
67
+ from pyspark.pandas.spark.accessors import SparkIndexMethods
68
+ from pyspark.pandas.utils import (
69
+ is_name_like_tuple,
70
+ is_name_like_value,
71
+ name_like_string,
72
+ same_anchor,
73
+ scol_for,
74
+ verify_temp_column_name,
75
+ validate_bool_kwarg,
76
+ validate_index_loc,
77
+ ERROR_MESSAGE_CANNOT_COMBINE,
78
+ log_advice,
79
+ )
80
+ from pyspark.pandas.internal import (
81
+ InternalField,
82
+ InternalFrame,
83
+ DEFAULT_SERIES_NAME,
84
+ SPARK_DEFAULT_INDEX_NAME,
85
+ SPARK_INDEX_NAME_FORMAT,
86
+ )
87
+
88
+ if TYPE_CHECKING:
89
+ from pyspark.pandas.spark.accessors import SparkIndexOpsMethods
90
+
91
+
92
+ class Index(IndexOpsMixin):
93
+ """
94
+ pandas-on-Spark Index that corresponds to pandas Index logically. This might hold Spark Column
95
+ internally.
96
+
97
+ Parameters
98
+ ----------
99
+ data : array-like (1-dimensional)
100
+ dtype : dtype, default None
101
+ If dtype is None, we find the dtype that best fits the data.
102
+ If an actual dtype is provided, we coerce to that dtype if it's safe.
103
+ Otherwise, an error will be raised.
104
+ copy : bool
105
+ Make a copy of input ndarray.
106
+ name : object
107
+ Name to be stored in the index.
108
+ tupleize_cols : bool (default: True)
109
+ When True, attempt to create a MultiIndex if possible.
110
+
111
+ See Also
112
+ --------
113
+ MultiIndex : A multi-level, or hierarchical, Index.
114
+ DatetimeIndex : Index of datetime64 data.
115
+ Int64Index : A special case of :class:`Index` with purely integer labels.
116
+ Float64Index : A special case of :class:`Index` with purely float labels.
117
+
118
+ Examples
119
+ --------
120
+ >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index # doctest: +SKIP
121
+ Int64Index([1, 2, 3], dtype='int64')
122
+
123
+ >>> ps.DataFrame({'a': [1, 2, 3]}, index=list('abc')).index # doctest: +SKIP
124
+ Index(['a', 'b', 'c'], dtype='object')
125
+
126
+ >>> ps.Index([1, 2, 3]) # doctest: +SKIP
127
+ Int64Index([1, 2, 3], dtype='int64')
128
+
129
+ >>> ps.Index(list('abc'))
130
+ Index(['a', 'b', 'c'], dtype='object')
131
+
132
+ From a Series:
133
+
134
+ >>> s = ps.Series([1, 2, 3], index=[10, 20, 30])
135
+ >>> ps.Index(s) # doctest: +SKIP
136
+ Int64Index([1, 2, 3], dtype='int64')
137
+
138
+ From an Index:
139
+
140
+ >>> idx = ps.Index([1, 2, 3])
141
+ >>> ps.Index(idx) # doctest: +SKIP
142
+ Int64Index([1, 2, 3], dtype='int64')
143
+ """
144
+
145
+ def __new__(
146
+ cls,
147
+ data: Optional[Any] = None,
148
+ dtype: Optional[Union[str, Dtype]] = None,
149
+ copy: bool = False,
150
+ name: Optional[Name] = None,
151
+ tupleize_cols: bool = True,
152
+ **kwargs: Any,
153
+ ) -> "Index":
154
+ if not is_hashable(name):
155
+ raise TypeError("Index.name must be a hashable type")
156
+
157
+ if isinstance(data, Series):
158
+ if dtype is not None:
159
+ data = data.astype(dtype)
160
+ if name is not None:
161
+ data = data.rename(name)
162
+
163
+ internal = InternalFrame(
164
+ spark_frame=data._internal.spark_frame,
165
+ index_spark_columns=data._internal.data_spark_columns,
166
+ index_names=data._internal.column_labels,
167
+ index_fields=data._internal.data_fields,
168
+ column_labels=[],
169
+ data_spark_columns=[],
170
+ data_fields=[],
171
+ )
172
+ return DataFrame(internal).index
173
+ elif isinstance(data, Index):
174
+ if copy:
175
+ data = data.copy()
176
+ if dtype is not None:
177
+ data = data.astype(dtype)
178
+ if name is not None:
179
+ data = data.rename(name)
180
+ return data
181
+
182
+ return cast(
183
+ Index,
184
+ ps.from_pandas(
185
+ pd.Index(
186
+ data=data,
187
+ dtype=dtype,
188
+ copy=copy,
189
+ name=name,
190
+ tupleize_cols=tupleize_cols,
191
+ **kwargs,
192
+ )
193
+ ),
194
+ )
195
+
196
+ @staticmethod
197
+ def _new_instance(anchor: DataFrame) -> "Index":
198
+ from pyspark.pandas.indexes.category import CategoricalIndex
199
+ from pyspark.pandas.indexes.datetimes import DatetimeIndex
200
+ from pyspark.pandas.indexes.multi import MultiIndex
201
+ from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
202
+ from pyspark.pandas.indexes.timedelta import TimedeltaIndex
203
+
204
+ instance: Index
205
+ if anchor._internal.index_level > 1:
206
+ instance = object.__new__(MultiIndex)
207
+ elif isinstance(anchor._internal.index_fields[0].dtype, CategoricalDtype):
208
+ instance = object.__new__(CategoricalIndex)
209
+ elif isinstance(
210
+ anchor._internal.spark_type_for(anchor._internal.index_spark_columns[0]), IntegralType
211
+ ):
212
+ instance = object.__new__(Int64Index)
213
+ elif isinstance(
214
+ anchor._internal.spark_type_for(anchor._internal.index_spark_columns[0]), FractionalType
215
+ ):
216
+ instance = object.__new__(Float64Index)
217
+ elif isinstance(
218
+ anchor._internal.spark_type_for(anchor._internal.index_spark_columns[0]),
219
+ (TimestampType, TimestampNTZType),
220
+ ):
221
+ instance = object.__new__(DatetimeIndex)
222
+ elif isinstance(
223
+ anchor._internal.spark_type_for(anchor._internal.index_spark_columns[0]),
224
+ DayTimeIntervalType,
225
+ ):
226
+ instance = object.__new__(TimedeltaIndex)
227
+ else:
228
+ instance = object.__new__(Index)
229
+
230
+ instance._anchor = anchor # type: ignore[attr-defined]
231
+ return instance
232
+
233
+ @property
234
+ def _psdf(self) -> DataFrame:
235
+ return self._anchor
236
+
237
+ @property
238
+ def _internal(self) -> InternalFrame:
239
+ internal = self._psdf._internal
240
+ return internal.copy(
241
+ column_labels=internal.index_names,
242
+ data_spark_columns=internal.index_spark_columns,
243
+ data_fields=internal.index_fields,
244
+ column_label_names=None,
245
+ )
246
+
247
+ @property
248
+ def _column_label(self) -> Optional[Label]:
249
+ return self._psdf._internal.index_names[0]
250
+
251
+ def _with_new_scol(self, scol: Column, *, field: Optional[InternalField] = None) -> "Index":
252
+ """
253
+ Copy pandas-on-Spark Index with the new Spark Column.
254
+
255
+ :param scol: the new Spark Column
256
+ :return: the copied Index
257
+ """
258
+ internal = self._internal.copy(
259
+ index_spark_columns=[scol.alias(SPARK_DEFAULT_INDEX_NAME)],
260
+ index_fields=[
261
+ field
262
+ if field is None or field.struct_field is None
263
+ else field.copy(name=SPARK_DEFAULT_INDEX_NAME)
264
+ ],
265
+ column_labels=[],
266
+ data_spark_columns=[],
267
+ data_fields=[],
268
+ )
269
+ return DataFrame(internal).index
270
+
271
+ spark: "SparkIndexOpsMethods" = CachedAccessor( # type: ignore[assignment]
272
+ "spark", SparkIndexMethods
273
+ )
274
+
275
+ # This method is used via `DataFrame.info` API internally.
276
+ def _summary(self, name: Optional[str] = None) -> str:
277
+ """
278
+ Return a summarized representation.
279
+
280
+ Parameters
281
+ ----------
282
+ name : str
283
+ name to use in the summary representation
284
+
285
+ Returns
286
+ -------
287
+ String with a summarized representation of the index
288
+ """
289
+ head, tail, total_count = tuple(
290
+ self._internal.spark_frame.select(
291
+ F.first(self.spark.column), F.last(self.spark.column), F.count(F.expr("*"))
292
+ )
293
+ .toPandas()
294
+ .iloc[0]
295
+ )
296
+
297
+ if total_count > 0:
298
+ index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail))
299
+ else:
300
+ index_summary = ""
301
+
302
+ if name is None:
303
+ name = type(self).__name__
304
+ return "%s: %s entries%s" % (name, total_count, index_summary)
305
+
306
+ @property
307
+ def size(self) -> int:
308
+ """
309
+ Return an int representing the number of elements in this object.
310
+
311
+ Examples
312
+ --------
313
+ >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
314
+ ... columns=['dogs', 'cats'],
315
+ ... index=list('abcd'))
316
+ >>> df.index.size
317
+ 4
318
+
319
+ >>> df.set_index('dogs', append=True).index.size
320
+ 4
321
+ """
322
+ return len(self)
323
+
324
+ @property
325
+ def shape(self) -> tuple:
326
+ """
327
+ Return a tuple of the shape of the underlying data.
328
+
329
+ Examples
330
+ --------
331
+ >>> idx = ps.Index(['a', 'b', 'c'])
332
+ >>> idx
333
+ Index(['a', 'b', 'c'], dtype='object')
334
+ >>> idx.shape
335
+ (3,)
336
+
337
+ >>> midx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
338
+ >>> midx # doctest: +SKIP
339
+ MultiIndex([('a', 'x'),
340
+ ('b', 'y'),
341
+ ('c', 'z')],
342
+ )
343
+ >>> midx.shape
344
+ (3,)
345
+ """
346
+ return (len(self._psdf),)
347
+
348
+ def identical(self, other: "Index") -> bool:
349
+ """
350
+ Similar to equals, but check that other comparable attributes are
351
+ also equal.
352
+
353
+ Returns
354
+ -------
355
+ bool
356
+ If two Index objects have equal elements and same type True,
357
+ otherwise False.
358
+
359
+ Examples
360
+ --------
361
+
362
+ >>> from pyspark.pandas.config import option_context
363
+ >>> idx = ps.Index(['a', 'b', 'c'])
364
+ >>> midx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
365
+
366
+ For Index
367
+
368
+ >>> idx.identical(idx)
369
+ True
370
+ >>> with option_context('compute.ops_on_diff_frames', True):
371
+ ... idx.identical(ps.Index(['a', 'b', 'c']))
372
+ True
373
+ >>> with option_context('compute.ops_on_diff_frames', True):
374
+ ... idx.identical(ps.Index(['b', 'b', 'a']))
375
+ False
376
+ >>> idx.identical(midx)
377
+ False
378
+
379
+ For MultiIndex
380
+
381
+ >>> midx.identical(midx)
382
+ True
383
+ >>> with option_context('compute.ops_on_diff_frames', True):
384
+ ... midx.identical(ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]))
385
+ True
386
+ >>> with option_context('compute.ops_on_diff_frames', True):
387
+ ... midx.identical(ps.MultiIndex.from_tuples([('c', 'z'), ('b', 'y'), ('a', 'x')]))
388
+ False
389
+ >>> midx.identical(idx)
390
+ False
391
+ """
392
+ from pyspark.pandas.indexes.multi import MultiIndex
393
+
394
+ self_name = self.names if isinstance(self, MultiIndex) else self.name
395
+ other_name = other.names if isinstance(other, MultiIndex) else other.name
396
+
397
+ return (
398
+ self_name == other_name # to support non-index comparison by short-circuiting.
399
+ and self.equals(other)
400
+ )
401
+
402
+ def equals(self, other: "Index") -> bool:
403
+ """
404
+ Determine if two Index objects contain the same elements.
405
+
406
+ Returns
407
+ -------
408
+ bool
409
+ True if "other" is an Index and it has the same elements as calling
410
+ index; False otherwise.
411
+
412
+ Examples
413
+ --------
414
+
415
+ >>> from pyspark.pandas.config import option_context
416
+ >>> idx = ps.Index(['a', 'b', 'c'])
417
+ >>> idx.name = "name"
418
+ >>> midx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
419
+ >>> midx.names = ("nameA", "nameB")
420
+
421
+ For Index
422
+
423
+ >>> idx.equals(idx)
424
+ True
425
+ >>> with option_context('compute.ops_on_diff_frames', True):
426
+ ... idx.equals(ps.Index(['a', 'b', 'c']))
427
+ True
428
+ >>> with option_context('compute.ops_on_diff_frames', True):
429
+ ... idx.equals(ps.Index(['b', 'b', 'a']))
430
+ False
431
+ >>> idx.equals(midx)
432
+ False
433
+
434
+ For MultiIndex
435
+
436
+ >>> midx.equals(midx)
437
+ True
438
+ >>> with option_context('compute.ops_on_diff_frames', True):
439
+ ... midx.equals(ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]))
440
+ True
441
+ >>> with option_context('compute.ops_on_diff_frames', True):
442
+ ... midx.equals(ps.MultiIndex.from_tuples([('c', 'z'), ('b', 'y'), ('a', 'x')]))
443
+ False
444
+ >>> midx.equals(idx)
445
+ False
446
+ """
447
+ if same_anchor(self, other):
448
+ return True
449
+ elif type(self) == type(other):
450
+ if get_option("compute.ops_on_diff_frames"):
451
+ # TODO: avoid using default index?
452
+ with option_context("compute.default_index_type", "distributed-sequence"):
453
+ # Directly using Series from both self and other seems causing
454
+ # some exceptions when 'compute.ops_on_diff_frames' is enabled.
455
+ # Working around for now via using frames.
456
+ return (
457
+ cast(Series, self.to_series("self").reset_index(drop=True))
458
+ == cast(Series, other.to_series("other").reset_index(drop=True))
459
+ ).all()
460
+ else:
461
+ raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)
462
+ else:
463
+ return False
464
+
465
+ def transpose(self) -> "Index":
466
+ """
467
+ Return the transpose, For index, It will be index itself.
468
+
469
+ Examples
470
+ --------
471
+ >>> idx = ps.Index(['a', 'b', 'c'])
472
+ >>> idx
473
+ Index(['a', 'b', 'c'], dtype='object')
474
+
475
+ >>> idx.transpose()
476
+ Index(['a', 'b', 'c'], dtype='object')
477
+
478
+ For MultiIndex
479
+
480
+ >>> midx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
481
+ >>> midx # doctest: +SKIP
482
+ MultiIndex([('a', 'x'),
483
+ ('b', 'y'),
484
+ ('c', 'z')],
485
+ )
486
+
487
+ >>> midx.transpose() # doctest: +SKIP
488
+ MultiIndex([('a', 'x'),
489
+ ('b', 'y'),
490
+ ('c', 'z')],
491
+ )
492
+ """
493
+ return self
494
+
495
+ T = property(transpose)
496
+
497
+ def _to_internal_pandas(self) -> pd.Index:
498
+ """
499
+ Return a pandas Index directly from _internal to avoid overhead of copy.
500
+
501
+ This method is for internal use only.
502
+ """
503
+ return self._psdf._internal.to_pandas_frame.index
504
+
505
+ def to_pandas(self) -> pd.Index:
506
+ """
507
+ Return a pandas Index.
508
+
509
+ .. note:: This method should only be used if the resulting pandas object is expected
510
+ to be small, as all the data is loaded into the driver's memory.
511
+
512
+ Examples
513
+ --------
514
+ >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
515
+ ... columns=['dogs', 'cats'],
516
+ ... index=list('abcd'))
517
+ >>> df['dogs'].index.to_pandas()
518
+ Index(['a', 'b', 'c', 'd'], dtype='object')
519
+ """
520
+ log_advice(
521
+ "`to_pandas` loads all data into the driver's memory. "
522
+ "It should only be used if the resulting pandas Index is expected to be small."
523
+ )
524
+ return self._to_pandas()
525
+
526
+ def _to_pandas(self) -> pd.Index:
527
+ """
528
+ Same as `to_pandas()`, without issuing the advice log for internal usage.
529
+ """
530
+ return self._to_internal_pandas().copy()
531
+
532
+ def to_numpy(self, dtype: Optional[Union[str, Dtype]] = None, copy: bool = False) -> np.ndarray:
533
+ """
534
+ A NumPy ndarray representing the values in this Index or MultiIndex.
535
+
536
+ .. note:: This method should only be used if the resulting NumPy ndarray is expected
537
+ to be small, as all the data is loaded into the driver's memory.
538
+
539
+ Parameters
540
+ ----------
541
+ dtype : str or numpy.dtype, optional
542
+ The dtype to pass to :meth:`numpy.asarray`
543
+ copy : bool, default False
544
+ Whether to ensure that the returned value is not a view on
545
+ another array. Note that ``copy=False`` does not *ensure* that
546
+ ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensures that
547
+ a copy is made, even if not strictly necessary.
548
+
549
+ Returns
550
+ -------
551
+ numpy.ndarray
552
+
553
+ Examples
554
+ --------
555
+ >>> ps.Series([1, 2, 3, 4]).index.to_numpy()
556
+ array([0, 1, 2, 3])
557
+ >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=[[1, 2, 3], [4, 5, 6]]).index.to_numpy()
558
+ array([(1, 4), (2, 5), (3, 6)], dtype=object)
559
+ """
560
+ log_advice(
561
+ "`to_numpy` loads all data into the driver's memory. "
562
+ "It should only be used if the resulting NumPy ndarray is expected to be small."
563
+ )
564
+ result = np.asarray(
565
+ self._to_internal_pandas()._values, dtype=dtype # type: ignore[arg-type,attr-defined]
566
+ )
567
+ if copy:
568
+ result = result.copy()
569
+ return result
570
+
571
+ def map(
572
+ self, mapper: Union[dict, Callable[[Any], Any], pd.Series], na_action: Optional[str] = None
573
+ ) -> "Index":
574
+ """
575
+ Map values using input correspondence (a dict, Series, or function).
576
+
577
+ Parameters
578
+ ----------
579
+ mapper : function, dict, or pd.Series
580
+ Mapping correspondence.
581
+ na_action : {None, 'ignore'}
582
+ If ‘ignore’, propagate NA values, without passing them to the mapping correspondence.
583
+
584
+ Returns
585
+ -------
586
+ applied : Index, inferred
587
+ The output of the mapping function applied to the index.
588
+
589
+ Examples
590
+ --------
591
+ >>> psidx = ps.Index([1, 2, 3])
592
+
593
+ >>> psidx.map({1: "one", 2: "two", 3: "three"})
594
+ Index(['one', 'two', 'three'], dtype='object')
595
+
596
+ >>> psidx.map(lambda id: "{id} + 1".format(id=id))
597
+ Index(['1 + 1', '2 + 1', '3 + 1'], dtype='object')
598
+
599
+ >>> pser = pd.Series(["one", "two", "three"], index=[1, 2, 3])
600
+ >>> psidx.map(pser)
601
+ Index(['one', 'two', 'three'], dtype='object')
602
+ """
603
+ if isinstance(mapper, dict):
604
+ if len(set(type(k) for k in mapper.values())) > 1:
605
+ raise TypeError(
606
+ "If the mapper is a dictionary, its values must be of the same type"
607
+ )
608
+
609
+ return Index(
610
+ self.to_series().pandas_on_spark.transform_batch(
611
+ lambda pser: pser.map(mapper, na_action)
612
+ )
613
+ ).rename(self.name)
614
+
615
+ @property
616
+ def values(self) -> np.ndarray:
617
+ """
618
+ Return an array representing the data in the Index.
619
+
620
+ .. warning:: We recommend using `Index.to_numpy()` instead.
621
+
622
+ .. note:: This method should only be used if the resulting NumPy ndarray is expected
623
+ to be small, as all the data is loaded into the driver's memory.
624
+
625
+ Returns
626
+ -------
627
+ numpy.ndarray
628
+
629
+ Examples
630
+ --------
631
+ >>> ps.Series([1, 2, 3, 4]).index.values
632
+ array([0, 1, 2, 3])
633
+ >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=[[1, 2, 3], [4, 5, 6]]).index.values
634
+ array([(1, 4), (2, 5), (3, 6)], dtype=object)
635
+ """
636
+ warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
637
+ return self.to_numpy()
638
+
639
+ @property
640
+ def asi8(self) -> np.ndarray:
641
+ """
642
+ Integer representation of the values.
643
+
644
+ .. warning:: We recommend using `Index.to_numpy()` instead.
645
+
646
+ .. note:: This method should only be used if the resulting NumPy ndarray is expected
647
+ to be small, as all the data is loaded into the driver's memory.
648
+
649
+ .. deprecated:: 3.4.0
650
+
651
+ Returns
652
+ -------
653
+ numpy.ndarray
654
+ An ndarray with int64 dtype.
655
+
656
+ Examples
657
+ --------
658
+ >>> ps.Index([1, 2, 3]).asi8
659
+ array([1, 2, 3])
660
+
661
+ Returns None for non-int64 dtype
662
+
663
+ >>> ps.Index(['a', 'b', 'c']).asi8 is None
664
+ True
665
+ """
666
+ warnings.warn(
667
+ "Index.asi8 is deprecated and will be removed in 4.0.0. " "Use Index.astype instead.",
668
+ FutureWarning,
669
+ )
670
+ if isinstance(self.spark.data_type, IntegralType):
671
+ return self.to_numpy()
672
+ elif isinstance(self.spark.data_type, (TimestampType, TimestampNTZType)):
673
+ return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy())))
674
+ else:
675
+ return None
676
+
677
+ @property
678
+ def has_duplicates(self) -> bool:
679
+ """
680
+ If index has duplicates, return True, otherwise False.
681
+
682
+ Examples
683
+ --------
684
+ >>> idx = ps.Index([1, 5, 7, 7])
685
+ >>> idx.has_duplicates
686
+ True
687
+
688
+ >>> idx = ps.Index([1, 5, 7])
689
+ >>> idx.has_duplicates
690
+ False
691
+
692
+ >>> idx = ps.Index(["Watermelon", "Orange", "Apple",
693
+ ... "Watermelon"])
694
+ >>> idx.has_duplicates
695
+ True
696
+
697
+ >>> idx = ps.Index(["Orange", "Apple",
698
+ ... "Watermelon"])
699
+ >>> idx.has_duplicates
700
+ False
701
+ """
702
+ sdf = self._internal.spark_frame.select(self.spark.column)
703
+ scol = scol_for(sdf, sdf.columns[0])
704
+
705
+ return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]
706
+
707
+ @property
708
+ def is_unique(self) -> bool:
709
+ """
710
+ Return if the index has unique values.
711
+
712
+ Examples
713
+ --------
714
+ >>> idx = ps.Index([1, 5, 7, 7])
715
+ >>> idx.is_unique
716
+ False
717
+
718
+ >>> idx = ps.Index([1, 5, 7])
719
+ >>> idx.is_unique
720
+ True
721
+
722
+ >>> idx = ps.Index(["Watermelon", "Orange", "Apple",
723
+ ... "Watermelon"])
724
+ >>> idx.is_unique
725
+ False
726
+
727
+ >>> idx = ps.Index(["Orange", "Apple",
728
+ ... "Watermelon"])
729
+ >>> idx.is_unique
730
+ True
731
+ """
732
+ return not self.has_duplicates
733
+
734
+ @property
735
+ def name(self) -> Name:
736
+ """Return name of the Index."""
737
+ return self.names[0]
738
+
739
+ @name.setter
740
+ def name(self, name: Name) -> None:
741
+ self.names = [name]
742
+
743
+ @property
744
+ def names(self) -> List[Name]:
745
+ """Return names of the Index."""
746
+ return [
747
+ name if name is None or len(name) > 1 else name[0]
748
+ for name in self._internal.index_names
749
+ ]
750
+
751
+ @names.setter
752
+ def names(self, names: List[Name]) -> None:
753
+ if not is_list_like(names):
754
+ raise ValueError("Names must be a list-like")
755
+ if self._internal.index_level != len(names):
756
+ raise ValueError(
757
+ "Length of new names must be {}, got {}".format(
758
+ self._internal.index_level, len(names)
759
+ )
760
+ )
761
+ if self._internal.index_level == 1:
762
+ self.rename(names[0], inplace=True)
763
+ else:
764
+ self.rename(names, inplace=True)
765
+
766
+ @property
767
+ def nlevels(self) -> int:
768
+ """
769
+ Number of levels in Index & MultiIndex.
770
+
771
+ Examples
772
+ --------
773
+ >>> psdf = ps.DataFrame({"a": [1, 2, 3]}, index=pd.Index(['a', 'b', 'c'], name="idx"))
774
+ >>> psdf.index.nlevels
775
+ 1
776
+
777
+ >>> psdf = ps.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
778
+ >>> psdf.index.nlevels
779
+ 2
780
+ """
781
+ return self._internal.index_level
782
+
783
+ def rename(self, name: Union[Name, List[Name]], inplace: bool = False) -> Optional["Index"]:
784
+ """
785
+ Alter Index or MultiIndex name.
786
+ Able to set new names without level. Defaults to returning a new index.
787
+
788
+ Parameters
789
+ ----------
790
+ name : label or list of labels
791
+ Name(s) to set.
792
+ inplace : boolean, default False
793
+ Modifies the object directly, instead of creating a new Index or MultiIndex.
794
+
795
+ Returns
796
+ -------
797
+ Index or MultiIndex
798
+ The same type as the caller or None if inplace is True.
799
+
800
+ Examples
801
+ --------
802
+ >>> df = ps.DataFrame({'a': ['A', 'C'], 'b': ['A', 'B']}, columns=['a', 'b'])
803
+ >>> df.index.rename("c") # doctest: +SKIP
804
+ Int64Index([0, 1], dtype='int64', name='c')
805
+
806
+ >>> df.set_index("a", inplace=True)
807
+ >>> df.index.rename("d")
808
+ Index(['A', 'C'], dtype='object', name='d')
809
+
810
+ You can also change the index name in place.
811
+
812
+ >>> df.index.rename("e", inplace=True)
813
+ >>> df.index
814
+ Index(['A', 'C'], dtype='object', name='e')
815
+
816
+ >>> df # doctest: +NORMALIZE_WHITESPACE
817
+ b
818
+ e
819
+ A A
820
+ C B
821
+
822
+ Support for MultiIndex
823
+
824
+ >>> psidx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y')])
825
+ >>> psidx.names = ['hello', 'pandas-on-Spark']
826
+ >>> psidx # doctest: +SKIP
827
+ MultiIndex([('a', 'x'),
828
+ ('b', 'y')],
829
+ names=['hello', 'pandas-on-Spark'])
830
+
831
+ >>> psidx.rename(['aloha', 'databricks']) # doctest: +SKIP
832
+ MultiIndex([('a', 'x'),
833
+ ('b', 'y')],
834
+ names=['aloha', 'databricks'])
835
+ """
836
+ names = self._verify_for_rename(name)
837
+
838
+ internal = self._psdf._internal.copy(index_names=names)
839
+
840
+ if inplace:
841
+ self._psdf._update_internal_frame(internal)
842
+ return None
843
+ else:
844
+ return DataFrame(internal).index
845
+
846
+ def _verify_for_rename(self, name: Name) -> List[Label]:
847
+ if is_hashable(name):
848
+ if is_name_like_tuple(name):
849
+ return [name]
850
+ elif is_name_like_value(name):
851
+ return [(name,)]
852
+ raise TypeError("Index.name must be a hashable type")
853
+
854
+ # TODO: add downcast parameter for fillna function
855
+ def fillna(self, value: Scalar) -> "Index":
856
+ """
857
+ Fill NA/NaN values with the specified value.
858
+
859
+ Parameters
860
+ ----------
861
+ value : scalar
862
+ Scalar value to use to fill holes (example: 0). This value cannot be a list-likes.
863
+
864
+ Returns
865
+ -------
866
+ Index :
867
+ filled with value
868
+
869
+ Examples
870
+ --------
871
+ >>> idx = ps.Index([1, 2, None])
872
+ >>> idx # doctest: +SKIP
873
+ Float64Index([1.0, 2.0, nan], dtype='float64')
874
+
875
+ >>> idx.fillna(0) # doctest: +SKIP
876
+ Float64Index([1.0, 2.0, 0.0], dtype='float64')
877
+ """
878
+ if not isinstance(value, (float, int, str, bool)):
879
+ raise TypeError("Unsupported type %s" % type(value).__name__)
880
+ sdf = self._internal.spark_frame.fillna(value)
881
+
882
+ internal = InternalFrame( # TODO: dtypes?
883
+ spark_frame=sdf,
884
+ index_spark_columns=[
885
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
886
+ ],
887
+ index_names=self._internal.index_names,
888
+ )
889
+ return DataFrame(internal).index
890
+
891
+ def drop_duplicates(self, keep: Union[bool, str] = "first") -> "Index":
892
+ """
893
+ Return Index with duplicate values removed.
894
+
895
+ Parameters
896
+ ----------
897
+ keep : {'first', 'last', ``False``}, default 'first'
898
+ Method to handle dropping duplicates:
899
+ - 'first' : Drop duplicates except for the first occurrence.
900
+ - 'last' : Drop duplicates except for the last occurrence.
901
+ - ``False`` : Drop all duplicates.
902
+
903
+ Returns
904
+ -------
905
+ deduplicated : Index
906
+
907
+ See Also
908
+ --------
909
+ Series.drop_duplicates : Equivalent method on Series.
910
+ DataFrame.drop_duplicates : Equivalent method on DataFrame.
911
+
912
+ Examples
913
+ --------
914
+ Generate an Index with duplicate values.
915
+
916
+ >>> idx = ps.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
917
+
918
+ >>> idx.drop_duplicates().sort_values()
919
+ Index(['beetle', 'cow', 'hippo', 'lama'], dtype='object')
920
+ """
921
+ with ps.option_context("compute.default_index_type", "distributed"):
922
+ # The attached index caused by `reset_index` below is used for sorting only,
923
+ # and it will be dropped soon,
924
+ # so we enforce “distributed” default index type
925
+ psser = self.to_series().reset_index(drop=True)
926
+ return Index(psser.drop_duplicates(keep=keep).sort_index())
927
+
928
+ def to_series(self, name: Optional[Name] = None) -> Series:
929
+ """
930
+ Create a Series with both index and values equal to the index keys
931
+ useful with map for returning an indexer based on an index.
932
+
933
+ Parameters
934
+ ----------
935
+ name : string, optional
936
+ name of resulting Series. If None, defaults to name of original
937
+ index
938
+
939
+ Returns
940
+ -------
941
+ Series : dtype will be based on the type of the Index values.
942
+
943
+ Examples
944
+ --------
945
+ >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
946
+ ... columns=['dogs', 'cats'],
947
+ ... index=list('abcd'))
948
+ >>> df['dogs'].index.to_series()
949
+ a a
950
+ b b
951
+ c c
952
+ d d
953
+ dtype: object
954
+ """
955
+ if not is_hashable(name):
956
+ raise TypeError("Series.name must be a hashable type")
957
+ scol = self.spark.column
958
+ field = self._internal.data_fields[0]
959
+ if name is not None:
960
+ scol = scol.alias(name_like_string(name))
961
+ field = field.copy(name=name_like_string(name))
962
+ elif self._internal.index_level == 1:
963
+ name = self.name
964
+ column_labels: List[Optional[Label]] = [name if is_name_like_tuple(name) else (name,)]
965
+ internal = self._internal.copy(
966
+ column_labels=column_labels,
967
+ data_spark_columns=[scol],
968
+ data_fields=[field],
969
+ column_label_names=None,
970
+ )
971
+ return first_series(DataFrame(internal))
972
+
973
+ def to_frame(self, index: bool = True, name: Optional[Name] = None) -> DataFrame:
974
+ """
975
+ Create a DataFrame with a column containing the Index.
976
+
977
+ Parameters
978
+ ----------
979
+ index : boolean, default True
980
+ Set the index of the returned DataFrame as the original Index.
981
+ name : object, default None
982
+ The passed name should substitute for the index name (if it has
983
+ one).
984
+
985
+ Returns
986
+ -------
987
+ DataFrame
988
+ DataFrame containing the original Index data.
989
+
990
+ See Also
991
+ --------
992
+ Index.to_series : Convert an Index to a Series.
993
+ Series.to_frame : Convert Series to DataFrame.
994
+
995
+ Examples
996
+ --------
997
+ >>> idx = ps.Index(['Ant', 'Bear', 'Cow'], name='animal')
998
+ >>> idx.to_frame() # doctest: +NORMALIZE_WHITESPACE
999
+ animal
1000
+ animal
1001
+ Ant Ant
1002
+ Bear Bear
1003
+ Cow Cow
1004
+
1005
+ By default, the original Index is reused. To enforce a new Index:
1006
+
1007
+ >>> idx.to_frame(index=False)
1008
+ animal
1009
+ 0 Ant
1010
+ 1 Bear
1011
+ 2 Cow
1012
+
1013
+ To override the name of the resulting column, specify `name`:
1014
+
1015
+ >>> idx.to_frame(name='zoo') # doctest: +NORMALIZE_WHITESPACE
1016
+ zoo
1017
+ animal
1018
+ Ant Ant
1019
+ Bear Bear
1020
+ Cow Cow
1021
+ """
1022
+ if name is None:
1023
+ if self._internal.index_names[0] is None:
1024
+ name = (DEFAULT_SERIES_NAME,)
1025
+ else:
1026
+ name = self._internal.index_names[0]
1027
+ elif not is_name_like_tuple(name):
1028
+ if is_name_like_value(name):
1029
+ name = (name,)
1030
+ else:
1031
+ raise TypeError("unhashable type: '{}'".format(type(name).__name__))
1032
+
1033
+ return self._to_frame(index=index, names=[name])
1034
+
1035
+ def _to_frame(self, index: bool, names: List[Label]) -> DataFrame:
1036
+ if index:
1037
+ index_spark_columns = self._internal.index_spark_columns
1038
+ index_names = self._internal.index_names
1039
+ index_fields = self._internal.index_fields
1040
+ else:
1041
+ index_spark_columns = []
1042
+ index_names = []
1043
+ index_fields = []
1044
+
1045
+ internal = InternalFrame(
1046
+ spark_frame=self._internal.spark_frame,
1047
+ index_spark_columns=index_spark_columns,
1048
+ index_names=index_names,
1049
+ index_fields=index_fields,
1050
+ column_labels=names,
1051
+ data_spark_columns=self._internal.index_spark_columns,
1052
+ data_fields=self._internal.index_fields,
1053
+ )
1054
+ return DataFrame(internal)
1055
+
1056
+ def is_boolean(self) -> bool:
1057
+ """
1058
+ Return if the current index type is a boolean type.
1059
+
1060
+ Examples
1061
+ --------
1062
+ >>> ps.DataFrame({'a': [1]}, index=[True]).index.is_boolean()
1063
+ True
1064
+ """
1065
+ return is_bool_dtype(self.dtype)
1066
+
1067
+ def is_categorical(self) -> bool:
1068
+ """
1069
+ Return if the current index type is a categorical type.
1070
+
1071
+ Examples
1072
+ --------
1073
+ >>> ps.DataFrame({'a': [1]}, index=[1]).index.is_categorical()
1074
+ False
1075
+ """
1076
+ return is_categorical_dtype(self.dtype)
1077
+
1078
+ def is_floating(self) -> bool:
1079
+ """
1080
+ Return if the current index type is a floating type.
1081
+
1082
+ Examples
1083
+ --------
1084
+ >>> ps.DataFrame({'a': [1]}, index=[1]).index.is_floating()
1085
+ False
1086
+ """
1087
+ return is_float_dtype(self.dtype)
1088
+
1089
+ def is_integer(self) -> bool:
1090
+ """
1091
+ Return if the current index type is an integer type.
1092
+
1093
+ Examples
1094
+ --------
1095
+ >>> ps.DataFrame({'a': [1]}, index=[1]).index.is_integer()
1096
+ True
1097
+ """
1098
+ return is_integer_dtype(self.dtype)
1099
+
1100
+ def is_interval(self) -> bool:
1101
+ """
1102
+ Return if the current index type is an interval type.
1103
+
1104
+ Examples
1105
+ --------
1106
+ >>> ps.DataFrame({'a': [1]}, index=[1]).index.is_interval()
1107
+ False
1108
+ """
1109
+ return is_interval_dtype(self.dtype)
1110
+
1111
+ def is_numeric(self) -> bool:
1112
+ """
1113
+ Return if the current index type is a numeric type.
1114
+
1115
+ Examples
1116
+ --------
1117
+ >>> ps.DataFrame({'a': [1]}, index=[1]).index.is_numeric()
1118
+ True
1119
+ """
1120
+ return is_numeric_dtype(self.dtype)
1121
+
1122
+ def is_object(self) -> bool:
1123
+ """
1124
+ Return if the current index type is an object type.
1125
+
1126
+ Examples
1127
+ --------
1128
+ >>> ps.DataFrame({'a': [1]}, index=["a"]).index.is_object()
1129
+ True
1130
+ """
1131
+ return is_object_dtype(self.dtype)
1132
+
1133
+ def is_type_compatible(self, kind: str) -> bool:
1134
+ """
1135
+ Whether the index type is compatible with the provided type.
1136
+
1137
+ .. deprecated:: 3.4.0
1138
+
1139
+ Examples
1140
+ --------
1141
+ >>> psidx = ps.Index([1, 2, 3])
1142
+ >>> psidx.is_type_compatible('integer')
1143
+ True
1144
+
1145
+ >>> psidx = ps.Index([1.0, 2.0, 3.0])
1146
+ >>> psidx.is_type_compatible('integer')
1147
+ False
1148
+ >>> psidx.is_type_compatible('floating')
1149
+ True
1150
+ """
1151
+ warnings.warn(
1152
+ "Index.is_type_compatible is deprecated and will be removed in 4.0.0. "
1153
+ "Use Index.isin instead.",
1154
+ FutureWarning,
1155
+ )
1156
+ return kind == self.inferred_type
1157
+
1158
+ def dropna(self, how: str = "any") -> "Index":
1159
+ """
1160
+ Return Index or MultiIndex without NA/NaN values
1161
+
1162
+ Parameters
1163
+ ----------
1164
+ how : {'any', 'all'}, default 'any'
1165
+ If the Index is a MultiIndex, drop the value when any or all levels
1166
+ are NaN.
1167
+
1168
+ Returns
1169
+ -------
1170
+ Index or MultiIndex
1171
+
1172
+ Examples
1173
+ --------
1174
+
1175
+ >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],
1176
+ ... index=['cobra', 'viper', None],
1177
+ ... columns=['max_speed', 'shield'])
1178
+ >>> df # doctest: +SKIP
1179
+ max_speed shield
1180
+ cobra 1 2
1181
+ viper 4 5
1182
+ None 7 8
1183
+
1184
+ >>> df.index.dropna()
1185
+ Index(['cobra', 'viper'], dtype='object')
1186
+
1187
+ Also support for MultiIndex
1188
+
1189
+
1190
+ >>> tuples = [(np.nan, 1.0), (2.0, 2.0), (np.nan, np.nan), (3.0, np.nan)]
1191
+ >>> midx = ps.MultiIndex.from_tuples(tuples)
1192
+ >>> midx # doctest: +SKIP
1193
+ MultiIndex([(nan, 1.0),
1194
+ (2.0, 2.0),
1195
+ (nan, nan),
1196
+ (3.0, nan)],
1197
+ )
1198
+
1199
+ >>> midx.dropna() # doctest: +SKIP
1200
+ MultiIndex([(2.0, 2.0)],
1201
+ )
1202
+
1203
+ >>> midx.dropna(how="all") # doctest: +SKIP
1204
+ MultiIndex([(nan, 1.0),
1205
+ (2.0, 2.0),
1206
+ (3.0, nan)],
1207
+ )
1208
+ """
1209
+ if how not in ("any", "all"):
1210
+ raise ValueError("invalid how option: %s" % how)
1211
+
1212
+ sdf = self._internal.spark_frame.select(self._internal.index_spark_columns).dropna(how=how)
1213
+ internal = InternalFrame(
1214
+ spark_frame=sdf,
1215
+ index_spark_columns=[
1216
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
1217
+ ],
1218
+ index_names=self._internal.index_names,
1219
+ index_fields=self._internal.index_fields,
1220
+ )
1221
+ return DataFrame(internal).index
1222
+
1223
+ def unique(self, level: Optional[Union[int, Name]] = None) -> "Index":
1224
+ """
1225
+ Return unique values in the index.
1226
+
1227
+ Be aware the order of unique values might be different than pandas.Index.unique
1228
+
1229
+ Parameters
1230
+ ----------
1231
+ level : int or str, optional, default is None
1232
+
1233
+ Returns
1234
+ -------
1235
+ Index without duplicates
1236
+
1237
+ See Also
1238
+ --------
1239
+ Series.unique
1240
+ groupby.SeriesGroupBy.unique
1241
+
1242
+ Examples
1243
+ --------
1244
+ >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 1, 3]).index.unique().sort_values()
1245
+ ... # doctest: +SKIP
1246
+ Int64Index([1, 3], dtype='int64')
1247
+
1248
+ >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=['d', 'e', 'e']).index.unique().sort_values()
1249
+ Index(['d', 'e'], dtype='object')
1250
+
1251
+ MultiIndex
1252
+
1253
+ >>> ps.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("A", "X")]).unique()
1254
+ ... # doctest: +SKIP
1255
+ MultiIndex([('A', 'X'),
1256
+ ('A', 'Y')],
1257
+ )
1258
+ """
1259
+ if level is not None:
1260
+ self._validate_index_level(level)
1261
+ scols = self._internal.index_spark_columns
1262
+ sdf = self._psdf._internal.spark_frame.select(scols).distinct()
1263
+ return DataFrame(
1264
+ InternalFrame(
1265
+ spark_frame=sdf,
1266
+ index_spark_columns=[
1267
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
1268
+ ],
1269
+ index_names=self._internal.index_names,
1270
+ index_fields=self._internal.index_fields,
1271
+ )
1272
+ ).index
1273
+
1274
+ # TODO: add error parameter
1275
+ def drop(self, labels: List[Any]) -> "Index":
1276
+ """
1277
+ Make new Index with passed list of labels deleted.
1278
+
1279
+ Parameters
1280
+ ----------
1281
+ labels : array-like
1282
+
1283
+ Returns
1284
+ -------
1285
+ dropped : Index
1286
+
1287
+ Examples
1288
+ --------
1289
+ >>> index = ps.Index([1, 2, 3])
1290
+ >>> index # doctest: +SKIP
1291
+ Int64Index([1, 2, 3], dtype='int64')
1292
+
1293
+ >>> index.drop([1]) # doctest: +SKIP
1294
+ Int64Index([2, 3], dtype='int64')
1295
+ """
1296
+ internal = self._internal.resolved_copy
1297
+ sdf = internal.spark_frame[~internal.index_spark_columns[0].isin(labels)]
1298
+
1299
+ internal = InternalFrame(
1300
+ spark_frame=sdf,
1301
+ index_spark_columns=[
1302
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
1303
+ ],
1304
+ index_names=self._internal.index_names,
1305
+ index_fields=self._internal.index_fields,
1306
+ column_labels=[],
1307
+ data_spark_columns=[],
1308
+ data_fields=[],
1309
+ )
1310
+ return DataFrame(internal).index
1311
+
1312
+ def _validate_index_level(self, level: Union[int, Name]) -> None:
1313
+ """
1314
+ Validate index level.
1315
+ For single-level Index getting level number is a no-op, but some
1316
+ verification must be done like in MultiIndex.
1317
+ """
1318
+ if isinstance(level, int):
1319
+ if level < 0 and level != -1:
1320
+ raise IndexError(
1321
+ "Too many levels: Index has only 1 level,"
1322
+ " %d is not a valid level number" % (level,)
1323
+ )
1324
+ elif level > 0:
1325
+ raise IndexError("Too many levels:" " Index has only 1 level, not %d" % (level + 1))
1326
+ elif level != self.name:
1327
+ raise KeyError(
1328
+ "Requested level ({}) does not match index name ({})".format(level, self.name)
1329
+ )
1330
+
1331
+ def get_level_values(self, level: Union[int, Name]) -> "Index":
1332
+ """
1333
+ Return Index if a valid level is given.
1334
+
1335
+ Examples
1336
+ --------
1337
+ >>> psidx = ps.Index(['a', 'b', 'c'], name='ks')
1338
+ >>> psidx.get_level_values(0)
1339
+ Index(['a', 'b', 'c'], dtype='object', name='ks')
1340
+
1341
+ >>> psidx.get_level_values('ks')
1342
+ Index(['a', 'b', 'c'], dtype='object', name='ks')
1343
+ """
1344
+ self._validate_index_level(level)
1345
+ return self
1346
+
1347
+ def copy(self, name: Optional[Name] = None, deep: Optional[bool] = None) -> "Index":
1348
+ """
1349
+ Make a copy of this object. name sets those attributes on the new object.
1350
+
1351
+ Parameters
1352
+ ----------
1353
+ name : string, optional
1354
+ to set name of index
1355
+ deep : None
1356
+ this parameter is not supported but just dummy parameter to match pandas.
1357
+
1358
+ Examples
1359
+ --------
1360
+ >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],
1361
+ ... index=['cobra', 'viper', 'sidewinder'],
1362
+ ... columns=['max_speed', 'shield'])
1363
+ >>> df
1364
+ max_speed shield
1365
+ cobra 1 2
1366
+ viper 4 5
1367
+ sidewinder 7 8
1368
+ >>> df.index
1369
+ Index(['cobra', 'viper', 'sidewinder'], dtype='object')
1370
+
1371
+ Copy index
1372
+
1373
+ >>> df.index.copy()
1374
+ Index(['cobra', 'viper', 'sidewinder'], dtype='object')
1375
+
1376
+ Copy index with name
1377
+
1378
+ >>> df.index.copy(name='snake')
1379
+ Index(['cobra', 'viper', 'sidewinder'], dtype='object', name='snake')
1380
+ """
1381
+ result = self._psdf[[]].index
1382
+ if name:
1383
+ result.name = name
1384
+ return result
1385
+
1386
+ def droplevel(self, level: Union[int, Name, List[Union[int, Name]]]) -> "Index":
1387
+ """
1388
+ Return index with requested level(s) removed.
1389
+ If resulting index has only 1 level left, the result will be
1390
+ of Index type, not MultiIndex.
1391
+
1392
+ Parameters
1393
+ ----------
1394
+ level : int, str, tuple, or list-like, default 0
1395
+ If a string is given, must be the name of a level
1396
+ If list-like, elements must be names or indexes of levels.
1397
+
1398
+ Returns
1399
+ -------
1400
+ Index or MultiIndex
1401
+
1402
+ Examples
1403
+ --------
1404
+ >>> midx = ps.DataFrame({'a': ['a', 'b']}, index=[['a', 'x'], ['b', 'y'], [1, 2]]).index
1405
+ >>> midx # doctest: +SKIP
1406
+ MultiIndex([('a', 'b', 1),
1407
+ ('x', 'y', 2)],
1408
+ )
1409
+ >>> midx.droplevel([0, 1]) # doctest: +SKIP
1410
+ Int64Index([1, 2], dtype='int64')
1411
+ >>> midx.droplevel(0) # doctest: +SKIP
1412
+ MultiIndex([('b', 1),
1413
+ ('y', 2)],
1414
+ )
1415
+ >>> midx.names = [("a", "b"), "b", "c"]
1416
+ >>> midx.droplevel([('a', 'b')]) # doctest: +SKIP
1417
+ MultiIndex([('b', 1),
1418
+ ('y', 2)],
1419
+ names=['b', 'c'])
1420
+ """
1421
+ names = self.names
1422
+ nlevels = self.nlevels
1423
+ if not is_list_like(level):
1424
+ levels = [cast(Union[int, Name], level)]
1425
+ else:
1426
+ levels = cast(List[Union[int, Name]], level)
1427
+
1428
+ int_level = set()
1429
+ for n in levels:
1430
+ if isinstance(n, int):
1431
+ if n < 0:
1432
+ n = n + nlevels
1433
+ if n < 0:
1434
+ raise IndexError(
1435
+ "Too many levels: Index has only {} levels, "
1436
+ "{} is not a valid level number".format(nlevels, (n - nlevels))
1437
+ )
1438
+ if n >= nlevels:
1439
+ raise IndexError(
1440
+ "Too many levels: Index has only {} levels, not {}".format(nlevels, n + 1)
1441
+ )
1442
+ else:
1443
+ if n not in names:
1444
+ raise KeyError("Level {} not found".format(n))
1445
+ n = names.index(n)
1446
+ int_level.add(n)
1447
+
1448
+ if len(levels) >= nlevels:
1449
+ raise ValueError(
1450
+ "Cannot remove {} levels from an index with {} "
1451
+ "levels: at least one level must be "
1452
+ "left.".format(len(levels), nlevels)
1453
+ )
1454
+
1455
+ index_spark_columns, index_names, index_fields = zip(
1456
+ *[
1457
+ item
1458
+ for i, item in enumerate(
1459
+ zip(
1460
+ self._internal.index_spark_columns,
1461
+ self._internal.index_names,
1462
+ self._internal.index_fields,
1463
+ )
1464
+ )
1465
+ if i not in int_level
1466
+ ]
1467
+ )
1468
+
1469
+ internal = self._internal.copy(
1470
+ index_spark_columns=list(index_spark_columns),
1471
+ index_names=list(index_names),
1472
+ index_fields=list(index_fields),
1473
+ column_labels=[],
1474
+ data_spark_columns=[],
1475
+ data_fields=[],
1476
+ )
1477
+ return DataFrame(internal).index
1478
+
1479
+ def symmetric_difference(
1480
+ self,
1481
+ other: "Index",
1482
+ result_name: Optional[Name] = None,
1483
+ sort: Optional[bool] = None,
1484
+ ) -> "Index":
1485
+ """
1486
+ Compute the symmetric difference of two Index objects.
1487
+
1488
+ Parameters
1489
+ ----------
1490
+ other : Index or array-like
1491
+ result_name : str
1492
+ sort : True or None, default None
1493
+ Whether to sort the resulting index.
1494
+ * True : Attempt to sort the result.
1495
+ * None : Do not sort the result.
1496
+
1497
+ Returns
1498
+ -------
1499
+ symmetric_difference : Index
1500
+
1501
+ Notes
1502
+ -----
1503
+ ``symmetric_difference`` contains elements that appear in either
1504
+ ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
1505
+ ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
1506
+ dropped.
1507
+
1508
+ Examples
1509
+ --------
1510
+ >>> s1 = ps.Series([1, 2, 3, 4], index=[1, 2, 3, 4])
1511
+ >>> s2 = ps.Series([1, 2, 3, 4], index=[2, 3, 4, 5])
1512
+
1513
+ >>> s1.index.symmetric_difference(s2.index) # doctest: +SKIP
1514
+ Int64Index([5, 1], dtype='int64')
1515
+
1516
+ You can set name of result Index.
1517
+
1518
+ >>> s1.index.symmetric_difference(s2.index, result_name='pandas-on-Spark') # doctest: +SKIP
1519
+ Int64Index([5, 1], dtype='int64', name='pandas-on-Spark')
1520
+
1521
+ You can set sort to `True`, if you want to sort the resulting index.
1522
+
1523
+ >>> s1.index.symmetric_difference(s2.index, sort=True) # doctest: +SKIP
1524
+ Int64Index([1, 5], dtype='int64')
1525
+
1526
+ You can also use the ``^`` operator:
1527
+
1528
+ >>> s1.index ^ s2.index # doctest: +SKIP
1529
+ Int64Index([5, 1], dtype='int64')
1530
+ """
1531
+ if type(self) != type(other):
1532
+ raise NotImplementedError(
1533
+ "Doesn't support symmetric_difference between Index & MultiIndex for now"
1534
+ )
1535
+
1536
+ sdf_self = self._psdf._internal.spark_frame.select(self._internal.index_spark_columns)
1537
+ sdf_other = other._psdf._internal.spark_frame.select(other._internal.index_spark_columns)
1538
+
1539
+ sdf_symdiff = sdf_self.union(sdf_other).subtract(sdf_self.intersect(sdf_other))
1540
+
1541
+ if sort:
1542
+ sdf_symdiff = sdf_symdiff.sort(*self._internal.index_spark_column_names)
1543
+
1544
+ internal = InternalFrame(
1545
+ spark_frame=sdf_symdiff,
1546
+ index_spark_columns=[
1547
+ scol_for(sdf_symdiff, col) for col in self._internal.index_spark_column_names
1548
+ ],
1549
+ index_names=self._internal.index_names,
1550
+ index_fields=self._internal.index_fields,
1551
+ )
1552
+ result = DataFrame(internal).index
1553
+
1554
+ if result_name:
1555
+ result.name = result_name
1556
+
1557
+ return result
1558
+
1559
+ def sort_values(
1560
+ self, return_indexer: bool = False, ascending: bool = True
1561
+ ) -> Union["Index", Tuple["Index", "Index"]]:
1562
+ """
1563
+ Return a sorted copy of the index, and optionally return the indices that
1564
+ sorted the index itself.
1565
+
1566
+ .. note:: This method is not supported for pandas when index has NaN value.
1567
+ pandas raises unexpected TypeError, but we support treating NaN
1568
+ as the smallest value.
1569
+ This method returns indexer as a pandas-on-Spark index while
1570
+ pandas returns it as a list. That's because indexer in pandas-on-Spark
1571
+ may not fit in memory.
1572
+
1573
+ Parameters
1574
+ ----------
1575
+ return_indexer : bool, default False
1576
+ Should the indices that would sort the index be returned.
1577
+ ascending : bool, default True
1578
+ Should the index values be sorted in an ascending order.
1579
+
1580
+ Returns
1581
+ -------
1582
+ sorted_index : ps.Index or ps.MultiIndex
1583
+ Sorted copy of the index.
1584
+ indexer : ps.Index
1585
+ The indices that the index itself was sorted by.
1586
+
1587
+ See Also
1588
+ --------
1589
+ Series.sort_values : Sort values of a Series.
1590
+ DataFrame.sort_values : Sort values in a DataFrame.
1591
+
1592
+ Examples
1593
+ --------
1594
+ >>> idx = ps.Index([10, 100, 1, 1000])
1595
+ >>> idx # doctest: +SKIP
1596
+ Int64Index([10, 100, 1, 1000], dtype='int64')
1597
+
1598
+ Sort values in ascending order (default behavior).
1599
+
1600
+ >>> idx.sort_values() # doctest: +SKIP
1601
+ Int64Index([1, 10, 100, 1000], dtype='int64')
1602
+
1603
+ Sort values in descending order.
1604
+
1605
+ >>> idx.sort_values(ascending=False) # doctest: +SKIP
1606
+ Int64Index([1000, 100, 10, 1], dtype='int64')
1607
+
1608
+ Sort values in descending order, and also get the indices idx was sorted by.
1609
+
1610
+ >>> idx.sort_values(ascending=False, return_indexer=True) # doctest: +SKIP
1611
+ (Int64Index([1000, 100, 10, 1], dtype='int64'), Int64Index([3, 1, 0, 2], dtype='int64'))
1612
+
1613
+ Support for MultiIndex.
1614
+
1615
+ >>> psidx = ps.MultiIndex.from_tuples([('a', 'x', 1), ('c', 'y', 2), ('b', 'z', 3)])
1616
+ >>> psidx # doctest: +SKIP
1617
+ MultiIndex([('a', 'x', 1),
1618
+ ('c', 'y', 2),
1619
+ ('b', 'z', 3)],
1620
+ )
1621
+
1622
+ >>> psidx.sort_values() # doctest: +SKIP
1623
+ MultiIndex([('a', 'x', 1),
1624
+ ('b', 'z', 3),
1625
+ ('c', 'y', 2)],
1626
+ )
1627
+
1628
+ >>> psidx.sort_values(ascending=False) # doctest: +SKIP
1629
+ MultiIndex([('c', 'y', 2),
1630
+ ('b', 'z', 3),
1631
+ ('a', 'x', 1)],
1632
+ )
1633
+
1634
+ >>> psidx.sort_values(ascending=False, return_indexer=True) # doctest: +SKIP
1635
+ (MultiIndex([('c', 'y', 2),
1636
+ ('b', 'z', 3),
1637
+ ('a', 'x', 1)],
1638
+ ), Int64Index([1, 2, 0], dtype='int64'))
1639
+ """
1640
+ sdf = self._internal.spark_frame
1641
+ if return_indexer:
1642
+ sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
1643
+ sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
1644
+
1645
+ ordered_sdf = sdf.orderBy(*self._internal.index_spark_columns, ascending=ascending)
1646
+ sdf = ordered_sdf.select(self._internal.index_spark_columns)
1647
+
1648
+ internal = InternalFrame(
1649
+ spark_frame=sdf,
1650
+ index_spark_columns=[
1651
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
1652
+ ],
1653
+ index_names=self._internal.index_names,
1654
+ index_fields=self._internal.index_fields,
1655
+ )
1656
+ sorted_index = DataFrame(internal).index
1657
+
1658
+ if return_indexer:
1659
+ alias_sequence_scol = scol_for(ordered_sdf, sequence_col).alias(
1660
+ SPARK_DEFAULT_INDEX_NAME
1661
+ )
1662
+ indexer_sdf = ordered_sdf.select(alias_sequence_scol)
1663
+ indexer_internal = InternalFrame(
1664
+ spark_frame=indexer_sdf,
1665
+ index_spark_columns=[scol_for(indexer_sdf, SPARK_DEFAULT_INDEX_NAME)],
1666
+ )
1667
+ indexer = DataFrame(indexer_internal).index
1668
+ return sorted_index, indexer
1669
+ else:
1670
+ return sorted_index
1671
+
1672
+ @no_type_check
1673
+ def sort(self, *args, **kwargs) -> None:
1674
+ """
1675
+ Use sort_values instead.
1676
+ """
1677
+ raise TypeError("cannot sort an Index object in-place, use sort_values instead")
1678
+
1679
+ def min(self) -> Union[Scalar, Tuple[Scalar, ...]]:
1680
+ """
1681
+ Return the minimum value of the Index.
1682
+
1683
+ Returns
1684
+ -------
1685
+ scalar
1686
+ Minimum value.
1687
+
1688
+ See Also
1689
+ --------
1690
+ Index.max : Return the maximum value of the object.
1691
+ Series.min : Return the minimum value in a Series.
1692
+ DataFrame.min : Return the minimum values in a DataFrame.
1693
+
1694
+ Examples
1695
+ --------
1696
+ >>> idx = ps.Index([3, 2, 1])
1697
+ >>> idx.min()
1698
+ 1
1699
+
1700
+ >>> idx = ps.Index(['c', 'b', 'a'])
1701
+ >>> idx.min()
1702
+ 'a'
1703
+
1704
+ For a MultiIndex, the maximum is determined lexicographically.
1705
+
1706
+ >>> idx = ps.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
1707
+ >>> idx.min()
1708
+ ('a', 'x', 1)
1709
+ """
1710
+ sdf = self._internal.spark_frame
1711
+ min_row = (
1712
+ sdf.select(F.min(F.struct(*self._internal.index_spark_columns)).alias("min_row"))
1713
+ .select("min_row.*")
1714
+ .toPandas()
1715
+ )
1716
+ result = tuple(min_row.iloc[0])
1717
+
1718
+ return result if len(result) > 1 else result[0]
1719
+
1720
+ def max(self) -> Union[Scalar, Tuple[Scalar, ...]]:
1721
+ """
1722
+ Return the maximum value of the Index.
1723
+
1724
+ Returns
1725
+ -------
1726
+ scalar
1727
+ Maximum value.
1728
+
1729
+ See Also
1730
+ --------
1731
+ Index.min : Return the minimum value in an Index.
1732
+ Series.max : Return the maximum value in a Series.
1733
+ DataFrame.max : Return the maximum values in a DataFrame.
1734
+
1735
+ Examples
1736
+ --------
1737
+ >>> idx = ps.Index([3, 2, 1])
1738
+ >>> idx.max()
1739
+ 3
1740
+
1741
+ >>> idx = ps.Index(['c', 'b', 'a'])
1742
+ >>> idx.max()
1743
+ 'c'
1744
+
1745
+ For a MultiIndex, the maximum is determined lexicographically.
1746
+
1747
+ >>> idx = ps.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
1748
+ >>> idx.max()
1749
+ ('b', 'y', 2)
1750
+ """
1751
+ sdf = self._internal.spark_frame
1752
+ max_row = (
1753
+ sdf.select(F.max(F.struct(*self._internal.index_spark_columns)).alias("max_row"))
1754
+ .select("max_row.*")
1755
+ .toPandas()
1756
+ )
1757
+ result = tuple(max_row.iloc[0])
1758
+
1759
+ return result if len(result) > 1 else result[0]
1760
+
1761
+ def delete(self, loc: Union[int, List[int]]) -> "Index":
1762
+ """
1763
+ Make new Index with passed location(-s) deleted.
1764
+
1765
+ .. note:: this API can be pretty expensive since it is based on
1766
+ a global sequence internally.
1767
+
1768
+ Returns
1769
+ -------
1770
+ new_index : Index
1771
+
1772
+ Examples
1773
+ --------
1774
+ >>> psidx = ps.Index([10, 10, 9, 8, 4, 2, 4, 4, 2, 2, 10, 10])
1775
+ >>> psidx # doctest: +SKIP
1776
+ Int64Index([10, 10, 9, 8, 4, 2, 4, 4, 2, 2, 10, 10], dtype='int64')
1777
+
1778
+ >>> psidx.delete(0).sort_values() # doctest: +SKIP
1779
+ Int64Index([2, 2, 2, 4, 4, 4, 8, 9, 10, 10, 10], dtype='int64')
1780
+
1781
+ >>> psidx.delete([0, 1, 2, 3, 10, 11]).sort_values() # doctest: +SKIP
1782
+ Int64Index([2, 2, 2, 4, 4, 4], dtype='int64')
1783
+
1784
+ MultiIndex
1785
+
1786
+ >>> psidx = ps.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)])
1787
+ >>> psidx # doctest: +SKIP
1788
+ MultiIndex([('a', 'x', 1),
1789
+ ('b', 'y', 2),
1790
+ ('c', 'z', 3)],
1791
+ )
1792
+
1793
+ >>> psidx.delete([0, 2]).sort_values() # doctest: +SKIP
1794
+ MultiIndex([('b', 'y', 2)],
1795
+ )
1796
+ """
1797
+ length = len(self)
1798
+
1799
+ def is_len_exceeded(index: int) -> bool:
1800
+ """Check if the given index is exceeded the length or not"""
1801
+ return index >= length if index >= 0 else abs(index) > length
1802
+
1803
+ if not is_list_like(loc):
1804
+ if is_len_exceeded(cast(int, loc)):
1805
+ raise IndexError(
1806
+ "index {} is out of bounds for axis 0 with size {}".format(loc, length)
1807
+ )
1808
+ locs = [cast(int, loc)]
1809
+ else:
1810
+ for index in cast(List[int], loc):
1811
+ if is_len_exceeded(index):
1812
+ raise IndexError(
1813
+ "index {} is out of bounds for axis 0 with size {}".format(index, length)
1814
+ )
1815
+ locs = cast(List[int], loc)
1816
+
1817
+ locs = [int(item) for item in locs]
1818
+ locs = [item if item >= 0 else length + item for item in locs]
1819
+
1820
+ # we need a temporary column such as '__index_value_0__'
1821
+ # since 'InternalFrame.attach_default_index' will be failed
1822
+ # when self._scol has name of '__index_level_0__'
1823
+ index_value_column_format = "__index_value_{}__"
1824
+
1825
+ sdf = self._internal._sdf
1826
+ index_value_column_names = [
1827
+ verify_temp_column_name(sdf, index_value_column_format.format(i))
1828
+ for i in range(self._internal.index_level)
1829
+ ]
1830
+ index_value_columns = [
1831
+ index_scol.alias(index_vcol_name)
1832
+ for index_scol, index_vcol_name in zip(
1833
+ self._internal.index_spark_columns, index_value_column_names
1834
+ )
1835
+ ]
1836
+ sdf = sdf.select(index_value_columns)
1837
+
1838
+ sdf = InternalFrame.attach_default_index(sdf, default_index_type="distributed-sequence")
1839
+ # sdf here looks as below
1840
+ # +-----------------+-----------------+-----------------+-----------------+
1841
+ # |__index_level_0__|__index_value_0__|__index_value_1__|__index_value_2__|
1842
+ # +-----------------+-----------------+-----------------+-----------------+
1843
+ # | 0| a| x| 1|
1844
+ # | 1| b| y| 2|
1845
+ # | 2| c| z| 3|
1846
+ # +-----------------+-----------------+-----------------+-----------------+
1847
+
1848
+ # delete rows which are matched with given `loc`
1849
+ sdf = sdf.where(~F.col(SPARK_INDEX_NAME_FORMAT(0)).isin(locs))
1850
+ sdf = sdf.select(index_value_column_names)
1851
+ # sdf here looks as below, we should alias them back to origin spark column names
1852
+ # +-----------------+-----------------+-----------------+
1853
+ # |__index_value_0__|__index_value_1__|__index_value_2__|
1854
+ # +-----------------+-----------------+-----------------+
1855
+ # | c| z| 3|
1856
+ # +-----------------+-----------------+-----------------+
1857
+ index_origin_columns = [
1858
+ F.col(index_vcol_name).alias(index_scol_name)
1859
+ for index_vcol_name, index_scol_name in zip(
1860
+ index_value_column_names, self._internal.index_spark_column_names
1861
+ )
1862
+ ]
1863
+ sdf = sdf.select(index_origin_columns)
1864
+
1865
+ internal = InternalFrame(
1866
+ spark_frame=sdf,
1867
+ index_spark_columns=[
1868
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
1869
+ ],
1870
+ index_names=self._internal.index_names,
1871
+ index_fields=self._internal.index_fields,
1872
+ )
1873
+
1874
+ return DataFrame(internal).index
1875
+
1876
+ def append(self, other: "Index") -> "Index":
1877
+ """
1878
+ Append a collection of Index options together.
1879
+
1880
+ Parameters
1881
+ ----------
1882
+ other : Index
1883
+
1884
+ Returns
1885
+ -------
1886
+ appended : Index
1887
+
1888
+ Examples
1889
+ --------
1890
+ >>> psidx = ps.Index([10, 5, 0, 5, 10, 5, 0, 10])
1891
+ >>> psidx # doctest: +SKIP
1892
+ Int64Index([10, 5, 0, 5, 10, 5, 0, 10], dtype='int64')
1893
+
1894
+ >>> psidx.append(psidx) # doctest: +SKIP
1895
+ Int64Index([10, 5, 0, 5, 10, 5, 0, 10, 10, 5, 0, 5, 10, 5, 0, 10], dtype='int64')
1896
+
1897
+ Support for MiltiIndex
1898
+
1899
+ >>> psidx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y')])
1900
+ >>> psidx # doctest: +SKIP
1901
+ MultiIndex([('a', 'x'),
1902
+ ('b', 'y')],
1903
+ )
1904
+
1905
+ >>> psidx.append(psidx) # doctest: +SKIP
1906
+ MultiIndex([('a', 'x'),
1907
+ ('b', 'y'),
1908
+ ('a', 'x'),
1909
+ ('b', 'y')],
1910
+ )
1911
+ """
1912
+ from pyspark.pandas.indexes.multi import MultiIndex
1913
+ from pyspark.pandas.indexes.category import CategoricalIndex
1914
+
1915
+ if isinstance(self, MultiIndex) != isinstance(other, MultiIndex):
1916
+ raise NotImplementedError(
1917
+ "append() between Index & MultiIndex is currently not supported"
1918
+ )
1919
+ if self._internal.index_level != other._internal.index_level:
1920
+ raise NotImplementedError(
1921
+ "append() between MultiIndexs with different levels is currently not supported"
1922
+ )
1923
+
1924
+ index_fields = self._index_fields_for_union_like(other, func_name="append")
1925
+ # Since pandas 1.5.0, the order of category matters.
1926
+ if isinstance(other, CategoricalIndex):
1927
+ other = other.reorder_categories(self.categories.to_list())
1928
+
1929
+ sdf_self = self._internal.spark_frame.select(self._internal.index_spark_columns)
1930
+ sdf_other = other._internal.spark_frame.select(other._internal.index_spark_columns)
1931
+ sdf_appended = sdf_self.union(sdf_other)
1932
+
1933
+ # names should be kept when MultiIndex, but Index wouldn't keep its name.
1934
+ if isinstance(self, MultiIndex):
1935
+ index_names = self._internal.index_names
1936
+ else:
1937
+ index_names = None
1938
+
1939
+ internal = InternalFrame(
1940
+ spark_frame=sdf_appended,
1941
+ index_spark_columns=[
1942
+ scol_for(sdf_appended, col) for col in self._internal.index_spark_column_names
1943
+ ],
1944
+ index_names=index_names,
1945
+ index_fields=index_fields,
1946
+ )
1947
+
1948
+ return DataFrame(internal).index
1949
+
1950
+ def argmax(self) -> int:
1951
+ """
1952
+ Return a maximum argument indexer.
1953
+
1954
+ Parameters
1955
+ ----------
1956
+ skipna : bool, default True
1957
+
1958
+ Returns
1959
+ -------
1960
+ maximum argument indexer
1961
+
1962
+ Examples
1963
+ --------
1964
+ >>> psidx = ps.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
1965
+ >>> psidx # doctest: +SKIP
1966
+ Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')
1967
+
1968
+ >>> psidx.argmax()
1969
+ 4
1970
+ """
1971
+ sdf = self._internal.spark_frame.select(self.spark.column)
1972
+ sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
1973
+ sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
1974
+ # spark_frame here looks like below
1975
+ # +-----------------+---------------+
1976
+ # |__index_level_0__|__index_value__|
1977
+ # +-----------------+---------------+
1978
+ # | 0| 10|
1979
+ # | 4| 100|
1980
+ # | 2| 8|
1981
+ # | 3| 7|
1982
+ # | 6| 4|
1983
+ # | 5| 5|
1984
+ # | 7| 3|
1985
+ # | 8| 100|
1986
+ # | 1| 9|
1987
+ # +-----------------+---------------+
1988
+
1989
+ return (
1990
+ sdf.orderBy(
1991
+ scol_for(sdf, self._internal.data_spark_column_names[0]).desc(),
1992
+ F.col(sequence_col).asc(),
1993
+ )
1994
+ .select(sequence_col)
1995
+ .first()[0]
1996
+ )
1997
+
1998
+ def argmin(self) -> int:
1999
+ """
2000
+ Return a minimum argument indexer.
2001
+
2002
+ Parameters
2003
+ ----------
2004
+ skipna : bool, default True
2005
+
2006
+ Returns
2007
+ -------
2008
+ minimum argument indexer
2009
+
2010
+ Examples
2011
+ --------
2012
+ >>> psidx = ps.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
2013
+ >>> psidx # doctest: +SKIP
2014
+ Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')
2015
+
2016
+ >>> psidx.argmin()
2017
+ 7
2018
+ """
2019
+ sdf = self._internal.spark_frame.select(self.spark.column)
2020
+ sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
2021
+ sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
2022
+
2023
+ return (
2024
+ sdf.orderBy(
2025
+ scol_for(sdf, self._internal.data_spark_column_names[0]).asc(),
2026
+ F.col(sequence_col).asc(),
2027
+ )
2028
+ .select(sequence_col)
2029
+ .first()[0]
2030
+ )
2031
+
2032
+ def set_names(
2033
+ self,
2034
+ names: Union[Name, List[Name]],
2035
+ level: Optional[Union[int, Name, List[Union[int, Name]]]] = None,
2036
+ inplace: bool = False,
2037
+ ) -> Optional["Index"]:
2038
+ """
2039
+ Set Index or MultiIndex name.
2040
+ Able to set new names partially and by level.
2041
+
2042
+ Parameters
2043
+ ----------
2044
+ names : label or list of label
2045
+ Name(s) to set.
2046
+ level : int, label or list of int or label, optional
2047
+ If the index is a MultiIndex, level(s) to set (None for all
2048
+ levels). Otherwise level must be None.
2049
+ inplace : bool, default False
2050
+ Modifies the object directly, instead of creating a new Index or
2051
+ MultiIndex.
2052
+
2053
+ Returns
2054
+ -------
2055
+ Index
2056
+ The same type as the caller or None if inplace is True.
2057
+
2058
+ See Also
2059
+ --------
2060
+ Index.rename : Able to set new names without level.
2061
+
2062
+ Examples
2063
+ --------
2064
+ >>> idx = ps.Index([1, 2, 3, 4])
2065
+ >>> idx # doctest: +SKIP
2066
+ Int64Index([1, 2, 3, 4], dtype='int64')
2067
+
2068
+ >>> idx.set_names('quarter') # doctest: +SKIP
2069
+ Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
2070
+
2071
+ For MultiIndex
2072
+
2073
+ >>> idx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y')])
2074
+ >>> idx # doctest: +SKIP
2075
+ MultiIndex([('a', 'x'),
2076
+ ('b', 'y')],
2077
+ )
2078
+
2079
+ >>> idx.set_names(['kind', 'year'], inplace=True)
2080
+ >>> idx # doctest: +SKIP
2081
+ MultiIndex([('a', 'x'),
2082
+ ('b', 'y')],
2083
+ names=['kind', 'year'])
2084
+
2085
+ >>> idx.set_names('species', level=0) # doctest: +SKIP
2086
+ MultiIndex([('a', 'x'),
2087
+ ('b', 'y')],
2088
+ names=['species', 'year'])
2089
+ """
2090
+ from pyspark.pandas.indexes.multi import MultiIndex
2091
+
2092
+ if isinstance(self, MultiIndex) and level is not None:
2093
+ self_names = self.names
2094
+ self_names[level] = names # type: ignore[index]
2095
+ names = self_names
2096
+ return self.rename(name=names, inplace=inplace)
2097
+
2098
+ def difference(self, other: "Index", sort: Optional[bool] = None) -> "Index":
2099
+ """
2100
+ Return a new Index with elements from the index that are not in
2101
+ `other`.
2102
+
2103
+ This is the set difference of two Index objects.
2104
+
2105
+ Parameters
2106
+ ----------
2107
+ other : Index or array-like
2108
+ sort : True or None, default None
2109
+ Whether to sort the resulting index.
2110
+ * True : Attempt to sort the result.
2111
+ * None : Do not sort the result.
2112
+
2113
+ Returns
2114
+ -------
2115
+ difference : Index
2116
+
2117
+ Examples
2118
+ --------
2119
+
2120
+ >>> idx1 = ps.Index([2, 1, 3, 4])
2121
+ >>> idx2 = ps.Index([3, 4, 5, 6])
2122
+ >>> idx1.difference(idx2, sort=True) # doctest: +SKIP
2123
+ Int64Index([1, 2], dtype='int64')
2124
+
2125
+ MultiIndex
2126
+
2127
+ >>> midx1 = ps.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)])
2128
+ >>> midx2 = ps.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'z', 2), ('k', 'z', 3)])
2129
+ >>> midx1.difference(midx2) # doctest: +SKIP
2130
+ MultiIndex([('b', 'y', 2),
2131
+ ('c', 'z', 3)],
2132
+ )
2133
+ """
2134
+ from pyspark.pandas.indexes.multi import MultiIndex
2135
+
2136
+ # Check if the `self` and `other` have different index types.
2137
+ # 1. `self` is Index, `other` is MultiIndex
2138
+ # 2. `self` is MultiIndex, `other` is Index
2139
+ is_index_types_different = isinstance(other, Index) and not isinstance(self, type(other))
2140
+ if is_index_types_different:
2141
+ if isinstance(self, MultiIndex):
2142
+ # In case `self` is MultiIndex and `other` is Index,
2143
+ # return MultiIndex without its names.
2144
+ return self.rename([None] * len(self))
2145
+ elif isinstance(self, Index):
2146
+ # In case `self` is Index and `other` is MultiIndex,
2147
+ # return Index without its name.
2148
+ return self.rename(None)
2149
+
2150
+ if not isinstance(other, (Index, Series, tuple, list, set, dict)):
2151
+ raise TypeError("Input must be Index or array-like")
2152
+ if not isinstance(sort, (type(None), type(True))):
2153
+ raise ValueError(
2154
+ "The 'sort' keyword only takes the values of None or True; {} was passed.".format(
2155
+ sort
2156
+ )
2157
+ )
2158
+ # Handling MultiIndex when `other` is not MultiIndex.
2159
+ if isinstance(self, MultiIndex) and not isinstance(other, MultiIndex):
2160
+ is_other_list_of_tuples = isinstance(other, (list, set, dict)) and all(
2161
+ [isinstance(item, tuple) for item in other]
2162
+ )
2163
+ if is_other_list_of_tuples:
2164
+ other = MultiIndex.from_tuples(other) # type: ignore[arg-type]
2165
+ else:
2166
+ raise TypeError("other must be a MultiIndex or a list of tuples")
2167
+
2168
+ if not isinstance(other, Index):
2169
+ other = Index(other)
2170
+
2171
+ sdf_self = self._internal.spark_frame
2172
+ sdf_other = other._internal.spark_frame
2173
+ idx_self = self._internal.index_spark_columns
2174
+ idx_other = other._internal.index_spark_columns
2175
+ sdf_diff = sdf_self.select(idx_self).subtract(sdf_other.select(idx_other))
2176
+ internal = InternalFrame(
2177
+ spark_frame=sdf_diff,
2178
+ index_spark_columns=[
2179
+ scol_for(sdf_diff, col) for col in self._internal.index_spark_column_names
2180
+ ],
2181
+ index_names=self._internal.index_names,
2182
+ index_fields=self._internal.index_fields,
2183
+ )
2184
+ result = DataFrame(internal).index
2185
+ # Name(s) will be kept when only name(s) of (Multi)Index are the same.
2186
+ if isinstance(self, type(other)) and isinstance(self, MultiIndex):
2187
+ if self.names == other.names:
2188
+ result.names = self.names
2189
+ elif isinstance(self, type(other)) and not isinstance(self, MultiIndex):
2190
+ if self.name == other.name:
2191
+ result.name = self.name
2192
+ return result if sort is None else cast(Index, result.sort_values())
2193
+
2194
+ @property
2195
+ def is_all_dates(self) -> bool:
2196
+ """
2197
+ Return if all data types of the index are datetime.
2198
+ remember that since pandas-on-Spark does not support multiple data types in an index,
2199
+ so it returns True if any type of data is datetime.
2200
+
2201
+ .. deprecated:: 3.4.0
2202
+
2203
+ Examples
2204
+ --------
2205
+ >>> from datetime import datetime
2206
+
2207
+ >>> idx = ps.Index([datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0)])
2208
+ >>> idx
2209
+ DatetimeIndex(['2019-01-01', '2019-02-03'], dtype='datetime64[ns]', freq=None)
2210
+
2211
+ >>> idx.is_all_dates
2212
+ True
2213
+
2214
+ >>> idx = ps.Index([datetime(2019, 1, 1, 0, 0, 0), None])
2215
+ >>> idx
2216
+ DatetimeIndex(['2019-01-01', 'NaT'], dtype='datetime64[ns]', freq=None)
2217
+
2218
+ >>> idx.is_all_dates
2219
+ True
2220
+
2221
+ >>> idx = ps.Index([0, 1, 2])
2222
+ >>> idx # doctest: +SKIP
2223
+ Int64Index([0, 1, 2], dtype='int64')
2224
+
2225
+ >>> idx.is_all_dates
2226
+ False
2227
+ """
2228
+ warnings.warn(
2229
+ "Index.is_all_dates is deprecated, will be removed in a future version. "
2230
+ "check index.inferred_type instead",
2231
+ FutureWarning,
2232
+ )
2233
+ return isinstance(self.spark.data_type, (TimestampType, TimestampNTZType))
2234
+
2235
+ def repeat(self, repeats: int) -> "Index":
2236
+ """
2237
+ Repeat elements of a Index/MultiIndex.
2238
+
2239
+ Returns a new Index/MultiIndex where each element of the current Index/MultiIndex
2240
+ is repeated consecutively a given number of times.
2241
+
2242
+ Parameters
2243
+ ----------
2244
+ repeats : int
2245
+ The number of repetitions for each element. This should be a
2246
+ non-negative integer. Repeating 0 times will return an empty
2247
+ Index.
2248
+
2249
+ Returns
2250
+ -------
2251
+ repeated_index : Index/MultiIndex
2252
+ Newly created Index/MultiIndex with repeated elements.
2253
+
2254
+ See Also
2255
+ --------
2256
+ Series.repeat : Equivalent function for Series.
2257
+
2258
+ Examples
2259
+ --------
2260
+ >>> idx = ps.Index(['a', 'b', 'c'])
2261
+ >>> idx
2262
+ Index(['a', 'b', 'c'], dtype='object')
2263
+ >>> idx.repeat(2)
2264
+ Index(['a', 'b', 'c', 'a', 'b', 'c'], dtype='object')
2265
+
2266
+ For MultiIndex,
2267
+
2268
+ >>> midx = ps.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')])
2269
+ >>> midx # doctest: +SKIP
2270
+ MultiIndex([('x', 'a'),
2271
+ ('x', 'b'),
2272
+ ('y', 'c')],
2273
+ )
2274
+ >>> midx.repeat(2) # doctest: +SKIP
2275
+ MultiIndex([('x', 'a'),
2276
+ ('x', 'b'),
2277
+ ('y', 'c'),
2278
+ ('x', 'a'),
2279
+ ('x', 'b'),
2280
+ ('y', 'c')],
2281
+ )
2282
+ >>> midx.repeat(0) # doctest: +SKIP
2283
+ MultiIndex([], )
2284
+ """
2285
+ if not isinstance(repeats, int):
2286
+ raise TypeError(
2287
+ "`repeats` argument must be integer, but got {}".format(type(repeats).__name__)
2288
+ )
2289
+ elif repeats < 0:
2290
+ raise ValueError("negative dimensions are not allowed")
2291
+
2292
+ psdf: DataFrame = DataFrame(self._internal.resolved_copy)
2293
+ if repeats == 0:
2294
+ return DataFrame(psdf._internal.with_filter(F.lit(False))).index
2295
+ else:
2296
+ return ps.concat([psdf] * repeats).index
2297
+
2298
+ def asof(self, label: Any) -> Scalar:
2299
+ """
2300
+ Return the label from the index, or, if not present, the previous one.
2301
+
2302
+ Assuming that the index is sorted, return the passed index label if it
2303
+ is in the index, or return the previous index label if the passed one
2304
+ is not in the index.
2305
+
2306
+ .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
2307
+ which can be expensive.
2308
+
2309
+ Parameters
2310
+ ----------
2311
+ label : object
2312
+ The label up to which the method returns the latest index label.
2313
+
2314
+ Returns
2315
+ -------
2316
+ object
2317
+ The passed label if it is in the index. The previous label if the
2318
+ passed label is not in the sorted index or `NaN` if there is no
2319
+ such label.
2320
+
2321
+ Examples
2322
+ --------
2323
+ `Index.asof` returns the latest index label up to the passed label.
2324
+
2325
+ >>> idx = ps.Index(['2013-12-31', '2014-01-02', '2014-01-03'])
2326
+ >>> idx.asof('2014-01-01')
2327
+ '2013-12-31'
2328
+
2329
+ If the label is in the index, the method returns the passed label.
2330
+
2331
+ >>> idx.asof('2014-01-02')
2332
+ '2014-01-02'
2333
+
2334
+ If all of the labels in the index are later than the passed label,
2335
+ NaN is returned.
2336
+
2337
+ >>> idx.asof('1999-01-02')
2338
+ nan
2339
+ """
2340
+ sdf = self._internal.spark_frame
2341
+ if self.is_monotonic_increasing:
2342
+ sdf = sdf.where(self.spark.column <= F.lit(label).cast(self.spark.data_type)).select(
2343
+ F.max(self.spark.column)
2344
+ )
2345
+ elif self.is_monotonic_decreasing:
2346
+ sdf = sdf.where(self.spark.column >= F.lit(label).cast(self.spark.data_type)).select(
2347
+ F.min(self.spark.column)
2348
+ )
2349
+ else:
2350
+ raise ValueError("index must be monotonic increasing or decreasing")
2351
+
2352
+ result = sdf.toPandas().iloc[0, 0]
2353
+ return result if result is not None else np.nan
2354
+
2355
+ def _index_fields_for_union_like(
2356
+ self: "Index", other: "Index", func_name: str
2357
+ ) -> Optional[List[InternalField]]:
2358
+ if self._internal.index_fields == other._internal.index_fields:
2359
+ return self._internal.index_fields
2360
+ elif all(
2361
+ left.dtype == right.dtype
2362
+ and (isinstance(left.dtype, CategoricalDtype) or left.spark_type == right.spark_type)
2363
+ for left, right in zip(self._internal.index_fields, other._internal.index_fields)
2364
+ ):
2365
+ return [
2366
+ left.copy(nullable=left.nullable or right.nullable)
2367
+ if left.spark_type == right.spark_type
2368
+ else InternalField(dtype=left.dtype)
2369
+ for left, right in zip(self._internal.index_fields, other._internal.index_fields)
2370
+ ]
2371
+ elif any(
2372
+ isinstance(field.dtype, CategoricalDtype)
2373
+ for field in self._internal.index_fields + other._internal.index_fields
2374
+ ):
2375
+ # TODO: non-categorical or categorical with different categories
2376
+ raise NotImplementedError(
2377
+ "{}() between CategoricalIndex and non-categorical or "
2378
+ "categorical with different categories is currently not supported".format(func_name)
2379
+ )
2380
+ else:
2381
+ return None
2382
+
2383
+ def union(
2384
+ self, other: Union[DataFrame, Series, "Index", List], sort: Optional[bool] = None
2385
+ ) -> "Index":
2386
+ """
2387
+ Form the union of two Index objects.
2388
+
2389
+ Parameters
2390
+ ----------
2391
+ other : Index or array-like
2392
+ sort : bool or None, default None
2393
+ Whether to sort the resulting Index.
2394
+
2395
+ Returns
2396
+ -------
2397
+ union : Index
2398
+
2399
+ Examples
2400
+ --------
2401
+
2402
+ Index
2403
+
2404
+ >>> idx1 = ps.Index([1, 2, 3, 4])
2405
+ >>> idx2 = ps.Index([3, 4, 5, 6])
2406
+ >>> idx1.union(idx2).sort_values() # doctest: +SKIP
2407
+ Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')
2408
+
2409
+ MultiIndex
2410
+
2411
+ >>> midx1 = ps.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")])
2412
+ >>> midx2 = ps.MultiIndex.from_tuples([("x", "c"), ("x", "d"), ("x", "e"), ("x", "f")])
2413
+ >>> midx1.union(midx2).sort_values() # doctest: +SKIP
2414
+ MultiIndex([('x', 'a'),
2415
+ ('x', 'b'),
2416
+ ('x', 'c'),
2417
+ ('x', 'd'),
2418
+ ('x', 'e'),
2419
+ ('x', 'f')],
2420
+ )
2421
+ """
2422
+ from pyspark.pandas.indexes.multi import MultiIndex
2423
+
2424
+ sort = True if sort is None else sort
2425
+ sort = validate_bool_kwarg(sort, "sort")
2426
+ other_idx: Index
2427
+ if isinstance(self, MultiIndex):
2428
+ if isinstance(other, MultiIndex):
2429
+ other_idx = other
2430
+ elif isinstance(other, list) and all(isinstance(item, tuple) for item in other):
2431
+ other_idx = MultiIndex.from_tuples(other)
2432
+ else:
2433
+ raise TypeError("other must be a MultiIndex or a list of tuples")
2434
+ else:
2435
+ if isinstance(other, MultiIndex):
2436
+ # TODO: We can't support different type of values in a single column for now.
2437
+ raise NotImplementedError("Union between Index and MultiIndex is not yet supported")
2438
+ elif isinstance(other, DataFrame):
2439
+ raise ValueError("Index data must be 1-dimensional")
2440
+ else:
2441
+ other_idx = Index(other)
2442
+
2443
+ index_fields = self._index_fields_for_union_like(other_idx, func_name="union")
2444
+
2445
+ sdf_self = self._internal.spark_frame.select(self._internal.index_spark_columns)
2446
+ sdf_other = other_idx._internal.spark_frame.select(other_idx._internal.index_spark_columns)
2447
+ sdf = sdf_self.unionAll(sdf_other).exceptAll(sdf_self.intersectAll(sdf_other))
2448
+ if sort:
2449
+ sdf = sdf.sort(*self._internal.index_spark_column_names)
2450
+
2451
+ internal = InternalFrame(
2452
+ spark_frame=sdf,
2453
+ index_spark_columns=[
2454
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
2455
+ ],
2456
+ index_names=self._internal.index_names,
2457
+ index_fields=index_fields,
2458
+ )
2459
+
2460
+ return DataFrame(internal).index
2461
+
2462
+ def holds_integer(self) -> bool:
2463
+ """
2464
+ Whether the type is an integer type.
2465
+ Always return False for MultiIndex.
2466
+
2467
+ Notes
2468
+ -----
2469
+ When Index contains null values the result can be different with pandas
2470
+ since pandas-on-Spark cast integer to float when Index contains null values.
2471
+
2472
+ >>> ps.Index([1, 2, 3, None]) # doctest: +SKIP
2473
+ Float64Index([1.0, 2.0, 3.0, nan], dtype='float64')
2474
+
2475
+ Examples
2476
+ --------
2477
+ >>> psidx = ps.Index([1, 2, 3, 4])
2478
+ >>> psidx.holds_integer()
2479
+ True
2480
+
2481
+ Returns False for string type.
2482
+
2483
+ >>> psidx = ps.Index(["A", "B", "C", "D"])
2484
+ >>> psidx.holds_integer()
2485
+ False
2486
+
2487
+ Returns False for float type.
2488
+
2489
+ >>> psidx = ps.Index([1.1, 2.2, 3.3, 4.4])
2490
+ >>> psidx.holds_integer()
2491
+ False
2492
+ """
2493
+ return isinstance(self.spark.data_type, IntegralType)
2494
+
2495
+ def intersection(self, other: Union[DataFrame, Series, "Index", List]) -> "Index":
2496
+ """
2497
+ Form the intersection of two Index objects.
2498
+
2499
+ This returns a new Index with elements common to the index and `other`.
2500
+
2501
+ Parameters
2502
+ ----------
2503
+ other : Index or array-like
2504
+
2505
+ Returns
2506
+ -------
2507
+ intersection : Index
2508
+
2509
+ Examples
2510
+ --------
2511
+ >>> idx1 = ps.Index([1, 2, 3, 4])
2512
+ >>> idx2 = ps.Index([3, 4, 5, 6])
2513
+ >>> idx1.intersection(idx2).sort_values() # doctest: +SKIP
2514
+ Int64Index([3, 4], dtype='int64')
2515
+ """
2516
+ from pyspark.pandas.indexes.multi import MultiIndex
2517
+
2518
+ other_idx: Index
2519
+ if isinstance(other, DataFrame):
2520
+ raise ValueError("Index data must be 1-dimensional")
2521
+ elif isinstance(other, MultiIndex):
2522
+ # Always returns a no-named empty Index if `other` is MultiIndex.
2523
+ return self._psdf.head(0).index.rename(None)
2524
+ elif isinstance(other, Index):
2525
+ other_idx = other
2526
+ spark_frame_other = other_idx.to_frame()._to_spark()
2527
+ keep_name = self.name == other_idx.name
2528
+ elif isinstance(other, Series):
2529
+ other_idx = Index(other)
2530
+ spark_frame_other = other_idx.to_frame()._to_spark()
2531
+ keep_name = True
2532
+ elif is_list_like(other):
2533
+ other_idx = Index(other)
2534
+ if isinstance(other_idx, MultiIndex):
2535
+ raise ValueError("Names should be list-like for a MultiIndex")
2536
+ spark_frame_other = other_idx.to_frame()._to_spark()
2537
+ keep_name = True
2538
+ else:
2539
+ raise TypeError("Input must be Index or array-like")
2540
+
2541
+ index_fields = self._index_fields_for_union_like(other_idx, func_name="intersection")
2542
+
2543
+ spark_frame_self = self.to_frame(name=SPARK_DEFAULT_INDEX_NAME)._to_spark()
2544
+ spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
2545
+ if keep_name:
2546
+ index_names = self._internal.index_names
2547
+ else:
2548
+ index_names = None
2549
+
2550
+ internal = InternalFrame(
2551
+ spark_frame=spark_frame_intersected,
2552
+ index_spark_columns=[scol_for(spark_frame_intersected, SPARK_DEFAULT_INDEX_NAME)],
2553
+ index_names=index_names,
2554
+ index_fields=index_fields,
2555
+ )
2556
+
2557
+ return DataFrame(internal).index
2558
+
2559
+ def item(self) -> Union[Scalar, Tuple[Scalar, ...]]:
2560
+ """
2561
+ Return the first element of the underlying data as a python scalar.
2562
+
2563
+ Returns
2564
+ -------
2565
+ scalar
2566
+ The first element of Index.
2567
+
2568
+ Raises
2569
+ ------
2570
+ ValueError
2571
+ If the data is not length-1.
2572
+
2573
+ Examples
2574
+ --------
2575
+ >>> psidx = ps.Index([10])
2576
+ >>> psidx.item()
2577
+ 10
2578
+ """
2579
+ return self.to_series().item()
2580
+
2581
+ def insert(self, loc: int, item: Any) -> "Index":
2582
+ """
2583
+ Make new Index inserting new item at location.
2584
+
2585
+ Follows Python list.append semantics for negative values.
2586
+
2587
+ .. versionchanged:: 3.4.0
2588
+ Raise IndexError when loc is out of bounds to follow Pandas 1.4+ behavior
2589
+
2590
+ Parameters
2591
+ ----------
2592
+ loc : int
2593
+ item : object
2594
+
2595
+ Returns
2596
+ -------
2597
+ new_index : Index
2598
+
2599
+ Examples
2600
+ --------
2601
+ >>> psidx = ps.Index([1, 2, 3, 4, 5])
2602
+ >>> psidx.insert(3, 100) # doctest: +SKIP
2603
+ Int64Index([1, 2, 3, 100, 4, 5], dtype='int64')
2604
+
2605
+ For negative values
2606
+
2607
+ >>> psidx = ps.Index([1, 2, 3, 4, 5])
2608
+ >>> psidx.insert(-3, 100) # doctest: +SKIP
2609
+ Int64Index([1, 2, 100, 3, 4, 5], dtype='int64')
2610
+ """
2611
+ validate_index_loc(self, loc)
2612
+ loc = loc + len(self) if loc < 0 else loc
2613
+
2614
+ index_name = self._internal.index_spark_column_names[0]
2615
+ sdf_before = self.to_frame(name=index_name)[:loc]._to_spark()
2616
+ sdf_middle = Index([item], dtype=self.dtype).to_frame(name=index_name)._to_spark()
2617
+ sdf_after = self.to_frame(name=index_name)[loc:]._to_spark()
2618
+ sdf = sdf_before.union(sdf_middle).union(sdf_after)
2619
+
2620
+ internal = InternalFrame(
2621
+ spark_frame=sdf,
2622
+ index_spark_columns=[
2623
+ scol_for(sdf, col) for col in self._internal.index_spark_column_names
2624
+ ],
2625
+ index_names=self._internal.index_names,
2626
+ index_fields=[InternalField(field.dtype) for field in self._internal.index_fields],
2627
+ )
2628
+ return DataFrame(internal).index
2629
+
2630
+ def view(self) -> "Index":
2631
+ """
2632
+ this is defined as a copy with the same identity
2633
+ """
2634
+ return self.copy()
2635
+
2636
+ def to_list(self) -> List:
2637
+ """
2638
+ Return a list of the values.
2639
+
2640
+ These are each a scalar type, which is a Python scalar
2641
+ (for str, int, float) or a pandas scalar
2642
+ (for Timestamp/Timedelta/Interval/Period)
2643
+
2644
+ .. note:: This method should only be used if the resulting list is expected
2645
+ to be small, as all the data is loaded into the driver's memory.
2646
+
2647
+ Examples
2648
+ --------
2649
+ Index
2650
+
2651
+ >>> idx = ps.Index([1, 2, 3, 4, 5])
2652
+ >>> idx.to_list()
2653
+ [1, 2, 3, 4, 5]
2654
+
2655
+ MultiIndex
2656
+
2657
+ >>> tuples = [(1, 'red'), (1, 'blue'), (2, 'red'), (2, 'green')]
2658
+ >>> midx = ps.MultiIndex.from_tuples(tuples)
2659
+ >>> midx.to_list()
2660
+ [(1, 'red'), (1, 'blue'), (2, 'red'), (2, 'green')]
2661
+ """
2662
+ log_advice(
2663
+ "`to_list` loads all data into the driver's memory. "
2664
+ "It should only be used if the resulting list is expected to be small."
2665
+ )
2666
+ return self._to_internal_pandas().tolist()
2667
+
2668
+ tolist = to_list
2669
+
2670
+ @property
2671
+ def inferred_type(self) -> str:
2672
+ """
2673
+ Return a string of the type inferred from the values.
2674
+
2675
+ Examples
2676
+ --------
2677
+ >>> from datetime import datetime
2678
+ >>> ps.Index([1, 2, 3]).inferred_type
2679
+ 'integer'
2680
+
2681
+ >>> ps.Index([1.0, 2.0, 3.0]).inferred_type
2682
+ 'floating'
2683
+
2684
+ >>> ps.Index(['a', 'b', 'c']).inferred_type
2685
+ 'string'
2686
+
2687
+ >>> ps.Index([True, False, True, False]).inferred_type
2688
+ 'boolean'
2689
+ """
2690
+ return lib.infer_dtype([self.to_series().head(1).item()])
2691
+
2692
+ def __getattr__(self, item: str) -> Any:
2693
+ if hasattr(MissingPandasLikeIndex, item):
2694
+ property_or_func = getattr(MissingPandasLikeIndex, item)
2695
+ if isinstance(property_or_func, property):
2696
+ return property_or_func.fget(self)
2697
+ else:
2698
+ return partial(property_or_func, self)
2699
+ raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))
2700
+
2701
+ def __repr__(self) -> str:
2702
+ max_display_count = get_option("display.max_rows")
2703
+ if max_display_count is None:
2704
+ return repr(self._to_internal_pandas())
2705
+
2706
+ pindex = self._psdf._get_or_create_repr_pandas_cache(max_display_count).index
2707
+
2708
+ pindex_length = len(pindex)
2709
+ repr_string = repr(pindex[:max_display_count])
2710
+
2711
+ if pindex_length > max_display_count:
2712
+ footer = "\nShowing only the first {}".format(max_display_count)
2713
+ return repr_string + footer
2714
+ return repr_string
2715
+
2716
+ def __iter__(self) -> Iterator:
2717
+ return MissingPandasLikeIndex.__iter__(self)
2718
+
2719
+ def __and__(self, other: "Index") -> "Index":
2720
+ warnings.warn(
2721
+ "Index.__and__ operating as a set operation is deprecated, "
2722
+ "in the future this will be a logical operation matching Series.__and__. "
2723
+ "Use index.intersection(other) instead",
2724
+ FutureWarning,
2725
+ )
2726
+ return self.intersection(other)
2727
+
2728
+ def __or__(self, other: "Index") -> "Index":
2729
+ warnings.warn(
2730
+ "Index.__or__ operating as a set operation is deprecated, "
2731
+ "in the future this will be a logical operation matching Series.__or__. "
2732
+ "Use index.union(other) instead",
2733
+ FutureWarning,
2734
+ )
2735
+ return self.union(other)
2736
+
2737
+ def __xor__(self, other: "Index") -> "Index":
2738
+ warnings.warn(
2739
+ "Index.__xor__ operating as a set operation is deprecated, "
2740
+ "in the future this will be a logical operation matching Series.__xor__. "
2741
+ "Use index.symmetric_difference(other) instead",
2742
+ FutureWarning,
2743
+ )
2744
+ return self.symmetric_difference(other)
2745
+
2746
+ def __rxor__(self, other: Any) -> "Index":
2747
+ return NotImplemented
2748
+
2749
+ def __bool__(self) -> bool:
2750
+ raise ValueError(
2751
+ "The truth value of a {0} is ambiguous. "
2752
+ "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(self.__class__.__name__)
2753
+ )
2754
+
2755
+
2756
+ def _test() -> None:
2757
+ import os
2758
+ import doctest
2759
+ import sys
2760
+ from pyspark.sql import SparkSession
2761
+ import pyspark.pandas.indexes.base
2762
+
2763
+ os.chdir(os.environ["SPARK_HOME"])
2764
+
2765
+ globs = pyspark.pandas.indexes.base.__dict__.copy()
2766
+ globs["ps"] = pyspark.pandas
2767
+ spark = (
2768
+ SparkSession.builder.master("local[4]")
2769
+ .appName("pyspark.pandas.indexes.base tests")
2770
+ .getOrCreate()
2771
+ )
2772
+ (failure_count, test_count) = doctest.testmod(
2773
+ pyspark.pandas.indexes.base,
2774
+ globs=globs,
2775
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
2776
+ )
2777
+ spark.stop()
2778
+ if failure_count:
2779
+ sys.exit(-1)
2780
+
2781
+
2782
+ if __name__ == "__main__":
2783
+ _test()