snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1862 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ A loc indexer for pandas-on-Spark DataFrame/Series.
20
+ """
21
+ from abc import ABCMeta, abstractmethod
22
+ from collections.abc import Iterable
23
+ from functools import reduce
24
+ from typing import Any, Optional, List, Tuple, TYPE_CHECKING, Union, cast, Sized
25
+
26
+ import pandas as pd
27
+ from pandas.api.types import is_list_like # type: ignore[attr-defined]
28
+ from pyspark.sql import functions as F, Column as PySparkColumn
29
+ from pyspark.sql.types import BooleanType, LongType, DataType
30
+ from pyspark.errors import AnalysisException
31
+ import numpy as np
32
+
33
+ from pyspark import pandas as ps # noqa: F401
34
+ from pyspark.pandas._typing import Label, Name, Scalar
35
+ from pyspark.pandas.internal import (
36
+ DEFAULT_SERIES_NAME,
37
+ InternalField,
38
+ InternalFrame,
39
+ NATURAL_ORDER_COLUMN_NAME,
40
+ SPARK_DEFAULT_SERIES_NAME,
41
+ )
42
+ from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
43
+ from pyspark.pandas.utils import (
44
+ is_name_like_tuple,
45
+ is_name_like_value,
46
+ lazy_property,
47
+ name_like_string,
48
+ same_anchor,
49
+ scol_for,
50
+ spark_column_equals,
51
+ verify_temp_column_name,
52
+ )
53
+
54
+ # For Supporting Spark Connect
55
+ from pyspark.sql.utils import get_column_class
56
+
57
+ if TYPE_CHECKING:
58
+ from pyspark.pandas.frame import DataFrame
59
+ from pyspark.pandas.generic import Frame
60
+ from pyspark.pandas.series import Series
61
+
62
+
63
+ class IndexerLike:
64
+ def __init__(self, psdf_or_psser: "Frame"):
65
+ from pyspark.pandas.frame import DataFrame
66
+ from pyspark.pandas.series import Series
67
+
68
+ assert isinstance(
69
+ psdf_or_psser, (DataFrame, Series)
70
+ ), "unexpected argument type: {}".format(type(psdf_or_psser))
71
+ self._psdf_or_psser = psdf_or_psser
72
+
73
+ @property
74
+ def _is_df(self) -> bool:
75
+ from pyspark.pandas.frame import DataFrame
76
+
77
+ return isinstance(self._psdf_or_psser, DataFrame)
78
+
79
+ @property
80
+ def _is_series(self) -> bool:
81
+ from pyspark.pandas.series import Series
82
+
83
+ return isinstance(self._psdf_or_psser, Series)
84
+
85
+ @property
86
+ def _psdf(self) -> "DataFrame":
87
+ if self._is_df:
88
+ return cast("DataFrame", self._psdf_or_psser)
89
+ else:
90
+ assert self._is_series
91
+ return self._psdf_or_psser._psdf
92
+
93
+ @property
94
+ def _internal(self) -> InternalFrame:
95
+ return self._psdf._internal
96
+
97
+
98
+ class AtIndexer(IndexerLike):
99
+ """
100
+ Access a single value for a row/column label pair.
101
+ If the index is not unique, all matching pairs are returned as an array.
102
+ Like ``loc``, in that both provide label-based lookups. Use ``at`` if you only need to
103
+ get a single value in a DataFrame or Series.
104
+
105
+ .. note:: Unlike pandas, pandas-on-Spark only allows using ``at`` to get values but not to
106
+ set them.
107
+
108
+ .. note:: Warning: If ``row_index`` matches a lot of rows, large amounts of data will be
109
+ fetched, potentially causing your machine to run out of memory.
110
+
111
+ Raises
112
+ ------
113
+ KeyError
114
+ When label does not exist in DataFrame
115
+
116
+ Examples
117
+ --------
118
+ >>> psdf = ps.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
119
+ ... index=[4, 5, 5], columns=['A', 'B', 'C'])
120
+ >>> psdf
121
+ A B C
122
+ 4 0 2 3
123
+ 5 0 4 1
124
+ 5 10 20 30
125
+
126
+ Get value at specified row/column pair
127
+
128
+ >>> psdf.at[4, 'B']
129
+ 2
130
+
131
+ Get array if an index occurs multiple times
132
+
133
+ >>> psdf.at[5, 'B']
134
+ array([ 4, 20])
135
+ """
136
+
137
+ def __getitem__(self, key: Any) -> Union["Series", "DataFrame", Scalar]:
138
+ if self._is_df:
139
+ if not isinstance(key, tuple) or len(key) != 2:
140
+ raise TypeError("Use DataFrame.at like .at[row_index, column_name]")
141
+ row_sel, col_sel = key
142
+ else:
143
+ assert self._is_series, type(self._psdf_or_psser)
144
+ if isinstance(key, tuple) and len(key) != 1:
145
+ raise TypeError("Use Series.at like .at[row_index]")
146
+ row_sel = key
147
+ col_sel = self._psdf_or_psser._column_label
148
+
149
+ if self._internal.index_level == 1:
150
+ if not is_name_like_value(row_sel, allow_none=False, allow_tuple=False):
151
+ raise ValueError("At based indexing on a single index can only have a single value")
152
+ row_sel = (row_sel,)
153
+ else:
154
+ if not is_name_like_tuple(row_sel, allow_none=False):
155
+ raise ValueError("At based indexing on multi-index can only have tuple values")
156
+
157
+ if col_sel is not None:
158
+ if not is_name_like_value(col_sel, allow_none=False):
159
+ raise ValueError("At based indexing on multi-index can only have tuple values")
160
+ if not is_name_like_tuple(col_sel):
161
+ col_sel = (col_sel,)
162
+
163
+ cond = reduce(
164
+ lambda x, y: x & y,
165
+ [scol == row for scol, row in zip(self._internal.index_spark_columns, row_sel)],
166
+ )
167
+ pdf = (
168
+ self._internal.spark_frame.drop(NATURAL_ORDER_COLUMN_NAME)
169
+ .filter(cond)
170
+ .select(self._internal.spark_column_for(col_sel))
171
+ .toPandas()
172
+ )
173
+
174
+ if len(pdf) < 1:
175
+ raise KeyError(name_like_string(row_sel))
176
+
177
+ values = pdf.iloc[:, 0].values
178
+ return (
179
+ values if (len(row_sel) < self._internal.index_level or len(values) > 1) else values[0]
180
+ )
181
+
182
+
183
+ class iAtIndexer(IndexerLike):
184
+ """
185
+ Access a single value for a row/column pair by integer position.
186
+
187
+ Like ``iloc``, in that both provide integer-based lookups. Use
188
+ ``iat`` if you only need to get or set a single value in a DataFrame
189
+ or Series.
190
+
191
+ Raises
192
+ ------
193
+ KeyError
194
+ When label does not exist in DataFrame
195
+
196
+ Examples
197
+ --------
198
+ >>> df = ps.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
199
+ ... columns=['A', 'B', 'C'])
200
+ >>> df
201
+ A B C
202
+ 0 0 2 3
203
+ 1 0 4 1
204
+ 2 10 20 30
205
+
206
+ Get value at specified row/column pair
207
+
208
+ >>> df.iat[1, 2]
209
+ 1
210
+
211
+ Get value within a series
212
+
213
+ >>> psser = ps.Series([1, 2, 3], index=[10, 20, 30])
214
+ >>> psser
215
+ 10 1
216
+ 20 2
217
+ 30 3
218
+ dtype: int64
219
+
220
+ >>> psser.iat[1]
221
+ 2
222
+ """
223
+
224
+ def __getitem__(self, key: Any) -> Union["Series", "DataFrame", Scalar]:
225
+ if self._is_df:
226
+ if not isinstance(key, tuple) or len(key) != 2:
227
+ raise TypeError(
228
+ "Use DataFrame.iat like .iat[row_integer_position, column_integer_position]"
229
+ )
230
+ row_sel, col_sel = key
231
+ if not isinstance(row_sel, int) or not isinstance(col_sel, int):
232
+ raise ValueError("iAt based indexing can only have integer indexers")
233
+ return self._psdf_or_psser.iloc[row_sel, col_sel]
234
+ else:
235
+ assert self._is_series, type(self._psdf_or_psser)
236
+ if not isinstance(key, int) and len(key) != 1:
237
+ raise TypeError("Use Series.iat like .iat[row_integer_position]")
238
+ if not isinstance(key, int):
239
+ raise ValueError("iAt based indexing can only have integer indexers")
240
+ return self._psdf_or_psser.iloc[key]
241
+
242
+
243
+ class LocIndexerLike(IndexerLike, metaclass=ABCMeta):
244
+ def _select_rows(
245
+ self, rows_sel: Any
246
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
247
+ """
248
+ Dispatch the logic for select rows to more specific methods by `rows_sel` argument types.
249
+
250
+ Parameters
251
+ ----------
252
+ rows_sel : the key specified to select rows.
253
+
254
+ Returns
255
+ -------
256
+ Tuple of Spark column, int, int:
257
+
258
+ * The Spark column for the condition to filter the rows.
259
+ * The number of rows when the selection can be simplified by limit.
260
+ * The remaining index rows if the result index size is shrunk.
261
+ """
262
+ from pyspark.pandas.series import Series
263
+
264
+ Column = get_column_class()
265
+ if rows_sel is None:
266
+ return None, None, None
267
+ elif isinstance(rows_sel, Series):
268
+ return self._select_rows_by_series(rows_sel)
269
+ elif isinstance(rows_sel, Column):
270
+ return self._select_rows_by_spark_column(rows_sel)
271
+ elif isinstance(rows_sel, slice):
272
+ if rows_sel == slice(None):
273
+ # If slice is None - select everything, so nothing to do
274
+ return None, None, None
275
+ return self._select_rows_by_slice(rows_sel)
276
+ elif isinstance(rows_sel, tuple):
277
+ return self._select_rows_else(rows_sel)
278
+ elif is_list_like(rows_sel):
279
+ return self._select_rows_by_iterable(rows_sel)
280
+ else:
281
+ return self._select_rows_else(rows_sel)
282
+
283
+ def _select_cols(
284
+ self, cols_sel: Any, missing_keys: Optional[List[Name]] = None
285
+ ) -> Tuple[
286
+ List[Label],
287
+ Optional[List[PySparkColumn]],
288
+ Optional[List[InternalField]],
289
+ bool,
290
+ Optional[Name],
291
+ ]:
292
+ """
293
+ Dispatch the logic for select columns to more specific methods by `cols_sel` argument types.
294
+
295
+ Parameters
296
+ ----------
297
+ cols_sel : the key specified to select columns.
298
+
299
+ Returns
300
+ -------
301
+ Tuple of list of column label, list of Spark columns, list of dtypes, bool:
302
+
303
+ * The column labels selected.
304
+ * The Spark columns selected.
305
+ * The field metadata selected.
306
+ * The boolean value whether Series should be returned or not.
307
+ * The Series name if needed.
308
+ """
309
+ from pyspark.pandas.series import Series
310
+
311
+ Column = get_column_class()
312
+ if cols_sel is None:
313
+ column_labels = self._internal.column_labels
314
+ data_spark_columns = self._internal.data_spark_columns
315
+ data_fields = self._internal.data_fields
316
+ return column_labels, data_spark_columns, data_fields, False, None
317
+ elif isinstance(cols_sel, Series):
318
+ return self._select_cols_by_series(cols_sel, missing_keys)
319
+ elif isinstance(cols_sel, Column):
320
+ return self._select_cols_by_spark_column(cols_sel, missing_keys)
321
+ elif isinstance(cols_sel, slice):
322
+ if cols_sel == slice(None):
323
+ # If slice is None - select everything, so nothing to do
324
+ column_labels = self._internal.column_labels
325
+ data_spark_columns = self._internal.data_spark_columns
326
+ data_fields = self._internal.data_fields
327
+ return column_labels, data_spark_columns, data_fields, False, None
328
+ return self._select_cols_by_slice(cols_sel, missing_keys)
329
+ elif isinstance(cols_sel, tuple):
330
+ return self._select_cols_else(cols_sel, missing_keys)
331
+ elif is_list_like(cols_sel):
332
+ return self._select_cols_by_iterable(cols_sel, missing_keys)
333
+ else:
334
+ return self._select_cols_else(cols_sel, missing_keys)
335
+
336
+ # Methods for row selection
337
+
338
+ @abstractmethod
339
+ def _select_rows_by_series(
340
+ self, rows_sel: "Series"
341
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
342
+ """Select rows by `Series` type key."""
343
+ pass
344
+
345
+ @abstractmethod
346
+ def _select_rows_by_spark_column(
347
+ self, rows_sel: PySparkColumn
348
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
349
+ """Select rows by Spark `Column` type key."""
350
+ pass
351
+
352
+ @abstractmethod
353
+ def _select_rows_by_slice(
354
+ self, rows_sel: slice
355
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
356
+ """Select rows by `slice` type key."""
357
+ pass
358
+
359
+ @abstractmethod
360
+ def _select_rows_by_iterable(
361
+ self, rows_sel: Iterable
362
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
363
+ """Select rows by `Iterable` type key."""
364
+ pass
365
+
366
+ @abstractmethod
367
+ def _select_rows_else(
368
+ self, rows_sel: Any
369
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
370
+ """Select rows by other type key."""
371
+ pass
372
+
373
+ # Methods for col selection
374
+
375
+ @abstractmethod
376
+ def _select_cols_by_series(
377
+ self, cols_sel: "Series", missing_keys: Optional[List[Name]]
378
+ ) -> Tuple[
379
+ List[Label],
380
+ Optional[List[PySparkColumn]],
381
+ Optional[List[InternalField]],
382
+ bool,
383
+ Optional[Name],
384
+ ]:
385
+ """Select columns by `Series` type key."""
386
+ pass
387
+
388
+ @abstractmethod
389
+ def _select_cols_by_spark_column(
390
+ self, cols_sel: PySparkColumn, missing_keys: Optional[List[Name]]
391
+ ) -> Tuple[
392
+ List[Label],
393
+ Optional[List[PySparkColumn]],
394
+ Optional[List[InternalField]],
395
+ bool,
396
+ Optional[Name],
397
+ ]:
398
+ """Select columns by Spark `Column` type key."""
399
+ pass
400
+
401
+ @abstractmethod
402
+ def _select_cols_by_slice(
403
+ self, cols_sel: slice, missing_keys: Optional[List[Name]]
404
+ ) -> Tuple[
405
+ List[Label],
406
+ Optional[List[PySparkColumn]],
407
+ Optional[List[InternalField]],
408
+ bool,
409
+ Optional[Name],
410
+ ]:
411
+ """Select columns by `slice` type key."""
412
+ pass
413
+
414
+ @abstractmethod
415
+ def _select_cols_by_iterable(
416
+ self, cols_sel: Iterable, missing_keys: Optional[List[Name]]
417
+ ) -> Tuple[
418
+ List[Label],
419
+ Optional[List[PySparkColumn]],
420
+ Optional[List[InternalField]],
421
+ bool,
422
+ Optional[Name],
423
+ ]:
424
+ """Select columns by `Iterable` type key."""
425
+ pass
426
+
427
+ @abstractmethod
428
+ def _select_cols_else(
429
+ self, cols_sel: Any, missing_keys: Optional[List[Name]]
430
+ ) -> Tuple[
431
+ List[Label],
432
+ Optional[List[PySparkColumn]],
433
+ Optional[List[InternalField]],
434
+ bool,
435
+ Optional[Name],
436
+ ]:
437
+ """Select columns by other type key."""
438
+ pass
439
+
440
+ def __getitem__(self, key: Any) -> Union["Series", "DataFrame"]:
441
+ from pyspark.pandas.frame import DataFrame
442
+ from pyspark.pandas.series import Series, first_series
443
+
444
+ if self._is_series:
445
+ if isinstance(key, Series) and not same_anchor(key, self._psdf_or_psser):
446
+ name = self._psdf_or_psser.name or DEFAULT_SERIES_NAME
447
+ psdf = self._psdf_or_psser.to_frame(name)
448
+ temp_col = verify_temp_column_name(psdf, "__temp_col__")
449
+
450
+ psdf[temp_col] = key
451
+ return type(self)(psdf[name].rename(self._psdf_or_psser.name))[psdf[temp_col]]
452
+
453
+ cond, limit, remaining_index = self._select_rows(key)
454
+ if cond is None and limit is None:
455
+ return self._psdf_or_psser
456
+
457
+ column_label = self._psdf_or_psser._column_label
458
+ column_labels = [column_label]
459
+ data_spark_columns = [self._internal.spark_column_for(column_label)]
460
+ data_fields = [self._internal.field_for(column_label)]
461
+ returns_series = True
462
+ series_name = self._psdf_or_psser.name
463
+ else:
464
+ assert self._is_df
465
+ if isinstance(key, tuple):
466
+ if len(key) != 2:
467
+ raise SparkPandasIndexingError("Only accepts pairs of candidates")
468
+ rows_sel, cols_sel = key
469
+ else:
470
+ rows_sel = key
471
+ cols_sel = None
472
+
473
+ if isinstance(rows_sel, Series) and not same_anchor(rows_sel, self._psdf_or_psser):
474
+ psdf = self._psdf_or_psser.copy()
475
+ temp_col = verify_temp_column_name(cast("DataFrame", psdf), "__temp_col__")
476
+
477
+ psdf[temp_col] = rows_sel
478
+ return type(self)(psdf)[psdf[temp_col], cols_sel][list(self._psdf_or_psser.columns)]
479
+
480
+ cond, limit, remaining_index = self._select_rows(rows_sel)
481
+ (
482
+ column_labels,
483
+ data_spark_columns,
484
+ data_fields,
485
+ returns_series,
486
+ series_name,
487
+ ) = self._select_cols(cols_sel)
488
+
489
+ if cond is None and limit is None and returns_series:
490
+ psser = self._psdf_or_psser._psser_for(column_labels[0])
491
+ if series_name is not None and series_name != psser.name:
492
+ psser = psser.rename(series_name)
493
+ return psser
494
+
495
+ if remaining_index is not None:
496
+ index_spark_columns = self._internal.index_spark_columns[-remaining_index:]
497
+ index_names = self._internal.index_names[-remaining_index:]
498
+ index_fields = self._internal.index_fields[-remaining_index:]
499
+ else:
500
+ index_spark_columns = self._internal.index_spark_columns
501
+ index_names = self._internal.index_names
502
+ index_fields = self._internal.index_fields
503
+
504
+ if len(column_labels) > 0:
505
+ column_labels = column_labels.copy()
506
+ column_labels_level = max(
507
+ len(label) if label is not None else 1 for label in column_labels
508
+ )
509
+ none_column = 0
510
+ for i, label in enumerate(column_labels):
511
+ if label is None:
512
+ label = (none_column,)
513
+ none_column += 1
514
+ if len(label) < column_labels_level:
515
+ label = tuple(list(label) + ([""]) * (column_labels_level - len(label)))
516
+ column_labels[i] = label
517
+
518
+ if i == 0 and none_column == 1:
519
+ column_labels = [None]
520
+
521
+ column_label_names = self._internal.column_label_names[-column_labels_level:]
522
+ else:
523
+ column_label_names = self._internal.column_label_names
524
+
525
+ try:
526
+ sdf = self._internal.spark_frame
527
+
528
+ if cond is not None:
529
+ index_columns = sdf.select(index_spark_columns).columns
530
+ data_columns = sdf.select(data_spark_columns).columns
531
+ sdf = sdf.filter(cond).select(index_spark_columns + data_spark_columns)
532
+ index_spark_columns = [scol_for(sdf, col) for col in index_columns]
533
+ data_spark_columns = [scol_for(sdf, col) for col in data_columns]
534
+
535
+ if limit is not None:
536
+ if limit >= 0:
537
+ sdf = sdf.limit(limit)
538
+ else:
539
+ sdf = sdf.limit(sdf.count() + limit)
540
+ sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME)
541
+ except AnalysisException:
542
+ raise KeyError(
543
+ "[{}] don't exist in columns".format(
544
+ [col._jc.toString() for col in data_spark_columns]
545
+ )
546
+ )
547
+
548
+ internal = InternalFrame(
549
+ spark_frame=sdf,
550
+ index_spark_columns=index_spark_columns,
551
+ index_names=index_names,
552
+ index_fields=index_fields,
553
+ column_labels=column_labels,
554
+ data_spark_columns=data_spark_columns,
555
+ data_fields=data_fields,
556
+ column_label_names=column_label_names,
557
+ )
558
+ psdf = DataFrame(internal)
559
+
560
+ psdf_or_psser: Union[DataFrame, Series]
561
+ if returns_series:
562
+ psdf_or_psser = first_series(psdf)
563
+ if series_name is not None and series_name != psdf_or_psser.name:
564
+ psdf_or_psser = psdf_or_psser.rename(series_name)
565
+ else:
566
+ psdf_or_psser = psdf
567
+
568
+ if remaining_index is not None and remaining_index == 0:
569
+ pdf_or_pser = psdf_or_psser.head(2)._to_pandas()
570
+ length = len(pdf_or_pser)
571
+ if length == 0:
572
+ raise KeyError(name_like_string(key))
573
+ elif length == 1:
574
+ return pdf_or_pser.iloc[0]
575
+ else:
576
+ return psdf_or_psser
577
+ else:
578
+ return psdf_or_psser
579
+
580
+ def __setitem__(self, key: Any, value: Any) -> None:
581
+ from pyspark.pandas.frame import DataFrame
582
+ from pyspark.pandas.series import Series, first_series
583
+
584
+ Column = get_column_class()
585
+ if self._is_series:
586
+ if (
587
+ isinstance(key, Series)
588
+ and (isinstance(self, iLocIndexer) or not same_anchor(key, self._psdf_or_psser))
589
+ ) or (
590
+ isinstance(value, Series)
591
+ and (isinstance(self, iLocIndexer) or not same_anchor(value, self._psdf_or_psser))
592
+ ):
593
+ if self._psdf_or_psser.name is None:
594
+ psdf = self._psdf_or_psser.to_frame()
595
+ column_label = psdf._internal.column_labels[0]
596
+ else:
597
+ psdf = self._psdf_or_psser._psdf.copy()
598
+ column_label = self._psdf_or_psser._column_label
599
+ temp_natural_order = verify_temp_column_name(psdf, "__temp_natural_order__")
600
+ temp_key_col = verify_temp_column_name(psdf, "__temp_key_col__")
601
+ temp_value_col = verify_temp_column_name(psdf, "__temp_value_col__")
602
+
603
+ psdf[temp_natural_order] = F.monotonically_increasing_id()
604
+ if isinstance(key, Series):
605
+ psdf[temp_key_col] = key
606
+ if isinstance(value, Series):
607
+ psdf[temp_value_col] = value
608
+ psdf = psdf.sort_values(temp_natural_order).drop(columns=temp_natural_order)
609
+
610
+ psser = psdf._psser_for(column_label)
611
+ if isinstance(key, Series):
612
+ key = F.col(
613
+ "`{}`".format(psdf[temp_key_col]._internal.data_spark_column_names[0])
614
+ )
615
+ if isinstance(value, Series):
616
+ value = F.col(
617
+ "`{}`".format(psdf[temp_value_col]._internal.data_spark_column_names[0])
618
+ )
619
+
620
+ type(self)(psser)[key] = value
621
+
622
+ if self._psdf_or_psser.name is None:
623
+ psser = psser.rename()
624
+
625
+ self._psdf_or_psser._psdf._update_internal_frame(
626
+ psser._psdf[
627
+ self._psdf_or_psser._psdf._internal.column_labels
628
+ ]._internal.resolved_copy,
629
+ check_same_anchor=False,
630
+ )
631
+ return
632
+
633
+ if isinstance(value, DataFrame):
634
+ raise ValueError("Incompatible indexer with DataFrame")
635
+
636
+ cond, limit, remaining_index = self._select_rows(key)
637
+ if cond is None:
638
+ cond = F.lit(True)
639
+ if limit is not None:
640
+ cond = cond & (
641
+ self._internal.spark_frame[cast(iLocIndexer, self)._sequence_col] < F.lit(limit)
642
+ )
643
+
644
+ if isinstance(value, (Series, Column)):
645
+ if remaining_index is not None and remaining_index == 0:
646
+ raise ValueError(
647
+ "No axis named {} for object type {}".format(key, type(value).__name__)
648
+ )
649
+ if isinstance(value, Series):
650
+ value = value.spark.column
651
+ else:
652
+ value = F.lit(value)
653
+ scol = (
654
+ F.when(cond, value)
655
+ .otherwise(self._internal.spark_column_for(self._psdf_or_psser._column_label))
656
+ .alias(name_like_string(self._psdf_or_psser.name or SPARK_DEFAULT_SERIES_NAME))
657
+ )
658
+
659
+ internal = self._internal.with_new_spark_column(
660
+ self._psdf_or_psser._column_label, scol # TODO: dtype?
661
+ )
662
+ self._psdf_or_psser._psdf._update_internal_frame(internal, check_same_anchor=False)
663
+ else:
664
+ assert self._is_df
665
+
666
+ if isinstance(key, tuple):
667
+ if len(key) != 2:
668
+ raise SparkPandasIndexingError("Only accepts pairs of candidates")
669
+ rows_sel, cols_sel = key
670
+ else:
671
+ rows_sel = key
672
+ cols_sel = None
673
+
674
+ if isinstance(value, DataFrame):
675
+ if len(value.columns) == 1:
676
+ value = first_series(value)
677
+ else:
678
+ raise ValueError("Only a dataframe with one column can be assigned")
679
+
680
+ if (
681
+ isinstance(rows_sel, Series)
682
+ and (
683
+ isinstance(self, iLocIndexer) or not same_anchor(rows_sel, self._psdf_or_psser)
684
+ )
685
+ ) or (
686
+ isinstance(value, Series)
687
+ and (isinstance(self, iLocIndexer) or not same_anchor(value, self._psdf_or_psser))
688
+ ):
689
+ psdf = cast(DataFrame, self._psdf_or_psser.copy())
690
+ temp_natural_order = verify_temp_column_name(psdf, "__temp_natural_order__")
691
+ temp_key_col = verify_temp_column_name(psdf, "__temp_key_col__")
692
+ temp_value_col = verify_temp_column_name(psdf, "__temp_value_col__")
693
+
694
+ psdf[temp_natural_order] = F.monotonically_increasing_id()
695
+ if isinstance(rows_sel, Series):
696
+ psdf[temp_key_col] = rows_sel
697
+ if isinstance(value, Series):
698
+ psdf[temp_value_col] = value
699
+ psdf = psdf.sort_values(temp_natural_order).drop(columns=temp_natural_order)
700
+
701
+ if isinstance(rows_sel, Series):
702
+ rows_sel = F.col(
703
+ "`{}`".format(psdf[temp_key_col]._internal.data_spark_column_names[0])
704
+ )
705
+ if isinstance(value, Series):
706
+ value = F.col(
707
+ "`{}`".format(psdf[temp_value_col]._internal.data_spark_column_names[0])
708
+ )
709
+
710
+ type(self)(psdf)[rows_sel, cols_sel] = value
711
+
712
+ self._psdf_or_psser._update_internal_frame(
713
+ psdf[list(self._psdf_or_psser.columns)]._internal.resolved_copy,
714
+ check_same_anchor=False,
715
+ )
716
+ return
717
+
718
+ cond, limit, remaining_index = self._select_rows(rows_sel)
719
+ missing_keys: List[Name] = []
720
+ _, data_spark_columns, _, _, _ = self._select_cols(cols_sel, missing_keys=missing_keys)
721
+
722
+ if cond is None:
723
+ cond = F.lit(True)
724
+ if limit is not None:
725
+ cond = cond & (
726
+ self._internal.spark_frame[cast(iLocIndexer, self)._sequence_col] < F.lit(limit)
727
+ )
728
+
729
+ if isinstance(value, (Series, Column)):
730
+ if remaining_index is not None and remaining_index == 0:
731
+ raise ValueError("Incompatible indexer with Series")
732
+ if len(data_spark_columns) > 1:
733
+ raise ValueError("shape mismatch")
734
+ if isinstance(value, Series):
735
+ value = value.spark.column
736
+ else:
737
+ value = F.lit(value)
738
+
739
+ new_data_spark_columns = []
740
+ new_fields = []
741
+ for new_scol, spark_column_name, new_field in zip(
742
+ self._internal.data_spark_columns,
743
+ self._internal.data_spark_column_names,
744
+ self._internal.data_fields,
745
+ ):
746
+ for scol in data_spark_columns:
747
+ if spark_column_equals(new_scol, scol):
748
+ new_scol = F.when(cond, value).otherwise(scol).alias(spark_column_name)
749
+ new_field = InternalField.from_struct_field(
750
+ self._internal.spark_frame.select(new_scol).schema[0],
751
+ use_extension_dtypes=new_field.is_extension_dtype,
752
+ )
753
+ break
754
+ new_data_spark_columns.append(new_scol)
755
+ new_fields.append(new_field)
756
+
757
+ column_labels = self._internal.column_labels.copy()
758
+ for missing in missing_keys:
759
+ if is_name_like_tuple(missing):
760
+ label = cast(Label, missing)
761
+ else:
762
+ label = cast(Label, (missing,))
763
+ if len(label) < self._internal.column_labels_level:
764
+ label = tuple(
765
+ list(label) + ([""] * (self._internal.column_labels_level - len(label)))
766
+ )
767
+ elif len(label) > self._internal.column_labels_level:
768
+ raise KeyError(
769
+ "Key length ({}) exceeds index depth ({})".format(
770
+ len(label), self._internal.column_labels_level
771
+ )
772
+ )
773
+ column_labels.append(label)
774
+ new_data_spark_columns.append(F.when(cond, value).alias(name_like_string(label)))
775
+ new_fields.append(None)
776
+
777
+ internal = self._internal.with_new_columns(
778
+ new_data_spark_columns, column_labels=column_labels, data_fields=new_fields
779
+ )
780
+ self._psdf_or_psser._update_internal_frame(internal, check_same_anchor=False)
781
+
782
+
783
+ class LocIndexer(LocIndexerLike):
784
+ """
785
+ Access a group of rows and columns by label(s) or a boolean Series.
786
+
787
+ ``.loc[]`` is primarily label based, but may also be used with a
788
+ conditional boolean Series derived from the DataFrame or Series.
789
+
790
+ Allowed inputs are:
791
+
792
+ - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
793
+ interpreted as a *label* of the index, and **never** as an
794
+ integer position along the index) for column selection.
795
+
796
+ - A list or array of labels, e.g. ``['a', 'b', 'c']``.
797
+
798
+ - A slice object with labels, e.g. ``'a':'f'``.
799
+
800
+ - A conditional boolean Series derived from the DataFrame or Series
801
+
802
+ - A boolean array of the same length as the column axis being sliced,
803
+ e.g. ``[True, False, True]``.
804
+
805
+ - An alignable boolean pandas Series to the column axis being sliced.
806
+ The index of the key will be aligned before masking.
807
+
808
+ Not allowed inputs which pandas allows are:
809
+
810
+ - A boolean array of the same length as the row axis being sliced,
811
+ e.g. ``[True, False, True]``.
812
+ - A ``callable`` function with one argument (the calling Series, DataFrame
813
+ or Panel) and that returns valid output for indexing (one of the above)
814
+
815
+ .. note:: MultiIndex is not supported yet.
816
+
817
+ .. note:: Note that contrary to usual python slices, **both** the
818
+ start and the stop are included, and the step of the slice is not allowed.
819
+
820
+ .. note:: With a list or array of labels for row selection,
821
+ pandas-on-Spark behaves as a filter without reordering by the labels.
822
+
823
+ See Also
824
+ --------
825
+ Series.loc : Access group of values using labels.
826
+
827
+ Examples
828
+ --------
829
+ **Getting values**
830
+
831
+ >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],
832
+ ... index=['cobra', 'viper', 'sidewinder'],
833
+ ... columns=['max_speed', 'shield'])
834
+ >>> df
835
+ max_speed shield
836
+ cobra 1 2
837
+ viper 4 5
838
+ sidewinder 7 8
839
+
840
+ Single label. Note this returns the row as a Series.
841
+
842
+ >>> df.loc['viper']
843
+ max_speed 4
844
+ shield 5
845
+ Name: viper, dtype: int64
846
+
847
+ List of labels. Note using ``[[]]`` returns a DataFrame.
848
+ Also note that pandas-on-Spark behaves just a filter without reordering by the labels.
849
+
850
+ >>> df.loc[['viper', 'sidewinder']]
851
+ max_speed shield
852
+ viper 4 5
853
+ sidewinder 7 8
854
+
855
+ >>> df.loc[['sidewinder', 'viper']]
856
+ max_speed shield
857
+ viper 4 5
858
+ sidewinder 7 8
859
+
860
+ Single label for column.
861
+
862
+ >>> df.loc['cobra', 'shield']
863
+ 2
864
+
865
+ List of labels for row.
866
+
867
+ >>> df.loc[['cobra'], 'shield']
868
+ cobra 2
869
+ Name: shield, dtype: int64
870
+
871
+ List of labels for column.
872
+
873
+ >>> df.loc['cobra', ['shield']]
874
+ shield 2
875
+ Name: cobra, dtype: int64
876
+
877
+ List of labels for both row and column.
878
+
879
+ >>> df.loc[['cobra'], ['shield']]
880
+ shield
881
+ cobra 2
882
+
883
+ Slice with labels for row and single label for column.
884
+ Note that both the start and stop of the slice are included.
885
+
886
+ >>> df.loc['cobra':'viper', 'max_speed']
887
+ cobra 1
888
+ viper 4
889
+ Name: max_speed, dtype: int64
890
+
891
+ Conditional that returns a boolean Series
892
+
893
+ >>> df.loc[df['shield'] > 6]
894
+ max_speed shield
895
+ sidewinder 7 8
896
+
897
+ Conditional that returns a boolean Series with column labels specified
898
+
899
+ >>> df.loc[df['shield'] > 6, ['max_speed']]
900
+ max_speed
901
+ sidewinder 7
902
+
903
+ A boolean array of the same length as the column axis being sliced.
904
+
905
+ >>> df.loc[:, [False, True]]
906
+ shield
907
+ cobra 2
908
+ viper 5
909
+ sidewinder 8
910
+
911
+ An alignable boolean Series to the column axis being sliced.
912
+
913
+ >>> df.loc[:, pd.Series([False, True], index=['max_speed', 'shield'])]
914
+ shield
915
+ cobra 2
916
+ viper 5
917
+ sidewinder 8
918
+
919
+ **Setting values**
920
+
921
+ Setting value for all items matching the list of labels.
922
+
923
+ >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
924
+ >>> df
925
+ max_speed shield
926
+ cobra 1 2
927
+ viper 4 50
928
+ sidewinder 7 50
929
+
930
+ Setting value for an entire row
931
+
932
+ >>> df.loc['cobra'] = 10
933
+ >>> df
934
+ max_speed shield
935
+ cobra 10 10
936
+ viper 4 50
937
+ sidewinder 7 50
938
+
939
+ Set value for an entire column
940
+
941
+ >>> df.loc[:, 'max_speed'] = 30
942
+ >>> df
943
+ max_speed shield
944
+ cobra 30 10
945
+ viper 30 50
946
+ sidewinder 30 50
947
+
948
+ Set value for an entire list of columns
949
+
950
+ >>> df.loc[:, ['max_speed', 'shield']] = 100
951
+ >>> df
952
+ max_speed shield
953
+ cobra 100 100
954
+ viper 100 100
955
+ sidewinder 100 100
956
+
957
+ Set value with Series
958
+
959
+ >>> df.loc[:, 'shield'] = df['shield'] * 2
960
+ >>> df
961
+ max_speed shield
962
+ cobra 100 200
963
+ viper 100 200
964
+ sidewinder 100 200
965
+
966
+ **Getting values on a DataFrame with an index that has integer labels**
967
+
968
+ Another example using integers for the index
969
+
970
+ >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],
971
+ ... index=[7, 8, 9],
972
+ ... columns=['max_speed', 'shield'])
973
+ >>> df
974
+ max_speed shield
975
+ 7 1 2
976
+ 8 4 5
977
+ 9 7 8
978
+
979
+ Slice with integer labels for rows. Note that both
980
+ the start and stop of the slice are included.
981
+
982
+ >>> df.loc[7:9]
983
+ max_speed shield
984
+ 7 1 2
985
+ 8 4 5
986
+ 9 7 8
987
+ """
988
+
989
+ @staticmethod
990
+ def _NotImplemented(description: str) -> SparkPandasNotImplementedError:
991
+ return SparkPandasNotImplementedError(
992
+ description=description,
993
+ pandas_function=".loc[..., ...]",
994
+ spark_target_function="select, where",
995
+ )
996
+
997
+ def _select_rows_by_series(
998
+ self, rows_sel: "Series"
999
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1000
+ assert isinstance(rows_sel.spark.data_type, BooleanType), rows_sel.spark.data_type
1001
+ return rows_sel.spark.column, None, None
1002
+
1003
+ def _select_rows_by_spark_column(
1004
+ self, rows_sel: PySparkColumn
1005
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1006
+ spark_type = self._internal.spark_frame.select(rows_sel).schema[0].dataType
1007
+ assert isinstance(spark_type, BooleanType), spark_type
1008
+ return rows_sel, None, None
1009
+
1010
+ def _select_rows_by_slice(
1011
+ self, rows_sel: slice
1012
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1013
+ from pyspark.pandas.indexes import MultiIndex
1014
+
1015
+ if rows_sel.step is not None:
1016
+ raise LocIndexer._NotImplemented("Cannot use step with Spark.")
1017
+ elif self._internal.index_level == 1:
1018
+ sdf = self._internal.spark_frame
1019
+ index = self._psdf_or_psser.index
1020
+ index_column = index.to_series()
1021
+ index_data_type = index_column.spark.data_type
1022
+ start = rows_sel.start
1023
+ stop = rows_sel.stop
1024
+
1025
+ # get natural order from '__natural_order__' from start to stop
1026
+ # to keep natural order.
1027
+ start_and_stop = (
1028
+ sdf.select(index_column.spark.column, NATURAL_ORDER_COLUMN_NAME)
1029
+ .where(
1030
+ (index_column.spark.column == F.lit(start).cast(index_data_type))
1031
+ | (index_column.spark.column == F.lit(stop).cast(index_data_type))
1032
+ )
1033
+ .collect()
1034
+ )
1035
+
1036
+ start = [row[1] for row in start_and_stop if row[0] == start]
1037
+ start = start[0] if len(start) > 0 else None
1038
+
1039
+ stop = [row[1] for row in start_and_stop if row[0] == stop]
1040
+ stop = stop[-1] if len(stop) > 0 else None
1041
+
1042
+ conds: List[PySparkColumn] = []
1043
+ if start is not None:
1044
+ conds.append(F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(LongType()))
1045
+ if stop is not None:
1046
+ conds.append(F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(LongType()))
1047
+
1048
+ # if index order is not monotonic increasing or decreasing
1049
+ # and specified values don't exist in index, raise KeyError
1050
+ if (start is None and rows_sel.start is not None) or (
1051
+ stop is None and rows_sel.stop is not None
1052
+ ):
1053
+
1054
+ inc = index_column.is_monotonic_increasing
1055
+ if inc is False:
1056
+ dec = index_column.is_monotonic_decreasing
1057
+
1058
+ if start is None and rows_sel.start is not None:
1059
+ start = rows_sel.start
1060
+ if inc is not False:
1061
+ conds.append(
1062
+ index_column.spark.column >= F.lit(start).cast(index_data_type)
1063
+ )
1064
+ elif dec is not False:
1065
+ conds.append(
1066
+ index_column.spark.column <= F.lit(start).cast(index_data_type)
1067
+ )
1068
+ else:
1069
+ raise KeyError(rows_sel.start)
1070
+ if stop is None and rows_sel.stop is not None:
1071
+ stop = rows_sel.stop
1072
+ if inc is not False:
1073
+ conds.append(index_column.spark.column <= F.lit(stop).cast(index_data_type))
1074
+ elif dec is not False:
1075
+ conds.append(index_column.spark.column >= F.lit(stop).cast(index_data_type))
1076
+ else:
1077
+ raise KeyError(rows_sel.stop)
1078
+
1079
+ return reduce(lambda x, y: x & y, conds), None, None
1080
+ else:
1081
+ from pyspark.sql.types import StructType
1082
+
1083
+ index = self._psdf_or_psser.index
1084
+ index_data_type = [ # type: ignore[assignment]
1085
+ f.dataType for f in cast(StructType, index.to_series().spark.data_type)
1086
+ ]
1087
+
1088
+ start = rows_sel.start
1089
+ if start is not None:
1090
+ if not isinstance(start, tuple):
1091
+ start = (start,)
1092
+ if len(start) == 0:
1093
+ start = None
1094
+ stop = rows_sel.stop
1095
+ if stop is not None:
1096
+ if not isinstance(stop, tuple):
1097
+ stop = (stop,)
1098
+ if len(stop) == 0:
1099
+ stop = None
1100
+
1101
+ depth = max(
1102
+ len(start) if start is not None else 0, len(stop) if stop is not None else 0
1103
+ )
1104
+ if depth == 0:
1105
+ return None, None, None
1106
+ elif (
1107
+ depth > self._internal.index_level
1108
+ or not index.droplevel(list(range(self._internal.index_level)[depth:])).is_monotonic
1109
+ ):
1110
+ raise KeyError(
1111
+ "Key length ({}) was greater than MultiIndex sort depth".format(depth)
1112
+ )
1113
+
1114
+ conds = []
1115
+ if start is not None:
1116
+ cond = F.lit(True)
1117
+ for scol, value, dt in list(
1118
+ zip(
1119
+ self._internal.index_spark_columns,
1120
+ cast(Tuple[int, ...], start),
1121
+ cast(List[DataType], index_data_type),
1122
+ )
1123
+ )[::-1]:
1124
+ compare = MultiIndex._comparator_for_monotonic_increasing(dt)
1125
+ Column = get_column_class()
1126
+ cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise(
1127
+ compare(scol, F.lit(value).cast(dt), Column.__gt__)
1128
+ )
1129
+ conds.append(cond)
1130
+ if stop is not None:
1131
+ cond = F.lit(True)
1132
+ for scol, value, dt in list(
1133
+ zip(
1134
+ self._internal.index_spark_columns,
1135
+ cast(Tuple[int, ...], stop),
1136
+ cast(List[DataType], index_data_type),
1137
+ )
1138
+ )[::-1]:
1139
+ compare = MultiIndex._comparator_for_monotonic_increasing(dt)
1140
+ Column = get_column_class()
1141
+ cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise(
1142
+ compare(scol, F.lit(value).cast(dt), Column.__lt__)
1143
+ )
1144
+ conds.append(cond)
1145
+
1146
+ return reduce(lambda x, y: x & y, conds), None, None
1147
+
1148
+ def _select_rows_by_iterable(
1149
+ self, rows_sel: Iterable
1150
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1151
+ rows_sel = list(rows_sel)
1152
+ if len(rows_sel) == 0:
1153
+ return F.lit(False), None, None
1154
+ elif self._internal.index_level == 1:
1155
+ index_column = self._psdf_or_psser.index.to_series()
1156
+ index_data_type = index_column.spark.data_type
1157
+ if len(rows_sel) == 1:
1158
+ return (
1159
+ index_column.spark.column == F.lit(rows_sel[0]).cast(index_data_type),
1160
+ None,
1161
+ None,
1162
+ )
1163
+ else:
1164
+ return (
1165
+ index_column.spark.column.isin(
1166
+ [F.lit(r).cast(index_data_type) for r in rows_sel]
1167
+ ),
1168
+ None,
1169
+ None,
1170
+ )
1171
+ else:
1172
+ raise LocIndexer._NotImplemented("Cannot select with MultiIndex with Spark.")
1173
+
1174
+ def _select_rows_else(
1175
+ self, rows_sel: Any
1176
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1177
+ if not isinstance(rows_sel, tuple):
1178
+ rows_sel = (rows_sel,)
1179
+ if len(rows_sel) > self._internal.index_level:
1180
+ raise SparkPandasIndexingError("Too many indexers")
1181
+
1182
+ rows = [scol == value for scol, value in zip(self._internal.index_spark_columns, rows_sel)]
1183
+ return (
1184
+ reduce(lambda x, y: x & y, rows),
1185
+ None,
1186
+ self._internal.index_level - len(rows_sel),
1187
+ )
1188
+
1189
+ def _get_from_multiindex_column(
1190
+ self,
1191
+ key: Optional[Label],
1192
+ missing_keys: Optional[List[Name]],
1193
+ labels: Optional[List[Tuple[Label, Label]]] = None,
1194
+ recursed: int = 0,
1195
+ ) -> Tuple[
1196
+ List[Label], Optional[List[PySparkColumn]], List[InternalField], bool, Optional[Name]
1197
+ ]:
1198
+ """Select columns from multi-index columns."""
1199
+ assert isinstance(key, tuple)
1200
+ if labels is None:
1201
+ labels = [(label, label) for label in self._internal.column_labels]
1202
+ for k in key:
1203
+ labels = [
1204
+ (label, None if lbl is None else lbl[1:])
1205
+ for label, lbl in labels
1206
+ if (lbl is None and k is None) or (lbl is not None and lbl[0] == k)
1207
+ ]
1208
+ if len(labels) == 0:
1209
+ if missing_keys is None:
1210
+ raise KeyError(k)
1211
+ else:
1212
+ missing_keys.append(key)
1213
+ return [], [], [], False, None
1214
+
1215
+ if all(lbl is not None and len(lbl) > 0 and lbl[0] == "" for _, lbl in labels):
1216
+ # If the head is '', drill down recursively.
1217
+ labels = [(label, tuple([str(key), *lbl[1:]])) for i, (label, lbl) in enumerate(labels)]
1218
+ return self._get_from_multiindex_column((str(key),), missing_keys, labels, recursed + 1)
1219
+ else:
1220
+ returns_series = all(lbl is None or len(lbl) == 0 for _, lbl in labels)
1221
+ series_name: Optional[Name]
1222
+ if returns_series:
1223
+ label_set = set(label for label, _ in labels)
1224
+ assert len(label_set) == 1
1225
+ label = list(label_set)[0]
1226
+ column_labels = [label]
1227
+ data_spark_columns = [self._internal.spark_column_for(label)]
1228
+ data_fields = [self._internal.field_for(label)]
1229
+ if label is None:
1230
+ series_name = None
1231
+ else:
1232
+ if recursed > 0:
1233
+ label = label[:-recursed]
1234
+ series_name = label if len(label) > 1 else label[0]
1235
+ else:
1236
+ column_labels = [
1237
+ None if lbl is None or lbl == (None,) else lbl for _, lbl in labels
1238
+ ]
1239
+ data_spark_columns = [self._internal.spark_column_for(label) for label, _ in labels]
1240
+ data_fields = [self._internal.field_for(label) for label, _ in labels]
1241
+ series_name = None
1242
+
1243
+ return column_labels, data_spark_columns, data_fields, returns_series, series_name
1244
+
1245
+ def _select_cols_by_series(
1246
+ self, cols_sel: "Series", missing_keys: Optional[List[Name]]
1247
+ ) -> Tuple[
1248
+ List[Label],
1249
+ Optional[List[PySparkColumn]],
1250
+ Optional[List[InternalField]],
1251
+ bool,
1252
+ Optional[Name],
1253
+ ]:
1254
+ column_labels = cols_sel._internal.column_labels
1255
+ data_spark_columns = cols_sel._internal.data_spark_columns
1256
+ data_fields = cols_sel._internal.data_fields
1257
+ return column_labels, data_spark_columns, data_fields, True, None
1258
+
1259
+ def _select_cols_by_spark_column(
1260
+ self, cols_sel: PySparkColumn, missing_keys: Optional[List[Name]]
1261
+ ) -> Tuple[
1262
+ List[Label],
1263
+ Optional[List[PySparkColumn]],
1264
+ Optional[List[InternalField]],
1265
+ bool,
1266
+ Optional[Name],
1267
+ ]:
1268
+ column_labels: List[Label] = [(self._internal.spark_frame.select(cols_sel).columns[0],)]
1269
+ data_spark_columns = [cols_sel]
1270
+ return column_labels, data_spark_columns, None, True, None
1271
+
1272
+ def _select_cols_by_slice(
1273
+ self, cols_sel: slice, missing_keys: Optional[List[Name]]
1274
+ ) -> Tuple[
1275
+ List[Label],
1276
+ Optional[List[PySparkColumn]],
1277
+ Optional[List[InternalField]],
1278
+ bool,
1279
+ Optional[Name],
1280
+ ]:
1281
+ start, stop = self._psdf_or_psser.columns.slice_locs(
1282
+ start=cols_sel.start, end=cols_sel.stop
1283
+ )
1284
+ column_labels = self._internal.column_labels[start:stop]
1285
+ data_spark_columns = self._internal.data_spark_columns[start:stop]
1286
+ data_fields = self._internal.data_fields[start:stop]
1287
+ return column_labels, data_spark_columns, data_fields, False, None
1288
+
1289
+ def _select_cols_by_iterable(
1290
+ self, cols_sel: Iterable, missing_keys: Optional[List[Name]]
1291
+ ) -> Tuple[
1292
+ List[Label],
1293
+ Optional[List[PySparkColumn]],
1294
+ Optional[List[InternalField]],
1295
+ bool,
1296
+ Optional[Name],
1297
+ ]:
1298
+ from pyspark.pandas.series import Series
1299
+
1300
+ Column = get_column_class()
1301
+ if all(isinstance(key, Series) for key in cols_sel):
1302
+ column_labels = [key._column_label for key in cols_sel]
1303
+ data_spark_columns = [key.spark.column for key in cols_sel]
1304
+ data_fields = [key._internal.data_fields[0] for key in cols_sel]
1305
+ elif all(isinstance(key, Column) for key in cols_sel):
1306
+ column_labels = [
1307
+ (self._internal.spark_frame.select(col).columns[0],) for col in cols_sel
1308
+ ]
1309
+ data_spark_columns = list(cols_sel)
1310
+ data_fields = None
1311
+ elif all(isinstance(key, bool) for key in cols_sel) or all(
1312
+ isinstance(key, np.bool_) for key in cols_sel
1313
+ ):
1314
+ if len(cast(Sized, cols_sel)) != len(self._internal.column_labels):
1315
+ raise IndexError(
1316
+ "Boolean index has wrong length: %s instead of %s"
1317
+ % (len(cast(Sized, cols_sel)), len(self._internal.column_labels))
1318
+ )
1319
+ if isinstance(cols_sel, pd.Series):
1320
+ if not cols_sel.index.sort_values().equals(self._psdf.columns.sort_values()):
1321
+ raise SparkPandasIndexingError(
1322
+ "Unalignable boolean Series provided as indexer "
1323
+ "(index of the boolean Series and of the indexed object do not match)"
1324
+ )
1325
+ else:
1326
+ column_labels = [
1327
+ column_label
1328
+ for column_label in self._internal.column_labels
1329
+ if cols_sel[column_label if len(column_label) > 1 else column_label[0]]
1330
+ ]
1331
+ data_spark_columns = [
1332
+ self._internal.spark_column_for(column_label)
1333
+ for column_label in column_labels
1334
+ ]
1335
+ data_fields = [
1336
+ self._internal.field_for(column_label) for column_label in column_labels
1337
+ ]
1338
+ else:
1339
+ column_labels = [
1340
+ self._internal.column_labels[i] for i, col in enumerate(cols_sel) if col
1341
+ ]
1342
+ data_spark_columns = [
1343
+ self._internal.data_spark_columns[i] for i, col in enumerate(cols_sel) if col
1344
+ ]
1345
+ data_fields = [
1346
+ self._internal.data_fields[i] for i, col in enumerate(cols_sel) if col
1347
+ ]
1348
+ elif any(isinstance(key, tuple) for key in cols_sel) and any(
1349
+ not is_name_like_tuple(key) for key in cols_sel
1350
+ ):
1351
+ raise TypeError(
1352
+ "Expected tuple, got {}".format(
1353
+ type(set(key for key in cols_sel if not is_name_like_tuple(key)).pop())
1354
+ )
1355
+ )
1356
+ else:
1357
+ if missing_keys is None and all(isinstance(key, tuple) for key in cols_sel):
1358
+ level = self._internal.column_labels_level
1359
+ if any(len(key) != level for key in cols_sel):
1360
+ raise ValueError("All the key level should be the same as column index level.")
1361
+
1362
+ column_labels = []
1363
+ data_spark_columns = []
1364
+ data_fields = []
1365
+ for key in cols_sel:
1366
+ found = False
1367
+ for label in self._internal.column_labels:
1368
+ if label == key or label[0] == key:
1369
+ column_labels.append(label)
1370
+ data_spark_columns.append(self._internal.spark_column_for(label))
1371
+ data_fields.append(self._internal.field_for(label))
1372
+ found = True
1373
+ if not found:
1374
+ if missing_keys is None:
1375
+ raise KeyError("['{}'] not in index".format(name_like_string(key)))
1376
+ else:
1377
+ missing_keys.append(key)
1378
+
1379
+ return column_labels, data_spark_columns, data_fields, False, None
1380
+
1381
+ def _select_cols_else(
1382
+ self, cols_sel: Any, missing_keys: Optional[List[Name]]
1383
+ ) -> Tuple[
1384
+ List[Label],
1385
+ Optional[List[PySparkColumn]],
1386
+ Optional[List[InternalField]],
1387
+ bool,
1388
+ Optional[Name],
1389
+ ]:
1390
+ if not is_name_like_tuple(cols_sel):
1391
+ cols_sel = (cols_sel,)
1392
+ return self._get_from_multiindex_column(cols_sel, missing_keys)
1393
+
1394
+
1395
+ class iLocIndexer(LocIndexerLike):
1396
+ """
1397
+ Purely integer-location based indexing for selection by position.
1398
+
1399
+ ``.iloc[]`` is primarily integer position based (from ``0`` to
1400
+ ``length-1`` of the axis), but may also be used with a conditional boolean Series.
1401
+
1402
+ Allowed inputs are:
1403
+
1404
+ - An integer for column selection, e.g. ``5``.
1405
+ - A list or array of integers for row selection with distinct index values,
1406
+ e.g. ``[3, 4, 0]``
1407
+ - A list or array of integers for column selection, e.g. ``[4, 3, 0]``.
1408
+ - A boolean array for column selection.
1409
+ - A slice object with ints for row and column selection, e.g. ``1:7``.
1410
+
1411
+ Not allowed inputs which pandas allows are:
1412
+
1413
+ - A list or array of integers for row selection with duplicated indexes,
1414
+ e.g. ``[4, 4, 0]``.
1415
+ - A boolean array for row selection.
1416
+ - A ``callable`` function with one argument (the calling Series, DataFrame
1417
+ or Panel) and that returns valid output for indexing (one of the above).
1418
+ This is useful in method chains when you don't have a reference to the
1419
+ calling object but would like to base your selection on some value.
1420
+
1421
+ ``.iloc`` will raise ``IndexError`` if a requested indexer is
1422
+ out-of-bounds, except *slice* indexers which allow out-of-bounds
1423
+ indexing (this conforms with python/numpy *slice* semantics).
1424
+
1425
+ See Also
1426
+ --------
1427
+ DataFrame.loc : Purely label-location based indexer for selection by label.
1428
+ Series.iloc : Purely integer-location based indexing for
1429
+ selection by position.
1430
+
1431
+ Examples
1432
+ --------
1433
+
1434
+ >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
1435
+ ... {'a': 100, 'b': 200, 'c': 300, 'd': 400},
1436
+ ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
1437
+ >>> df = ps.DataFrame(mydict, columns=['a', 'b', 'c', 'd'])
1438
+ >>> df
1439
+ a b c d
1440
+ 0 1 2 3 4
1441
+ 1 100 200 300 400
1442
+ 2 1000 2000 3000 4000
1443
+
1444
+ **Indexing just the rows**
1445
+
1446
+ A scalar integer for row selection.
1447
+
1448
+ >>> df.iloc[1]
1449
+ a 100
1450
+ b 200
1451
+ c 300
1452
+ d 400
1453
+ Name: 1, dtype: int64
1454
+
1455
+ >>> df.iloc[[0]]
1456
+ a b c d
1457
+ 0 1 2 3 4
1458
+
1459
+ With a `slice` object.
1460
+
1461
+ >>> df.iloc[:3]
1462
+ a b c d
1463
+ 0 1 2 3 4
1464
+ 1 100 200 300 400
1465
+ 2 1000 2000 3000 4000
1466
+
1467
+ **Indexing both axes**
1468
+
1469
+ You can mix the indexer types for the index and columns. Use ``:`` to
1470
+ select the entire axis.
1471
+
1472
+ With scalar integers.
1473
+
1474
+ >>> df.iloc[:1, 1]
1475
+ 0 2
1476
+ Name: b, dtype: int64
1477
+
1478
+ With lists of integers.
1479
+
1480
+ >>> df.iloc[:2, [1, 3]]
1481
+ b d
1482
+ 0 2 4
1483
+ 1 200 400
1484
+
1485
+ With `slice` objects.
1486
+
1487
+ >>> df.iloc[:2, 0:3]
1488
+ a b c
1489
+ 0 1 2 3
1490
+ 1 100 200 300
1491
+
1492
+ With a boolean array whose length matches the columns.
1493
+
1494
+ >>> df.iloc[:, [True, False, True, False]]
1495
+ a c
1496
+ 0 1 3
1497
+ 1 100 300
1498
+ 2 1000 3000
1499
+
1500
+ **Setting values**
1501
+
1502
+ Setting value for all items matching the list of labels.
1503
+
1504
+ >>> df.iloc[[1, 2], [1]] = 50
1505
+ >>> df
1506
+ a b c d
1507
+ 0 1 2 3 4
1508
+ 1 100 50 300 400
1509
+ 2 1000 50 3000 4000
1510
+
1511
+ Setting value for an entire row
1512
+
1513
+ >>> df.iloc[0] = 10
1514
+ >>> df
1515
+ a b c d
1516
+ 0 10 10 10 10
1517
+ 1 100 50 300 400
1518
+ 2 1000 50 3000 4000
1519
+
1520
+ Set value for an entire column
1521
+
1522
+ >>> df.iloc[:, 2] = 30
1523
+ >>> df
1524
+ a b c d
1525
+ 0 10 10 30 10
1526
+ 1 100 50 30 400
1527
+ 2 1000 50 30 4000
1528
+
1529
+ Set value for an entire list of columns
1530
+
1531
+ >>> df.iloc[:, [2, 3]] = 100
1532
+ >>> df
1533
+ a b c d
1534
+ 0 10 10 100 100
1535
+ 1 100 50 100 100
1536
+ 2 1000 50 100 100
1537
+
1538
+ Set value with Series
1539
+
1540
+ >>> df.iloc[:, 3] = df.iloc[:, 3] * 2
1541
+ >>> df
1542
+ a b c d
1543
+ 0 10 10 100 200
1544
+ 1 100 50 100 200
1545
+ 2 1000 50 100 200
1546
+ """
1547
+
1548
+ @staticmethod
1549
+ def _NotImplemented(description: str) -> SparkPandasNotImplementedError:
1550
+ return SparkPandasNotImplementedError(
1551
+ description=description,
1552
+ pandas_function=".iloc[..., ...]",
1553
+ spark_target_function="select, where",
1554
+ )
1555
+
1556
+ @lazy_property
1557
+ def _internal(self) -> "InternalFrame":
1558
+ # Use resolved_copy to fix the natural order.
1559
+ internal = super()._internal.resolved_copy
1560
+ sdf = InternalFrame.attach_distributed_sequence_column(
1561
+ internal.spark_frame, column_name=self._sequence_col
1562
+ )
1563
+ return internal.with_new_sdf(spark_frame=sdf.orderBy(NATURAL_ORDER_COLUMN_NAME))
1564
+
1565
+ @lazy_property
1566
+ def _sequence_col(self) -> str:
1567
+ # Use resolved_copy to fix the natural order.
1568
+ internal = super()._internal.resolved_copy
1569
+ return verify_temp_column_name(internal.spark_frame, "__distributed_sequence_column__")
1570
+
1571
+ def _select_rows_by_series(
1572
+ self, rows_sel: "Series"
1573
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1574
+ raise iLocIndexer._NotImplemented(
1575
+ ".iloc requires numeric slice, conditional "
1576
+ "boolean Index or a sequence of positions as int, "
1577
+ "got {}".format(type(rows_sel))
1578
+ )
1579
+
1580
+ def _select_rows_by_spark_column(
1581
+ self, rows_sel: PySparkColumn
1582
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1583
+ raise iLocIndexer._NotImplemented(
1584
+ ".iloc requires numeric slice, conditional "
1585
+ "boolean Index or a sequence of positions as int, "
1586
+ "got {}".format(type(rows_sel))
1587
+ )
1588
+
1589
+ def _select_rows_by_slice(
1590
+ self, rows_sel: slice
1591
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1592
+ def verify_type(i: int) -> None:
1593
+ if not isinstance(i, int):
1594
+ raise TypeError(
1595
+ "cannot do slice indexing with these indexers [{}] of {}".format(i, type(i))
1596
+ )
1597
+
1598
+ has_negative = False
1599
+ start = rows_sel.start
1600
+ if start is not None:
1601
+ verify_type(start)
1602
+ if start == 0:
1603
+ start = None
1604
+ elif start < 0:
1605
+ has_negative = True
1606
+ stop = rows_sel.stop
1607
+ if stop is not None:
1608
+ verify_type(stop)
1609
+ if stop < 0:
1610
+ has_negative = True
1611
+
1612
+ step = rows_sel.step
1613
+ if step is not None:
1614
+ verify_type(step)
1615
+ if step == 0:
1616
+ raise ValueError("slice step cannot be zero")
1617
+ else:
1618
+ step = 1
1619
+
1620
+ if start is None and step == 1:
1621
+ return None, stop, None
1622
+
1623
+ sdf = self._internal.spark_frame
1624
+ sequence_scol = sdf[self._sequence_col]
1625
+
1626
+ if has_negative or (step < 0 and start is None):
1627
+ cnt = sdf.count()
1628
+
1629
+ cond = []
1630
+ if start is not None:
1631
+ if start < 0:
1632
+ start = start + cnt
1633
+ if step >= 0:
1634
+ cond.append(sequence_scol >= F.lit(start).cast(LongType()))
1635
+ else:
1636
+ cond.append(sequence_scol <= F.lit(start).cast(LongType()))
1637
+ if stop is not None:
1638
+ if stop < 0:
1639
+ stop = stop + cnt
1640
+ if step >= 0:
1641
+ cond.append(sequence_scol < F.lit(stop).cast(LongType()))
1642
+ else:
1643
+ cond.append(sequence_scol > F.lit(stop).cast(LongType()))
1644
+ if step != 1:
1645
+ if step > 0:
1646
+ start = start or 0
1647
+ else:
1648
+ start = start or (cnt - 1)
1649
+ cond.append(((sequence_scol - start) % F.lit(step).cast(LongType())) == F.lit(0))
1650
+
1651
+ return reduce(lambda x, y: x & y, cond), None, None
1652
+
1653
+ def _select_rows_by_iterable(
1654
+ self, rows_sel: Iterable
1655
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1656
+ sdf = self._internal.spark_frame
1657
+
1658
+ if any(isinstance(key, (int, np.int64, np.int32)) and key < 0 for key in rows_sel):
1659
+ offset = sdf.count()
1660
+ else:
1661
+ offset = 0
1662
+
1663
+ new_rows_sel = []
1664
+ for key in list(rows_sel):
1665
+ if not isinstance(key, (int, np.int64, np.int32)):
1666
+ raise TypeError(
1667
+ "cannot do positional indexing with these indexers [{}] of {}".format(
1668
+ key, type(key)
1669
+ )
1670
+ )
1671
+ if key < 0:
1672
+ key = key + offset
1673
+ new_rows_sel.append(key)
1674
+
1675
+ if len(new_rows_sel) != len(set(new_rows_sel)):
1676
+ raise NotImplementedError(
1677
+ "Duplicated row selection is not currently supported; "
1678
+ "however, normalized index was [%s]" % new_rows_sel
1679
+ )
1680
+
1681
+ if len(new_rows_sel) == 0:
1682
+ cond = F.lit(False)
1683
+ else:
1684
+ cond = sdf[self._sequence_col].isin(
1685
+ [F.lit(int(key)).cast(LongType()) for key in new_rows_sel]
1686
+ )
1687
+ return cond, None, None
1688
+
1689
+ def _select_rows_else(
1690
+ self, rows_sel: Any
1691
+ ) -> Tuple[Optional[PySparkColumn], Optional[int], Optional[int]]:
1692
+ if isinstance(rows_sel, int):
1693
+ sdf = self._internal.spark_frame
1694
+ return (sdf[self._sequence_col] == rows_sel), None, 0
1695
+ elif isinstance(rows_sel, tuple):
1696
+ raise SparkPandasIndexingError("Too many indexers")
1697
+ else:
1698
+ raise iLocIndexer._NotImplemented(
1699
+ ".iloc requires numeric slice, conditional "
1700
+ "boolean Index or a sequence of positions as int, "
1701
+ "got {}".format(type(rows_sel))
1702
+ )
1703
+
1704
+ def _select_cols_by_series(
1705
+ self, cols_sel: "Series", missing_keys: Optional[List[Name]]
1706
+ ) -> Tuple[
1707
+ List[Label],
1708
+ Optional[List[PySparkColumn]],
1709
+ Optional[List[InternalField]],
1710
+ bool,
1711
+ Optional[Name],
1712
+ ]:
1713
+ raise ValueError(
1714
+ "Location based indexing can only have [integer, integer slice, "
1715
+ "listlike of integers, boolean array] types, got {}".format(cols_sel)
1716
+ )
1717
+
1718
+ def _select_cols_by_spark_column(
1719
+ self, cols_sel: PySparkColumn, missing_keys: Optional[List[Name]]
1720
+ ) -> Tuple[
1721
+ List[Label],
1722
+ Optional[List[PySparkColumn]],
1723
+ Optional[List[InternalField]],
1724
+ bool,
1725
+ Optional[Name],
1726
+ ]:
1727
+ raise ValueError(
1728
+ "Location based indexing can only have [integer, integer slice, "
1729
+ "listlike of integers, boolean array] types, got {}".format(cols_sel)
1730
+ )
1731
+
1732
+ def _select_cols_by_slice(
1733
+ self, cols_sel: slice, missing_keys: Optional[List[Name]]
1734
+ ) -> Tuple[
1735
+ List[Label],
1736
+ Optional[List[PySparkColumn]],
1737
+ Optional[List[InternalField]],
1738
+ bool,
1739
+ Optional[Name],
1740
+ ]:
1741
+ if all(
1742
+ s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)
1743
+ ):
1744
+ column_labels = self._internal.column_labels[cols_sel]
1745
+ data_spark_columns = self._internal.data_spark_columns[cols_sel]
1746
+ data_fields = self._internal.data_fields[cols_sel]
1747
+ return column_labels, data_spark_columns, data_fields, False, None
1748
+ else:
1749
+ not_none = (
1750
+ cols_sel.start
1751
+ if cols_sel.start is not None
1752
+ else cols_sel.stop
1753
+ if cols_sel.stop is not None
1754
+ else cols_sel.step
1755
+ )
1756
+ raise TypeError(
1757
+ "cannot do slice indexing with these indexers {} of {}".format(
1758
+ not_none, type(not_none)
1759
+ )
1760
+ )
1761
+
1762
+ def _select_cols_by_iterable(
1763
+ self, cols_sel: Iterable, missing_keys: Optional[List[Name]]
1764
+ ) -> Tuple[
1765
+ List[Label],
1766
+ Optional[List[PySparkColumn]],
1767
+ Optional[List[InternalField]],
1768
+ bool,
1769
+ Optional[Name],
1770
+ ]:
1771
+ if all(isinstance(s, bool) for s in cols_sel):
1772
+ cols_sel = [i for i, s in enumerate(cols_sel) if s]
1773
+ if all(isinstance(s, int) for s in cols_sel):
1774
+ column_labels = [self._internal.column_labels[s] for s in cols_sel]
1775
+ data_spark_columns = [self._internal.data_spark_columns[s] for s in cols_sel]
1776
+ data_fields = [self._internal.data_fields[s] for s in cols_sel]
1777
+ return column_labels, data_spark_columns, data_fields, False, None
1778
+ else:
1779
+ raise TypeError("cannot perform reduce with flexible type")
1780
+
1781
+ def _select_cols_else(
1782
+ self, cols_sel: Any, missing_keys: Optional[List[Name]]
1783
+ ) -> Tuple[
1784
+ List[Label],
1785
+ Optional[List[PySparkColumn]],
1786
+ Optional[List[InternalField]],
1787
+ bool,
1788
+ Optional[Name],
1789
+ ]:
1790
+ if isinstance(cols_sel, int):
1791
+ if cols_sel > len(self._internal.column_labels):
1792
+ raise KeyError(cols_sel)
1793
+ column_labels = [self._internal.column_labels[cols_sel]]
1794
+ data_spark_columns = [self._internal.data_spark_columns[cols_sel]]
1795
+ data_fields = [self._internal.data_fields[cols_sel]]
1796
+ return column_labels, data_spark_columns, data_fields, True, None
1797
+ else:
1798
+ raise ValueError(
1799
+ "Location based indexing can only have [integer, integer slice, "
1800
+ "listlike of integers, boolean array] types, got {}".format(cols_sel)
1801
+ )
1802
+
1803
+ def __setitem__(self, key: Any, value: Any) -> None:
1804
+ Column = get_column_class()
1805
+ if not isinstance(value, Column) and is_list_like(value):
1806
+ iloc_item = self[key]
1807
+ if not is_list_like(key) or not is_list_like(iloc_item):
1808
+ raise ValueError("setting an array element with a sequence.")
1809
+ else:
1810
+ shape_iloc_item = iloc_item.shape
1811
+ len_iloc_item = shape_iloc_item[0]
1812
+ len_value = len(value)
1813
+ if len_iloc_item != len_value:
1814
+ if self._is_series:
1815
+ raise ValueError(
1816
+ "cannot set using a list-like indexer with a different length than "
1817
+ "the value"
1818
+ )
1819
+ else:
1820
+ raise ValueError(
1821
+ "shape mismatch: value array of shape ({},) could not be broadcast "
1822
+ "to indexing result of shape {}".format(len_value, shape_iloc_item)
1823
+ )
1824
+ super().__setitem__(key, value)
1825
+ # Update again with resolved_copy to drop extra columns.
1826
+ self._psdf._update_internal_frame(
1827
+ self._psdf._internal.resolved_copy, check_same_anchor=False
1828
+ )
1829
+
1830
+ # Clean up implicitly cached properties to be able to reuse the indexer.
1831
+ del self._internal
1832
+ del self._sequence_col
1833
+
1834
+
1835
+ def _test() -> None:
1836
+ import os
1837
+ import doctest
1838
+ import sys
1839
+ from pyspark.sql import SparkSession
1840
+ import pyspark.pandas.indexing
1841
+
1842
+ os.chdir(os.environ["SPARK_HOME"])
1843
+
1844
+ globs = pyspark.pandas.indexing.__dict__.copy()
1845
+ globs["ps"] = pyspark.pandas
1846
+ spark = (
1847
+ SparkSession.builder.master("local[4]")
1848
+ .appName("pyspark.pandas.indexing tests")
1849
+ .getOrCreate()
1850
+ )
1851
+ (failure_count, test_count) = doctest.testmod(
1852
+ pyspark.pandas.indexing,
1853
+ globs=globs,
1854
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
1855
+ )
1856
+ spark.stop()
1857
+ if failure_count:
1858
+ sys.exit(-1)
1859
+
1860
+
1861
+ if __name__ == "__main__":
1862
+ _test()