snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3807 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ Wrappers around spark that correspond to common pandas functions.
20
+ """
21
+ from typing import (
22
+ Any,
23
+ Callable,
24
+ Dict,
25
+ List,
26
+ Optional,
27
+ Set,
28
+ Sized,
29
+ Tuple,
30
+ Type,
31
+ Union,
32
+ cast,
33
+ no_type_check,
34
+ )
35
+ from collections.abc import Iterable
36
+ from datetime import tzinfo
37
+ from functools import reduce
38
+ from io import BytesIO
39
+ import json
40
+ import warnings
41
+
42
+ import numpy as np
43
+ import pandas as pd
44
+ from pandas.api.types import ( # type: ignore[attr-defined]
45
+ is_datetime64_dtype,
46
+ is_datetime64tz_dtype,
47
+ is_list_like,
48
+ )
49
+ from pandas.tseries.offsets import DateOffset
50
+ import pyarrow as pa
51
+ import pyarrow.parquet as pq
52
+ from pyspark.sql import functions as F, Column as PySparkColumn
53
+ from pyspark.sql.functions import pandas_udf
54
+ from pyspark.sql.types import (
55
+ ByteType,
56
+ ShortType,
57
+ IntegerType,
58
+ LongType,
59
+ FloatType,
60
+ DoubleType,
61
+ BooleanType,
62
+ TimestampType,
63
+ TimestampNTZType,
64
+ DecimalType,
65
+ StringType,
66
+ DateType,
67
+ StructType,
68
+ DataType,
69
+ )
70
+ from pyspark.sql.dataframe import DataFrame as PySparkDataFrame
71
+
72
+ from pyspark import pandas as ps
73
+ from pyspark.pandas._typing import Axis, Dtype, Label, Name
74
+ from pyspark.pandas.base import IndexOpsMixin
75
+ from pyspark.pandas.utils import (
76
+ align_diff_frames,
77
+ default_session,
78
+ is_name_like_tuple,
79
+ is_name_like_value,
80
+ name_like_string,
81
+ same_anchor,
82
+ scol_for,
83
+ validate_axis,
84
+ log_advice,
85
+ )
86
+ from pyspark.pandas.frame import DataFrame, _reduce_spark_multi
87
+ from pyspark.pandas.internal import (
88
+ InternalFrame,
89
+ DEFAULT_SERIES_NAME,
90
+ HIDDEN_COLUMNS,
91
+ SPARK_INDEX_NAME_FORMAT,
92
+ )
93
+ from pyspark.pandas.series import Series, first_series
94
+ from pyspark.pandas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale
95
+ from pyspark.pandas.indexes import Index, DatetimeIndex, TimedeltaIndex
96
+ from pyspark.pandas.indexes.multi import MultiIndex
97
+
98
+ # For Supporting Spark Connect
99
+ from pyspark.sql.utils import get_column_class
100
+
101
+ __all__ = [
102
+ "from_pandas",
103
+ "range",
104
+ "read_csv",
105
+ "read_delta",
106
+ "read_table",
107
+ "read_spark_io",
108
+ "read_parquet",
109
+ "read_clipboard",
110
+ "read_excel",
111
+ "read_html",
112
+ "to_datetime",
113
+ "date_range",
114
+ "to_timedelta",
115
+ "timedelta_range",
116
+ "get_dummies",
117
+ "concat",
118
+ "melt",
119
+ "isna",
120
+ "isnull",
121
+ "notna",
122
+ "notnull",
123
+ "read_sql_table",
124
+ "read_sql_query",
125
+ "read_sql",
126
+ "read_json",
127
+ "merge",
128
+ "merge_asof",
129
+ "to_numeric",
130
+ "broadcast",
131
+ "read_orc",
132
+ ]
133
+
134
+
135
+ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series, DataFrame, Index]:
136
+ """Create a pandas-on-Spark DataFrame, Series or Index from a pandas DataFrame, Series or Index.
137
+
138
+ This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame,
139
+ but this also works with pandas Series and picks the index.
140
+
141
+ Parameters
142
+ ----------
143
+ pobj : pandas.DataFrame or pandas.Series
144
+ pandas DataFrame or Series to read.
145
+
146
+ Returns
147
+ -------
148
+ Series or DataFrame
149
+ If a pandas Series is passed in, this function returns a pandas-on-Spark Series.
150
+ If a pandas DataFrame is passed in, this function returns a pandas-on-Spark DataFrame.
151
+ """
152
+ if isinstance(pobj, pd.Series):
153
+ return Series(pobj)
154
+ elif isinstance(pobj, pd.DataFrame):
155
+ return DataFrame(pobj)
156
+ elif isinstance(pobj, pd.Index):
157
+ return DataFrame(pd.DataFrame(index=pobj)).index
158
+ else:
159
+ raise TypeError("Unknown data type: {}".format(type(pobj).__name__))
160
+
161
+
162
+ _range = range # built-in range
163
+
164
+
165
+ def range(
166
+ start: int, end: Optional[int] = None, step: int = 1, num_partitions: Optional[int] = None
167
+ ) -> DataFrame:
168
+ """
169
+ Create a DataFrame with some range of numbers.
170
+
171
+ The resulting DataFrame has a single int64 column named `id`, containing elements in a range
172
+ from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter
173
+ (i.e. start) is specified, we treat it as the end value with the start value being 0.
174
+
175
+ This is like the range function in SparkSession and is used primarily for testing.
176
+
177
+ Parameters
178
+ ----------
179
+ start : int
180
+ the start value (inclusive)
181
+ end : int, optional
182
+ the end value (exclusive)
183
+ step : int, optional, default 1
184
+ the incremental step
185
+ num_partitions : int, optional
186
+ the number of partitions of the DataFrame
187
+
188
+ Returns
189
+ -------
190
+ DataFrame
191
+
192
+ Examples
193
+ --------
194
+ When the first parameter is specified, we generate a range of values up till that number.
195
+
196
+ >>> ps.range(5)
197
+ id
198
+ 0 0
199
+ 1 1
200
+ 2 2
201
+ 3 3
202
+ 4 4
203
+
204
+ When start, end, and step are specified:
205
+
206
+ >>> ps.range(start = 100, end = 200, step = 20)
207
+ id
208
+ 0 100
209
+ 1 120
210
+ 2 140
211
+ 3 160
212
+ 4 180
213
+ """
214
+ sdf = default_session().range(start=start, end=end, step=step, numPartitions=num_partitions)
215
+ return DataFrame(sdf)
216
+
217
+
218
+ def read_csv(
219
+ path: Union[str, List[str]],
220
+ sep: str = ",",
221
+ header: Union[str, int, None] = "infer",
222
+ names: Optional[Union[str, List[str]]] = None,
223
+ index_col: Optional[Union[str, List[str]]] = None,
224
+ usecols: Optional[Union[List[int], List[str], Callable[[str], bool]]] = None,
225
+ squeeze: bool = False,
226
+ mangle_dupe_cols: bool = True,
227
+ dtype: Optional[Union[str, Dtype, Dict[str, Union[str, Dtype]]]] = None,
228
+ nrows: Optional[int] = None,
229
+ parse_dates: bool = False,
230
+ quotechar: Optional[str] = None,
231
+ escapechar: Optional[str] = None,
232
+ comment: Optional[str] = None,
233
+ encoding: Optional[str] = None,
234
+ **options: Any,
235
+ ) -> Union[DataFrame, Series]:
236
+ """Read CSV (comma-separated) file into DataFrame or Series.
237
+
238
+ Parameters
239
+ ----------
240
+ path : str or list
241
+ Path(s) of the CSV file(s) to be read.
242
+ sep : str, default ‘,’
243
+ Delimiter to use. Non empty string.
244
+ header : int, default ‘infer’
245
+ Whether to use the column names, and the start of the data.
246
+ Default behavior is to infer the column names: if no names are passed
247
+ the behavior is identical to `header=0` and column names are inferred from
248
+ the first line of the file, if column names are passed explicitly then
249
+ the behavior is identical to `header=None`. Explicitly pass `header=0` to be
250
+ able to replace existing names
251
+ names : str or array-like, optional
252
+ List of column names to use. If file contains no header row, then you should
253
+ explicitly pass `header=None`. Duplicates in this list will cause an error to be issued.
254
+ If a string is given, it should be a DDL-formatted string in Spark SQL, which is
255
+ preferred to avoid schema inference for better performance.
256
+ index_col: str or list of str, optional, default: None
257
+ Index column of table in Spark.
258
+ usecols : list-like or callable, optional
259
+ Return a subset of the columns. If list-like, all elements must either be
260
+ positional (i.e. integer indices into the document columns) or strings that
261
+ correspond to column names provided either by the user in names or inferred
262
+ from the document header row(s).
263
+ If callable, the callable function will be evaluated against the column names,
264
+ returning names where the callable function evaluates to `True`.
265
+ squeeze : bool, default False
266
+ If the parsed data only contains one column then return a Series.
267
+
268
+ .. deprecated:: 3.4.0
269
+
270
+ mangle_dupe_cols : bool, default True
271
+ Duplicate columns will be specified as 'X0', 'X1', ... 'XN', rather
272
+ than 'X' ... 'X'. Passing in False will cause data to be overwritten if
273
+ there are duplicate names in the columns.
274
+ Currently only `True` is allowed.
275
+
276
+ .. deprecated:: 3.4.0
277
+
278
+ dtype : Type name or dict of column -> type, default None
279
+ Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} Use str or object
280
+ together with suitable na_values settings to preserve and not interpret dtype.
281
+ nrows : int, default None
282
+ Number of rows to read from the CSV file.
283
+ parse_dates : boolean or list of ints or names or list of lists or dict, default `False`.
284
+ Currently only `False` is allowed.
285
+ quotechar : str (length 1), optional
286
+ The character used to denote the start and end of a quoted item. Quoted items can include
287
+ the delimiter and it will be ignored.
288
+ escapechar : str (length 1), default None
289
+ One-character string used to escape other characters.
290
+ comment: str, optional
291
+ Indicates the line should not be parsed.
292
+ encoding: str, optional
293
+ Indicates the encoding to read file
294
+ options : dict
295
+ All other options passed directly into Spark's data source.
296
+
297
+ Returns
298
+ -------
299
+ DataFrame or Series
300
+
301
+ See Also
302
+ --------
303
+ DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
304
+
305
+ Examples
306
+ --------
307
+ >>> ps.read_csv('data.csv') # doctest: +SKIP
308
+
309
+ Load multiple CSV files as a single DataFrame:
310
+
311
+ >>> ps.read_csv(['data-01.csv', 'data-02.csv']) # doctest: +SKIP
312
+ """
313
+ # For latin-1 encoding is same as iso-8859-1, that's why its mapped to iso-8859-1.
314
+ encoding_mapping = {"latin-1": "iso-8859-1"}
315
+
316
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
317
+ options = options.get("options")
318
+
319
+ if mangle_dupe_cols is not True:
320
+ raise ValueError("mangle_dupe_cols can only be `True`: %s" % mangle_dupe_cols)
321
+ if parse_dates is not False:
322
+ raise ValueError("parse_dates can only be `False`: %s" % parse_dates)
323
+
324
+ if usecols is not None and not callable(usecols):
325
+ usecols = list(usecols) # type: ignore[assignment]
326
+
327
+ if usecols is None or callable(usecols) or len(usecols) > 0:
328
+ reader = default_session().read
329
+ reader.option("inferSchema", True)
330
+ reader.option("sep", sep)
331
+
332
+ if header == "infer":
333
+ header = 0 if names is None else None
334
+ if header == 0:
335
+ reader.option("header", True)
336
+ elif header is None:
337
+ reader.option("header", False)
338
+ else:
339
+ raise ValueError("Unknown header argument {}".format(header))
340
+
341
+ if quotechar is not None:
342
+ reader.option("quote", quotechar)
343
+ if escapechar is not None:
344
+ reader.option("escape", escapechar)
345
+
346
+ if comment is not None:
347
+ if not isinstance(comment, str) or len(comment) != 1:
348
+ raise ValueError("Only length-1 comment characters supported")
349
+ reader.option("comment", comment)
350
+
351
+ reader.options(**options)
352
+
353
+ if encoding is not None:
354
+ reader.option("encoding", encoding_mapping.get(encoding, encoding))
355
+
356
+ column_labels: Dict[Any, str]
357
+ if isinstance(names, str):
358
+ sdf = reader.schema(names).csv(path)
359
+ column_labels = {col: col for col in sdf.columns}
360
+ else:
361
+ sdf = reader.csv(path)
362
+ if is_list_like(names):
363
+ names = list(names)
364
+ if len(set(names)) != len(names):
365
+ raise ValueError("Found non-unique column index")
366
+ if len(names) != len(sdf.columns):
367
+ raise ValueError(
368
+ "The number of names [%s] does not match the number "
369
+ "of columns [%d]. Try names by a Spark SQL DDL-formatted "
370
+ "string." % (len(sdf.schema), len(names))
371
+ )
372
+ column_labels = dict(zip(names, sdf.columns))
373
+ elif header is None:
374
+ column_labels = dict(enumerate(sdf.columns))
375
+ else:
376
+ column_labels = {col: col for col in sdf.columns}
377
+
378
+ if usecols is not None:
379
+ missing: List[Union[int, str]]
380
+ if callable(usecols):
381
+ column_labels = {
382
+ label: col for label, col in column_labels.items() if usecols(label)
383
+ }
384
+ missing = []
385
+ elif all(isinstance(col, int) for col in usecols):
386
+ usecols_ints = cast(List[int], usecols)
387
+ new_column_labels = {
388
+ label: col
389
+ for i, (label, col) in enumerate(column_labels.items())
390
+ if i in usecols_ints
391
+ }
392
+ missing = [
393
+ col
394
+ for col in usecols_ints
395
+ if (
396
+ col >= len(column_labels)
397
+ or list(column_labels)[col] not in new_column_labels
398
+ )
399
+ ]
400
+ column_labels = new_column_labels
401
+ elif all(isinstance(col, str) for col in usecols):
402
+ new_column_labels = {
403
+ label: col for label, col in column_labels.items() if label in usecols
404
+ }
405
+ missing = [col for col in usecols if col not in new_column_labels]
406
+ column_labels = new_column_labels
407
+ else:
408
+ raise ValueError(
409
+ "'usecols' must either be list-like of all strings, "
410
+ "all unicode, all integers or a callable."
411
+ )
412
+ if len(missing) > 0:
413
+ raise ValueError(
414
+ "Usecols do not match columns, columns expected but not " "found: %s" % missing
415
+ )
416
+
417
+ if len(column_labels) > 0:
418
+ sdf = sdf.select([scol_for(sdf, col) for col in column_labels.values()])
419
+ else:
420
+ sdf = default_session().createDataFrame([], schema=StructType())
421
+ else:
422
+ sdf = default_session().createDataFrame([], schema=StructType())
423
+ column_labels = {}
424
+
425
+ if nrows is not None:
426
+ sdf = sdf.limit(nrows)
427
+
428
+ index_spark_column_names: List[str]
429
+ index_names: List[Label]
430
+ if index_col is not None:
431
+ if isinstance(index_col, (str, int)):
432
+ index_col = [index_col]
433
+ for col in index_col:
434
+ if col not in column_labels:
435
+ raise KeyError(col)
436
+ index_spark_column_names = [column_labels[col] for col in index_col]
437
+ index_names = [(col,) for col in index_col]
438
+ column_labels = {
439
+ label: col for label, col in column_labels.items() if label not in index_col
440
+ }
441
+ else:
442
+ log_advice(
443
+ "If `index_col` is not specified for `read_csv`, "
444
+ "the default index is attached which can cause additional overhead."
445
+ )
446
+ index_spark_column_names = []
447
+ index_names = []
448
+
449
+ psdf: DataFrame = DataFrame(
450
+ InternalFrame(
451
+ spark_frame=sdf,
452
+ index_spark_columns=[scol_for(sdf, col) for col in index_spark_column_names],
453
+ index_names=index_names,
454
+ column_labels=[
455
+ label if is_name_like_tuple(label) else (label,) for label in column_labels
456
+ ],
457
+ data_spark_columns=[scol_for(sdf, col) for col in column_labels.values()],
458
+ )
459
+ )
460
+
461
+ if dtype is not None:
462
+ if isinstance(dtype, dict):
463
+ for col, tpe in dtype.items():
464
+ psdf[col] = psdf[col].astype(tpe)
465
+ else:
466
+ for col in psdf.columns:
467
+ psdf[col] = psdf[col].astype(dtype)
468
+
469
+ if squeeze and len(psdf.columns) == 1:
470
+ return first_series(psdf)
471
+ else:
472
+ return psdf
473
+
474
+
475
+ def read_json(
476
+ path: str, lines: bool = True, index_col: Optional[Union[str, List[str]]] = None, **options: Any
477
+ ) -> DataFrame:
478
+ """
479
+ Convert a JSON string to DataFrame.
480
+
481
+ Parameters
482
+ ----------
483
+ path : string
484
+ File path
485
+ lines : bool, default True
486
+ Read the file as a JSON object per line. It should be always True for now.
487
+ index_col : str or list of str, optional, default: None
488
+ Index column of table in Spark.
489
+ options : dict
490
+ All other options passed directly into Spark's data source.
491
+
492
+ Examples
493
+ --------
494
+ >>> df = ps.DataFrame([['a', 'b'], ['c', 'd']],
495
+ ... columns=['col 1', 'col 2'])
496
+
497
+ >>> df.to_json(path=r'%s/read_json/foo.json' % path, num_files=1)
498
+ >>> ps.read_json(
499
+ ... path=r'%s/read_json/foo.json' % path
500
+ ... ).sort_values(by="col 1")
501
+ col 1 col 2
502
+ 0 a b
503
+ 1 c d
504
+
505
+ >>> df.to_json(path=r'%s/read_json/foo.json' % path, num_files=1, lineSep='___')
506
+ >>> ps.read_json(
507
+ ... path=r'%s/read_json/foo.json' % path, lineSep='___'
508
+ ... ).sort_values(by="col 1")
509
+ col 1 col 2
510
+ 0 a b
511
+ 1 c d
512
+
513
+ You can preserve the index in the roundtrip as below.
514
+
515
+ >>> df.to_json(path=r'%s/read_json/bar.json' % path, num_files=1, index_col="index")
516
+ >>> ps.read_json(
517
+ ... path=r'%s/read_json/bar.json' % path, index_col="index"
518
+ ... ).sort_values(by="col 1") # doctest: +NORMALIZE_WHITESPACE
519
+ col 1 col 2
520
+ index
521
+ 0 a b
522
+ 1 c d
523
+ """
524
+ if index_col is None:
525
+ log_advice(
526
+ "If `index_col` is not specified for `read_json`, "
527
+ "the default index is attached which can cause additional overhead."
528
+ )
529
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
530
+ options = options.get("options")
531
+
532
+ if not lines:
533
+ raise NotImplementedError("lines=False is not implemented yet.")
534
+
535
+ return read_spark_io(path, format="json", index_col=index_col, **options)
536
+
537
+
538
+ def read_delta(
539
+ path: str,
540
+ version: Optional[str] = None,
541
+ timestamp: Optional[str] = None,
542
+ index_col: Optional[Union[str, List[str]]] = None,
543
+ **options: Any,
544
+ ) -> DataFrame:
545
+ """
546
+ Read a Delta Lake table on some file system and return a DataFrame.
547
+
548
+ If the Delta Lake table is already stored in the catalog (aka the metastore), use 'read_table'.
549
+
550
+ Parameters
551
+ ----------
552
+ path : string
553
+ Path to the Delta Lake table.
554
+ version : string, optional
555
+ Specifies the table version (based on Delta's internal transaction version) to read from,
556
+ using Delta's time travel feature. This sets Delta's 'versionAsOf' option. Note that
557
+ this parameter and `timestamp` parameter cannot be used together, otherwise it will raise a
558
+ `ValueError`.
559
+ timestamp : string, optional
560
+ Specifies the table version (based on timestamp) to read from,
561
+ using Delta's time travel feature. This must be a valid date or timestamp string in Spark,
562
+ and sets Delta's 'timestampAsOf' option. Note that this parameter and `version` parameter
563
+ cannot be used together, otherwise it will raise a `ValueError`.
564
+ index_col : str or list of str, optional, default: None
565
+ Index column of table in Spark.
566
+ options
567
+ Additional options that can be passed onto Delta.
568
+
569
+ Returns
570
+ -------
571
+ DataFrame
572
+
573
+ See Also
574
+ --------
575
+ DataFrame.to_delta
576
+ read_table
577
+ read_spark_io
578
+ read_parquet
579
+
580
+ Examples
581
+ --------
582
+ >>> ps.range(1).to_delta('%s/read_delta/foo' % path) # doctest: +SKIP
583
+ >>> ps.read_delta('%s/read_delta/foo' % path) # doctest: +SKIP
584
+ id
585
+ 0 0
586
+
587
+ >>> ps.range(10, 15, num_partitions=1).to_delta('%s/read_delta/foo' % path,
588
+ ... mode='overwrite') # doctest: +SKIP
589
+ >>> ps.read_delta('%s/read_delta/foo' % path) # doctest: +SKIP
590
+ id
591
+ 0 10
592
+ 1 11
593
+ 2 12
594
+ 3 13
595
+ 4 14
596
+
597
+ >>> ps.read_delta('%s/read_delta/foo' % path, version=0) # doctest: +SKIP
598
+ id
599
+ 0 0
600
+
601
+ You can preserve the index in the roundtrip as below.
602
+
603
+ >>> ps.range(10, 15, num_partitions=1).to_delta(
604
+ ... '%s/read_delta/bar' % path, index_col="index") # doctest: +SKIP
605
+ >>> ps.read_delta('%s/read_delta/bar' % path, index_col="index") # doctest: +SKIP
606
+ id
607
+ index
608
+ 0 10
609
+ 1 11
610
+ 2 12
611
+ 3 13
612
+ 4 14
613
+ """
614
+ if index_col is None:
615
+ log_advice(
616
+ "If `index_col` is not specified for `read_delta`, "
617
+ "the default index is attached which can cause additional overhead."
618
+ )
619
+ if version is not None and timestamp is not None:
620
+ raise ValueError("version and timestamp cannot be used together.")
621
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
622
+ options = options.get("options")
623
+
624
+ if version is not None:
625
+ options["versionAsOf"] = version
626
+ if timestamp is not None:
627
+ options["timestampAsOf"] = timestamp
628
+ return read_spark_io(path, format="delta", index_col=index_col, **options)
629
+
630
+
631
+ def read_table(name: str, index_col: Optional[Union[str, List[str]]] = None) -> DataFrame:
632
+ """
633
+ Read a Spark table and return a DataFrame.
634
+
635
+ Parameters
636
+ ----------
637
+ name : string
638
+ Table name in Spark.
639
+
640
+ index_col : str or list of str, optional, default: None
641
+ Index column of table in Spark.
642
+
643
+ Returns
644
+ -------
645
+ DataFrame
646
+
647
+ See Also
648
+ --------
649
+ DataFrame.to_table
650
+ read_delta
651
+ read_parquet
652
+ read_spark_io
653
+
654
+ Examples
655
+ --------
656
+ >>> ps.range(1).to_table('%s.my_table' % db)
657
+ >>> ps.read_table('%s.my_table' % db)
658
+ id
659
+ 0 0
660
+
661
+ >>> ps.range(1).to_table('%s.my_table' % db, index_col="index")
662
+ >>> ps.read_table('%s.my_table' % db, index_col="index") # doctest: +NORMALIZE_WHITESPACE
663
+ id
664
+ index
665
+ 0 0
666
+ """
667
+ if index_col is None:
668
+ log_advice(
669
+ "If `index_col` is not specified for `read_table`, "
670
+ "the default index is attached which can cause additional overhead."
671
+ )
672
+ sdf = default_session().read.table(name)
673
+ index_spark_columns, index_names = _get_index_map(sdf, index_col)
674
+
675
+ return DataFrame(
676
+ InternalFrame(
677
+ spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names
678
+ )
679
+ )
680
+
681
+
682
+ def read_spark_io(
683
+ path: Optional[str] = None,
684
+ format: Optional[str] = None,
685
+ schema: Union[str, "StructType"] = None,
686
+ index_col: Optional[Union[str, List[str]]] = None,
687
+ **options: Any,
688
+ ) -> DataFrame:
689
+ """Load a DataFrame from a Spark data source.
690
+
691
+ Parameters
692
+ ----------
693
+ path : string, optional
694
+ Path to the data source.
695
+ format : string, optional
696
+ Specifies the output data source format. Some common ones are:
697
+
698
+ - 'delta'
699
+ - 'parquet'
700
+ - 'orc'
701
+ - 'json'
702
+ - 'csv'
703
+ schema : string or StructType, optional
704
+ Input schema. If none, Spark tries to infer the schema automatically.
705
+ The schema can either be a Spark StructType, or a DDL-formatted string like
706
+ `col0 INT, col1 DOUBLE`.
707
+ index_col : str or list of str, optional, default: None
708
+ Index column of table in Spark.
709
+ options : dict
710
+ All other options passed directly into Spark's data source.
711
+
712
+ See Also
713
+ --------
714
+ DataFrame.to_spark_io
715
+ DataFrame.read_table
716
+ DataFrame.read_delta
717
+ DataFrame.read_parquet
718
+
719
+ Examples
720
+ --------
721
+ >>> ps.range(1).to_spark_io('%s/read_spark_io/data.parquet' % path)
722
+ >>> ps.read_spark_io(
723
+ ... '%s/read_spark_io/data.parquet' % path, format='parquet', schema='id long')
724
+ id
725
+ 0 0
726
+
727
+ >>> ps.range(10, 15, num_partitions=1).to_spark_io('%s/read_spark_io/data.json' % path,
728
+ ... format='json', lineSep='__')
729
+ >>> ps.read_spark_io(
730
+ ... '%s/read_spark_io/data.json' % path, format='json', schema='id long', lineSep='__')
731
+ id
732
+ 0 10
733
+ 1 11
734
+ 2 12
735
+ 3 13
736
+ 4 14
737
+
738
+ You can preserve the index in the roundtrip as below.
739
+
740
+ >>> ps.range(10, 15, num_partitions=1).to_spark_io('%s/read_spark_io/data.orc' % path,
741
+ ... format='orc', index_col="index")
742
+ >>> ps.read_spark_io(
743
+ ... path=r'%s/read_spark_io/data.orc' % path, format="orc", index_col="index")
744
+ ... # doctest: +NORMALIZE_WHITESPACE
745
+ id
746
+ index
747
+ 0 10
748
+ 1 11
749
+ 2 12
750
+ 3 13
751
+ 4 14
752
+ """
753
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
754
+ options = options.get("options")
755
+
756
+ sdf = default_session().read.load(path=path, format=format, schema=schema, **options)
757
+ index_spark_columns, index_names = _get_index_map(sdf, index_col)
758
+
759
+ return DataFrame(
760
+ InternalFrame(
761
+ spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names
762
+ )
763
+ )
764
+
765
+
766
+ def read_parquet(
767
+ path: str,
768
+ columns: Optional[List[str]] = None,
769
+ index_col: Optional[List[str]] = None,
770
+ pandas_metadata: bool = False,
771
+ **options: Any,
772
+ ) -> DataFrame:
773
+ """Load a parquet object from the file path, returning a DataFrame.
774
+
775
+ Parameters
776
+ ----------
777
+ path : string
778
+ File path
779
+ columns : list, default=None
780
+ If not None, only these columns will be read from the file.
781
+ index_col : str or list of str, optional, default: None
782
+ Index column of table in Spark.
783
+ pandas_metadata : bool, default: False
784
+ If True, try to respect the metadata if the Parquet file is written from pandas.
785
+ options : dict
786
+ All other options passed directly into Spark's data source.
787
+
788
+ Returns
789
+ -------
790
+ DataFrame
791
+
792
+ See Also
793
+ --------
794
+ DataFrame.to_parquet
795
+ DataFrame.read_table
796
+ DataFrame.read_delta
797
+ DataFrame.read_spark_io
798
+
799
+ Examples
800
+ --------
801
+ >>> ps.range(1).to_parquet('%s/read_spark_io/data.parquet' % path)
802
+ >>> ps.read_parquet('%s/read_spark_io/data.parquet' % path, columns=['id'])
803
+ id
804
+ 0 0
805
+
806
+ You can preserve the index in the roundtrip as below.
807
+
808
+ >>> ps.range(1).to_parquet('%s/read_spark_io/data.parquet' % path, index_col="index")
809
+ >>> ps.read_parquet('%s/read_spark_io/data.parquet' % path, columns=['id'], index_col="index")
810
+ ... # doctest: +NORMALIZE_WHITESPACE
811
+ id
812
+ index
813
+ 0 0
814
+ """
815
+ if index_col is None:
816
+ log_advice(
817
+ "If `index_col` is not specified for `read_parquet`, "
818
+ "the default index is attached which can cause additional overhead."
819
+ )
820
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
821
+ options = options.get("options")
822
+
823
+ if columns is not None:
824
+ columns = list(columns)
825
+
826
+ index_names = None
827
+
828
+ if index_col is None and pandas_metadata:
829
+ # Try to read pandas metadata
830
+
831
+ @pandas_udf( # type: ignore[call-overload]
832
+ "index_col array<string>, index_names array<string>"
833
+ )
834
+ def read_index_metadata(pser: pd.Series) -> pd.DataFrame:
835
+ binary = pser.iloc[0]
836
+ metadata = pq.ParquetFile(pa.BufferReader(binary)).metadata.metadata
837
+ if b"pandas" in metadata:
838
+ pandas_metadata = json.loads(metadata[b"pandas"].decode("utf8"))
839
+ if all(isinstance(col, str) for col in pandas_metadata["index_columns"]):
840
+ index_col = []
841
+ index_names = []
842
+ for col in pandas_metadata["index_columns"]:
843
+ index_col.append(col)
844
+ for column in pandas_metadata["columns"]:
845
+ if column["field_name"] == col:
846
+ index_names.append(column["name"])
847
+ break
848
+ else:
849
+ index_names.append(None)
850
+ return pd.DataFrame({"index_col": [index_col], "index_names": [index_names]})
851
+ return pd.DataFrame({"index_col": [None], "index_names": [None]})
852
+
853
+ index_col, index_names = (
854
+ default_session()
855
+ .read.format("binaryFile")
856
+ .load(path)
857
+ .limit(1)
858
+ .select(read_index_metadata("content").alias("index_metadata"))
859
+ .select("index_metadata.*")
860
+ .head()
861
+ )
862
+
863
+ psdf = read_spark_io(path=path, format="parquet", options=options, index_col=index_col)
864
+
865
+ if columns is not None:
866
+ new_columns = [c for c in columns if c in psdf.columns]
867
+ if len(new_columns) > 0:
868
+ psdf = psdf[new_columns]
869
+ else:
870
+ sdf = default_session().createDataFrame([], schema=StructType())
871
+ index_spark_columns, index_names = _get_index_map(sdf, index_col)
872
+ psdf = DataFrame(
873
+ InternalFrame(
874
+ spark_frame=sdf,
875
+ index_spark_columns=index_spark_columns,
876
+ index_names=index_names,
877
+ )
878
+ )
879
+
880
+ if index_names is not None:
881
+ psdf.index.names = index_names
882
+
883
+ return psdf
884
+
885
+
886
+ def read_clipboard(sep: str = r"\s+", **kwargs: Any) -> DataFrame:
887
+ r"""
888
+ Read text from clipboard and pass to read_csv. See read_csv for the
889
+ full argument list
890
+
891
+ Parameters
892
+ ----------
893
+ sep : str, default '\s+'
894
+ A string or regex delimiter. The default of '\s+' denotes
895
+ one or more whitespace characters.
896
+
897
+ See Also
898
+ --------
899
+ DataFrame.to_clipboard : Write text out to clipboard.
900
+
901
+ Returns
902
+ -------
903
+ parsed : DataFrame
904
+ """
905
+ return cast(DataFrame, from_pandas(pd.read_clipboard(sep, **kwargs)))
906
+
907
+
908
+ def read_excel(
909
+ io: Union[str, Any],
910
+ sheet_name: Union[str, int, List[Union[str, int]], None] = 0,
911
+ header: Union[int, List[int]] = 0,
912
+ names: Optional[List] = None,
913
+ index_col: Optional[List[int]] = None,
914
+ usecols: Optional[Union[int, str, List[Union[int, str]], Callable[[str], bool]]] = None,
915
+ squeeze: bool = False,
916
+ dtype: Optional[Dict[str, Union[str, Dtype]]] = None,
917
+ engine: Optional[str] = None,
918
+ converters: Optional[Dict] = None,
919
+ true_values: Optional[Any] = None,
920
+ false_values: Optional[Any] = None,
921
+ skiprows: Optional[Union[int, List[int]]] = None,
922
+ nrows: Optional[int] = None,
923
+ na_values: Optional[Any] = None,
924
+ keep_default_na: bool = True,
925
+ verbose: bool = False,
926
+ parse_dates: Union[bool, List, Dict] = False,
927
+ date_parser: Optional[Callable] = None,
928
+ thousands: Optional[str] = None,
929
+ comment: Optional[str] = None,
930
+ skipfooter: int = 0,
931
+ convert_float: bool = True,
932
+ mangle_dupe_cols: bool = True,
933
+ **kwds: Any,
934
+ ) -> Union[DataFrame, Series, Dict[str, Union[DataFrame, Series]]]:
935
+ """
936
+ Read an Excel file into a pandas-on-Spark DataFrame or Series.
937
+
938
+ Support both `xls` and `xlsx` file extensions from a local filesystem or URL.
939
+ Support an option to read a single sheet or a list of sheets.
940
+
941
+ Parameters
942
+ ----------
943
+ io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book
944
+ The string could be a URL. The value URL must be available in Spark's DataFrameReader.
945
+
946
+ .. note::
947
+ If the underlying Spark is below 3.0, the parameter as a string is not supported.
948
+ You can use `ps.from_pandas(pd.read_excel(...))` as a workaround.
949
+
950
+ sheet_name : str, int, list, or None, default 0
951
+ Strings are used for sheet names. Integers are used in zero-indexed
952
+ sheet positions. Lists of strings/integers are used to request
953
+ multiple sheets. Specify None to get all sheets.
954
+
955
+ Available cases:
956
+
957
+ * Defaults to ``0``: 1st sheet as a `DataFrame`
958
+ * ``1``: 2nd sheet as a `DataFrame`
959
+ * ``"Sheet1"``: Load sheet with name "Sheet1"
960
+ * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
961
+ as a dict of `DataFrame`
962
+ * None: All sheets.
963
+
964
+ header : int, list of int, default 0
965
+ Row (0-indexed) to use for the column labels of the parsed
966
+ DataFrame. If a list of integers is passed those row positions will
967
+ be combined into a ``MultiIndex``. Use None if there is no header.
968
+ names : array-like, default None
969
+ List of column names to use. If file contains no header row,
970
+ then you should explicitly pass header=None.
971
+ index_col : int, list of int, default None
972
+ Column (0-indexed) to use as the row labels of the DataFrame.
973
+ Pass None if there is no such column. If a list is passed,
974
+ those columns will be combined into a ``MultiIndex``. If a
975
+ subset of data is selected with ``usecols``, index_col
976
+ is based on the subset.
977
+ usecols : int, str, list-like, or callable default None
978
+ Return a subset of the columns.
979
+
980
+ * If None, then parse all columns.
981
+ * If str, then indicates comma separated list of Excel column letters
982
+ and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
983
+ both sides.
984
+ * If list of int, then indicates list of column numbers to be parsed.
985
+ * If list of string, then indicates list of column names to be parsed.
986
+ * If callable, then evaluate each column name against it and parse the
987
+ column if the callable returns ``True``.
988
+ squeeze : bool, default False
989
+ If the parsed data only contains one column then return a Series.
990
+
991
+ .. deprecated:: 3.4.0
992
+
993
+ dtype : Type name or dict of column -> type, default None
994
+ Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
995
+ Use `object` to preserve data as stored in Excel and not interpret dtype.
996
+ If converters are specified, they will be applied INSTEAD
997
+ of dtype conversion.
998
+ engine : str, default None
999
+ If io is not a buffer or path, this must be set to identify io.
1000
+ Acceptable values are None or xlrd.
1001
+ converters : dict, default None
1002
+ Dict of functions for converting values in certain columns. Keys can
1003
+ either be integers or column labels, values are functions that take one
1004
+ input argument, the Excel cell content, and return the transformed
1005
+ content.
1006
+ true_values : list, default None
1007
+ Values to consider as True.
1008
+ false_values : list, default None
1009
+ Values to consider as False.
1010
+ skiprows : list-like
1011
+ Rows to skip at the beginning (0-indexed).
1012
+ nrows : int, default None
1013
+ Number of rows to parse.
1014
+ na_values : scalar, str, list-like, or dict, default None
1015
+ Additional strings to recognize as NA/NaN. If dict passed, specific
1016
+ per-column NA values. By default the following values are interpreted
1017
+ as NaN.
1018
+ keep_default_na : bool, default True
1019
+ If na_values are specified and keep_default_na is False the default NaN
1020
+ values are overridden, otherwise they're appended to.
1021
+ verbose : bool, default False
1022
+ Indicate number of NA values placed in non-numeric columns.
1023
+ parse_dates : bool, list-like, or dict, default False
1024
+ The behavior is as follows:
1025
+
1026
+ * bool. If True -> try parsing the index.
1027
+ * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
1028
+ each as a separate date column.
1029
+ * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
1030
+ a single date column.
1031
+ * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
1032
+ result 'foo'
1033
+
1034
+ If a column or index contains an unparseable date, the entire column or
1035
+ index will be returned unaltered as an object data type. For non-standard
1036
+ datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``
1037
+
1038
+ Note: A fast-path exists for iso8601-formatted dates.
1039
+ date_parser : function, optional
1040
+ Function to use for converting a sequence of string columns to an array of
1041
+ datetime instances. The default uses ``dateutil.parser.parser`` to do the
1042
+ conversion. pandas-on-Spark will try to call `date_parser` in three different ways,
1043
+ advancing to the next if an exception occurs: 1) Pass one or more arrays
1044
+ (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
1045
+ string values from the columns defined by `parse_dates` into a single array
1046
+ and pass that; and 3) call `date_parser` once for each row using one or
1047
+ more strings (corresponding to the columns defined by `parse_dates`) as
1048
+ arguments.
1049
+ thousands : str, default None
1050
+ Thousands separator for parsing string columns to numeric. Note that
1051
+ this parameter is only necessary for columns stored as TEXT in Excel,
1052
+ any numeric columns will automatically be parsed, regardless of display
1053
+ format.
1054
+ comment : str, default None
1055
+ Comments out remainder of line. Pass a character or characters to this
1056
+ argument to indicate comments in the input file. Any data between the
1057
+ comment string and the end of the current line is ignored.
1058
+ skipfooter : int, default 0
1059
+ Rows at the end to skip (0-indexed).
1060
+ convert_float : bool, default True
1061
+ Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
1062
+ data will be read in as floats: Excel stores all numbers as floats
1063
+ internally.
1064
+
1065
+ .. deprecated:: 3.4.0
1066
+
1067
+ mangle_dupe_cols : bool, default True
1068
+ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
1069
+ 'X'...'X'. Passing in False will cause data to be overwritten if there
1070
+ are duplicate names in the columns.
1071
+
1072
+ .. deprecated:: 3.4.0
1073
+
1074
+ **kwds : optional
1075
+ Optional keyword arguments can be passed to ``TextFileReader``.
1076
+
1077
+ Returns
1078
+ -------
1079
+ DataFrame or dict of DataFrames
1080
+ DataFrame from the passed in Excel file. See notes in sheet_name
1081
+ argument for more information on when a dict of DataFrames is returned.
1082
+
1083
+ See Also
1084
+ --------
1085
+ DataFrame.to_excel : Write DataFrame to an Excel file.
1086
+ DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
1087
+ read_csv : Read a comma-separated values (csv) file into DataFrame.
1088
+
1089
+ Examples
1090
+ --------
1091
+ The file can be read using the file name as string or an open file object:
1092
+
1093
+ >>> ps.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
1094
+ Name Value
1095
+ 0 string1 1
1096
+ 1 string2 2
1097
+ 2 #Comment 3
1098
+
1099
+ >>> ps.read_excel(open('tmp.xlsx', 'rb'),
1100
+ ... sheet_name='Sheet3') # doctest: +SKIP
1101
+ Unnamed: 0 Name Value
1102
+ 0 0 string1 1
1103
+ 1 1 string2 2
1104
+ 2 2 #Comment 3
1105
+
1106
+ Index and header can be specified via the `index_col` and `header` arguments
1107
+
1108
+ >>> ps.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
1109
+ 0 1 2
1110
+ 0 NaN Name Value
1111
+ 1 0.0 string1 1
1112
+ 2 1.0 string2 2
1113
+ 3 2.0 #Comment 3
1114
+
1115
+ Column types are inferred but can be explicitly specified
1116
+
1117
+ >>> ps.read_excel('tmp.xlsx', index_col=0,
1118
+ ... dtype={'Name': str, 'Value': float}) # doctest: +SKIP
1119
+ Name Value
1120
+ 0 string1 1.0
1121
+ 1 string2 2.0
1122
+ 2 #Comment 3.0
1123
+
1124
+ True, False, and NA values, and thousands separators have defaults,
1125
+ but can be explicitly specified, too. Supply the values you would like
1126
+ as strings or lists of strings!
1127
+
1128
+ >>> ps.read_excel('tmp.xlsx', index_col=0,
1129
+ ... na_values=['string1', 'string2']) # doctest: +SKIP
1130
+ Name Value
1131
+ 0 None 1
1132
+ 1 None 2
1133
+ 2 #Comment 3
1134
+
1135
+ Comment lines in the excel input file can be skipped using the `comment` kwarg
1136
+
1137
+ >>> ps.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
1138
+ Name Value
1139
+ 0 string1 1.0
1140
+ 1 string2 2.0
1141
+ 2 None NaN
1142
+ """
1143
+
1144
+ def pd_read_excel(
1145
+ io_or_bin: Any, sn: Union[str, int, List[Union[str, int]], None], sq: bool
1146
+ ) -> pd.DataFrame:
1147
+ return pd.read_excel(
1148
+ io=BytesIO(io_or_bin) if isinstance(io_or_bin, (bytes, bytearray)) else io_or_bin,
1149
+ sheet_name=sn,
1150
+ header=header,
1151
+ names=names,
1152
+ index_col=index_col,
1153
+ usecols=usecols,
1154
+ squeeze=sq,
1155
+ dtype=dtype,
1156
+ engine=engine,
1157
+ converters=converters,
1158
+ true_values=true_values,
1159
+ false_values=false_values,
1160
+ skiprows=skiprows,
1161
+ nrows=nrows,
1162
+ na_values=na_values,
1163
+ keep_default_na=keep_default_na,
1164
+ verbose=verbose,
1165
+ parse_dates=parse_dates, # type: ignore[arg-type]
1166
+ date_parser=date_parser,
1167
+ thousands=thousands,
1168
+ comment=comment,
1169
+ skipfooter=skipfooter,
1170
+ convert_float=convert_float,
1171
+ mangle_dupe_cols=mangle_dupe_cols,
1172
+ **kwds,
1173
+ )
1174
+
1175
+ if isinstance(io, str):
1176
+ # 'binaryFile' format is available since Spark 3.0.0.
1177
+ binaries = default_session().read.format("binaryFile").load(io).select("content").head(2)
1178
+ io_or_bin = binaries[0][0]
1179
+ single_file = len(binaries) == 1
1180
+ else:
1181
+ io_or_bin = io
1182
+ single_file = True
1183
+
1184
+ pdf_or_psers = pd_read_excel(io_or_bin, sn=sheet_name, sq=squeeze)
1185
+
1186
+ if single_file:
1187
+ if isinstance(pdf_or_psers, dict):
1188
+ return {
1189
+ sn: cast(Union[DataFrame, Series], from_pandas(pdf_or_pser))
1190
+ for sn, pdf_or_pser in pdf_or_psers.items()
1191
+ }
1192
+ else:
1193
+ return cast(Union[DataFrame, Series], from_pandas(pdf_or_psers))
1194
+ else:
1195
+
1196
+ def read_excel_on_spark(
1197
+ pdf_or_pser: Union[pd.DataFrame, pd.Series],
1198
+ sn: Union[str, int, List[Union[str, int]], None],
1199
+ ) -> Union[DataFrame, Series]:
1200
+ if isinstance(pdf_or_pser, pd.Series):
1201
+ pdf = pdf_or_pser.to_frame()
1202
+ else:
1203
+ pdf = pdf_or_pser
1204
+
1205
+ psdf = cast(DataFrame, from_pandas(pdf))
1206
+ return_schema = force_decimal_precision_scale(
1207
+ as_nullable_spark_type(psdf._internal.spark_frame.drop(*HIDDEN_COLUMNS).schema)
1208
+ )
1209
+
1210
+ def output_func(pdf: pd.DataFrame) -> pd.DataFrame:
1211
+ pdf = pd.concat(
1212
+ [pd_read_excel(bin, sn=sn, sq=False) for bin in pdf[pdf.columns[0]]]
1213
+ )
1214
+
1215
+ reset_index = pdf.reset_index()
1216
+ for name, col in reset_index.items():
1217
+ dt = col.dtype
1218
+ if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
1219
+ continue
1220
+ reset_index[name] = col.replace({np.nan: None})
1221
+ pdf = reset_index
1222
+
1223
+ # Just positionally map the column names to given schema's.
1224
+ return pdf.rename(columns=dict(zip(pdf.columns, return_schema.names)))
1225
+
1226
+ sdf = (
1227
+ default_session()
1228
+ .read.format("binaryFile")
1229
+ .load(io)
1230
+ .select("content")
1231
+ .mapInPandas(lambda iterator: map(output_func, iterator), schema=return_schema)
1232
+ )
1233
+
1234
+ psdf = DataFrame(psdf._internal.with_new_sdf(sdf))
1235
+ if squeeze and len(psdf.columns) == 1:
1236
+ return first_series(psdf)
1237
+ else:
1238
+ return psdf
1239
+
1240
+ if isinstance(pdf_or_psers, dict):
1241
+ return {
1242
+ sn: read_excel_on_spark(pdf_or_pser, sn) for sn, pdf_or_pser in pdf_or_psers.items()
1243
+ }
1244
+ else:
1245
+ return read_excel_on_spark(pdf_or_psers, sheet_name)
1246
+
1247
+
1248
+ def read_html(
1249
+ io: Union[str, Any],
1250
+ match: str = ".+",
1251
+ flavor: Optional[str] = None,
1252
+ header: Optional[Union[int, List[int]]] = None,
1253
+ index_col: Optional[Union[int, List[int]]] = None,
1254
+ skiprows: Optional[Union[int, List[int], slice]] = None,
1255
+ attrs: Optional[Dict[str, str]] = None,
1256
+ parse_dates: bool = False,
1257
+ thousands: str = ",",
1258
+ encoding: Optional[str] = None,
1259
+ decimal: str = ".",
1260
+ converters: Optional[Dict] = None,
1261
+ na_values: Optional[Any] = None,
1262
+ keep_default_na: bool = True,
1263
+ displayed_only: bool = True,
1264
+ ) -> List[DataFrame]:
1265
+ r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
1266
+
1267
+ Parameters
1268
+ ----------
1269
+ io : str or file-like
1270
+ A URL, a file-like object, or a raw string containing HTML. Note that
1271
+ lxml only accepts the http, FTP and file URL protocols. If you have a
1272
+ URL that starts with ``'https'`` you might try removing the ``'s'``.
1273
+
1274
+ match : str or compiled regular expression, optional
1275
+ The set of tables containing text matching this regex or string will be
1276
+ returned. Unless the HTML is extremely simple you will probably need to
1277
+ pass a non-empty string here. Defaults to '.+' (match any non-empty
1278
+ string). The default value will return all tables contained on a page.
1279
+ This value is converted to a regular expression so that there is
1280
+ consistent behavior between Beautiful Soup and lxml.
1281
+
1282
+ flavor : str or None, container of strings
1283
+ The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
1284
+ each other, they are both there for backwards compatibility. The
1285
+ default of ``None`` tries to use ``lxml`` to parse and if that fails it
1286
+ falls back on ``bs4`` + ``html5lib``.
1287
+
1288
+ header : int or list-like or None, optional
1289
+ The row (or list of rows for a :class:`~ps.MultiIndex`) to use to
1290
+ make the columns headers.
1291
+
1292
+ index_col : int or list-like or None, optional
1293
+ The column (or list of columns) to use to create the index.
1294
+
1295
+ skiprows : int or list-like or slice or None, optional
1296
+ 0-based. Number of rows to skip after parsing the column integer. If a
1297
+ sequence of integers or a slice is given, will skip the rows indexed by
1298
+ that sequence. Note that a single element sequence means 'skip the nth
1299
+ row' whereas an integer means 'skip n rows'.
1300
+
1301
+ attrs : dict or None, optional
1302
+ This is a dictionary of attributes that you can pass to use to identify
1303
+ the table in the HTML. These are not checked for validity before being
1304
+ passed to lxml or Beautiful Soup. However, these attributes must be
1305
+ valid HTML table attributes to work correctly. For example, ::
1306
+
1307
+ attrs = {'id': 'table'}
1308
+
1309
+ is a valid attribute dictionary because the 'id' HTML tag attribute is
1310
+ a valid HTML attribute for *any* HTML tag as per `this document
1311
+ <http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::
1312
+
1313
+ attrs = {'asdf': 'table'}
1314
+
1315
+ is *not* a valid attribute dictionary because 'asdf' is not a valid
1316
+ HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
1317
+ table attributes can be found `here
1318
+ <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
1319
+ working draft of the HTML 5 spec can be found `here
1320
+ <http://www.w3.org/TR/html-markup/table.html>`__. It contains the
1321
+ latest information on table attributes for the modern web.
1322
+
1323
+ parse_dates : bool, optional
1324
+ See :func:`~ps.read_csv` for more details.
1325
+
1326
+ thousands : str, optional
1327
+ Separator to use to parse thousands. Defaults to ``','``.
1328
+
1329
+ encoding : str or None, optional
1330
+ The encoding used to decode the web page. Defaults to ``None``.``None``
1331
+ preserves the previous encoding behavior, which depends on the
1332
+ underlying parser library (e.g., the parser library will try to use
1333
+ the encoding provided by the document).
1334
+
1335
+ decimal : str, default '.'
1336
+ Character to recognize as decimal point (example: use ',' for European
1337
+ data).
1338
+
1339
+ converters : dict, default None
1340
+ Dict of functions for converting values in certain columns. Keys can
1341
+ either be integers or column labels, values are functions that take one
1342
+ input argument, the cell (not column) content, and return the
1343
+ transformed content.
1344
+
1345
+ na_values : iterable, default None
1346
+ Custom NA values
1347
+
1348
+ keep_default_na : bool, default True
1349
+ If na_values are specified and keep_default_na is False the default NaN
1350
+ values are overridden, otherwise they're appended to
1351
+
1352
+ displayed_only : bool, default True
1353
+ Whether elements with "display: none" should be parsed
1354
+
1355
+ Returns
1356
+ -------
1357
+ dfs : list of DataFrames
1358
+
1359
+ See Also
1360
+ --------
1361
+ read_csv
1362
+ DataFrame.to_html
1363
+ """
1364
+ pdfs = pd.read_html(
1365
+ io=io,
1366
+ match=match,
1367
+ flavor=flavor,
1368
+ header=header,
1369
+ index_col=index_col,
1370
+ skiprows=skiprows,
1371
+ attrs=attrs,
1372
+ parse_dates=parse_dates,
1373
+ thousands=thousands,
1374
+ encoding=encoding,
1375
+ decimal=decimal,
1376
+ converters=converters,
1377
+ na_values=na_values,
1378
+ keep_default_na=keep_default_na,
1379
+ displayed_only=displayed_only,
1380
+ )
1381
+ return cast(List[DataFrame], [from_pandas(pdf) for pdf in pdfs])
1382
+
1383
+
1384
+ # TODO: add `coerce_float` and 'parse_dates' parameters
1385
+ def read_sql_table(
1386
+ table_name: str,
1387
+ con: str,
1388
+ schema: Optional[str] = None,
1389
+ index_col: Optional[Union[str, List[str]]] = None,
1390
+ columns: Optional[Union[str, List[str]]] = None,
1391
+ **options: Any,
1392
+ ) -> DataFrame:
1393
+ """
1394
+ Read SQL database table into a DataFrame.
1395
+
1396
+ Given a table name and a JDBC URI, returns a DataFrame.
1397
+
1398
+ Parameters
1399
+ ----------
1400
+ table_name : str
1401
+ Name of SQL table in database.
1402
+ con : str
1403
+ A JDBC URI could be provided as str.
1404
+
1405
+ .. note:: The URI must be JDBC URI instead of Python's database URI.
1406
+
1407
+ schema : str, default None
1408
+ Name of SQL schema in database to query (if database flavor
1409
+ supports this). Uses default schema if None (default).
1410
+ index_col : str or list of str, optional, default: None
1411
+ Column(s) to set as index(MultiIndex).
1412
+ columns : list, default None
1413
+ List of column names to select from SQL table.
1414
+ options : dict
1415
+ All other options passed directly into Spark's JDBC data source.
1416
+
1417
+ Returns
1418
+ -------
1419
+ DataFrame
1420
+ A SQL table is returned as two-dimensional data structure with labeled
1421
+ axes.
1422
+
1423
+ See Also
1424
+ --------
1425
+ read_sql_query : Read SQL query into a DataFrame.
1426
+ read_sql : Read SQL query or database table into a DataFrame.
1427
+
1428
+ Examples
1429
+ --------
1430
+ >>> ps.read_sql_table('table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP
1431
+ """
1432
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
1433
+ options = options.get("options")
1434
+
1435
+ reader = default_session().read
1436
+ reader.option("dbtable", table_name)
1437
+ reader.option("url", con)
1438
+ if schema is not None:
1439
+ reader.schema(schema)
1440
+ reader.options(**options)
1441
+ sdf = reader.format("jdbc").load()
1442
+ index_spark_columns, index_names = _get_index_map(sdf, index_col)
1443
+ psdf: DataFrame = DataFrame(
1444
+ InternalFrame(
1445
+ spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names
1446
+ )
1447
+ )
1448
+ if columns is not None:
1449
+ if isinstance(columns, str):
1450
+ columns = [columns]
1451
+ psdf = psdf[columns]
1452
+ return psdf
1453
+
1454
+
1455
+ # TODO: add `coerce_float`, `params`, and 'parse_dates' parameters
1456
+ def read_sql_query(
1457
+ sql: str, con: str, index_col: Optional[Union[str, List[str]]] = None, **options: Any
1458
+ ) -> DataFrame:
1459
+ """Read SQL query into a DataFrame.
1460
+
1461
+ Returns a DataFrame corresponding to the result set of the query
1462
+ string. Optionally provide an `index_col` parameter to use one of the
1463
+ columns as the index, otherwise default index will be used.
1464
+
1465
+ .. note:: Some database might hit the issue of Spark: SPARK-27596
1466
+
1467
+ Parameters
1468
+ ----------
1469
+ sql : string SQL query
1470
+ SQL query to be executed.
1471
+ con : str
1472
+ A JDBC URI could be provided as str.
1473
+
1474
+ .. note:: The URI must be JDBC URI instead of Python's database URI.
1475
+
1476
+ index_col : string or list of strings, optional, default: None
1477
+ Column(s) to set as index(MultiIndex).
1478
+ options : dict
1479
+ All other options passed directly into Spark's JDBC data source.
1480
+
1481
+ Returns
1482
+ -------
1483
+ DataFrame
1484
+
1485
+ See Also
1486
+ --------
1487
+ read_sql_table : Read SQL database table into a DataFrame.
1488
+ read_sql
1489
+
1490
+ Examples
1491
+ --------
1492
+ >>> ps.read_sql_query('SELECT * FROM table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP
1493
+ """
1494
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
1495
+ options = options.get("options")
1496
+
1497
+ reader = default_session().read
1498
+ reader.option("query", sql)
1499
+ reader.option("url", con)
1500
+ reader.options(**options)
1501
+ sdf = reader.format("jdbc").load()
1502
+ index_spark_columns, index_names = _get_index_map(sdf, index_col)
1503
+ return DataFrame(
1504
+ InternalFrame(
1505
+ spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names
1506
+ )
1507
+ )
1508
+
1509
+
1510
+ # TODO: add `coerce_float`, `params`, and 'parse_dates' parameters
1511
+ def read_sql(
1512
+ sql: str,
1513
+ con: str,
1514
+ index_col: Optional[Union[str, List[str]]] = None,
1515
+ columns: Optional[Union[str, List[str]]] = None,
1516
+ **options: Any,
1517
+ ) -> DataFrame:
1518
+ """
1519
+ Read SQL query or database table into a DataFrame.
1520
+
1521
+ This function is a convenience wrapper around ``read_sql_table`` and
1522
+ ``read_sql_query`` (for backward compatibility). It will delegate
1523
+ to the specific function depending on the provided input. A SQL query
1524
+ will be routed to ``read_sql_query``, while a database table name will
1525
+ be routed to ``read_sql_table``. Note that the delegated function might
1526
+ have more specific notes about their functionality not listed here.
1527
+
1528
+ .. note:: Some database might hit the issue of Spark: SPARK-27596
1529
+
1530
+ Parameters
1531
+ ----------
1532
+ sql : string
1533
+ SQL query to be executed or a table name.
1534
+ con : str
1535
+ A JDBC URI could be provided as str.
1536
+
1537
+ .. note:: The URI must be JDBC URI instead of Python's database URI.
1538
+
1539
+ index_col : string or list of strings, optional, default: None
1540
+ Column(s) to set as index(MultiIndex).
1541
+ columns : list, default: None
1542
+ List of column names to select from SQL table (only used when reading
1543
+ a table).
1544
+ options : dict
1545
+ All other options passed directly into Spark's JDBC data source.
1546
+
1547
+ Returns
1548
+ -------
1549
+ DataFrame
1550
+
1551
+ See Also
1552
+ --------
1553
+ read_sql_table : Read SQL database table into a DataFrame.
1554
+ read_sql_query : Read SQL query into a DataFrame.
1555
+
1556
+ Examples
1557
+ --------
1558
+ >>> ps.read_sql('table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP
1559
+ >>> ps.read_sql('SELECT * FROM table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP
1560
+ """
1561
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
1562
+ options = options.get("options")
1563
+
1564
+ striped = sql.strip()
1565
+ if " " not in striped: # TODO: identify the table name or not more precisely.
1566
+ return read_sql_table(sql, con, index_col=index_col, columns=columns, **options)
1567
+ else:
1568
+ return read_sql_query(sql, con, index_col=index_col, **options)
1569
+
1570
+
1571
+ @no_type_check
1572
+ def to_datetime(
1573
+ arg,
1574
+ errors: str = "raise",
1575
+ format: Optional[str] = None,
1576
+ unit: Optional[str] = None,
1577
+ infer_datetime_format: bool = False,
1578
+ origin: str = "unix",
1579
+ ):
1580
+ """
1581
+ Convert argument to datetime.
1582
+
1583
+ Parameters
1584
+ ----------
1585
+ arg : integer, float, string, datetime, list, tuple, 1-d array, Series
1586
+ or DataFrame/dict-like
1587
+
1588
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
1589
+
1590
+ - If 'raise', then invalid parsing will raise an exception
1591
+ - If 'coerce', then invalid parsing will be set as NaT
1592
+ - If 'ignore', then invalid parsing will return the input
1593
+ format : string, default None
1594
+ strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
1595
+ all the way up to nanoseconds.
1596
+ unit : string, default None
1597
+ unit of the arg (D,s,ms,us,ns) denote the unit, which is an
1598
+ integer or float number. This will be based off the origin.
1599
+ Example, with unit='ms' and origin='unix' (the default), this
1600
+ would calculate the number of milliseconds to the unix epoch start.
1601
+ infer_datetime_format : boolean, default False
1602
+ If True and no `format` is given, attempt to infer the format of the
1603
+ datetime strings, and if it can be inferred, switch to a faster
1604
+ method of parsing them. In some cases this can increase the parsing
1605
+ speed by ~5-10x.
1606
+ origin : scalar, default 'unix'
1607
+ Define the reference date. The numeric values would be parsed as number
1608
+ of units (defined by `unit`) since this reference date.
1609
+
1610
+ - If 'unix' (or POSIX) time; origin is set to 1970-01-01.
1611
+ - If 'julian', unit must be 'D', and origin is set to beginning of
1612
+ Julian Calendar. Julian day number 0 is assigned to the day starting
1613
+ at noon on January 1, 4713 BC.
1614
+ - If Timestamp convertible, origin is set to Timestamp identified by
1615
+ origin.
1616
+
1617
+ Returns
1618
+ -------
1619
+ ret : datetime if parsing succeeded.
1620
+ Return type depends on input:
1621
+
1622
+ - list-like: DatetimeIndex
1623
+ - Series: Series of datetime64 dtype
1624
+ - scalar: Timestamp
1625
+
1626
+ In case when it is not possible to return designated types (e.g. when
1627
+ any element of input is before Timestamp.min or after Timestamp.max)
1628
+ return will have datetime.datetime type (or corresponding
1629
+ array/Series).
1630
+
1631
+ Examples
1632
+ --------
1633
+ Assembling a datetime from multiple columns of a DataFrame. The keys can be
1634
+ common abbreviations like ['year', 'month', 'day', 'minute', 'second',
1635
+ 'ms', 'us', 'ns']) or plurals of the same
1636
+
1637
+ >>> df = ps.DataFrame({'year': [2015, 2016],
1638
+ ... 'month': [2, 3],
1639
+ ... 'day': [4, 5]})
1640
+ >>> ps.to_datetime(df)
1641
+ 0 2015-02-04
1642
+ 1 2016-03-05
1643
+ dtype: datetime64[ns]
1644
+
1645
+ If a date does not meet the `timestamp limitations
1646
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html
1647
+ #timeseries-timestamp-limits>`_, passing errors='ignore'
1648
+ will return the original input instead of raising any exception.
1649
+
1650
+ Passing errors='coerce' will force an out-of-bounds date to NaT,
1651
+ in addition to forcing non-dates (or non-parseable dates) to NaT.
1652
+
1653
+ >>> ps.to_datetime('13000101', format='%Y%m%d', errors='ignore') # doctest: +SKIP
1654
+ datetime.datetime(1300, 1, 1, 0, 0)
1655
+ >>> ps.to_datetime('13000101', format='%Y%m%d', errors='coerce')
1656
+ NaT
1657
+
1658
+ Passing infer_datetime_format=True can often-times speedup a parsing
1659
+ if its not an ISO8601 format exactly, but in a regular format.
1660
+
1661
+ >>> s = ps.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000)
1662
+ >>> s.head()
1663
+ 0 3/11/2000
1664
+ 1 3/12/2000
1665
+ 2 3/13/2000
1666
+ 3 3/11/2000
1667
+ 4 3/12/2000
1668
+ dtype: object
1669
+
1670
+ >>> import timeit
1671
+ >>> timeit.timeit(
1672
+ ... lambda: repr(ps.to_datetime(s, infer_datetime_format=True)),
1673
+ ... number = 1) # doctest: +SKIP
1674
+ 0.35832712500000063
1675
+
1676
+ >>> timeit.timeit(
1677
+ ... lambda: repr(ps.to_datetime(s, infer_datetime_format=False)),
1678
+ ... number = 1) # doctest: +SKIP
1679
+ 0.8895321660000004
1680
+
1681
+ Using a unix epoch time
1682
+
1683
+ >>> ps.to_datetime(1490195805, unit='s')
1684
+ Timestamp('2017-03-22 15:16:45')
1685
+ >>> ps.to_datetime(1490195805433502912, unit='ns')
1686
+ Timestamp('2017-03-22 15:16:45.433502912')
1687
+
1688
+ Using a non-unix epoch origin
1689
+
1690
+ >>> ps.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))
1691
+ DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
1692
+ """
1693
+
1694
+ # mappings for assembling units
1695
+ # From pandas: pandas.core.tools.datetimes
1696
+ _unit_map = {
1697
+ "year": "year",
1698
+ "years": "year",
1699
+ "month": "month",
1700
+ "months": "month",
1701
+ "day": "day",
1702
+ "days": "day",
1703
+ "hour": "h",
1704
+ "hours": "h",
1705
+ "minute": "m",
1706
+ "minutes": "m",
1707
+ "second": "s",
1708
+ "seconds": "s",
1709
+ "ms": "ms",
1710
+ "millisecond": "ms",
1711
+ "milliseconds": "ms",
1712
+ "us": "us",
1713
+ "microsecond": "us",
1714
+ "microseconds": "us",
1715
+ }
1716
+
1717
+ def pandas_to_datetime(
1718
+ pser_or_pdf: Union[pd.DataFrame, pd.Series], cols: Optional[List[str]] = None
1719
+ ) -> Series[np.datetime64]:
1720
+ if isinstance(pser_or_pdf, pd.DataFrame):
1721
+ pser_or_pdf = pser_or_pdf[cols]
1722
+ return pd.to_datetime(
1723
+ pser_or_pdf,
1724
+ errors=errors,
1725
+ format=format,
1726
+ unit=unit,
1727
+ infer_datetime_format=infer_datetime_format,
1728
+ origin=origin,
1729
+ )
1730
+
1731
+ if isinstance(arg, Series):
1732
+ return arg.pandas_on_spark.transform_batch(pandas_to_datetime)
1733
+ if isinstance(arg, DataFrame):
1734
+ unit = {k: _unit_map[k.lower()] for k in arg.keys() if k.lower() in _unit_map}
1735
+ unit_rev = {v: k for k, v in unit.items()}
1736
+ list_cols = [unit_rev["year"], unit_rev["month"], unit_rev["day"]]
1737
+ for u in ["h", "m", "s", "ms", "us"]:
1738
+ value = unit_rev.get(u)
1739
+ if value is not None and value in arg:
1740
+ list_cols.append(value)
1741
+
1742
+ psdf = arg[list_cols]
1743
+ return psdf.pandas_on_spark.transform_batch(pandas_to_datetime, list_cols)
1744
+ return pd.to_datetime(
1745
+ arg,
1746
+ errors=errors,
1747
+ format=format,
1748
+ unit=unit,
1749
+ infer_datetime_format=infer_datetime_format,
1750
+ origin=origin,
1751
+ )
1752
+
1753
+
1754
+ # TODO(SPARK-42621): Add `inclusive` parameter and replace `closed`.
1755
+ # See https://github.com/pandas-dev/pandas/issues/40245
1756
+ def date_range(
1757
+ start: Union[str, Any] = None,
1758
+ end: Union[str, Any] = None,
1759
+ periods: Optional[int] = None,
1760
+ freq: Optional[Union[str, DateOffset]] = None,
1761
+ tz: Optional[Union[str, tzinfo]] = None,
1762
+ normalize: bool = False,
1763
+ name: Optional[str] = None,
1764
+ closed: Optional[str] = None,
1765
+ **kwargs: Any,
1766
+ ) -> DatetimeIndex:
1767
+ """
1768
+ Return a fixed frequency DatetimeIndex.
1769
+
1770
+ Parameters
1771
+ ----------
1772
+ start : str or datetime-like, optional
1773
+ Left bound for generating dates.
1774
+ end : str or datetime-like, optional
1775
+ Right bound for generating dates.
1776
+ periods : int, optional
1777
+ Number of periods to generate.
1778
+ freq : str or DateOffset, default 'D'
1779
+ Frequency strings can have multiples, e.g. '5H'.
1780
+ tz : str or tzinfo, optional
1781
+ Time zone name for returning localized DatetimeIndex, for example
1782
+ 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
1783
+ time zone naive.
1784
+ normalize : bool, default False
1785
+ Normalize start/end dates to midnight before generating date range.
1786
+ name : str, default None
1787
+ Name of the resulting DatetimeIndex.
1788
+ closed : {None, 'left', 'right'}, optional
1789
+ Make the interval closed with respect to the given frequency to
1790
+ the 'left', 'right', or both sides (None, the default).
1791
+
1792
+ .. deprecated:: 3.4.0
1793
+
1794
+ **kwargs
1795
+ For compatibility. Has no effect on the result.
1796
+
1797
+ Returns
1798
+ -------
1799
+ rng : DatetimeIndex
1800
+
1801
+ See Also
1802
+ --------
1803
+ DatetimeIndex : An immutable container for datetimes.
1804
+
1805
+ Notes
1806
+ -----
1807
+ Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
1808
+ exactly three must be specified. If ``freq`` is omitted, the resulting
1809
+ ``DatetimeIndex`` will have ``periods`` linearly spaced elements between
1810
+ ``start`` and ``end`` (closed on both sides).
1811
+
1812
+ To learn more about the frequency strings, please see `this link
1813
+ <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
1814
+
1815
+ Examples
1816
+ --------
1817
+ **Specifying the values**
1818
+
1819
+ The next four examples generate the same `DatetimeIndex`, but vary
1820
+ the combination of `start`, `end` and `periods`.
1821
+
1822
+ Specify `start` and `end`, with the default daily frequency.
1823
+
1824
+ >>> ps.date_range(start='1/1/2018', end='1/08/2018') # doctest: +SKIP
1825
+ DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
1826
+ '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
1827
+ dtype='datetime64[ns]', freq=None)
1828
+
1829
+ Specify `start` and `periods`, the number of periods (days).
1830
+
1831
+ >>> ps.date_range(start='1/1/2018', periods=8) # doctest: +SKIP
1832
+ DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
1833
+ '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
1834
+ dtype='datetime64[ns]', freq=None)
1835
+
1836
+ Specify `end` and `periods`, the number of periods (days).
1837
+
1838
+ >>> ps.date_range(end='1/1/2018', periods=8) # doctest: +SKIP
1839
+ DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
1840
+ '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
1841
+ dtype='datetime64[ns]', freq=None)
1842
+
1843
+ Specify `start`, `end`, and `periods`; the frequency is generated
1844
+ automatically (linearly spaced).
1845
+
1846
+ >>> ps.date_range(
1847
+ ... start='2018-04-24', end='2018-04-27', periods=3
1848
+ ... ) # doctest: +SKIP
1849
+ DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
1850
+ '2018-04-27 00:00:00'],
1851
+ dtype='datetime64[ns]', freq=None)
1852
+
1853
+ **Other Parameters**
1854
+
1855
+ Changed the `freq` (frequency) to ``'M'`` (month end frequency).
1856
+
1857
+ >>> ps.date_range(start='1/1/2018', periods=5, freq='M') # doctest: +SKIP
1858
+ DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
1859
+ '2018-05-31'],
1860
+ dtype='datetime64[ns]', freq=None)
1861
+
1862
+ Multiples are allowed
1863
+
1864
+ >>> ps.date_range(start='1/1/2018', periods=5, freq='3M') # doctest: +SKIP
1865
+ DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
1866
+ '2019-01-31'],
1867
+ dtype='datetime64[ns]', freq=None)
1868
+
1869
+ `freq` can also be specified as an Offset object.
1870
+
1871
+ >>> ps.date_range(
1872
+ ... start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)
1873
+ ... ) # doctest: +SKIP
1874
+ DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
1875
+ '2019-01-31'],
1876
+ dtype='datetime64[ns]', freq=None)
1877
+
1878
+ `closed` controls whether to include `start` and `end` that are on the
1879
+ boundary. The default includes boundary points on either end.
1880
+
1881
+ >>> ps.date_range(
1882
+ ... start='2017-01-01', end='2017-01-04', closed=None
1883
+ ... ) # doctest: +SKIP
1884
+ DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],
1885
+ dtype='datetime64[ns]', freq=None)
1886
+
1887
+ Use ``closed='left'`` to exclude `end` if it falls on the boundary.
1888
+
1889
+ >>> ps.date_range(
1890
+ ... start='2017-01-01', end='2017-01-04', closed='left'
1891
+ ... ) # doctest: +SKIP
1892
+ DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)
1893
+
1894
+ Use ``closed='right'`` to exclude `start` if it falls on the boundary.
1895
+
1896
+ >>> ps.date_range(
1897
+ ... start='2017-01-01', end='2017-01-04', closed='right'
1898
+ ... ) # doctest: +SKIP
1899
+ DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
1900
+ """
1901
+ assert freq not in ["N", "ns"], "nanoseconds is not supported"
1902
+ assert tz is None, "Localized DatetimeIndex is not supported"
1903
+ if closed is not None:
1904
+ warnings.warn(
1905
+ "Argument `closed` is deprecated in 3.4.0 and will be removed in 4.0.0.",
1906
+ FutureWarning,
1907
+ )
1908
+
1909
+ return cast(
1910
+ DatetimeIndex,
1911
+ ps.from_pandas(
1912
+ pd.date_range(
1913
+ start=start,
1914
+ end=end,
1915
+ periods=periods,
1916
+ freq=freq,
1917
+ tz=tz,
1918
+ normalize=normalize,
1919
+ name=name,
1920
+ closed=closed,
1921
+ **kwargs,
1922
+ )
1923
+ ),
1924
+ )
1925
+
1926
+
1927
+ @no_type_check
1928
+ def to_timedelta(
1929
+ arg,
1930
+ unit: Optional[str] = None,
1931
+ errors: str = "raise",
1932
+ ):
1933
+ """
1934
+ Convert argument to timedelta.
1935
+
1936
+ Parameters
1937
+ ----------
1938
+ arg : str, timedelta, list-like or Series
1939
+ The data to be converted to timedelta.
1940
+ unit : str, optional
1941
+ Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``.
1942
+
1943
+ Possible values:
1944
+ * 'W'
1945
+ * 'D' / 'days' / 'day'
1946
+ * 'hours' / 'hour' / 'hr' / 'h'
1947
+ * 'm' / 'minute' / 'min' / 'minutes' / 'T'
1948
+ * 'S' / 'seconds' / 'sec' / 'second'
1949
+ * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L'
1950
+ * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U'
1951
+ * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N'
1952
+
1953
+ Must not be specified when `arg` context strings and ``errors="raise"``.
1954
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
1955
+ - If 'raise', then invalid parsing will raise an exception.
1956
+ - If 'coerce', then invalid parsing will be set as NaT.
1957
+ - If 'ignore', then invalid parsing will return the input.
1958
+
1959
+ Returns
1960
+ -------
1961
+ ret : timedelta64, TimedeltaIndex or Series of timedelta64 if parsing succeeded.
1962
+
1963
+ See Also
1964
+ --------
1965
+ DataFrame.astype : Cast argument to a specified dtype.
1966
+ to_datetime : Convert argument to datetime.
1967
+
1968
+ Notes
1969
+ -----
1970
+ If the precision is higher than nanoseconds, the precision of the duration is
1971
+ truncated to nanoseconds for string inputs.
1972
+
1973
+ Examples
1974
+ --------
1975
+ Parsing a single string to a Timedelta:
1976
+
1977
+ >>> ps.to_timedelta('1 days 06:05:01.00003')
1978
+ Timedelta('1 days 06:05:01.000030')
1979
+ >>> ps.to_timedelta('15.5us') # doctest: +SKIP
1980
+ Timedelta('0 days 00:00:00.000015500')
1981
+
1982
+ Parsing a list or array of strings:
1983
+
1984
+ >>> ps.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) # doctest: +SKIP
1985
+ TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT],
1986
+ dtype='timedelta64[ns]', freq=None)
1987
+
1988
+ Converting numbers by specifying the `unit` keyword argument:
1989
+
1990
+ >>> ps.to_timedelta(np.arange(5), unit='s') # doctest: +SKIP
1991
+ TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02',
1992
+ '0 days 00:00:03', '0 days 00:00:04'],
1993
+ dtype='timedelta64[ns]', freq=None)
1994
+ >>> ps.to_timedelta(np.arange(5), unit='d') # doctest: +NORMALIZE_WHITESPACE
1995
+ TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
1996
+ dtype='timedelta64[ns]', freq=None)
1997
+ """
1998
+
1999
+ def pandas_to_timedelta(pser: pd.Series) -> np.timedelta64:
2000
+ return pd.to_timedelta(
2001
+ arg=pser,
2002
+ unit=unit,
2003
+ errors=errors,
2004
+ )
2005
+
2006
+ if isinstance(arg, Series):
2007
+ return arg.transform(pandas_to_timedelta)
2008
+
2009
+ else:
2010
+ return pd.to_timedelta(
2011
+ arg=arg,
2012
+ unit=unit,
2013
+ errors=errors,
2014
+ )
2015
+
2016
+
2017
+ def timedelta_range(
2018
+ start: Union[str, Any] = None,
2019
+ end: Union[str, Any] = None,
2020
+ periods: Optional[int] = None,
2021
+ freq: Optional[Union[str, DateOffset]] = None,
2022
+ name: Optional[str] = None,
2023
+ closed: Optional[str] = None,
2024
+ ) -> TimedeltaIndex:
2025
+ """
2026
+ Return a fixed frequency TimedeltaIndex, with day as the default frequency.
2027
+
2028
+ Parameters
2029
+ ----------
2030
+ start : str or timedelta-like, optional
2031
+ Left bound for generating timedeltas.
2032
+ end : str or timedelta-like, optional
2033
+ Right bound for generating timedeltas.
2034
+ periods : int, optional
2035
+ Number of periods to generate.
2036
+ freq : str or DateOffset, default 'D'
2037
+ Frequency strings can have multiples, e.g. '5H'.
2038
+ name : str, default None
2039
+ Name of the resulting TimedeltaIndex.
2040
+ closed : {None, 'left', 'right'}, optional
2041
+ Make the interval closed with respect to the given frequency to
2042
+ the 'left', 'right', or both sides (None, the default).
2043
+
2044
+ Returns
2045
+ -------
2046
+ TimedeltaIndex
2047
+
2048
+ Notes
2049
+ -----
2050
+ Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
2051
+ exactly three must be specified. If ``freq`` is omitted, the resulting
2052
+ ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between
2053
+ ``start`` and ``end`` (closed on both sides).
2054
+
2055
+ To learn more about the frequency strings, please see `this link
2056
+ <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
2057
+
2058
+ Examples
2059
+ --------
2060
+ >>> ps.timedelta_range(start='1 day', periods=4) # doctest: +NORMALIZE_WHITESPACE
2061
+ TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None)
2062
+
2063
+ The closed parameter specifies which endpoint is included.
2064
+ The default behavior is to include both endpoints.
2065
+
2066
+ >>> ps.timedelta_range(start='1 day', periods=4, closed='right')
2067
+ ... # doctest: +NORMALIZE_WHITESPACE
2068
+ TimedeltaIndex(['2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None)
2069
+
2070
+ The freq parameter specifies the frequency of the TimedeltaIndex.
2071
+ Only fixed frequencies can be passed, non-fixed frequencies such as ‘M’ (month end) will raise.
2072
+
2073
+ >>> ps.timedelta_range(start='1 day', end='2 days', freq='6H')
2074
+ ... # doctest: +NORMALIZE_WHITESPACE
2075
+ TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00',
2076
+ '1 days 18:00:00', '2 days 00:00:00'],
2077
+ dtype='timedelta64[ns]', freq=None)
2078
+
2079
+ Specify start, end, and periods; the frequency is generated automatically (linearly spaced).
2080
+
2081
+ >>> ps.timedelta_range(start='1 day', end='5 days', periods=4)
2082
+ ... # doctest: +NORMALIZE_WHITESPACE
2083
+ TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00',
2084
+ '5 days 00:00:00'],
2085
+ dtype='timedelta64[ns]', freq=None)
2086
+ """
2087
+ assert freq not in ["N", "ns"], "nanoseconds is not supported"
2088
+
2089
+ return cast(
2090
+ TimedeltaIndex,
2091
+ ps.from_pandas(
2092
+ pd.timedelta_range(
2093
+ start=start,
2094
+ end=end,
2095
+ periods=periods,
2096
+ freq=freq,
2097
+ name=name,
2098
+ closed=closed,
2099
+ )
2100
+ ),
2101
+ )
2102
+
2103
+
2104
+ def get_dummies(
2105
+ data: Union[DataFrame, Series],
2106
+ prefix: Optional[Union[str, List[str], Dict[str, str]]] = None,
2107
+ prefix_sep: str = "_",
2108
+ dummy_na: bool = False,
2109
+ columns: Optional[Union[Name, List[Name]]] = None,
2110
+ sparse: bool = False,
2111
+ drop_first: bool = False,
2112
+ dtype: Optional[Union[str, Dtype]] = None,
2113
+ ) -> DataFrame:
2114
+ """
2115
+ Convert categorical variable into dummy/indicator variables, also
2116
+ known as one hot encoding.
2117
+
2118
+ Parameters
2119
+ ----------
2120
+ data : array-like, Series, or DataFrame
2121
+ prefix : string, list of strings, or dict of strings, default None
2122
+ String to append DataFrame column names.
2123
+ Pass a list with length equal to the number of columns
2124
+ when calling get_dummies on a DataFrame. Alternatively, `prefix`
2125
+ can be a dictionary mapping column names to prefixes.
2126
+ prefix_sep : string, default '_'
2127
+ If appending prefix, separator/delimiter to use. Or pass a
2128
+ list or dictionary as with `prefix.`
2129
+ dummy_na : bool, default False
2130
+ Add a column to indicate NaNs, if False NaNs are ignored.
2131
+ columns : list-like, default None
2132
+ Column names in the DataFrame to be encoded.
2133
+ If `columns` is None then all the columns with
2134
+ `object` or `category` dtype will be converted.
2135
+ sparse : bool, default False
2136
+ Whether the dummy-encoded columns should be be backed by
2137
+ a :class:`SparseArray` (True) or a regular NumPy array (False).
2138
+ In pandas-on-Spark, this value must be "False".
2139
+ drop_first : bool, default False
2140
+ Whether to get k-1 dummies out of k categorical levels by removing the
2141
+ first level.
2142
+ dtype : dtype, default np.uint8
2143
+ Data type for new columns. Only a single dtype is allowed.
2144
+
2145
+ Returns
2146
+ -------
2147
+ dummies : DataFrame
2148
+
2149
+ See Also
2150
+ --------
2151
+ Series.str.get_dummies
2152
+
2153
+ Examples
2154
+ --------
2155
+ >>> s = ps.Series(list('abca'))
2156
+
2157
+ >>> ps.get_dummies(s)
2158
+ a b c
2159
+ 0 1 0 0
2160
+ 1 0 1 0
2161
+ 2 0 0 1
2162
+ 3 1 0 0
2163
+
2164
+ >>> df = ps.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
2165
+ ... 'C': [1, 2, 3]},
2166
+ ... columns=['A', 'B', 'C'])
2167
+
2168
+ >>> ps.get_dummies(df, prefix=['col1', 'col2'])
2169
+ C col1_a col1_b col2_a col2_b col2_c
2170
+ 0 1 1 0 0 1 0
2171
+ 1 2 0 1 1 0 0
2172
+ 2 3 1 0 0 0 1
2173
+
2174
+ >>> ps.get_dummies(ps.Series(list('abcaa')))
2175
+ a b c
2176
+ 0 1 0 0
2177
+ 1 0 1 0
2178
+ 2 0 0 1
2179
+ 3 1 0 0
2180
+ 4 1 0 0
2181
+
2182
+ >>> ps.get_dummies(ps.Series(list('abcaa')), drop_first=True)
2183
+ b c
2184
+ 0 0 0
2185
+ 1 1 0
2186
+ 2 0 1
2187
+ 3 0 0
2188
+ 4 0 0
2189
+
2190
+ >>> ps.get_dummies(ps.Series(list('abc')), dtype=float)
2191
+ a b c
2192
+ 0 1.0 0.0 0.0
2193
+ 1 0.0 1.0 0.0
2194
+ 2 0.0 0.0 1.0
2195
+ """
2196
+ if sparse is not False:
2197
+ raise NotImplementedError("get_dummies currently does not support sparse")
2198
+
2199
+ if columns is not None and not is_list_like(columns):
2200
+ raise TypeError("Input must be a list-like for parameter `columns`")
2201
+
2202
+ if dtype is None:
2203
+ dtype = "byte"
2204
+
2205
+ if isinstance(data, Series):
2206
+ if prefix is not None:
2207
+ prefix = [str(prefix)]
2208
+ psdf = data.to_frame()
2209
+ column_labels = psdf._internal.column_labels
2210
+ remaining_columns = []
2211
+ else:
2212
+ if isinstance(prefix, str):
2213
+ raise NotImplementedError(
2214
+ "get_dummies currently does not support prefix as string types"
2215
+ )
2216
+ psdf = data.copy()
2217
+
2218
+ if columns is None:
2219
+ column_labels = [
2220
+ label
2221
+ for label in psdf._internal.column_labels
2222
+ if isinstance(
2223
+ psdf._internal.spark_type_for(label), _get_dummies_default_accept_types
2224
+ )
2225
+ ]
2226
+ else:
2227
+ if is_name_like_tuple(columns):
2228
+ column_labels = [
2229
+ label
2230
+ for label in psdf._internal.column_labels
2231
+ if label[: len(columns)] == columns
2232
+ ]
2233
+ if len(column_labels) == 0:
2234
+ raise KeyError(name_like_string(columns))
2235
+ if prefix is None:
2236
+ prefix = [
2237
+ str(label[len(columns) :])
2238
+ if len(label) > len(columns) + 1
2239
+ else label[len(columns)]
2240
+ if len(label) == len(columns) + 1
2241
+ else ""
2242
+ for label in column_labels
2243
+ ]
2244
+ elif any(isinstance(col, tuple) for col in columns) and any(
2245
+ not is_name_like_tuple(col) for col in columns
2246
+ ):
2247
+ raise ValueError(
2248
+ "Expected tuple, got {}".format(
2249
+ type(set(col for col in columns if not is_name_like_tuple(col)).pop())
2250
+ )
2251
+ )
2252
+ else:
2253
+ column_labels = [
2254
+ label
2255
+ for key in columns
2256
+ for label in psdf._internal.column_labels
2257
+ if label == key or label[0] == key
2258
+ ]
2259
+ if len(column_labels) == 0:
2260
+ if columns is None:
2261
+ return psdf
2262
+ raise KeyError("{} not in index".format(columns))
2263
+
2264
+ if prefix is None:
2265
+ prefix = [str(label) if len(label) > 1 else label[0] for label in column_labels]
2266
+
2267
+ column_labels_set = set(column_labels)
2268
+ remaining_columns = [
2269
+ (
2270
+ psdf[label]
2271
+ if psdf._internal.column_labels_level == 1
2272
+ else psdf[label].rename(name_like_string(label))
2273
+ )
2274
+ for label in psdf._internal.column_labels
2275
+ if label not in column_labels_set
2276
+ ]
2277
+
2278
+ if any(
2279
+ not isinstance(psdf._internal.spark_type_for(label), _get_dummies_acceptable_types)
2280
+ for label in column_labels
2281
+ ):
2282
+ raise NotImplementedError(
2283
+ "get_dummies currently only accept {} values".format(
2284
+ ", ".join(
2285
+ [cast(Type[DataType], t).typeName() for t in _get_dummies_acceptable_types]
2286
+ )
2287
+ )
2288
+ )
2289
+
2290
+ if prefix is not None and len(column_labels) != len(prefix):
2291
+ raise ValueError(
2292
+ "Length of 'prefix' ({}) did not match the length of "
2293
+ "the columns being encoded ({}).".format(len(prefix), len(column_labels))
2294
+ )
2295
+ elif isinstance(prefix, dict):
2296
+ prefix = [prefix[column_label[0]] for column_label in column_labels]
2297
+
2298
+ all_values = _reduce_spark_multi(
2299
+ psdf._internal.spark_frame,
2300
+ [F.collect_set(psdf._internal.spark_column_for(label)) for label in column_labels],
2301
+ )
2302
+ for i, label in enumerate(column_labels):
2303
+ values = all_values[i]
2304
+ if isinstance(values, np.ndarray):
2305
+ values = values.tolist()
2306
+ values = sorted(values)
2307
+ if drop_first:
2308
+ values = values[1:]
2309
+
2310
+ def column_name(v: Any) -> Name:
2311
+ if prefix is None or cast(List[str], prefix)[i] == "":
2312
+ return v
2313
+ else:
2314
+ return "{}{}{}".format(cast(List[str], prefix)[i], prefix_sep, v)
2315
+
2316
+ for value in values:
2317
+ remaining_columns.append(
2318
+ (psdf[label].notnull() & (psdf[label] == value))
2319
+ .astype(dtype)
2320
+ .rename(column_name(value))
2321
+ )
2322
+ if dummy_na:
2323
+ remaining_columns.append(psdf[label].isnull().astype(dtype).rename(column_name(np.nan)))
2324
+
2325
+ return psdf[remaining_columns]
2326
+
2327
+
2328
+ # TODO: there are many parameters to implement and support. See pandas's pd.concat.
2329
+ def concat(
2330
+ objs: List[Union[DataFrame, Series]],
2331
+ axis: Axis = 0,
2332
+ join: str = "outer",
2333
+ ignore_index: bool = False,
2334
+ sort: bool = False,
2335
+ ) -> Union[Series, DataFrame]:
2336
+ """
2337
+ Concatenate pandas-on-Spark objects along a particular axis with optional set logic
2338
+ along the other axes.
2339
+
2340
+ Parameters
2341
+ ----------
2342
+ objs : a sequence of Series or DataFrame
2343
+ Any None objects will be dropped silently unless
2344
+ they are all None in which case a ValueError will be raised
2345
+ axis : {0/'index', 1/'columns'}, default 0
2346
+ The axis to concatenate along.
2347
+ join : {'inner', 'outer'}, default 'outer'
2348
+ How to handle indexes on other axis (or axes).
2349
+ ignore_index : bool, default False
2350
+ If True, do not use the index values along the concatenation axis. The
2351
+ resulting axis will be labeled 0, ..., n - 1. This is useful if you are
2352
+ concatenating objects where the concatenation axis does not have
2353
+ meaningful indexing information. Note the index values on the other
2354
+ axes are still respected in the join.
2355
+ sort : bool, default False
2356
+ Sort non-concatenation axis if it is not already aligned.
2357
+
2358
+ Returns
2359
+ -------
2360
+ object, type of objs
2361
+ When concatenating all ``Series`` along the index (axis=0), a
2362
+ ``Series`` is returned. When ``objs`` contains at least one
2363
+ ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
2364
+ the columns (axis=1), a ``DataFrame`` is returned.
2365
+
2366
+ See Also
2367
+ --------
2368
+ Series.append : Concatenate Series.
2369
+ DataFrame.join : Join DataFrames using indexes.
2370
+ DataFrame.merge : Merge DataFrames by indexes or columns.
2371
+
2372
+ Examples
2373
+ --------
2374
+ >>> from pyspark.pandas.config import set_option, reset_option
2375
+ >>> set_option("compute.ops_on_diff_frames", True)
2376
+
2377
+ Combine two ``Series``.
2378
+
2379
+ >>> s1 = ps.Series(['a', 'b'])
2380
+ >>> s2 = ps.Series(['c', 'd'])
2381
+ >>> ps.concat([s1, s2])
2382
+ 0 a
2383
+ 1 b
2384
+ 0 c
2385
+ 1 d
2386
+ dtype: object
2387
+
2388
+ Clear the existing index and reset it in the result
2389
+ by setting the ``ignore_index`` option to ``True``.
2390
+
2391
+ >>> ps.concat([s1, s2], ignore_index=True)
2392
+ 0 a
2393
+ 1 b
2394
+ 2 c
2395
+ 3 d
2396
+ dtype: object
2397
+
2398
+ Combine two ``DataFrame`` objects with identical columns.
2399
+
2400
+ >>> df1 = ps.DataFrame([['a', 1], ['b', 2]],
2401
+ ... columns=['letter', 'number'])
2402
+ >>> df1
2403
+ letter number
2404
+ 0 a 1
2405
+ 1 b 2
2406
+ >>> df2 = ps.DataFrame([['c', 3], ['d', 4]],
2407
+ ... columns=['letter', 'number'])
2408
+ >>> df2
2409
+ letter number
2410
+ 0 c 3
2411
+ 1 d 4
2412
+
2413
+ >>> ps.concat([df1, df2])
2414
+ letter number
2415
+ 0 a 1
2416
+ 1 b 2
2417
+ 0 c 3
2418
+ 1 d 4
2419
+
2420
+ Combine ``DataFrame`` and ``Series`` objects with different columns.
2421
+
2422
+ >>> ps.concat([df2, s1])
2423
+ letter number 0
2424
+ 0 c 3.0 None
2425
+ 1 d 4.0 None
2426
+ 0 None NaN a
2427
+ 1 None NaN b
2428
+
2429
+ Combine ``DataFrame`` objects with overlapping columns
2430
+ and return everything. Columns outside the intersection will
2431
+ be filled with ``None`` values.
2432
+
2433
+ >>> df3 = ps.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
2434
+ ... columns=['letter', 'number', 'animal'])
2435
+ >>> df3
2436
+ letter number animal
2437
+ 0 c 3 cat
2438
+ 1 d 4 dog
2439
+
2440
+ >>> ps.concat([df1, df3])
2441
+ letter number animal
2442
+ 0 a 1 None
2443
+ 1 b 2 None
2444
+ 0 c 3 cat
2445
+ 1 d 4 dog
2446
+
2447
+ Sort the columns.
2448
+
2449
+ >>> ps.concat([df1, df3], sort=True)
2450
+ animal letter number
2451
+ 0 None a 1
2452
+ 1 None b 2
2453
+ 0 cat c 3
2454
+ 1 dog d 4
2455
+
2456
+ Combine ``DataFrame`` objects with overlapping columns
2457
+ and return only those that are shared by passing ``inner`` to
2458
+ the ``join`` keyword argument.
2459
+
2460
+ >>> ps.concat([df1, df3], join="inner")
2461
+ letter number
2462
+ 0 a 1
2463
+ 1 b 2
2464
+ 0 c 3
2465
+ 1 d 4
2466
+
2467
+ >>> df4 = ps.DataFrame([['bird', 'polly'], ['monkey', 'george']],
2468
+ ... columns=['animal', 'name'])
2469
+
2470
+ Combine with column axis.
2471
+
2472
+ >>> ps.concat([df1, df4], axis=1)
2473
+ letter number animal name
2474
+ 0 a 1 bird polly
2475
+ 1 b 2 monkey george
2476
+
2477
+ >>> reset_option("compute.ops_on_diff_frames")
2478
+ """
2479
+ if isinstance(objs, (DataFrame, IndexOpsMixin)) or not isinstance(
2480
+ objs, Iterable
2481
+ ): # TODO: support dict
2482
+ raise TypeError(
2483
+ "first argument must be an iterable of pandas-on-Spark "
2484
+ "objects, you passed an object of type "
2485
+ '"{name}"'.format(name=type(objs).__name__)
2486
+ )
2487
+
2488
+ if len(cast(Sized, objs)) == 0:
2489
+ raise ValueError("No objects to concatenate")
2490
+ objs = list(filter(lambda obj: obj is not None, objs))
2491
+ if len(objs) == 0:
2492
+ raise ValueError("All objects passed were None")
2493
+
2494
+ for obj in objs:
2495
+ if not isinstance(obj, (Series, DataFrame)):
2496
+ raise TypeError(
2497
+ "cannot concatenate object of type "
2498
+ "'{name}"
2499
+ "; only ps.Series "
2500
+ "and ps.DataFrame are valid".format(name=type(objs).__name__)
2501
+ )
2502
+
2503
+ if join not in ["inner", "outer"]:
2504
+ raise ValueError("Only can inner (intersect) or outer (union) join the other axis.")
2505
+
2506
+ axis = validate_axis(axis)
2507
+ psdf: DataFrame
2508
+ if axis == 1:
2509
+ psdfs: List[DataFrame] = [
2510
+ obj.to_frame() if isinstance(obj, Series) else obj for obj in objs
2511
+ ]
2512
+
2513
+ level: int = min(psdf._internal.column_labels_level for psdf in psdfs)
2514
+ psdfs = [
2515
+ DataFrame._index_normalized_frame(level, psdf)
2516
+ if psdf._internal.column_labels_level > level
2517
+ else psdf
2518
+ for psdf in psdfs
2519
+ ]
2520
+
2521
+ concat_psdf = psdfs[0]
2522
+ column_labels: List[Label] = concat_psdf._internal.column_labels.copy()
2523
+
2524
+ psdfs_not_same_anchor = []
2525
+ for psdf in psdfs[1:]:
2526
+ duplicated = [label for label in psdf._internal.column_labels if label in column_labels]
2527
+ if len(duplicated) > 0:
2528
+ pretty_names = [name_like_string(label) for label in duplicated]
2529
+ raise ValueError(
2530
+ "Labels have to be unique; however, got duplicated labels %s." % pretty_names
2531
+ )
2532
+ column_labels.extend(psdf._internal.column_labels)
2533
+
2534
+ if same_anchor(concat_psdf, psdf):
2535
+ concat_psdf = DataFrame(
2536
+ concat_psdf._internal.with_new_columns(
2537
+ [
2538
+ concat_psdf._psser_for(label)
2539
+ for label in concat_psdf._internal.column_labels
2540
+ ]
2541
+ + [psdf._psser_for(label) for label in psdf._internal.column_labels]
2542
+ )
2543
+ )
2544
+ else:
2545
+ psdfs_not_same_anchor.append(psdf)
2546
+
2547
+ if len(psdfs_not_same_anchor) > 0:
2548
+
2549
+ @no_type_check
2550
+ def resolve_func(psdf, this_column_labels, that_column_labels):
2551
+ raise AssertionError("This should not happen.")
2552
+
2553
+ for psdf in psdfs_not_same_anchor:
2554
+ if join == "inner":
2555
+ concat_psdf = align_diff_frames(
2556
+ resolve_func,
2557
+ concat_psdf,
2558
+ psdf,
2559
+ fillna=False,
2560
+ how="inner",
2561
+ )
2562
+ elif join == "outer":
2563
+ concat_psdf = align_diff_frames(
2564
+ resolve_func,
2565
+ concat_psdf,
2566
+ psdf,
2567
+ fillna=False,
2568
+ how="full",
2569
+ )
2570
+
2571
+ concat_psdf = concat_psdf[column_labels]
2572
+
2573
+ if ignore_index:
2574
+ concat_psdf.columns = list( # type: ignore[assignment]
2575
+ map(str, _range(len(concat_psdf.columns)))
2576
+ )
2577
+
2578
+ if sort:
2579
+ concat_psdf = concat_psdf.sort_index()
2580
+
2581
+ return concat_psdf
2582
+
2583
+ # Series, Series ...
2584
+ # We should return Series if objects are all Series.
2585
+ should_return_series = all(map(lambda obj: isinstance(obj, Series), objs))
2586
+
2587
+ # DataFrame, Series ... & Series, Series ...
2588
+ # In this case, we should return DataFrame.
2589
+ new_objs: List[DataFrame] = []
2590
+ num_series = 0
2591
+ series_names = set()
2592
+ for obj in objs:
2593
+ if isinstance(obj, Series):
2594
+ num_series += 1
2595
+ series_names.add(obj.name)
2596
+ new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME))
2597
+ else:
2598
+ assert isinstance(obj, DataFrame)
2599
+ new_objs.append(obj)
2600
+
2601
+ column_labels_levels: Set[int] = set(obj._internal.column_labels_level for obj in new_objs)
2602
+ if len(column_labels_levels) != 1:
2603
+ raise ValueError("MultiIndex columns should have the same levels")
2604
+
2605
+ # DataFrame, DataFrame, ...
2606
+ # All Series are converted into DataFrame and then compute concat.
2607
+ if not ignore_index:
2608
+ indices_of_psdfs = [psdf.index for psdf in new_objs]
2609
+ index_of_first_psdf = indices_of_psdfs[0]
2610
+ for index_of_psdf in indices_of_psdfs:
2611
+ if index_of_first_psdf.names != index_of_psdf.names:
2612
+ raise ValueError(
2613
+ "Index type and names should be same in the objects to concatenate. "
2614
+ "You passed different indices "
2615
+ "{index_of_first_psdf} and {index_of_psdf}".format(
2616
+ index_of_first_psdf=index_of_first_psdf.names,
2617
+ index_of_psdf=index_of_psdf.names,
2618
+ )
2619
+ )
2620
+
2621
+ column_labels_of_psdfs = [psdf._internal.column_labels for psdf in new_objs]
2622
+ index_names_of_psdfs: List[List[Optional[Label]]]
2623
+ if ignore_index:
2624
+ index_names_of_psdfs = [[] for _ in new_objs]
2625
+ else:
2626
+ index_names_of_psdfs = [psdf._internal.index_names for psdf in new_objs]
2627
+
2628
+ if all(name == index_names_of_psdfs[0] for name in index_names_of_psdfs) and all(
2629
+ idx == column_labels_of_psdfs[0] for idx in column_labels_of_psdfs
2630
+ ):
2631
+ # If all columns are in the same order and values, use it.
2632
+ psdfs = new_objs
2633
+ else:
2634
+ if join == "inner":
2635
+ interested_columns = set.intersection(*map(lambda x: set(x), column_labels_of_psdfs))
2636
+ # Keep the column order with its firsts DataFrame.
2637
+ merged_columns = [
2638
+ label for label in column_labels_of_psdfs[0] if label in interested_columns
2639
+ ]
2640
+
2641
+ # If sort is True, sort to follow pandas 1.4+ behavior.
2642
+ if sort:
2643
+ # FIXME: better ordering
2644
+ merged_columns = sorted(merged_columns, key=name_like_string)
2645
+
2646
+ psdfs = [psdf[merged_columns] for psdf in new_objs]
2647
+ elif join == "outer":
2648
+ merged_columns = []
2649
+ for labels in column_labels_of_psdfs:
2650
+ merged_columns.extend(label for label in labels if label not in merged_columns)
2651
+
2652
+ assert len(merged_columns) > 0
2653
+
2654
+ # If sort is True, always sort
2655
+ if sort:
2656
+ # FIXME: better ordering
2657
+ merged_columns = sorted(merged_columns, key=name_like_string)
2658
+
2659
+ psdfs = []
2660
+ for psdf in new_objs:
2661
+ columns_to_add = list(set(merged_columns) - set(psdf._internal.column_labels))
2662
+
2663
+ # TODO: NaN and None difference for missing values. pandas seems to be filling NaN.
2664
+ sdf = psdf._internal.resolved_copy.spark_frame
2665
+ for label in columns_to_add:
2666
+ sdf = sdf.withColumn(name_like_string(label), F.lit(None))
2667
+
2668
+ data_columns = psdf._internal.data_spark_column_names + [
2669
+ name_like_string(label) for label in columns_to_add
2670
+ ]
2671
+ psdf = DataFrame(
2672
+ psdf._internal.copy(
2673
+ spark_frame=sdf,
2674
+ index_spark_columns=[
2675
+ scol_for(sdf, col) for col in psdf._internal.index_spark_column_names
2676
+ ],
2677
+ column_labels=(psdf._internal.column_labels + columns_to_add),
2678
+ data_spark_columns=[scol_for(sdf, col) for col in data_columns],
2679
+ data_fields=(psdf._internal.data_fields + ([None] * len(columns_to_add))),
2680
+ )
2681
+ )
2682
+
2683
+ psdfs.append(psdf[merged_columns])
2684
+
2685
+ if ignore_index:
2686
+ sdfs = [
2687
+ psdf._internal.spark_frame.select(psdf._internal.data_spark_columns) for psdf in psdfs
2688
+ ]
2689
+ else:
2690
+ sdfs = [
2691
+ psdf._internal.spark_frame.select(
2692
+ psdf._internal.index_spark_columns + psdf._internal.data_spark_columns
2693
+ )
2694
+ for psdf in psdfs
2695
+ ]
2696
+ concatenated = reduce(lambda x, y: x.union(y), sdfs)
2697
+
2698
+ if ignore_index:
2699
+ index_spark_column_names = []
2700
+ index_names = []
2701
+ index_fields = []
2702
+ else:
2703
+ index_spark_column_names = psdfs[0]._internal.index_spark_column_names
2704
+ index_names = psdfs[0]._internal.index_names
2705
+ index_fields = psdfs[0]._internal.index_fields
2706
+
2707
+ result_psdf: DataFrame = DataFrame(
2708
+ psdfs[0]._internal.copy(
2709
+ spark_frame=concatenated,
2710
+ index_spark_columns=[scol_for(concatenated, col) for col in index_spark_column_names],
2711
+ index_names=index_names,
2712
+ index_fields=index_fields,
2713
+ data_spark_columns=[
2714
+ scol_for(concatenated, col) for col in psdfs[0]._internal.data_spark_column_names
2715
+ ],
2716
+ data_fields=None, # TODO: dtypes?
2717
+ )
2718
+ )
2719
+
2720
+ if should_return_series:
2721
+ # If all input were Series, we should return Series.
2722
+ if len(series_names) == 1:
2723
+ name = series_names.pop()
2724
+ else:
2725
+ name = None
2726
+ return first_series(result_psdf).rename(name)
2727
+ else:
2728
+ return result_psdf
2729
+
2730
+
2731
+ def melt(
2732
+ frame: DataFrame,
2733
+ id_vars: Optional[Union[Name, List[Name]]] = None,
2734
+ value_vars: Optional[Union[Name, List[Name]]] = None,
2735
+ var_name: Optional[Union[str, List[str]]] = None,
2736
+ value_name: str = "value",
2737
+ ) -> DataFrame:
2738
+ return DataFrame.melt(frame, id_vars, value_vars, var_name, value_name)
2739
+
2740
+
2741
+ melt.__doc__ = DataFrame.melt.__doc__
2742
+
2743
+
2744
+ @no_type_check
2745
+ def isna(obj):
2746
+ """
2747
+ Detect missing values for an array-like object.
2748
+
2749
+ This function takes a scalar or array-like object and indicates
2750
+ whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``
2751
+ in object arrays).
2752
+
2753
+ Parameters
2754
+ ----------
2755
+ obj : scalar or array-like
2756
+ Object to check for null or missing values.
2757
+
2758
+ Returns
2759
+ -------
2760
+ bool or array-like of bool
2761
+ For scalar input, returns a scalar boolean.
2762
+ For array input, returns an array of boolean indicating whether each
2763
+ corresponding element is missing.
2764
+
2765
+ See Also
2766
+ --------
2767
+ Series.isna : Detect missing values in a Series.
2768
+ Series.isnull : Detect missing values in a Series.
2769
+ DataFrame.isna : Detect missing values in a DataFrame.
2770
+ DataFrame.isnull : Detect missing values in a DataFrame.
2771
+ Index.isna : Detect missing values in an Index.
2772
+ Index.isnull : Detect missing values in an Index.
2773
+
2774
+ Examples
2775
+ --------
2776
+ Scalar arguments (including strings) result in a scalar boolean.
2777
+
2778
+ >>> ps.isna('dog')
2779
+ False
2780
+
2781
+ >>> ps.isna(np.nan)
2782
+ True
2783
+
2784
+ ndarrays result in an ndarray of booleans.
2785
+
2786
+ >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
2787
+ >>> array
2788
+ array([[ 1., nan, 3.],
2789
+ [ 4., 5., nan]])
2790
+ >>> ps.isna(array)
2791
+ array([[False, True, False],
2792
+ [False, False, True]])
2793
+
2794
+ For Series and DataFrame, the same type is returned, containing booleans.
2795
+
2796
+ >>> df = ps.DataFrame({'a': ['ant', 'bee', 'cat'], 'b': ['dog', None, 'fly']})
2797
+ >>> df
2798
+ a b
2799
+ 0 ant dog
2800
+ 1 bee None
2801
+ 2 cat fly
2802
+
2803
+ >>> ps.isna(df)
2804
+ a b
2805
+ 0 False False
2806
+ 1 False True
2807
+ 2 False False
2808
+
2809
+ >>> ps.isnull(df.b)
2810
+ 0 False
2811
+ 1 True
2812
+ 2 False
2813
+ Name: b, dtype: bool
2814
+ """
2815
+ # TODO: Add back:
2816
+ # notnull : Boolean inverse of pandas.isnull.
2817
+ # into the See Also in the docstring. It does not find the method in the latest numpydoc.
2818
+ if isinstance(obj, (DataFrame, Series)):
2819
+ return obj.isnull()
2820
+ else:
2821
+ return pd.isnull(obj)
2822
+
2823
+
2824
+ isnull = isna
2825
+
2826
+
2827
+ @no_type_check
2828
+ def notna(obj):
2829
+ """
2830
+ Detect existing (non-missing) values.
2831
+
2832
+ Return a boolean same-sized object indicating if the values are not NA.
2833
+ Non-missing values get mapped to True. NA values, such as None or
2834
+ :attr:`numpy.NaN`, get mapped to False values.
2835
+
2836
+ Returns
2837
+ -------
2838
+ bool or array-like of bool
2839
+ Mask of bool values for each element that
2840
+ indicates whether an element is not an NA value.
2841
+
2842
+ See Also
2843
+ --------
2844
+ isna : Detect missing values for an array-like object.
2845
+ Series.notna : Boolean inverse of Series.isna.
2846
+ DataFrame.notnull : Boolean inverse of DataFrame.isnull.
2847
+ Index.notna : Boolean inverse of Index.isna.
2848
+ Index.notnull : Boolean inverse of Index.isnull.
2849
+
2850
+ Examples
2851
+ --------
2852
+ Show which entries in a DataFrame are not NA.
2853
+
2854
+ >>> df = ps.DataFrame({'age': [5, 6, np.NaN],
2855
+ ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
2856
+ ... pd.Timestamp('1940-04-25')],
2857
+ ... 'name': ['Alfred', 'Batman', ''],
2858
+ ... 'toy': [None, 'Batmobile', 'Joker']})
2859
+ >>> df
2860
+ age born name toy
2861
+ 0 5.0 NaT Alfred None
2862
+ 1 6.0 1939-05-27 Batman Batmobile
2863
+ 2 NaN 1940-04-25 Joker
2864
+
2865
+ >>> df.notnull()
2866
+ age born name toy
2867
+ 0 True False True False
2868
+ 1 True True True True
2869
+ 2 False True True True
2870
+
2871
+ Show which entries in a Series are not NA.
2872
+
2873
+ >>> ser = ps.Series([5, 6, np.NaN])
2874
+ >>> ser
2875
+ 0 5.0
2876
+ 1 6.0
2877
+ 2 NaN
2878
+ dtype: float64
2879
+
2880
+ >>> ps.notna(ser)
2881
+ 0 True
2882
+ 1 True
2883
+ 2 False
2884
+ dtype: bool
2885
+
2886
+ >>> ps.notna(ser.index)
2887
+ True
2888
+ """
2889
+ # TODO: Add back:
2890
+ # Series.notnull :Boolean inverse of Series.isnull.
2891
+ # DataFrame.notna :Boolean inverse of DataFrame.isna.
2892
+ # into the See Also in the docstring. It does not find the method in the latest numpydoc.
2893
+ if isinstance(obj, (DataFrame, Series)):
2894
+ return obj.notna()
2895
+ else:
2896
+ return pd.notna(obj)
2897
+
2898
+
2899
+ notnull = notna
2900
+
2901
+
2902
+ def merge(
2903
+ obj: DataFrame,
2904
+ right: DataFrame,
2905
+ how: str = "inner",
2906
+ on: Optional[Union[Name, List[Name]]] = None,
2907
+ left_on: Optional[Union[Name, List[Name]]] = None,
2908
+ right_on: Optional[Union[Name, List[Name]]] = None,
2909
+ left_index: bool = False,
2910
+ right_index: bool = False,
2911
+ suffixes: Tuple[str, str] = ("_x", "_y"),
2912
+ ) -> "DataFrame":
2913
+ """
2914
+ Merge DataFrame objects with a database-style join.
2915
+
2916
+ The index of the resulting DataFrame will be one of the following:
2917
+ - 0...n if no index is used for merging
2918
+ - Index of the left DataFrame if merged only on the index of the right DataFrame
2919
+ - Index of the right DataFrame if merged only on the index of the left DataFrame
2920
+ - All involved indices if merged using the indices of both DataFrames
2921
+ e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will
2922
+ be an index (x, a, b)
2923
+
2924
+ Parameters
2925
+ ----------
2926
+ right: Object to merge with.
2927
+ how: Type of merge to be performed.
2928
+ {'left', 'right', 'outer', 'inner'}, default 'inner'
2929
+
2930
+ left: use only keys from left frame, like a SQL left outer join; preserve key
2931
+ order.
2932
+ right: use only keys from right frame, like a SQL right outer join; preserve key
2933
+ order.
2934
+ outer: use union of keys from both frames, like a SQL full outer join; sort keys
2935
+ lexicographically.
2936
+ inner: use intersection of keys from both frames, like a SQL inner join;
2937
+ preserve the order of the left keys.
2938
+ on: Column or index level names to join on. These must be found in both DataFrames. If on
2939
+ is None and not merging on indexes then this defaults to the intersection of the
2940
+ columns in both DataFrames.
2941
+ left_on: Column or index level names to join on in the left DataFrame. Can also
2942
+ be an array or list of arrays of the length of the left DataFrame.
2943
+ These arrays are treated as if they are columns.
2944
+ right_on: Column or index level names to join on in the right DataFrame. Can also
2945
+ be an array or list of arrays of the length of the right DataFrame.
2946
+ These arrays are treated as if they are columns.
2947
+ left_index: Use the index from the left DataFrame as the join key(s). If it is a
2948
+ MultiIndex, the number of keys in the other DataFrame (either the index or a number of
2949
+ columns) must match the number of levels.
2950
+ right_index: Use the index from the right DataFrame as the join key. Same caveats as
2951
+ left_index.
2952
+ suffixes: Suffix to apply to overlapping column names in the left and right side,
2953
+ respectively.
2954
+
2955
+ Returns
2956
+ -------
2957
+ DataFrame
2958
+ A DataFrame of the two merged objects.
2959
+
2960
+ Examples
2961
+ --------
2962
+
2963
+ >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
2964
+ ... 'value': [1, 2, 3, 5]},
2965
+ ... columns=['lkey', 'value'])
2966
+ >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
2967
+ ... 'value': [5, 6, 7, 8]},
2968
+ ... columns=['rkey', 'value'])
2969
+ >>> df1
2970
+ lkey value
2971
+ 0 foo 1
2972
+ 1 bar 2
2973
+ 2 baz 3
2974
+ 3 foo 5
2975
+ >>> df2
2976
+ rkey value
2977
+ 0 foo 5
2978
+ 1 bar 6
2979
+ 2 baz 7
2980
+ 3 foo 8
2981
+
2982
+ Merge df1 and df2 on the lkey and rkey columns. The value columns have
2983
+ the default suffixes, _x and _y, appended.
2984
+
2985
+ >>> merged = ps.merge(df1, df2, left_on='lkey', right_on='rkey')
2986
+ >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) # doctest: +ELLIPSIS
2987
+ lkey value_x rkey value_y
2988
+ ...bar 2 bar 6
2989
+ ...baz 3 baz 7
2990
+ ...foo 1 foo 5
2991
+ ...foo 1 foo 8
2992
+ ...foo 5 foo 5
2993
+ ...foo 5 foo 8
2994
+
2995
+ >>> left_psdf = ps.DataFrame({'A': [1, 2]})
2996
+ >>> right_psdf = ps.DataFrame({'B': ['x', 'y']}, index=[1, 2])
2997
+
2998
+ >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True).sort_index()
2999
+ A B
3000
+ 1 2 x
3001
+
3002
+ >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how='left').sort_index()
3003
+ A B
3004
+ 0 1 None
3005
+ 1 2 x
3006
+
3007
+ >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how='right').sort_index()
3008
+ A B
3009
+ 1 2.0 x
3010
+ 2 NaN y
3011
+
3012
+ >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how='outer').sort_index()
3013
+ A B
3014
+ 0 1.0 None
3015
+ 1 2.0 x
3016
+ 2 NaN y
3017
+
3018
+ Notes
3019
+ -----
3020
+ As described in #263, joining string columns currently returns None for missing values
3021
+ instead of NaN.
3022
+ """
3023
+ return obj.merge(
3024
+ right,
3025
+ how=how,
3026
+ on=on,
3027
+ left_on=left_on,
3028
+ right_on=right_on,
3029
+ left_index=left_index,
3030
+ right_index=right_index,
3031
+ suffixes=suffixes,
3032
+ )
3033
+
3034
+
3035
+ def merge_asof(
3036
+ left: Union[DataFrame, Series],
3037
+ right: Union[DataFrame, Series],
3038
+ on: Optional[Name] = None,
3039
+ left_on: Optional[Name] = None,
3040
+ right_on: Optional[Name] = None,
3041
+ left_index: bool = False,
3042
+ right_index: bool = False,
3043
+ by: Optional[Union[Name, List[Name]]] = None,
3044
+ left_by: Optional[Union[Name, List[Name]]] = None,
3045
+ right_by: Optional[Union[Name, List[Name]]] = None,
3046
+ suffixes: Tuple[str, str] = ("_x", "_y"),
3047
+ tolerance: Optional[Any] = None,
3048
+ allow_exact_matches: bool = True,
3049
+ direction: str = "backward",
3050
+ ) -> DataFrame:
3051
+ """
3052
+ Perform an asof merge.
3053
+
3054
+ This is like a left-join except that we match on nearest
3055
+ key rather than equal keys.
3056
+
3057
+ For each row in the left DataFrame:
3058
+
3059
+ - A "backward" search selects the last row in the right DataFrame whose
3060
+ 'on' key is less than or equal to the left's key.
3061
+
3062
+ - A "forward" search selects the first row in the right DataFrame whose
3063
+ 'on' key is greater than or equal to the left's key.
3064
+
3065
+ - A "nearest" search selects the row in the right DataFrame who's 'on'
3066
+ key is closest in absolute distance to the left's key.
3067
+
3068
+ Optionally match on equivalent keys with 'by' before searching with 'on'.
3069
+
3070
+ .. versionadded:: 3.3.0
3071
+
3072
+ Parameters
3073
+ ----------
3074
+ left : DataFrame or named Series
3075
+ right : DataFrame or named Series
3076
+ on : label
3077
+ Field name to join on. Must be found in both DataFrames.
3078
+ The data MUST be ordered. This must be a numeric column,
3079
+ such as datetimelike, integer, or float. On or left_on/right_on
3080
+ must be given.
3081
+ left_on : label
3082
+ Field name to join on in left DataFrame.
3083
+ right_on : label
3084
+ Field name to join on in right DataFrame.
3085
+ left_index : bool
3086
+ Use the index of the left DataFrame as the join key.
3087
+ right_index : bool
3088
+ Use the index of the right DataFrame as the join key.
3089
+ by : column name or list of column names
3090
+ Match on these columns before performing merge operation.
3091
+ left_by : column name
3092
+ Field names to match on in the left DataFrame.
3093
+ right_by : column name
3094
+ Field names to match on in the right DataFrame.
3095
+ suffixes : 2-length sequence (tuple, list, ...)
3096
+ Suffix to apply to overlapping column names in the left and right
3097
+ side, respectively.
3098
+ tolerance : int or Timedelta, optional, default None
3099
+ Select asof tolerance within this range; must be compatible
3100
+ with the merge index.
3101
+ allow_exact_matches : bool, default True
3102
+
3103
+ - If True, allow matching with the same 'on' value
3104
+ (i.e. less-than-or-equal-to / greater-than-or-equal-to)
3105
+ - If False, don't match the same 'on' value
3106
+ (i.e., strictly less-than / strictly greater-than).
3107
+
3108
+ direction : 'backward' (default), 'forward', or 'nearest'
3109
+ Whether to search for prior, subsequent, or closest matches.
3110
+
3111
+ Returns
3112
+ -------
3113
+ merged : DataFrame
3114
+
3115
+ See Also
3116
+ --------
3117
+ merge : Merge with a database-style join.
3118
+ merge_ordered : Merge with optional filling/interpolation.
3119
+
3120
+ Examples
3121
+ --------
3122
+ >>> left = ps.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
3123
+ >>> left
3124
+ a left_val
3125
+ 0 1 a
3126
+ 1 5 b
3127
+ 2 10 c
3128
+
3129
+ >>> right = ps.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
3130
+ >>> right
3131
+ a right_val
3132
+ 0 1 1
3133
+ 1 2 2
3134
+ 2 3 3
3135
+ 3 6 6
3136
+ 4 7 7
3137
+
3138
+ >>> ps.merge_asof(left, right, on="a").sort_values("a").reset_index(drop=True)
3139
+ a left_val right_val
3140
+ 0 1 a 1
3141
+ 1 5 b 3
3142
+ 2 10 c 7
3143
+
3144
+ >>> ps.merge_asof(
3145
+ ... left,
3146
+ ... right,
3147
+ ... on="a",
3148
+ ... allow_exact_matches=False
3149
+ ... ).sort_values("a").reset_index(drop=True)
3150
+ a left_val right_val
3151
+ 0 1 a NaN
3152
+ 1 5 b 3.0
3153
+ 2 10 c 7.0
3154
+
3155
+ >>> ps.merge_asof(
3156
+ ... left,
3157
+ ... right,
3158
+ ... on="a",
3159
+ ... direction="forward"
3160
+ ... ).sort_values("a").reset_index(drop=True)
3161
+ a left_val right_val
3162
+ 0 1 a 1.0
3163
+ 1 5 b 6.0
3164
+ 2 10 c NaN
3165
+
3166
+ >>> ps.merge_asof(
3167
+ ... left,
3168
+ ... right,
3169
+ ... on="a",
3170
+ ... direction="nearest"
3171
+ ... ).sort_values("a").reset_index(drop=True)
3172
+ a left_val right_val
3173
+ 0 1 a 1
3174
+ 1 5 b 6
3175
+ 2 10 c 7
3176
+
3177
+ We can use indexed DataFrames as well.
3178
+
3179
+ >>> left = ps.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
3180
+ >>> left
3181
+ left_val
3182
+ 1 a
3183
+ 5 b
3184
+ 10 c
3185
+
3186
+ >>> right = ps.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
3187
+ >>> right
3188
+ right_val
3189
+ 1 1
3190
+ 2 2
3191
+ 3 3
3192
+ 6 6
3193
+ 7 7
3194
+
3195
+ >>> ps.merge_asof(left, right, left_index=True, right_index=True).sort_index()
3196
+ left_val right_val
3197
+ 1 a 1
3198
+ 5 b 3
3199
+ 10 c 7
3200
+
3201
+ Here is a real-world times-series example
3202
+
3203
+ >>> quotes = ps.DataFrame(
3204
+ ... {
3205
+ ... "time": [
3206
+ ... pd.Timestamp("2016-05-25 13:30:00.023"),
3207
+ ... pd.Timestamp("2016-05-25 13:30:00.023"),
3208
+ ... pd.Timestamp("2016-05-25 13:30:00.030"),
3209
+ ... pd.Timestamp("2016-05-25 13:30:00.041"),
3210
+ ... pd.Timestamp("2016-05-25 13:30:00.048"),
3211
+ ... pd.Timestamp("2016-05-25 13:30:00.049"),
3212
+ ... pd.Timestamp("2016-05-25 13:30:00.072"),
3213
+ ... pd.Timestamp("2016-05-25 13:30:00.075")
3214
+ ... ],
3215
+ ... "ticker": [
3216
+ ... "GOOG",
3217
+ ... "MSFT",
3218
+ ... "MSFT",
3219
+ ... "MSFT",
3220
+ ... "GOOG",
3221
+ ... "AAPL",
3222
+ ... "GOOG",
3223
+ ... "MSFT"
3224
+ ... ],
3225
+ ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
3226
+ ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
3227
+ ... }
3228
+ ... )
3229
+ >>> quotes
3230
+ time ticker bid ask
3231
+ 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
3232
+ 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
3233
+ 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
3234
+ 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
3235
+ 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
3236
+ 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
3237
+ 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
3238
+ 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
3239
+
3240
+ >>> trades = ps.DataFrame(
3241
+ ... {
3242
+ ... "time": [
3243
+ ... pd.Timestamp("2016-05-25 13:30:00.023"),
3244
+ ... pd.Timestamp("2016-05-25 13:30:00.038"),
3245
+ ... pd.Timestamp("2016-05-25 13:30:00.048"),
3246
+ ... pd.Timestamp("2016-05-25 13:30:00.048"),
3247
+ ... pd.Timestamp("2016-05-25 13:30:00.048")
3248
+ ... ],
3249
+ ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
3250
+ ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
3251
+ ... "quantity": [75, 155, 100, 100, 100]
3252
+ ... }
3253
+ ... )
3254
+ >>> trades
3255
+ time ticker price quantity
3256
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75
3257
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155
3258
+ 2 2016-05-25 13:30:00.048 GOOG 720.77 100
3259
+ 3 2016-05-25 13:30:00.048 GOOG 720.92 100
3260
+ 4 2016-05-25 13:30:00.048 AAPL 98.00 100
3261
+
3262
+ By default we are taking the asof of the quotes
3263
+
3264
+ >>> ps.merge_asof(
3265
+ ... trades, quotes, on="time", by="ticker"
3266
+ ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True)
3267
+ time ticker price quantity bid ask
3268
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
3269
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
3270
+ 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
3271
+ 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
3272
+ 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
3273
+
3274
+ We only asof within 2ms between the quote time and the trade time
3275
+
3276
+ >>> ps.merge_asof(
3277
+ ... trades,
3278
+ ... quotes,
3279
+ ... on="time",
3280
+ ... by="ticker",
3281
+ ... tolerance=sf.expr("INTERVAL 2 MILLISECONDS") # pd.Timedelta("2ms")
3282
+ ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True)
3283
+ time ticker price quantity bid ask
3284
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
3285
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
3286
+ 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
3287
+ 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
3288
+ 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
3289
+
3290
+ We only asof within 10ms between the quote time and the trade time
3291
+ and we exclude exact matches on time. However *prior* data will
3292
+ propagate forward
3293
+
3294
+ >>> ps.merge_asof(
3295
+ ... trades,
3296
+ ... quotes,
3297
+ ... on="time",
3298
+ ... by="ticker",
3299
+ ... tolerance=sf.expr("INTERVAL 10 MILLISECONDS"), # pd.Timedelta("10ms")
3300
+ ... allow_exact_matches=False
3301
+ ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True)
3302
+ time ticker price quantity bid ask
3303
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
3304
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
3305
+ 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
3306
+ 3 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
3307
+ 4 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
3308
+ """
3309
+
3310
+ def to_list(os: Optional[Union[Name, List[Name]]]) -> List[Label]:
3311
+ if os is None:
3312
+ return []
3313
+ elif is_name_like_tuple(os):
3314
+ return [cast(Label, os)]
3315
+ elif is_name_like_value(os):
3316
+ return [(os,)]
3317
+ else:
3318
+ return [o if is_name_like_tuple(o) else (o,) for o in os]
3319
+
3320
+ if isinstance(left, Series):
3321
+ left = left.to_frame()
3322
+ if isinstance(right, Series):
3323
+ right = right.to_frame()
3324
+
3325
+ if on:
3326
+ if left_on or right_on:
3327
+ raise ValueError(
3328
+ 'Can only pass argument "on" OR "left_on" and "right_on", '
3329
+ "not a combination of both."
3330
+ )
3331
+ left_as_of_names = list(map(left._internal.spark_column_name_for, to_list(on)))
3332
+ right_as_of_names = list(map(right._internal.spark_column_name_for, to_list(on)))
3333
+ else:
3334
+ if left_index:
3335
+ if isinstance(left.index, MultiIndex):
3336
+ raise ValueError("left can only have one index")
3337
+ left_as_of_names = left._internal.index_spark_column_names
3338
+ else:
3339
+ left_as_of_names = list(map(left._internal.spark_column_name_for, to_list(left_on)))
3340
+ if right_index:
3341
+ if isinstance(right.index, MultiIndex):
3342
+ raise ValueError("right can only have one index")
3343
+ right_as_of_names = right._internal.index_spark_column_names
3344
+ else:
3345
+ right_as_of_names = list(map(right._internal.spark_column_name_for, to_list(right_on)))
3346
+
3347
+ if left_as_of_names and not right_as_of_names:
3348
+ raise ValueError("Must pass right_on or right_index=True")
3349
+ if right_as_of_names and not left_as_of_names:
3350
+ raise ValueError("Must pass left_on or left_index=True")
3351
+ if not left_as_of_names and not right_as_of_names:
3352
+ common = list(left.columns.intersection(right.columns))
3353
+ if len(common) == 0:
3354
+ raise ValueError(
3355
+ "No common columns to perform merge on. Merge options: "
3356
+ "left_on=None, right_on=None, left_index=False, right_index=False"
3357
+ )
3358
+ left_as_of_names = list(map(left._internal.spark_column_name_for, to_list(common)))
3359
+ right_as_of_names = list(map(right._internal.spark_column_name_for, to_list(common)))
3360
+
3361
+ if len(left_as_of_names) != 1:
3362
+ raise ValueError("can only asof on a key for left")
3363
+ if len(right_as_of_names) != 1:
3364
+ raise ValueError("can only asof on a key for right")
3365
+
3366
+ if by:
3367
+ if left_by or right_by:
3368
+ raise ValueError('Can only pass argument "by" OR "left_by" and "right_by".')
3369
+ left_join_on_names = list(map(left._internal.spark_column_name_for, to_list(by)))
3370
+ right_join_on_names = list(map(right._internal.spark_column_name_for, to_list(by)))
3371
+ else:
3372
+ left_join_on_names = list(map(left._internal.spark_column_name_for, to_list(left_by)))
3373
+ right_join_on_names = list(map(right._internal.spark_column_name_for, to_list(right_by)))
3374
+
3375
+ if left_join_on_names and not right_join_on_names:
3376
+ raise ValueError("missing right_by")
3377
+ if right_join_on_names and not left_join_on_names:
3378
+ raise ValueError("missing left_by")
3379
+ if len(left_join_on_names) != len(right_join_on_names):
3380
+ raise ValueError("left_by and right_by must be same length")
3381
+
3382
+ # We should distinguish the name to avoid ambiguous column name after merging.
3383
+ right_prefix = "__right_"
3384
+ right_as_of_names = [right_prefix + right_as_of_name for right_as_of_name in right_as_of_names]
3385
+ right_join_on_names = [
3386
+ right_prefix + right_join_on_name for right_join_on_name in right_join_on_names
3387
+ ]
3388
+
3389
+ left_as_of_name = left_as_of_names[0]
3390
+ right_as_of_name = right_as_of_names[0]
3391
+
3392
+ def resolve(internal: InternalFrame, side: str) -> InternalFrame:
3393
+ def rename(col: str) -> str:
3394
+ return "__{}_{}".format(side, col)
3395
+
3396
+ internal = internal.resolved_copy
3397
+ sdf = internal.spark_frame
3398
+ sdf = sdf.select(
3399
+ *[
3400
+ scol_for(sdf, col).alias(rename(col))
3401
+ for col in sdf.columns
3402
+ if col not in HIDDEN_COLUMNS
3403
+ ],
3404
+ *HIDDEN_COLUMNS,
3405
+ )
3406
+ return internal.copy(
3407
+ spark_frame=sdf,
3408
+ index_spark_columns=[
3409
+ scol_for(sdf, rename(col)) for col in internal.index_spark_column_names
3410
+ ],
3411
+ index_fields=[field.copy(name=rename(field.name)) for field in internal.index_fields],
3412
+ data_spark_columns=[
3413
+ scol_for(sdf, rename(col)) for col in internal.data_spark_column_names
3414
+ ],
3415
+ data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields],
3416
+ )
3417
+
3418
+ left_internal = left._internal.resolved_copy
3419
+ right_internal = resolve(right._internal, "right")
3420
+
3421
+ left_table = left_internal.spark_frame.alias("left_table")
3422
+ right_table = right_internal.spark_frame.alias("right_table")
3423
+
3424
+ left_as_of_column = scol_for(left_table, left_as_of_name)
3425
+ right_as_of_column = scol_for(right_table, right_as_of_name)
3426
+
3427
+ if left_join_on_names:
3428
+ left_join_on_columns = [scol_for(left_table, label) for label in left_join_on_names]
3429
+ right_join_on_columns = [scol_for(right_table, label) for label in right_join_on_names]
3430
+ on = reduce(
3431
+ lambda lft, rgt: lft & rgt,
3432
+ [lft == rgt for lft, rgt in zip(left_join_on_columns, right_join_on_columns)],
3433
+ )
3434
+ else:
3435
+ on = None
3436
+
3437
+ Column = get_column_class()
3438
+ if tolerance is not None and not isinstance(tolerance, Column):
3439
+ tolerance = F.lit(tolerance)
3440
+
3441
+ as_of_joined_table = left_table._joinAsOf(
3442
+ right_table,
3443
+ leftAsOfColumn=left_as_of_column,
3444
+ rightAsOfColumn=right_as_of_column,
3445
+ on=on,
3446
+ how="left",
3447
+ tolerance=tolerance,
3448
+ allowExactMatches=allow_exact_matches,
3449
+ direction=direction,
3450
+ )
3451
+
3452
+ # Unpack suffixes tuple for convenience
3453
+ left_suffix = suffixes[0]
3454
+ right_suffix = suffixes[1]
3455
+
3456
+ # Append suffixes to columns with the same name to avoid conflicts later
3457
+ duplicate_columns = set(left_internal.column_labels) & set(right_internal.column_labels)
3458
+
3459
+ exprs = []
3460
+ data_columns = []
3461
+ column_labels = []
3462
+
3463
+ def left_scol_for(label: Label) -> Column: # type: ignore[valid-type]
3464
+ return scol_for(as_of_joined_table, left_internal.spark_column_name_for(label))
3465
+
3466
+ def right_scol_for(label: Label) -> Column: # type: ignore[valid-type]
3467
+ return scol_for(as_of_joined_table, right_internal.spark_column_name_for(label))
3468
+
3469
+ for label in left_internal.column_labels:
3470
+ col = left_internal.spark_column_name_for(label)
3471
+ scol = left_scol_for(label)
3472
+ if label in duplicate_columns:
3473
+ spark_column_name = left_internal.spark_column_name_for(label)
3474
+ if spark_column_name in (left_as_of_names + left_join_on_names) and (
3475
+ (right_prefix + spark_column_name) in (right_as_of_names + right_join_on_names)
3476
+ ):
3477
+ pass
3478
+ else:
3479
+ col = col + left_suffix
3480
+ scol = scol.alias(col) # type: ignore[attr-defined]
3481
+ label = tuple([str(label[0]) + left_suffix] + list(label[1:]))
3482
+ exprs.append(scol)
3483
+ data_columns.append(col)
3484
+ column_labels.append(label)
3485
+ for label in right_internal.column_labels:
3486
+ # recover `right_prefix` here.
3487
+ col = right_internal.spark_column_name_for(label)[len(right_prefix) :]
3488
+ scol = right_scol_for(label).alias(col) # type: ignore[attr-defined]
3489
+ if label in duplicate_columns:
3490
+ spark_column_name = left_internal.spark_column_name_for(label)
3491
+ if spark_column_name in left_as_of_names + left_join_on_names and (
3492
+ (right_prefix + spark_column_name) in right_as_of_names + right_join_on_names
3493
+ ):
3494
+ continue
3495
+ else:
3496
+ col = col + right_suffix
3497
+ scol = scol.alias(col) # type: ignore[attr-defined]
3498
+ label = tuple([str(label[0]) + right_suffix] + list(label[1:]))
3499
+ exprs.append(scol)
3500
+ data_columns.append(col)
3501
+ column_labels.append(label)
3502
+
3503
+ # Retain indices if they are used for joining
3504
+ if left_index or right_index:
3505
+ index_spark_column_names = [
3506
+ SPARK_INDEX_NAME_FORMAT(i) for i in range(len(left_internal.index_spark_column_names))
3507
+ ]
3508
+ left_index_scols = [
3509
+ scol.alias(name)
3510
+ for scol, name in zip(left_internal.index_spark_columns, index_spark_column_names)
3511
+ ]
3512
+ exprs.extend(left_index_scols)
3513
+ index_names = left_internal.index_names
3514
+ else:
3515
+ index_spark_column_names = []
3516
+ index_names = []
3517
+
3518
+ selected_columns = as_of_joined_table.select(*exprs)
3519
+
3520
+ internal = InternalFrame(
3521
+ spark_frame=selected_columns,
3522
+ index_spark_columns=[scol_for(selected_columns, col) for col in index_spark_column_names],
3523
+ index_names=index_names,
3524
+ column_labels=column_labels,
3525
+ data_spark_columns=[scol_for(selected_columns, col) for col in data_columns],
3526
+ )
3527
+ return DataFrame(internal)
3528
+
3529
+
3530
+ @no_type_check
3531
+ def to_numeric(arg, errors="raise"):
3532
+ """
3533
+ Convert argument to a numeric type.
3534
+
3535
+ Parameters
3536
+ ----------
3537
+ arg : scalar, list, tuple, 1-d array, or Series
3538
+ Argument to be converted.
3539
+ errors : {'raise', 'coerce'}, default 'raise'
3540
+ * If 'coerce', then invalid parsing will be set as NaN.
3541
+ * If 'raise', then invalid parsing will raise an exception.
3542
+ * If 'ignore', then invalid parsing will return the input.
3543
+
3544
+ .. note:: 'ignore' doesn't work yet when `arg` is pandas-on-Spark Series.
3545
+
3546
+ Returns
3547
+ -------
3548
+ ret : numeric if parsing succeeded.
3549
+
3550
+ See Also
3551
+ --------
3552
+ DataFrame.astype : Cast argument to a specified dtype.
3553
+ to_datetime : Convert argument to datetime.
3554
+ to_timedelta : Convert argument to timedelta.
3555
+ numpy.ndarray.astype : Cast a numpy array to a specified type.
3556
+
3557
+ Examples
3558
+ --------
3559
+
3560
+ >>> psser = ps.Series(['1.0', '2', '-3'])
3561
+ >>> psser
3562
+ 0 1.0
3563
+ 1 2
3564
+ 2 -3
3565
+ dtype: object
3566
+
3567
+ >>> ps.to_numeric(psser)
3568
+ 0 1.0
3569
+ 1 2.0
3570
+ 2 -3.0
3571
+ dtype: float32
3572
+
3573
+ If given Series contains invalid value to cast float, just cast it to `np.nan`
3574
+ when `errors` is set to "coerce".
3575
+
3576
+ >>> psser = ps.Series(['apple', '1.0', '2', '-3'])
3577
+ >>> psser
3578
+ 0 apple
3579
+ 1 1.0
3580
+ 2 2
3581
+ 3 -3
3582
+ dtype: object
3583
+
3584
+ >>> ps.to_numeric(psser, errors="coerce")
3585
+ 0 NaN
3586
+ 1 1.0
3587
+ 2 2.0
3588
+ 3 -3.0
3589
+ dtype: float32
3590
+
3591
+ Also support for list, tuple, np.array, or a scalar
3592
+
3593
+ >>> ps.to_numeric(['1.0', '2', '-3'])
3594
+ array([ 1., 2., -3.])
3595
+
3596
+ >>> ps.to_numeric(('1.0', '2', '-3'))
3597
+ array([ 1., 2., -3.])
3598
+
3599
+ >>> ps.to_numeric(np.array(['1.0', '2', '-3']))
3600
+ array([ 1., 2., -3.])
3601
+
3602
+ >>> ps.to_numeric('1.0')
3603
+ 1.0
3604
+ """
3605
+ if isinstance(arg, Series):
3606
+ if errors == "coerce":
3607
+ return arg._with_new_scol(arg.spark.column.cast("float"))
3608
+ elif errors == "raise":
3609
+ scol = arg.spark.column
3610
+ scol_casted = scol.cast("float")
3611
+ cond = F.when(
3612
+ F.assert_true(scol.isNull() | scol_casted.isNotNull()).isNull(), scol_casted
3613
+ )
3614
+ return arg._with_new_scol(cond)
3615
+ elif errors == "ignore":
3616
+ raise NotImplementedError("'ignore' is not implemented yet, when the `arg` is Series.")
3617
+ else:
3618
+ raise ValueError("invalid error value specified")
3619
+ else:
3620
+ return pd.to_numeric(arg, errors=errors)
3621
+
3622
+
3623
+ def broadcast(obj: DataFrame) -> DataFrame:
3624
+ """
3625
+ Marks a DataFrame as small enough for use in broadcast joins.
3626
+
3627
+ .. deprecated:: 3.2.0
3628
+ Use :func:`DataFrame.spark.hint` instead.
3629
+
3630
+ Parameters
3631
+ ----------
3632
+ obj : DataFrame
3633
+
3634
+ Returns
3635
+ -------
3636
+ ret : DataFrame with broadcast hint.
3637
+
3638
+ See Also
3639
+ --------
3640
+ DataFrame.merge : Merge DataFrame objects with a database-style join.
3641
+ DataFrame.join : Join columns of another DataFrame.
3642
+ DataFrame.update : Modify in place using non-NA values from another DataFrame.
3643
+ DataFrame.hint : Specifies some hint on the current DataFrame.
3644
+
3645
+ Examples
3646
+ --------
3647
+ >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
3648
+ ... 'value': [1, 2, 3, 5]},
3649
+ ... columns=['lkey', 'value']).set_index('lkey')
3650
+ >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
3651
+ ... 'value': [5, 6, 7, 8]},
3652
+ ... columns=['rkey', 'value']).set_index('rkey')
3653
+ >>> merged = df1.merge(ps.broadcast(df2), left_index=True, right_index=True)
3654
+ >>> merged.spark.explain() # doctest: +ELLIPSIS
3655
+ == Physical Plan ==
3656
+ ...
3657
+ ...BroadcastHashJoin...
3658
+ ...
3659
+ """
3660
+ warnings.warn(
3661
+ "`broadcast` has been deprecated and might be removed in a future version. "
3662
+ "Use `DataFrame.spark.hint` with 'broadcast' for `name` parameter instead.",
3663
+ FutureWarning,
3664
+ )
3665
+ if not isinstance(obj, DataFrame):
3666
+ raise TypeError("Invalid type : expected DataFrame got {}".format(type(obj).__name__))
3667
+ return DataFrame(
3668
+ obj._internal.with_new_sdf(F.broadcast(obj._internal.resolved_copy.spark_frame))
3669
+ )
3670
+
3671
+
3672
+ def read_orc(
3673
+ path: str,
3674
+ columns: Optional[List[str]] = None,
3675
+ index_col: Optional[Union[str, List[str]]] = None,
3676
+ **options: Any,
3677
+ ) -> "DataFrame":
3678
+ """
3679
+ Load an ORC object from the file path, returning a DataFrame.
3680
+
3681
+ Parameters
3682
+ ----------
3683
+ path : str
3684
+ The path string storing the ORC file to be read.
3685
+ columns : list, default None
3686
+ If not None, only these columns will be read from the file.
3687
+ index_col : str or list of str, optional, default: None
3688
+ Index column of table in Spark.
3689
+ options : dict
3690
+ All other options passed directly into Spark's data source.
3691
+
3692
+ Returns
3693
+ -------
3694
+ DataFrame
3695
+
3696
+ Examples
3697
+ --------
3698
+ >>> ps.range(1).to_orc('%s/read_spark_io/data.orc' % path)
3699
+ >>> ps.read_orc('%s/read_spark_io/data.orc' % path, columns=['id'])
3700
+ id
3701
+ 0 0
3702
+
3703
+ You can preserve the index in the roundtrip as below.
3704
+
3705
+ >>> ps.range(1).to_orc('%s/read_spark_io/data.orc' % path, index_col="index")
3706
+ >>> ps.read_orc('%s/read_spark_io/data.orc' % path, columns=['id'], index_col="index")
3707
+ ... # doctest: +NORMALIZE_WHITESPACE
3708
+ id
3709
+ index
3710
+ 0 0
3711
+ """
3712
+ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
3713
+ options = options.get("options")
3714
+
3715
+ psdf = read_spark_io(path, format="orc", index_col=index_col, **options)
3716
+
3717
+ if columns is not None:
3718
+ psdf_columns = psdf.columns
3719
+ new_columns = list()
3720
+ for column in list(columns):
3721
+ if column in psdf_columns:
3722
+ new_columns.append(column)
3723
+ else:
3724
+ raise ValueError("Unknown column name '{}'".format(column))
3725
+ psdf = psdf[new_columns]
3726
+
3727
+ return psdf
3728
+
3729
+
3730
+ def _get_index_map(
3731
+ sdf: PySparkDataFrame, index_col: Optional[Union[str, List[str]]] = None
3732
+ ) -> Tuple[Optional[List[PySparkColumn]], Optional[List[Label]]]:
3733
+ index_spark_columns: Optional[List[PySparkColumn]]
3734
+ index_names: Optional[List[Label]]
3735
+ if index_col is not None:
3736
+ if isinstance(index_col, str):
3737
+ index_col = [index_col]
3738
+ sdf_columns = set(sdf.columns)
3739
+ for col in index_col:
3740
+ if col not in sdf_columns:
3741
+ raise KeyError(col)
3742
+ index_spark_columns = [scol_for(sdf, col) for col in index_col]
3743
+ index_names = [(col,) for col in index_col]
3744
+ else:
3745
+ index_spark_columns = None
3746
+ index_names = None
3747
+
3748
+ return index_spark_columns, index_names
3749
+
3750
+
3751
+ _get_dummies_default_accept_types = (DecimalType, StringType, DateType)
3752
+ _get_dummies_acceptable_types = _get_dummies_default_accept_types + (
3753
+ ByteType,
3754
+ ShortType,
3755
+ IntegerType,
3756
+ LongType,
3757
+ FloatType,
3758
+ DoubleType,
3759
+ BooleanType,
3760
+ TimestampType,
3761
+ TimestampNTZType,
3762
+ )
3763
+
3764
+
3765
+ def _test() -> None:
3766
+ import os
3767
+ import doctest
3768
+ import shutil
3769
+ import sys
3770
+ import tempfile
3771
+ import uuid
3772
+ from pyspark.sql import SparkSession
3773
+ import pyspark.pandas.namespace
3774
+
3775
+ os.chdir(os.environ["SPARK_HOME"])
3776
+
3777
+ globs = pyspark.pandas.namespace.__dict__.copy()
3778
+ globs["ps"] = pyspark.pandas
3779
+ globs["sf"] = F
3780
+ spark = (
3781
+ SparkSession.builder.master("local[4]")
3782
+ .appName("pyspark.pandas.namespace tests")
3783
+ .getOrCreate()
3784
+ )
3785
+
3786
+ db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
3787
+ spark.sql("CREATE DATABASE %s" % db_name)
3788
+ globs["db"] = db_name
3789
+
3790
+ path = tempfile.mkdtemp()
3791
+ globs["path"] = path
3792
+
3793
+ (failure_count, test_count) = doctest.testmod(
3794
+ pyspark.pandas.namespace,
3795
+ globs=globs,
3796
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
3797
+ )
3798
+
3799
+ shutil.rmtree(path, ignore_errors=True)
3800
+ spark.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name)
3801
+ spark.stop()
3802
+ if failure_count:
3803
+ sys.exit(-1)
3804
+
3805
+
3806
+ if __name__ == "__main__":
3807
+ _test()