snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1652 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import sys
19
+ from collections.abc import Iterator
20
+ from typing import cast, overload, Any, Callable, List, Optional, TYPE_CHECKING, Union
21
+
22
+ from py4j.java_gateway import java_import, JavaObject
23
+
24
+ from pyspark.sql.column import _to_seq
25
+ from pyspark.sql.readwriter import OptionUtils, to_str
26
+ from pyspark.sql.streaming.query import StreamingQuery
27
+ from pyspark.sql.types import Row, StructType
28
+ from pyspark.sql.utils import ForeachBatchFunction
29
+ from pyspark.errors import (
30
+ PySparkTypeError,
31
+ PySparkValueError,
32
+ PySparkAttributeError,
33
+ PySparkRuntimeError,
34
+ )
35
+
36
+ if TYPE_CHECKING:
37
+ from pyspark.sql.session import SparkSession
38
+ from pyspark.sql._typing import SupportsProcess, OptionalPrimitiveType
39
+ from pyspark.sql.dataframe import DataFrame
40
+
41
+ __all__ = ["DataStreamReader", "DataStreamWriter"]
42
+
43
+
44
+ class DataStreamReader(OptionUtils):
45
+ """
46
+ Interface used to load a streaming :class:`DataFrame <pyspark.sql.DataFrame>` from external
47
+ storage systems (e.g. file systems, key-value stores, etc).
48
+ Use :attr:`SparkSession.readStream <pyspark.sql.SparkSession.readStream>` to access this.
49
+
50
+ .. versionadded:: 2.0.0
51
+
52
+ .. versionchanged:: 3.5.0
53
+ Supports Spark Connect.
54
+
55
+ Notes
56
+ -----
57
+ This API is evolving.
58
+
59
+ Examples
60
+ --------
61
+ >>> spark.readStream
62
+ <...streaming.readwriter.DataStreamReader object ...>
63
+
64
+ The example below uses Rate source that generates rows continuously.
65
+ After that, we operate a modulo by 3, and then writes the stream out to the console.
66
+ The streaming query stops in 3 seconds.
67
+
68
+ >>> import time
69
+ >>> df = spark.readStream.format("rate").load()
70
+ >>> df = df.selectExpr("value % 3 as v")
71
+ >>> q = df.writeStream.format("console").start()
72
+ >>> time.sleep(3)
73
+ >>> q.stop()
74
+ """
75
+
76
+ def __init__(self, spark: "SparkSession") -> None:
77
+ self._jreader = spark._jsparkSession.readStream()
78
+ self._spark = spark
79
+
80
+ def _df(self, jdf: JavaObject) -> "DataFrame":
81
+ from pyspark.sql.dataframe import DataFrame
82
+
83
+ return DataFrame(jdf, self._spark)
84
+
85
+ def format(self, source: str) -> "DataStreamReader":
86
+ """Specifies the input data source format.
87
+
88
+ .. versionadded:: 2.0.0
89
+
90
+ .. versionchanged:: 3.5.0
91
+ Supports Spark Connect.
92
+
93
+ Parameters
94
+ ----------
95
+ source : str
96
+ name of the data source, e.g. 'json', 'parquet'.
97
+
98
+ Notes
99
+ -----
100
+ This API is evolving.
101
+
102
+ Examples
103
+ --------
104
+ >>> spark.readStream.format("text")
105
+ <...streaming.readwriter.DataStreamReader object ...>
106
+
107
+ This API allows to configure other sources to read. The example below writes a small text
108
+ file, and reads it back via Text source.
109
+
110
+ >>> import tempfile
111
+ >>> import time
112
+ >>> with tempfile.TemporaryDirectory() as d:
113
+ ... # Write a temporary text file to read it.
114
+ ... spark.createDataFrame(
115
+ ... [("hello",), ("this",)]).write.mode("overwrite").format("text").save(d)
116
+ ...
117
+ ... # Start a streaming query to read the text file.
118
+ ... q = spark.readStream.format("text").load(d).writeStream.format("console").start()
119
+ ... time.sleep(3)
120
+ ... q.stop()
121
+ """
122
+ self._jreader = self._jreader.format(source)
123
+ return self
124
+
125
+ def schema(self, schema: Union[StructType, str]) -> "DataStreamReader":
126
+ """Specifies the input schema.
127
+
128
+ Some data sources (e.g. JSON) can infer the input schema automatically from data.
129
+ By specifying the schema here, the underlying data source can skip the schema
130
+ inference step, and thus speed up data loading.
131
+
132
+ .. versionadded:: 2.0.0
133
+
134
+ .. versionchanged:: 3.5.0
135
+ Supports Spark Connect.
136
+
137
+ Parameters
138
+ ----------
139
+ schema : :class:`pyspark.sql.types.StructType` or str
140
+ a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string
141
+ (For example ``col0 INT, col1 DOUBLE``).
142
+
143
+ Notes
144
+ -----
145
+ This API is evolving.
146
+
147
+ Examples
148
+ --------
149
+ >>> from pyspark.sql.types import StructField, StructType, StringType
150
+ >>> spark.readStream.schema(StructType([StructField("data", StringType(), True)]))
151
+ <...streaming.readwriter.DataStreamReader object ...>
152
+ >>> spark.readStream.schema("col0 INT, col1 DOUBLE")
153
+ <...streaming.readwriter.DataStreamReader object ...>
154
+
155
+ The example below specifies a different schema to CSV file.
156
+
157
+ >>> import tempfile
158
+ >>> import time
159
+ >>> with tempfile.TemporaryDirectory() as d:
160
+ ... # Start a streaming query to read the CSV file.
161
+ ... spark.readStream.schema("col0 INT, col1 STRING").format("csv").load(d).printSchema()
162
+ root
163
+ |-- col0: integer (nullable = true)
164
+ |-- col1: string (nullable = true)
165
+ """
166
+ from pyspark.sql import SparkSession
167
+
168
+ spark = SparkSession._getActiveSessionOrCreate()
169
+ if isinstance(schema, StructType):
170
+ jschema = spark._jsparkSession.parseDataType(schema.json())
171
+ self._jreader = self._jreader.schema(jschema)
172
+ elif isinstance(schema, str):
173
+ self._jreader = self._jreader.schema(schema)
174
+ else:
175
+ raise PySparkTypeError(
176
+ error_class="NOT_STR_OR_STRUCT",
177
+ message_parameters={"arg_name": "schema", "arg_type": type(schema).__name__},
178
+ )
179
+ return self
180
+
181
+ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataStreamReader":
182
+ """Adds an input option for the underlying data source.
183
+
184
+ .. versionadded:: 2.0.0
185
+
186
+ .. versionchanged:: 3.5.0
187
+ Supports Spark Connect.
188
+
189
+ Notes
190
+ -----
191
+ This API is evolving.
192
+
193
+ Examples
194
+ --------
195
+ >>> spark.readStream.option("x", 1)
196
+ <...streaming.readwriter.DataStreamReader object ...>
197
+
198
+ The example below specifies 'rowsPerSecond' option to Rate source in order to generate
199
+ 10 rows every second.
200
+
201
+ >>> import time
202
+ >>> q = spark.readStream.format(
203
+ ... "rate").option("rowsPerSecond", 10).load().writeStream.format("console").start()
204
+ >>> time.sleep(3)
205
+ >>> q.stop()
206
+ """
207
+ self._jreader = self._jreader.option(key, to_str(value))
208
+ return self
209
+
210
+ def options(self, **options: "OptionalPrimitiveType") -> "DataStreamReader":
211
+ """Adds input options for the underlying data source.
212
+
213
+ .. versionadded:: 2.0.0
214
+
215
+ .. versionchanged:: 3.5.0
216
+ Supports Spark Connect.
217
+
218
+ Notes
219
+ -----
220
+ This API is evolving.
221
+
222
+ Examples
223
+ --------
224
+ >>> spark.readStream.options(x="1", y=2)
225
+ <...streaming.readwriter.DataStreamReader object ...>
226
+
227
+ The example below specifies 'rowsPerSecond' and 'numPartitions' options to
228
+ Rate source in order to generate 10 rows with 10 partitions every second.
229
+
230
+ >>> import time
231
+ >>> q = spark.readStream.format("rate").options(
232
+ ... rowsPerSecond=10, numPartitions=10
233
+ ... ).load().writeStream.format("console").start()
234
+ >>> time.sleep(3)
235
+ >>> q.stop()
236
+ """
237
+ for k in options:
238
+ self._jreader = self._jreader.option(k, to_str(options[k]))
239
+ return self
240
+
241
+ def load(
242
+ self,
243
+ path: Optional[str] = None,
244
+ format: Optional[str] = None,
245
+ schema: Optional[Union[StructType, str]] = None,
246
+ **options: "OptionalPrimitiveType",
247
+ ) -> "DataFrame":
248
+ """Loads a data stream from a data source and returns it as a
249
+ :class:`DataFrame <pyspark.sql.DataFrame>`.
250
+
251
+ .. versionadded:: 2.0.0
252
+
253
+ .. versionchanged:: 3.5.0
254
+ Supports Spark Connect.
255
+
256
+ Parameters
257
+ ----------
258
+ path : str, optional
259
+ optional string for file-system backed data sources.
260
+ format : str, optional
261
+ optional string for format of the data source. Default to 'parquet'.
262
+ schema : :class:`pyspark.sql.types.StructType` or str, optional
263
+ optional :class:`pyspark.sql.types.StructType` for the input schema
264
+ or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
265
+ **options : dict
266
+ all other string options
267
+
268
+ Notes
269
+ -----
270
+ This API is evolving.
271
+
272
+ Examples
273
+ --------
274
+ Load a data stream from a temporary JSON file.
275
+
276
+ >>> import tempfile
277
+ >>> import time
278
+ >>> with tempfile.TemporaryDirectory() as d:
279
+ ... # Write a temporary JSON file to read it.
280
+ ... spark.createDataFrame(
281
+ ... [(100, "Hyukjin Kwon"),], ["age", "name"]
282
+ ... ).write.mode("overwrite").format("json").save(d)
283
+ ...
284
+ ... # Start a streaming query to read the JSON file.
285
+ ... q = spark.readStream.schema(
286
+ ... "age INT, name STRING"
287
+ ... ).format("json").load(d).writeStream.format("console").start()
288
+ ... time.sleep(3)
289
+ ... q.stop()
290
+ """
291
+ if format is not None:
292
+ self.format(format)
293
+ if schema is not None:
294
+ self.schema(schema)
295
+ self.options(**options)
296
+ if path is not None:
297
+ if type(path) != str or len(path.strip()) == 0:
298
+ raise PySparkValueError(
299
+ error_class="VALUE_NOT_NON_EMPTY_STR",
300
+ message_parameters={"arg_name": "path", "arg_value": str(path)},
301
+ )
302
+ return self._df(self._jreader.load(path))
303
+ else:
304
+ return self._df(self._jreader.load())
305
+
306
+ def json(
307
+ self,
308
+ path: str,
309
+ schema: Optional[Union[StructType, str]] = None,
310
+ primitivesAsString: Optional[Union[bool, str]] = None,
311
+ prefersDecimal: Optional[Union[bool, str]] = None,
312
+ allowComments: Optional[Union[bool, str]] = None,
313
+ allowUnquotedFieldNames: Optional[Union[bool, str]] = None,
314
+ allowSingleQuotes: Optional[Union[bool, str]] = None,
315
+ allowNumericLeadingZero: Optional[Union[bool, str]] = None,
316
+ allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None,
317
+ mode: Optional[str] = None,
318
+ columnNameOfCorruptRecord: Optional[str] = None,
319
+ dateFormat: Optional[str] = None,
320
+ timestampFormat: Optional[str] = None,
321
+ multiLine: Optional[Union[bool, str]] = None,
322
+ allowUnquotedControlChars: Optional[Union[bool, str]] = None,
323
+ lineSep: Optional[str] = None,
324
+ locale: Optional[str] = None,
325
+ dropFieldIfAllNull: Optional[Union[bool, str]] = None,
326
+ encoding: Optional[str] = None,
327
+ pathGlobFilter: Optional[Union[bool, str]] = None,
328
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
329
+ allowNonNumericNumbers: Optional[Union[bool, str]] = None,
330
+ ) -> "DataFrame":
331
+ """
332
+ Loads a JSON file stream and returns the results as a :class:`DataFrame`.
333
+
334
+ `JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
335
+ For JSON (one record per file), set the ``multiLine`` parameter to ``true``.
336
+
337
+ If the ``schema`` parameter is not specified, this function goes
338
+ through the input once to determine the input schema.
339
+
340
+ .. versionadded:: 2.0.0
341
+
342
+ .. versionchanged:: 3.5.0
343
+ Supports Spark Connect.
344
+
345
+ Parameters
346
+ ----------
347
+ path : str
348
+ string represents path to the JSON dataset,
349
+ or RDD of Strings storing JSON objects.
350
+ schema : :class:`pyspark.sql.types.StructType` or str, optional
351
+ an optional :class:`pyspark.sql.types.StructType` for the input schema
352
+ or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
353
+
354
+ Other Parameters
355
+ ----------------
356
+ Extra options
357
+ For the extra options, refer to
358
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_
359
+ in the version you use.
360
+
361
+ .. # noqa
362
+
363
+ Notes
364
+ -----
365
+ This API is evolving.
366
+
367
+ Examples
368
+ --------
369
+ Load a data stream from a temporary JSON file.
370
+
371
+ >>> import tempfile
372
+ >>> import time
373
+ >>> with tempfile.TemporaryDirectory() as d:
374
+ ... # Write a temporary JSON file to read it.
375
+ ... spark.createDataFrame(
376
+ ... [(100, "Hyukjin Kwon"),], ["age", "name"]
377
+ ... ).write.mode("overwrite").format("json").save(d)
378
+ ...
379
+ ... # Start a streaming query to read the JSON file.
380
+ ... q = spark.readStream.schema(
381
+ ... "age INT, name STRING"
382
+ ... ).json(d).writeStream.format("console").start()
383
+ ... time.sleep(3)
384
+ ... q.stop()
385
+ """
386
+ self._set_opts(
387
+ schema=schema,
388
+ primitivesAsString=primitivesAsString,
389
+ prefersDecimal=prefersDecimal,
390
+ allowComments=allowComments,
391
+ allowUnquotedFieldNames=allowUnquotedFieldNames,
392
+ allowSingleQuotes=allowSingleQuotes,
393
+ allowNumericLeadingZero=allowNumericLeadingZero,
394
+ allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
395
+ mode=mode,
396
+ columnNameOfCorruptRecord=columnNameOfCorruptRecord,
397
+ dateFormat=dateFormat,
398
+ timestampFormat=timestampFormat,
399
+ multiLine=multiLine,
400
+ allowUnquotedControlChars=allowUnquotedControlChars,
401
+ lineSep=lineSep,
402
+ locale=locale,
403
+ dropFieldIfAllNull=dropFieldIfAllNull,
404
+ encoding=encoding,
405
+ pathGlobFilter=pathGlobFilter,
406
+ recursiveFileLookup=recursiveFileLookup,
407
+ allowNonNumericNumbers=allowNonNumericNumbers,
408
+ )
409
+ if isinstance(path, str):
410
+ return self._df(self._jreader.json(path))
411
+ else:
412
+ raise PySparkTypeError(
413
+ error_class="NOT_STR",
414
+ message_parameters={"arg_name": "path", "arg_type": type(path).__name__},
415
+ )
416
+
417
+ def orc(
418
+ self,
419
+ path: str,
420
+ mergeSchema: Optional[bool] = None,
421
+ pathGlobFilter: Optional[Union[bool, str]] = None,
422
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
423
+ ) -> "DataFrame":
424
+ """Loads a ORC file stream, returning the result as a :class:`DataFrame`.
425
+
426
+ .. versionadded:: 2.3.0
427
+
428
+ .. versionchanged:: 3.5.0
429
+ Supports Spark Connect.
430
+
431
+ Other Parameters
432
+ ----------------
433
+ Extra options
434
+ For the extra options, refer to
435
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option>`_
436
+ in the version you use.
437
+
438
+ .. # noqa
439
+
440
+ Examples
441
+ --------
442
+ Load a data stream from a temporary ORC file.
443
+
444
+ >>> import tempfile
445
+ >>> import time
446
+ >>> with tempfile.TemporaryDirectory() as d:
447
+ ... # Write a temporary ORC file to read it.
448
+ ... spark.range(10).write.mode("overwrite").format("orc").save(d)
449
+ ...
450
+ ... # Start a streaming query to read the ORC file.
451
+ ... q = spark.readStream.schema("id LONG").orc(d).writeStream.format("console").start()
452
+ ... time.sleep(3)
453
+ ... q.stop()
454
+ """
455
+ self._set_opts(
456
+ mergeSchema=mergeSchema,
457
+ pathGlobFilter=pathGlobFilter,
458
+ recursiveFileLookup=recursiveFileLookup,
459
+ )
460
+ if isinstance(path, str):
461
+ return self._df(self._jreader.orc(path))
462
+ else:
463
+ raise PySparkTypeError(
464
+ error_class="NOT_STR",
465
+ message_parameters={"arg_name": "path", "arg_type": type(path).__name__},
466
+ )
467
+
468
+ def parquet(
469
+ self,
470
+ path: str,
471
+ mergeSchema: Optional[bool] = None,
472
+ pathGlobFilter: Optional[Union[bool, str]] = None,
473
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
474
+ datetimeRebaseMode: Optional[Union[bool, str]] = None,
475
+ int96RebaseMode: Optional[Union[bool, str]] = None,
476
+ ) -> "DataFrame":
477
+ """
478
+ Loads a Parquet file stream, returning the result as a :class:`DataFrame`.
479
+
480
+ .. versionadded:: 2.0.0
481
+
482
+ .. versionchanged:: 3.5.0
483
+ Supports Spark Connect.
484
+
485
+ Parameters
486
+ ----------
487
+ path : str
488
+ the path in any Hadoop supported file system
489
+
490
+ Other Parameters
491
+ ----------------
492
+ Extra options
493
+ For the extra options, refer to
494
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option>`_.
495
+ in the version you use.
496
+
497
+ .. # noqa
498
+
499
+ Examples
500
+ --------
501
+ Load a data stream from a temporary Parquet file.
502
+
503
+ >>> import tempfile
504
+ >>> import time
505
+ >>> with tempfile.TemporaryDirectory() as d:
506
+ ... # Write a temporary Parquet file to read it.
507
+ ... spark.range(10).write.mode("overwrite").format("parquet").save(d)
508
+ ...
509
+ ... # Start a streaming query to read the Parquet file.
510
+ ... q = spark.readStream.schema(
511
+ ... "id LONG").parquet(d).writeStream.format("console").start()
512
+ ... time.sleep(3)
513
+ ... q.stop()
514
+ """
515
+ self._set_opts(
516
+ mergeSchema=mergeSchema,
517
+ pathGlobFilter=pathGlobFilter,
518
+ recursiveFileLookup=recursiveFileLookup,
519
+ datetimeRebaseMode=datetimeRebaseMode,
520
+ int96RebaseMode=int96RebaseMode,
521
+ )
522
+ if isinstance(path, str):
523
+ return self._df(self._jreader.parquet(path))
524
+ else:
525
+ raise PySparkTypeError(
526
+ error_class="NOT_STR",
527
+ message_parameters={"arg_name": "path", "arg_type": type(path).__name__},
528
+ )
529
+
530
+ def text(
531
+ self,
532
+ path: str,
533
+ wholetext: bool = False,
534
+ lineSep: Optional[str] = None,
535
+ pathGlobFilter: Optional[Union[bool, str]] = None,
536
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
537
+ ) -> "DataFrame":
538
+ """
539
+ Loads a text file stream and returns a :class:`DataFrame` whose schema starts with a
540
+ string column named "value", and followed by partitioned columns if there
541
+ are any.
542
+ The text files must be encoded as UTF-8.
543
+
544
+ By default, each line in the text file is a new row in the resulting DataFrame.
545
+
546
+ .. versionadded:: 2.0.0
547
+
548
+ .. versionchanged:: 3.5.0
549
+ Supports Spark Connect.
550
+
551
+ Parameters
552
+ ----------
553
+ path : str or list
554
+ string, or list of strings, for input path(s).
555
+
556
+ Other Parameters
557
+ ----------------
558
+ Extra options
559
+ For the extra options, refer to
560
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option>`_
561
+ in the version you use.
562
+
563
+ .. # noqa
564
+
565
+ Notes
566
+ -----
567
+ This API is evolving.
568
+
569
+ Examples
570
+ --------
571
+ Load a data stream from a temporary text file.
572
+
573
+ >>> import tempfile
574
+ >>> import time
575
+ >>> with tempfile.TemporaryDirectory() as d:
576
+ ... # Write a temporary text file to read it.
577
+ ... spark.createDataFrame(
578
+ ... [("hello",), ("this",)]).write.mode("overwrite").format("text").save(d)
579
+ ...
580
+ ... # Start a streaming query to read the text file.
581
+ ... q = spark.readStream.text(d).writeStream.format("console").start()
582
+ ... time.sleep(3)
583
+ ... q.stop()
584
+ """
585
+ self._set_opts(
586
+ wholetext=wholetext,
587
+ lineSep=lineSep,
588
+ pathGlobFilter=pathGlobFilter,
589
+ recursiveFileLookup=recursiveFileLookup,
590
+ )
591
+ if isinstance(path, str):
592
+ return self._df(self._jreader.text(path))
593
+ else:
594
+ raise PySparkTypeError(
595
+ error_class="NOT_STR",
596
+ message_parameters={"arg_name": "path", "arg_type": type(path).__name__},
597
+ )
598
+
599
+ def csv(
600
+ self,
601
+ path: str,
602
+ schema: Optional[Union[StructType, str]] = None,
603
+ sep: Optional[str] = None,
604
+ encoding: Optional[str] = None,
605
+ quote: Optional[str] = None,
606
+ escape: Optional[str] = None,
607
+ comment: Optional[str] = None,
608
+ header: Optional[Union[bool, str]] = None,
609
+ inferSchema: Optional[Union[bool, str]] = None,
610
+ ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
611
+ ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
612
+ nullValue: Optional[str] = None,
613
+ nanValue: Optional[str] = None,
614
+ positiveInf: Optional[str] = None,
615
+ negativeInf: Optional[str] = None,
616
+ dateFormat: Optional[str] = None,
617
+ timestampFormat: Optional[str] = None,
618
+ maxColumns: Optional[Union[int, str]] = None,
619
+ maxCharsPerColumn: Optional[Union[int, str]] = None,
620
+ maxMalformedLogPerPartition: Optional[Union[int, str]] = None,
621
+ mode: Optional[str] = None,
622
+ columnNameOfCorruptRecord: Optional[str] = None,
623
+ multiLine: Optional[Union[bool, str]] = None,
624
+ charToEscapeQuoteEscaping: Optional[Union[bool, str]] = None,
625
+ enforceSchema: Optional[Union[bool, str]] = None,
626
+ emptyValue: Optional[str] = None,
627
+ locale: Optional[str] = None,
628
+ lineSep: Optional[str] = None,
629
+ pathGlobFilter: Optional[Union[bool, str]] = None,
630
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
631
+ unescapedQuoteHandling: Optional[str] = None,
632
+ ) -> "DataFrame":
633
+ r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
634
+
635
+ This function will go through the input once to determine the input schema if
636
+ ``inferSchema`` is enabled. To avoid going through the entire data once, disable
637
+ ``inferSchema`` option or specify the schema explicitly using ``schema``.
638
+
639
+ Parameters
640
+ ----------
641
+ path : str or list
642
+ string, or list of strings, for input path(s).
643
+ schema : :class:`pyspark.sql.types.StructType` or str, optional
644
+ an optional :class:`pyspark.sql.types.StructType` for the input schema
645
+ or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
646
+
647
+ .. versionadded:: 2.0.0
648
+
649
+ .. versionchanged:: 3.5.0
650
+ Supports Spark Connect.
651
+
652
+ Other Parameters
653
+ ----------------
654
+ Extra options
655
+ For the extra options, refer to
656
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
657
+ in the version you use.
658
+
659
+ .. # noqa
660
+
661
+ Notes
662
+ -----
663
+ This API is evolving.
664
+
665
+ Examples
666
+ --------
667
+ Load a data stream from a temporary CSV file.
668
+
669
+ >>> import tempfile
670
+ >>> import time
671
+ >>> with tempfile.TemporaryDirectory() as d:
672
+ ... # Write a temporary text file to read it.
673
+ ... spark.createDataFrame([(1, "2"),]).write.mode("overwrite").format("csv").save(d)
674
+ ...
675
+ ... # Start a streaming query to read the CSV file.
676
+ ... q = spark.readStream.schema(
677
+ ... "col0 INT, col1 STRING"
678
+ ... ).format("csv").load(d).writeStream.format("console").start()
679
+ ... time.sleep(3)
680
+ ... q.stop()
681
+ """
682
+ self._set_opts(
683
+ schema=schema,
684
+ sep=sep,
685
+ encoding=encoding,
686
+ quote=quote,
687
+ escape=escape,
688
+ comment=comment,
689
+ header=header,
690
+ inferSchema=inferSchema,
691
+ ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace,
692
+ ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace,
693
+ nullValue=nullValue,
694
+ nanValue=nanValue,
695
+ positiveInf=positiveInf,
696
+ negativeInf=negativeInf,
697
+ dateFormat=dateFormat,
698
+ timestampFormat=timestampFormat,
699
+ maxColumns=maxColumns,
700
+ maxCharsPerColumn=maxCharsPerColumn,
701
+ maxMalformedLogPerPartition=maxMalformedLogPerPartition,
702
+ mode=mode,
703
+ columnNameOfCorruptRecord=columnNameOfCorruptRecord,
704
+ multiLine=multiLine,
705
+ charToEscapeQuoteEscaping=charToEscapeQuoteEscaping,
706
+ enforceSchema=enforceSchema,
707
+ emptyValue=emptyValue,
708
+ locale=locale,
709
+ lineSep=lineSep,
710
+ pathGlobFilter=pathGlobFilter,
711
+ recursiveFileLookup=recursiveFileLookup,
712
+ unescapedQuoteHandling=unescapedQuoteHandling,
713
+ )
714
+ if isinstance(path, str):
715
+ return self._df(self._jreader.csv(path))
716
+ else:
717
+ raise PySparkTypeError(
718
+ error_class="NOT_STR",
719
+ message_parameters={"arg_name": "path", "arg_type": type(path).__name__},
720
+ )
721
+
722
+ def table(self, tableName: str) -> "DataFrame":
723
+ """Define a Streaming DataFrame on a Table. The DataSource corresponding to the table should
724
+ support streaming mode.
725
+
726
+ .. versionadded:: 3.1.0
727
+
728
+ .. versionchanged:: 3.5.0
729
+ Supports Spark Connect.
730
+
731
+ Parameters
732
+ ----------
733
+ tableName : str
734
+ string, for the name of the table.
735
+
736
+ Returns
737
+ -------
738
+ :class:`DataFrame`
739
+
740
+ Notes
741
+ -----
742
+ This API is evolving.
743
+
744
+ Examples
745
+ --------
746
+ Load a data stream from a table.
747
+
748
+ >>> import tempfile
749
+ >>> import time
750
+ >>> _ = spark.sql("DROP TABLE IF EXISTS my_table")
751
+ >>> with tempfile.TemporaryDirectory() as d:
752
+ ... # Create a table with Rate source.
753
+ ... q1 = spark.readStream.format("rate").load().writeStream.toTable(
754
+ ... "my_table", checkpointLocation=d)
755
+ ...
756
+ ... # Read the table back and print out in the console.
757
+ ... q2 = spark.readStream.table("my_table").writeStream.format("console").start()
758
+ ... time.sleep(3)
759
+ ... q1.stop()
760
+ ... q2.stop()
761
+ ... _ = spark.sql("DROP TABLE my_table")
762
+ """
763
+ if isinstance(tableName, str):
764
+ return self._df(self._jreader.table(tableName))
765
+ else:
766
+ raise PySparkTypeError(
767
+ error_class="NOT_STR",
768
+ message_parameters={"arg_name": "tableName", "arg_type": type(tableName).__name__},
769
+ )
770
+
771
+
772
+ class DataStreamWriter:
773
+ """
774
+ Interface used to write a streaming :class:`DataFrame <pyspark.sql.DataFrame>` to external
775
+ storage systems (e.g. file systems, key-value stores, etc).
776
+ Use :attr:`DataFrame.writeStream <pyspark.sql.DataFrame.writeStream>`
777
+ to access this.
778
+
779
+ .. versionadded:: 2.0.0
780
+
781
+ .. versionchanged:: 3.5.0
782
+ Supports Spark Connect.
783
+
784
+ Notes
785
+ -----
786
+ This API is evolving.
787
+
788
+ Examples
789
+ --------
790
+ The example below uses Rate source that generates rows continuously.
791
+ After that, we operate a modulo by 3, and then writes the stream out to the console.
792
+ The streaming query stops in 3 seconds.
793
+
794
+ >>> import time
795
+ >>> df = spark.readStream.format("rate").load()
796
+ >>> df = df.selectExpr("value % 3 as v")
797
+ >>> q = df.writeStream.format("console").start()
798
+ >>> time.sleep(3)
799
+ >>> q.stop()
800
+ """
801
+
802
+ def __init__(self, df: "DataFrame") -> None:
803
+ self._df = df
804
+ self._spark = df.sparkSession
805
+ self._jwrite = df._jdf.writeStream()
806
+
807
+ def _sq(self, jsq: JavaObject) -> StreamingQuery:
808
+ return StreamingQuery(jsq)
809
+
810
+ def outputMode(self, outputMode: str) -> "DataStreamWriter":
811
+ """Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink.
812
+
813
+ .. versionadded:: 2.0.0
814
+
815
+ .. versionchanged:: 3.5.0
816
+ Supports Spark Connect.
817
+
818
+ Options include:
819
+
820
+ * `append`: Only the new rows in the streaming DataFrame/Dataset will be written to
821
+ the sink
822
+ * `complete`: All the rows in the streaming DataFrame/Dataset will be written to the sink
823
+ every time these are some updates
824
+ * `update`: only the rows that were updated in the streaming DataFrame/Dataset will be
825
+ written to the sink every time there are some updates. If the query doesn't contain
826
+ aggregations, it will be equivalent to `append` mode.
827
+
828
+ Notes
829
+ -----
830
+ This API is evolving.
831
+
832
+ Examples
833
+ --------
834
+ >>> df = spark.readStream.format("rate").load()
835
+ >>> df.writeStream.outputMode('append')
836
+ <...streaming.readwriter.DataStreamWriter object ...>
837
+
838
+ The example below uses Complete mode that the entire aggregated counts are printed out.
839
+
840
+ >>> import time
841
+ >>> df = spark.readStream.format("rate").option("rowsPerSecond", 10).load()
842
+ >>> df = df.groupby().count()
843
+ >>> q = df.writeStream.outputMode("complete").format("console").start()
844
+ >>> time.sleep(3)
845
+ >>> q.stop()
846
+ """
847
+ if not outputMode or type(outputMode) != str or len(outputMode.strip()) == 0:
848
+ raise PySparkValueError(
849
+ error_class="VALUE_NOT_NON_EMPTY_STR",
850
+ message_parameters={"arg_name": "outputMode", "arg_value": str(outputMode)},
851
+ )
852
+ self._jwrite = self._jwrite.outputMode(outputMode)
853
+ return self
854
+
855
+ def format(self, source: str) -> "DataStreamWriter":
856
+ """Specifies the underlying output data source.
857
+
858
+ .. versionadded:: 2.0.0
859
+
860
+ .. versionchanged:: 3.5.0
861
+ Supports Spark Connect.
862
+
863
+ Parameters
864
+ ----------
865
+ source : str
866
+ string, name of the data source, which for now can be 'parquet'.
867
+
868
+ Notes
869
+ -----
870
+ This API is evolving.
871
+
872
+ Examples
873
+ --------
874
+ >>> df = spark.readStream.format("rate").load()
875
+ >>> df.writeStream.format("text")
876
+ <...streaming.readwriter.DataStreamWriter object ...>
877
+
878
+ This API allows to configure the source to write. The example below writes a CSV
879
+ file from Rate source in a streaming manner.
880
+
881
+ >>> import tempfile
882
+ >>> import time
883
+ >>> with tempfile.TemporaryDirectory() as d, tempfile.TemporaryDirectory() as cp:
884
+ ... df = spark.readStream.format("rate").load()
885
+ ... q = df.writeStream.format("csv").option("checkpointLocation", cp).start(d)
886
+ ... time.sleep(5)
887
+ ... q.stop()
888
+ ... spark.read.schema("timestamp TIMESTAMP, value STRING").csv(d).show()
889
+ +...---------+-----+
890
+ |...timestamp|value|
891
+ +...---------+-----+
892
+ ...
893
+ """
894
+ self._jwrite = self._jwrite.format(source)
895
+ return self
896
+
897
+ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataStreamWriter":
898
+ """Adds an output option for the underlying data source.
899
+
900
+ .. versionadded:: 2.0.0
901
+
902
+ .. versionchanged:: 3.5.0
903
+ Supports Spark Connect.
904
+
905
+ Notes
906
+ -----
907
+ This API is evolving.
908
+
909
+ Examples
910
+ --------
911
+ >>> df = spark.readStream.format("rate").load()
912
+ >>> df.writeStream.option("x", 1)
913
+ <...streaming.readwriter.DataStreamWriter object ...>
914
+
915
+ The example below specifies 'numRows' option to Console source in order to print
916
+ 3 rows for every batch.
917
+
918
+ >>> import time
919
+ >>> q = spark.readStream.format(
920
+ ... "rate").option("rowsPerSecond", 10).load().writeStream.format(
921
+ ... "console").option("numRows", 3).start()
922
+ >>> time.sleep(3)
923
+ >>> q.stop()
924
+ """
925
+ self._jwrite = self._jwrite.option(key, to_str(value))
926
+ return self
927
+
928
+ def options(self, **options: "OptionalPrimitiveType") -> "DataStreamWriter":
929
+ """Adds output options for the underlying data source.
930
+
931
+ .. versionadded:: 2.0.0
932
+
933
+ .. versionchanged:: 3.5.0
934
+ Supports Spark Connect.
935
+
936
+ Notes
937
+ -----
938
+ This API is evolving.
939
+
940
+ Examples
941
+ --------
942
+ >>> df = spark.readStream.format("rate").load()
943
+ >>> df.writeStream.option("x", 1)
944
+ <...streaming.readwriter.DataStreamWriter object ...>
945
+
946
+ The example below specifies 'numRows' and 'truncate' options to Console source in order
947
+ to print 3 rows for every batch without truncating the results.
948
+
949
+ >>> import time
950
+ >>> q = spark.readStream.format(
951
+ ... "rate").option("rowsPerSecond", 10).load().writeStream.format(
952
+ ... "console").options(numRows=3, truncate=False).start()
953
+ >>> time.sleep(3)
954
+ >>> q.stop()
955
+ """
956
+ for k in options:
957
+ self._jwrite = self._jwrite.option(k, to_str(options[k]))
958
+ return self
959
+
960
+ @overload
961
+ def partitionBy(self, *cols: str) -> "DataStreamWriter":
962
+ ...
963
+
964
+ @overload
965
+ def partitionBy(self, __cols: List[str]) -> "DataStreamWriter":
966
+ ...
967
+
968
+ def partitionBy(self, *cols: str) -> "DataStreamWriter": # type: ignore[misc]
969
+ """Partitions the output by the given columns on the file system.
970
+
971
+ If specified, the output is laid out on the file system similar
972
+ to Hive's partitioning scheme.
973
+
974
+ .. versionadded:: 2.0.0
975
+
976
+ .. versionchanged:: 3.5.0
977
+ Supports Spark Connect.
978
+
979
+ Parameters
980
+ ----------
981
+ cols : str or list
982
+ name of columns
983
+
984
+ Notes
985
+ -----
986
+ This API is evolving.
987
+
988
+ Examples
989
+ --------
990
+ >>> df = spark.readStream.format("rate").load()
991
+ >>> df.writeStream.partitionBy("value")
992
+ <...streaming.readwriter.DataStreamWriter object ...>
993
+
994
+ Partition-by timestamp column from Rate source.
995
+
996
+ >>> import tempfile
997
+ >>> import time
998
+ >>> with tempfile.TemporaryDirectory() as d, tempfile.TemporaryDirectory() as cp:
999
+ ... df = spark.readStream.format("rate").option("rowsPerSecond", 10).load()
1000
+ ... q = df.writeStream.partitionBy(
1001
+ ... "timestamp").format("parquet").option("checkpointLocation", cp).start(d)
1002
+ ... time.sleep(5)
1003
+ ... q.stop()
1004
+ ... spark.read.schema(df.schema).parquet(d).show()
1005
+ +...---------+-----+
1006
+ |...timestamp|value|
1007
+ +...---------+-----+
1008
+ ...
1009
+ """
1010
+ if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
1011
+ cols = cols[0]
1012
+ self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
1013
+ return self
1014
+
1015
+ def queryName(self, queryName: str) -> "DataStreamWriter":
1016
+ """Specifies the name of the :class:`StreamingQuery` that can be started with
1017
+ :func:`start`. This name must be unique among all the currently active queries
1018
+ in the associated SparkSession.
1019
+
1020
+ .. versionadded:: 2.0.0
1021
+
1022
+ .. versionchanged:: 3.5.0
1023
+ Supports Spark Connect.
1024
+
1025
+ Parameters
1026
+ ----------
1027
+ queryName : str
1028
+ unique name for the query
1029
+
1030
+ Notes
1031
+ -----
1032
+ This API is evolving.
1033
+
1034
+ Examples
1035
+ --------
1036
+ >>> import time
1037
+ >>> df = spark.readStream.format("rate").load()
1038
+ >>> q = df.writeStream.queryName("streaming_query").format("console").start()
1039
+ >>> q.stop()
1040
+ >>> q.name
1041
+ 'streaming_query'
1042
+ """
1043
+ if not queryName or type(queryName) != str or len(queryName.strip()) == 0:
1044
+ raise PySparkValueError(
1045
+ error_class="VALUE_NOT_NON_EMPTY_STR",
1046
+ message_parameters={"arg_name": "queryName", "arg_value": str(queryName)},
1047
+ )
1048
+ self._jwrite = self._jwrite.queryName(queryName)
1049
+ return self
1050
+
1051
+ @overload
1052
+ def trigger(self, *, processingTime: str) -> "DataStreamWriter":
1053
+ ...
1054
+
1055
+ @overload
1056
+ def trigger(self, *, once: bool) -> "DataStreamWriter":
1057
+ ...
1058
+
1059
+ @overload
1060
+ def trigger(self, *, continuous: str) -> "DataStreamWriter":
1061
+ ...
1062
+
1063
+ @overload
1064
+ def trigger(self, *, availableNow: bool) -> "DataStreamWriter":
1065
+ ...
1066
+
1067
+ def trigger(
1068
+ self,
1069
+ *,
1070
+ processingTime: Optional[str] = None,
1071
+ once: Optional[bool] = None,
1072
+ continuous: Optional[str] = None,
1073
+ availableNow: Optional[bool] = None,
1074
+ ) -> "DataStreamWriter":
1075
+ """Set the trigger for the stream query. If this is not set it will run the query as fast
1076
+ as possible, which is equivalent to setting the trigger to ``processingTime='0 seconds'``.
1077
+
1078
+ .. versionadded:: 2.0.0
1079
+
1080
+ .. versionchanged:: 3.5.0
1081
+ Supports Spark Connect.
1082
+
1083
+ Parameters
1084
+ ----------
1085
+ processingTime : str, optional
1086
+ a processing time interval as a string, e.g. '5 seconds', '1 minute'.
1087
+ Set a trigger that runs a microbatch query periodically based on the
1088
+ processing time. Only one trigger can be set.
1089
+ once : bool, optional
1090
+ if set to True, set a trigger that processes only one batch of data in a
1091
+ streaming query then terminates the query. Only one trigger can be set.
1092
+ continuous : str, optional
1093
+ a time interval as a string, e.g. '5 seconds', '1 minute'.
1094
+ Set a trigger that runs a continuous query with a given checkpoint
1095
+ interval. Only one trigger can be set.
1096
+ availableNow : bool, optional
1097
+ if set to True, set a trigger that processes all available data in multiple
1098
+ batches then terminates the query. Only one trigger can be set.
1099
+
1100
+ Notes
1101
+ -----
1102
+ This API is evolving.
1103
+
1104
+ Examples
1105
+ --------
1106
+ >>> df = spark.readStream.format("rate").load()
1107
+
1108
+ Trigger the query for execution every 5 seconds
1109
+
1110
+ >>> df.writeStream.trigger(processingTime='5 seconds')
1111
+ <...streaming.readwriter.DataStreamWriter object ...>
1112
+
1113
+ Trigger the query for execution every 5 seconds
1114
+
1115
+ >>> df.writeStream.trigger(continuous='5 seconds')
1116
+ <...streaming.readwriter.DataStreamWriter object ...>
1117
+
1118
+ Trigger the query for reading all available data with multiple batches
1119
+
1120
+ >>> df.writeStream.trigger(availableNow=True)
1121
+ <...streaming.readwriter.DataStreamWriter object ...>
1122
+ """
1123
+ params = [processingTime, once, continuous, availableNow]
1124
+
1125
+ if params.count(None) == 4:
1126
+ raise PySparkValueError(
1127
+ error_class="ONLY_ALLOW_SINGLE_TRIGGER",
1128
+ message_parameters={},
1129
+ )
1130
+ elif params.count(None) < 3:
1131
+ raise PySparkValueError(
1132
+ error_class="ONLY_ALLOW_SINGLE_TRIGGER",
1133
+ message_parameters={},
1134
+ )
1135
+
1136
+ jTrigger = None
1137
+ assert self._spark._sc._jvm is not None
1138
+ if processingTime is not None:
1139
+ if type(processingTime) != str or len(processingTime.strip()) == 0:
1140
+ raise PySparkValueError(
1141
+ error_class="VALUE_NOT_NON_EMPTY_STR",
1142
+ message_parameters={
1143
+ "arg_name": "processingTime",
1144
+ "arg_value": str(processingTime),
1145
+ },
1146
+ )
1147
+ interval = processingTime.strip()
1148
+ jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime(
1149
+ interval
1150
+ )
1151
+
1152
+ elif once is not None:
1153
+ if once is not True:
1154
+ raise PySparkValueError(
1155
+ error_class="VALUE_NOT_TRUE",
1156
+ message_parameters={"arg_name": "once", "arg_value": str(once)},
1157
+ )
1158
+
1159
+ jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once()
1160
+
1161
+ elif continuous is not None:
1162
+ if type(continuous) != str or len(continuous.strip()) == 0:
1163
+ raise PySparkValueError(
1164
+ error_class="VALUE_NOT_NON_EMPTY_STR",
1165
+ message_parameters={"arg_name": "continuous", "arg_value": str(continuous)},
1166
+ )
1167
+ interval = continuous.strip()
1168
+ jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous(
1169
+ interval
1170
+ )
1171
+ else:
1172
+ if availableNow is not True:
1173
+ raise PySparkValueError(
1174
+ error_class="VALUE_NOT_TRUE",
1175
+ message_parameters={"arg_name": "availableNow", "arg_value": str(availableNow)},
1176
+ )
1177
+ jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.AvailableNow()
1178
+
1179
+ self._jwrite = self._jwrite.trigger(jTrigger)
1180
+ return self
1181
+
1182
+ @staticmethod
1183
+ def _construct_foreach_function(
1184
+ f: Union[Callable[[Row], None], "SupportsProcess"]
1185
+ ) -> Callable[[Any, Iterator], Iterator]:
1186
+ from pyspark.taskcontext import TaskContext
1187
+
1188
+ if callable(f):
1189
+ # The provided object is a callable function that is supposed to be called on each row.
1190
+ # Construct a function that takes an iterator and calls the provided function on each
1191
+ # row.
1192
+ def func_without_process(_: Any, iterator: Iterator) -> Iterator:
1193
+ for x in iterator:
1194
+ f(x) # type: ignore[operator]
1195
+ return iter([])
1196
+
1197
+ return func_without_process
1198
+
1199
+ else:
1200
+ # The provided object is not a callable function. Then it is expected to have a
1201
+ # 'process(row)' method, and optional 'open(partition_id, epoch_id)' and
1202
+ # 'close(error)' methods.
1203
+
1204
+ if not hasattr(f, "process"):
1205
+ raise PySparkAttributeError(
1206
+ error_class="ATTRIBUTE_NOT_CALLABLE",
1207
+ message_parameters={"attr_name": "process", "obj_name": "f"},
1208
+ )
1209
+
1210
+ if not callable(getattr(f, "process")):
1211
+ raise PySparkAttributeError(
1212
+ error_class="ATTRIBUTE_NOT_CALLABLE",
1213
+ message_parameters={"attr_name": "process", "obj_name": "f"},
1214
+ )
1215
+
1216
+ def doesMethodExist(method_name: str) -> bool:
1217
+ exists = hasattr(f, method_name)
1218
+ if exists and not callable(getattr(f, method_name)):
1219
+ raise PySparkAttributeError(
1220
+ error_class="ATTRIBUTE_NOT_CALLABLE",
1221
+ message_parameters={"attr_name": method_name, "obj_name": "f"},
1222
+ )
1223
+ return exists
1224
+
1225
+ open_exists = doesMethodExist("open")
1226
+ close_exists = doesMethodExist("close")
1227
+
1228
+ def func_with_open_process_close(partition_id: Any, iterator: Iterator) -> Iterator:
1229
+ epoch_id = cast(TaskContext, TaskContext.get()).getLocalProperty(
1230
+ "streaming.sql.batchId"
1231
+ )
1232
+ if epoch_id:
1233
+ int_epoch_id = int(epoch_id)
1234
+ else:
1235
+ raise PySparkRuntimeError(
1236
+ error_class="CANNOT_GET_BATCH_ID",
1237
+ message_parameters={"obj_name": "TaskContext"},
1238
+ )
1239
+
1240
+ # Check if the data should be processed
1241
+ should_process = True
1242
+ if open_exists:
1243
+ should_process = f.open(partition_id, int_epoch_id) # type: ignore[union-attr]
1244
+
1245
+ error = None
1246
+
1247
+ try:
1248
+ if should_process:
1249
+ for x in iterator:
1250
+ cast("SupportsProcess", f).process(x)
1251
+ except Exception as ex:
1252
+ error = ex
1253
+ finally:
1254
+ if close_exists:
1255
+ f.close(error) # type: ignore[union-attr]
1256
+ if error:
1257
+ raise error
1258
+
1259
+ return iter([])
1260
+
1261
+ return func_with_open_process_close
1262
+
1263
+ @overload
1264
+ def foreach(self, f: Callable[[Row], None]) -> "DataStreamWriter":
1265
+ ...
1266
+
1267
+ @overload
1268
+ def foreach(self, f: "SupportsProcess") -> "DataStreamWriter":
1269
+ ...
1270
+
1271
+ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataStreamWriter":
1272
+ """
1273
+ Sets the output of the streaming query to be processed using the provided writer ``f``.
1274
+ This is often used to write the output of a streaming query to arbitrary storage systems.
1275
+ The processing logic can be specified in two ways.
1276
+
1277
+ #. A **function** that takes a row as input.
1278
+ This is a simple way to express your processing logic. Note that this does
1279
+ not allow you to deduplicate generated data when failures cause reprocessing of
1280
+ some input data. That would require you to specify the processing logic in the next
1281
+ way.
1282
+
1283
+ #. An **object** with a ``process`` method and optional ``open`` and ``close`` methods.
1284
+ The object can have the following methods.
1285
+
1286
+ * ``open(partition_id, epoch_id)``: *Optional* method that initializes the processing
1287
+ (for example, open a connection, start a transaction, etc). Additionally, you can
1288
+ use the `partition_id` and `epoch_id` to deduplicate regenerated data
1289
+ (discussed later).
1290
+
1291
+ * ``process(row)``: *Non-optional* method that processes each :class:`Row`.
1292
+
1293
+ * ``close(error)``: *Optional* method that finalizes and cleans up (for example,
1294
+ close connection, commit transaction, etc.) after all rows have been processed.
1295
+
1296
+ The object will be used by Spark in the following way.
1297
+
1298
+ * A single copy of this object is responsible of all the data generated by a
1299
+ single task in a query. In other words, one instance is responsible for
1300
+ processing one partition of the data generated in a distributed manner.
1301
+
1302
+ * This object must be serializable because each task will get a fresh
1303
+ serialized-deserialized copy of the provided object. Hence, it is strongly
1304
+ recommended that any initialization for writing data (e.g. opening a
1305
+ connection or starting a transaction) is done after the `open(...)`
1306
+ method has been called, which signifies that the task is ready to generate data.
1307
+
1308
+ * The lifecycle of the methods are as follows.
1309
+
1310
+ For each partition with ``partition_id``:
1311
+
1312
+ ... For each batch/epoch of streaming data with ``epoch_id``:
1313
+
1314
+ ....... Method ``open(partitionId, epochId)`` is called.
1315
+
1316
+ ....... If ``open(...)`` returns true, for each row in the partition and
1317
+ batch/epoch, method ``process(row)`` is called.
1318
+
1319
+ ....... Method ``close(errorOrNull)`` is called with error (if any) seen while
1320
+ processing rows.
1321
+
1322
+ Important points to note:
1323
+
1324
+ * The `partitionId` and `epochId` can be used to deduplicate generated data when
1325
+ failures cause reprocessing of some input data. This depends on the execution
1326
+ mode of the query. If the streaming query is being executed in the micro-batch
1327
+ mode, then every partition represented by a unique tuple (partition_id, epoch_id)
1328
+ is guaranteed to have the same data. Hence, (partition_id, epoch_id) can be used
1329
+ to deduplicate and/or transactionally commit data and achieve exactly-once
1330
+ guarantees. However, if the streaming query is being executed in the continuous
1331
+ mode, then this guarantee does not hold and therefore should not be used for
1332
+ deduplication.
1333
+
1334
+ * The ``close()`` method (if exists) will be called if `open()` method exists and
1335
+ returns successfully (irrespective of the return value), except if the Python
1336
+ crashes in the middle.
1337
+
1338
+ .. versionadded:: 2.4.0
1339
+
1340
+ .. versionchanged:: 3.5.0
1341
+ Supports Spark Connect.
1342
+
1343
+ Notes
1344
+ -----
1345
+ This API is evolving.
1346
+
1347
+ Examples
1348
+ --------
1349
+ >>> import time
1350
+ >>> df = spark.readStream.format("rate").load()
1351
+
1352
+ Print every row using a function
1353
+
1354
+ >>> def print_row(row):
1355
+ ... print(row)
1356
+ ...
1357
+ >>> q = df.writeStream.foreach(print_row).start()
1358
+ >>> time.sleep(3)
1359
+ >>> q.stop()
1360
+
1361
+ Print every row using a object with process() method
1362
+
1363
+ >>> class RowPrinter:
1364
+ ... def open(self, partition_id, epoch_id):
1365
+ ... print("Opened %d, %d" % (partition_id, epoch_id))
1366
+ ... return True
1367
+ ...
1368
+ ... def process(self, row):
1369
+ ... print(row)
1370
+ ...
1371
+ ... def close(self, error):
1372
+ ... print("Closed with error: %s" % str(error))
1373
+ ...
1374
+ >>> q = df.writeStream.foreach(print_row).start()
1375
+ >>> time.sleep(3)
1376
+ >>> q.stop()
1377
+ """
1378
+
1379
+ from pyspark.rdd import _wrap_function
1380
+ from pyspark.serializers import CPickleSerializer, AutoBatchedSerializer
1381
+
1382
+ func = self._construct_foreach_function(f)
1383
+ serializer = AutoBatchedSerializer(CPickleSerializer())
1384
+ wrapped_func = _wrap_function(self._spark._sc, func, serializer, serializer)
1385
+ assert self._spark._sc._jvm is not None
1386
+ jForeachWriter = (
1387
+ self._spark._sc._jvm.org.apache.spark.sql.execution.python.PythonForeachWriter(
1388
+ wrapped_func, self._df._jdf.schema()
1389
+ )
1390
+ )
1391
+ self._jwrite.foreach(jForeachWriter)
1392
+ return self
1393
+
1394
+ def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter":
1395
+ """
1396
+ Sets the output of the streaming query to be processed using the provided
1397
+ function. This is supported only the in the micro-batch execution modes (that is, when the
1398
+ trigger is not continuous). In every micro-batch, the provided function will be called in
1399
+ every micro-batch with (i) the output rows as a DataFrame and (ii) the batch identifier.
1400
+ The batchId can be used deduplicate and transactionally write the output
1401
+ (that is, the provided Dataset) to external systems. The output DataFrame is guaranteed
1402
+ to exactly same for the same batchId (assuming all operations are deterministic in the
1403
+ query).
1404
+
1405
+ .. versionadded:: 2.4.0
1406
+
1407
+ .. versionchanged:: 3.5.0
1408
+ Supports Spark Connect.
1409
+
1410
+ Notes
1411
+ -----
1412
+ This API is evolving.
1413
+ This function behaves differently in Spark Connect mode. See examples.
1414
+ In Connect, the provided function doesn't have access to variables defined outside of it.
1415
+
1416
+ Examples
1417
+ --------
1418
+ >>> import time
1419
+ >>> df = spark.readStream.format("rate").load()
1420
+ >>> my_value = -1
1421
+ >>> def func(batch_df, batch_id):
1422
+ ... global my_value
1423
+ ... my_value = 100
1424
+ ... batch_df.collect()
1425
+ ...
1426
+ >>> q = df.writeStream.foreachBatch(func).start()
1427
+ >>> time.sleep(3)
1428
+ >>> q.stop()
1429
+ >>> # if in Spark Connect, my_value = -1, else my_value = 100
1430
+ """
1431
+
1432
+ from pyspark.java_gateway import ensure_callback_server_started
1433
+
1434
+ gw = self._spark._sc._gateway
1435
+ assert gw is not None
1436
+ java_import(gw.jvm, "org.apache.spark.sql.execution.streaming.sources.*")
1437
+
1438
+ wrapped_func = ForeachBatchFunction(self._spark, func)
1439
+ gw.jvm.PythonForeachBatchHelper.callForeachBatch(self._jwrite, wrapped_func)
1440
+ ensure_callback_server_started(gw)
1441
+ return self
1442
+
1443
+ def start(
1444
+ self,
1445
+ path: Optional[str] = None,
1446
+ format: Optional[str] = None,
1447
+ outputMode: Optional[str] = None,
1448
+ partitionBy: Optional[Union[str, List[str]]] = None,
1449
+ queryName: Optional[str] = None,
1450
+ **options: "OptionalPrimitiveType",
1451
+ ) -> StreamingQuery:
1452
+ """Streams the contents of the :class:`DataFrame` to a data source.
1453
+
1454
+ The data source is specified by the ``format`` and a set of ``options``.
1455
+ If ``format`` is not specified, the default data source configured by
1456
+ ``spark.sql.sources.default`` will be used.
1457
+
1458
+ .. versionadded:: 2.0.0
1459
+
1460
+ .. versionchanged:: 3.5.0
1461
+ Supports Spark Connect.
1462
+
1463
+ Parameters
1464
+ ----------
1465
+ path : str, optional
1466
+ the path in a Hadoop supported file system
1467
+ format : str, optional
1468
+ the format used to save
1469
+ outputMode : str, optional
1470
+ specifies how data of a streaming DataFrame/Dataset is written to a
1471
+ streaming sink.
1472
+
1473
+ * `append`: Only the new rows in the streaming DataFrame/Dataset will be written to the
1474
+ sink
1475
+ * `complete`: All the rows in the streaming DataFrame/Dataset will be written to the
1476
+ sink every time these are some updates
1477
+ * `update`: only the rows that were updated in the streaming DataFrame/Dataset will be
1478
+ written to the sink every time there are some updates. If the query doesn't contain
1479
+ aggregations, it will be equivalent to `append` mode.
1480
+ partitionBy : str or list, optional
1481
+ names of partitioning columns
1482
+ queryName : str, optional
1483
+ unique name for the query
1484
+ **options : dict
1485
+ All other string options. You may want to provide a `checkpointLocation`
1486
+ for most streams, however it is not required for a `memory` stream.
1487
+
1488
+ Notes
1489
+ -----
1490
+ This API is evolving.
1491
+
1492
+ Examples
1493
+ --------
1494
+ >>> df = spark.readStream.format("rate").load()
1495
+
1496
+ Basic example.
1497
+
1498
+ >>> q = df.writeStream.format('memory').queryName('this_query').start()
1499
+ >>> q.isActive
1500
+ True
1501
+ >>> q.name
1502
+ 'this_query'
1503
+ >>> q.stop()
1504
+ >>> q.isActive
1505
+ False
1506
+
1507
+ Example with using other parameters with a trigger.
1508
+
1509
+ >>> q = df.writeStream.trigger(processingTime='5 seconds').start(
1510
+ ... queryName='that_query', outputMode="append", format='memory')
1511
+ >>> q.name
1512
+ 'that_query'
1513
+ >>> q.isActive
1514
+ True
1515
+ >>> q.stop()
1516
+ """
1517
+ self.options(**options)
1518
+ if outputMode is not None:
1519
+ self.outputMode(outputMode)
1520
+ if partitionBy is not None:
1521
+ self.partitionBy(partitionBy)
1522
+ if format is not None:
1523
+ self.format(format)
1524
+ if queryName is not None:
1525
+ self.queryName(queryName)
1526
+ if path is None:
1527
+ return self._sq(self._jwrite.start())
1528
+ else:
1529
+ return self._sq(self._jwrite.start(path))
1530
+
1531
+ def toTable(
1532
+ self,
1533
+ tableName: str,
1534
+ format: Optional[str] = None,
1535
+ outputMode: Optional[str] = None,
1536
+ partitionBy: Optional[Union[str, List[str]]] = None,
1537
+ queryName: Optional[str] = None,
1538
+ **options: "OptionalPrimitiveType",
1539
+ ) -> StreamingQuery:
1540
+ """
1541
+ Starts the execution of the streaming query, which will continually output results to the
1542
+ given table as new data arrives.
1543
+
1544
+ The returned :class:`StreamingQuery` object can be used to interact with the stream.
1545
+
1546
+ .. versionadded:: 3.1.0
1547
+
1548
+ .. versionchanged:: 3.5.0
1549
+ Supports Spark Connect.
1550
+
1551
+ Parameters
1552
+ ----------
1553
+ tableName : str
1554
+ string, for the name of the table.
1555
+ format : str, optional
1556
+ the format used to save.
1557
+ outputMode : str, optional
1558
+ specifies how data of a streaming DataFrame/Dataset is written to a
1559
+ streaming sink.
1560
+
1561
+ * `append`: Only the new rows in the streaming DataFrame/Dataset will be written to the
1562
+ sink
1563
+ * `complete`: All the rows in the streaming DataFrame/Dataset will be written to the
1564
+ sink every time these are some updates
1565
+ * `update`: only the rows that were updated in the streaming DataFrame/Dataset will be
1566
+ written to the sink every time there are some updates. If the query doesn't contain
1567
+ aggregations, it will be equivalent to `append` mode.
1568
+ partitionBy : str or list, optional
1569
+ names of partitioning columns
1570
+ queryName : str, optional
1571
+ unique name for the query
1572
+ **options : dict
1573
+ All other string options. You may want to provide a `checkpointLocation`.
1574
+
1575
+ Notes
1576
+ -----
1577
+ This API is evolving.
1578
+
1579
+ For v1 table, partitioning columns provided by `partitionBy` will be respected no matter
1580
+ the table exists or not. A new table will be created if the table not exists.
1581
+
1582
+ For v2 table, `partitionBy` will be ignored if the table already exists. `partitionBy` will
1583
+ be respected only if the v2 table does not exist. Besides, the v2 table created by this API
1584
+ lacks some functionalities (e.g., customized properties, options, and serde info). If you
1585
+ need them, please create the v2 table manually before the execution to avoid creating a
1586
+ table with incomplete information.
1587
+
1588
+ Examples
1589
+ --------
1590
+ Save a data stream to a table.
1591
+
1592
+ >>> import tempfile
1593
+ >>> import time
1594
+ >>> _ = spark.sql("DROP TABLE IF EXISTS my_table2")
1595
+ >>> with tempfile.TemporaryDirectory() as d:
1596
+ ... # Create a table with Rate source.
1597
+ ... q = spark.readStream.format("rate").option(
1598
+ ... "rowsPerSecond", 10).load().writeStream.toTable(
1599
+ ... "my_table2",
1600
+ ... queryName='that_query',
1601
+ ... outputMode="append",
1602
+ ... format='parquet',
1603
+ ... checkpointLocation=d)
1604
+ ... time.sleep(3)
1605
+ ... q.stop()
1606
+ ... spark.read.table("my_table2").show()
1607
+ ... _ = spark.sql("DROP TABLE my_table2")
1608
+ +...---------+-----+
1609
+ |...timestamp|value|
1610
+ +...---------+-----+
1611
+ ...
1612
+ """
1613
+ self.options(**options)
1614
+ if outputMode is not None:
1615
+ self.outputMode(outputMode)
1616
+ if partitionBy is not None:
1617
+ self.partitionBy(partitionBy)
1618
+ if format is not None:
1619
+ self.format(format)
1620
+ if queryName is not None:
1621
+ self.queryName(queryName)
1622
+ return self._sq(self._jwrite.toTable(tableName))
1623
+
1624
+
1625
+ def _test() -> None:
1626
+ import doctest
1627
+ import os
1628
+ from pyspark.sql import SparkSession
1629
+ import pyspark.sql.streaming.readwriter
1630
+
1631
+ os.chdir(os.environ["SPARK_HOME"])
1632
+
1633
+ globs = pyspark.sql.streaming.readwriter.__dict__.copy()
1634
+ globs["spark"] = (
1635
+ SparkSession.builder.master("local[4]")
1636
+ .appName("sql.streaming.readwriter tests")
1637
+ .getOrCreate()
1638
+ )
1639
+
1640
+ (failure_count, test_count) = doctest.testmod(
1641
+ pyspark.sql.streaming.readwriter,
1642
+ globs=globs,
1643
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
1644
+ )
1645
+ globs["spark"].stop()
1646
+
1647
+ if failure_count:
1648
+ sys.exit(-1)
1649
+
1650
+
1651
+ if __name__ == "__main__":
1652
+ _test()