snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2601 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import os
19
+ import shutil
20
+ import signal
21
+ import sys
22
+ import threading
23
+ import warnings
24
+ import importlib
25
+ from threading import RLock
26
+ from tempfile import NamedTemporaryFile
27
+ from types import TracebackType
28
+ from typing import (
29
+ Any,
30
+ Callable,
31
+ cast,
32
+ ClassVar,
33
+ Dict,
34
+ Iterable,
35
+ List,
36
+ NoReturn,
37
+ Optional,
38
+ Sequence,
39
+ Tuple,
40
+ Type,
41
+ TYPE_CHECKING,
42
+ TypeVar,
43
+ Set,
44
+ )
45
+
46
+ from py4j.java_collections import JavaMap
47
+ from py4j.protocol import Py4JError
48
+
49
+ from pyspark import accumulators
50
+ from pyspark.accumulators import Accumulator
51
+ from pyspark.broadcast import Broadcast, BroadcastPickleRegistry
52
+ from pyspark.conf import SparkConf
53
+ from pyspark.files import SparkFiles
54
+ from pyspark.java_gateway import launch_gateway, local_connect_and_auth
55
+ from pyspark.serializers import (
56
+ CPickleSerializer,
57
+ BatchedSerializer,
58
+ Serializer,
59
+ UTF8Deserializer,
60
+ PairDeserializer,
61
+ AutoBatchedSerializer,
62
+ NoOpSerializer,
63
+ ChunkedStream,
64
+ )
65
+ from pyspark.storagelevel import StorageLevel
66
+ from pyspark.resource.information import ResourceInformation
67
+ from pyspark.rdd import RDD, _load_from_socket
68
+ from pyspark.taskcontext import TaskContext
69
+ from pyspark.traceback_utils import CallSite, first_spark_call
70
+ from pyspark.status import StatusTracker
71
+ from pyspark.profiler import ProfilerCollector, BasicProfiler, UDFBasicProfiler, MemoryProfiler
72
+ from pyspark.errors import PySparkRuntimeError
73
+ from py4j.java_gateway import is_instance_of, JavaGateway, JavaObject, JVMView
74
+
75
+ if TYPE_CHECKING:
76
+ from pyspark.accumulators import AccumulatorParam
77
+
78
+ __all__ = ["SparkContext"]
79
+
80
+
81
+ # These are special default configs for PySpark, they will overwrite
82
+ # the default ones for Spark if they are not configured by user.
83
+ DEFAULT_CONFIGS: Dict[str, Any] = {
84
+ "spark.serializer.objectStreamReset": 100,
85
+ "spark.rdd.compress": True,
86
+ }
87
+
88
+ T = TypeVar("T")
89
+ U = TypeVar("U")
90
+
91
+
92
+ class SparkContext:
93
+
94
+ """
95
+ Main entry point for Spark functionality. A SparkContext represents the
96
+ connection to a Spark cluster, and can be used to create :class:`RDD` and
97
+ broadcast variables on that cluster.
98
+
99
+ When you create a new SparkContext, at least the master and app name should
100
+ be set, either through the named parameters here or through `conf`.
101
+
102
+ Parameters
103
+ ----------
104
+ master : str, optional
105
+ Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
106
+ appName : str, optional
107
+ A name for your job, to display on the cluster web UI.
108
+ sparkHome : str, optional
109
+ Location where Spark is installed on cluster nodes.
110
+ pyFiles : list, optional
111
+ Collection of .zip or .py files to send to the cluster
112
+ and add to PYTHONPATH. These can be paths on the local file
113
+ system or HDFS, HTTP, HTTPS, or FTP URLs.
114
+ environment : dict, optional
115
+ A dictionary of environment variables to set on
116
+ worker nodes.
117
+ batchSize : int, optional, default 0
118
+ The number of Python objects represented as a single
119
+ Java object. Set 1 to disable batching, 0 to automatically choose
120
+ the batch size based on object sizes, or -1 to use an unlimited
121
+ batch size
122
+ serializer : :class:`Serializer`, optional, default :class:`CPickleSerializer`
123
+ The serializer for RDDs.
124
+ conf : :class:`SparkConf`, optional
125
+ An object setting Spark properties.
126
+ gateway : class:`py4j.java_gateway.JavaGateway`, optional
127
+ Use an existing gateway and JVM, otherwise a new JVM
128
+ will be instantiated. This is only used internally.
129
+ jsc : class:`py4j.java_gateway.JavaObject`, optional
130
+ The JavaSparkContext instance. This is only used internally.
131
+ profiler_cls : type, optional, default :class:`BasicProfiler`
132
+ A class of custom Profiler used to do profiling
133
+ udf_profiler_cls : type, optional, default :class:`UDFBasicProfiler`
134
+ A class of custom Profiler used to do udf profiling
135
+
136
+ Notes
137
+ -----
138
+ Only one :class:`SparkContext` should be active per JVM. You must `stop()`
139
+ the active :class:`SparkContext` before creating a new one.
140
+
141
+ :class:`SparkContext` instance is not supported to share across multiple
142
+ processes out of the box, and PySpark does not guarantee multi-processing execution.
143
+ Use threads instead for concurrent processing purpose.
144
+
145
+ Examples
146
+ --------
147
+ >>> from pyspark.context import SparkContext
148
+ >>> sc = SparkContext('local', 'test')
149
+ >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
150
+ Traceback (most recent call last):
151
+ ...
152
+ ValueError: ...
153
+ """
154
+
155
+ _gateway: ClassVar[Optional[JavaGateway]] = None
156
+ _jvm: ClassVar[Optional[JVMView]] = None
157
+ _next_accum_id = 0
158
+ _active_spark_context: ClassVar[Optional["SparkContext"]] = None
159
+ _lock = RLock()
160
+ _python_includes: Optional[
161
+ List[str]
162
+ ] = None # zip and egg files that need to be added to PYTHONPATH
163
+ serializer: Serializer
164
+ profiler_collector: ProfilerCollector
165
+
166
+ PACKAGE_EXTENSIONS: Iterable[str] = (".zip", ".egg", ".jar")
167
+
168
+ def __init__(
169
+ self,
170
+ master: Optional[str] = None,
171
+ appName: Optional[str] = None,
172
+ sparkHome: Optional[str] = None,
173
+ pyFiles: Optional[List[str]] = None,
174
+ environment: Optional[Dict[str, Any]] = None,
175
+ batchSize: int = 0,
176
+ serializer: "Serializer" = CPickleSerializer(),
177
+ conf: Optional[SparkConf] = None,
178
+ gateway: Optional[JavaGateway] = None,
179
+ jsc: Optional[JavaObject] = None,
180
+ profiler_cls: Type[BasicProfiler] = BasicProfiler,
181
+ udf_profiler_cls: Type[UDFBasicProfiler] = UDFBasicProfiler,
182
+ memory_profiler_cls: Type[MemoryProfiler] = MemoryProfiler,
183
+ ):
184
+ if "SPARK_CONNECT_MODE_ENABLED" in os.environ and "SPARK_LOCAL_REMOTE" not in os.environ:
185
+ raise PySparkRuntimeError(
186
+ error_class="CONTEXT_UNAVAILABLE_FOR_REMOTE_CLIENT",
187
+ message_parameters={},
188
+ )
189
+
190
+ if conf is None or conf.get("spark.executor.allowSparkContext", "false").lower() != "true":
191
+ # In order to prevent SparkContext from being created in executors.
192
+ SparkContext._assert_on_driver()
193
+
194
+ self._callsite = first_spark_call() or CallSite(None, None, None)
195
+ if gateway is not None and gateway.gateway_parameters.auth_token is None:
196
+ raise ValueError(
197
+ "You are trying to pass an insecure Py4j gateway to Spark. This"
198
+ " is not allowed as it is a security risk."
199
+ )
200
+
201
+ SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
202
+ try:
203
+ self._do_init(
204
+ master,
205
+ appName,
206
+ sparkHome,
207
+ pyFiles,
208
+ environment,
209
+ batchSize,
210
+ serializer,
211
+ conf,
212
+ jsc,
213
+ profiler_cls,
214
+ udf_profiler_cls,
215
+ memory_profiler_cls,
216
+ )
217
+ except BaseException:
218
+ # If an error occurs, clean up in order to allow future SparkContext creation:
219
+ self.stop()
220
+ raise
221
+
222
+ def _do_init(
223
+ self,
224
+ master: Optional[str],
225
+ appName: Optional[str],
226
+ sparkHome: Optional[str],
227
+ pyFiles: Optional[List[str]],
228
+ environment: Optional[Dict[str, Any]],
229
+ batchSize: int,
230
+ serializer: Serializer,
231
+ conf: Optional[SparkConf],
232
+ jsc: JavaObject,
233
+ profiler_cls: Type[BasicProfiler] = BasicProfiler,
234
+ udf_profiler_cls: Type[UDFBasicProfiler] = UDFBasicProfiler,
235
+ memory_profiler_cls: Type[MemoryProfiler] = MemoryProfiler,
236
+ ) -> None:
237
+ self.environment = environment or {}
238
+ # java gateway must have been launched at this point.
239
+ if conf is not None and conf._jconf is not None:
240
+ # conf has been initialized in JVM properly, so use conf directly. This represents the
241
+ # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
242
+ # created and then stopped, and we create a new SparkConf and new SparkContext again)
243
+ self._conf = conf
244
+ else:
245
+ self._conf = SparkConf(_jvm=SparkContext._jvm)
246
+ if conf is not None:
247
+ for k, v in conf.getAll():
248
+ self._conf.set(k, v)
249
+
250
+ self._batchSize = batchSize # -1 represents an unlimited batch size
251
+ self._unbatched_serializer = serializer
252
+ if batchSize == 0:
253
+ self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
254
+ else:
255
+ self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
256
+
257
+ # Set any parameters passed directly to us on the conf
258
+ if master:
259
+ self._conf.setMaster(master)
260
+ if appName:
261
+ self._conf.setAppName(appName)
262
+ if sparkHome:
263
+ self._conf.setSparkHome(sparkHome)
264
+ if environment:
265
+ for key, value in environment.items():
266
+ self._conf.setExecutorEnv(key, value)
267
+ for key, value in DEFAULT_CONFIGS.items():
268
+ self._conf.setIfMissing(key, value)
269
+
270
+ # Check that we have at least the required parameters
271
+ if not self._conf.contains("spark.master"):
272
+ raise PySparkRuntimeError(
273
+ error_class="MASTER_URL_NOT_SET",
274
+ message_parameters={},
275
+ )
276
+ if not self._conf.contains("spark.app.name"):
277
+ raise PySparkRuntimeError(
278
+ error_class="APPLICATION_NAME_NOT_SET",
279
+ message_parameters={},
280
+ )
281
+
282
+ # Read back our properties from the conf in case we loaded some of them from
283
+ # the classpath or an external config file
284
+ self.master = self._conf.get("spark.master")
285
+ self.appName = self._conf.get("spark.app.name")
286
+ self.sparkHome = self._conf.get("spark.home", None)
287
+
288
+ for (k, v) in self._conf.getAll():
289
+ if k.startswith("spark.executorEnv."):
290
+ varName = k[len("spark.executorEnv.") :]
291
+ self.environment[varName] = v
292
+
293
+ self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
294
+
295
+ # Create the Java SparkContext through Py4J
296
+ self._jsc = jsc or self._initialize_context(self._conf._jconf)
297
+ # Reset the SparkConf to the one actually used by the SparkContext in JVM.
298
+ self._conf = SparkConf(_jconf=self._jsc.sc().conf())
299
+
300
+ # Create a single Accumulator in Java that we'll send all our updates through;
301
+ # they will be passed back to us through a TCP server
302
+ assert self._gateway is not None
303
+ auth_token = self._gateway.gateway_parameters.auth_token
304
+ start_update_server = accumulators._start_update_server
305
+ self._accumulatorServer = start_update_server(auth_token)
306
+ (host, port) = self._accumulatorServer.server_address
307
+ assert self._jvm is not None
308
+ self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port, auth_token)
309
+ self._jsc.sc().register(self._javaAccumulator)
310
+
311
+ # If encryption is enabled, we need to setup a server in the jvm to read broadcast
312
+ # data via a socket.
313
+ # scala's mangled names w/ $ in them require special treatment.
314
+ self._encryption_enabled = self._jvm.PythonUtils.isEncryptionEnabled(self._jsc)
315
+ os.environ["SPARK_AUTH_SOCKET_TIMEOUT"] = str(
316
+ self._jvm.PythonUtils.getPythonAuthSocketTimeout(self._jsc)
317
+ )
318
+ os.environ["SPARK_BUFFER_SIZE"] = str(self._jvm.PythonUtils.getSparkBufferSize(self._jsc))
319
+
320
+ self.pythonExec = os.environ.get("PYSPARK_PYTHON", "python3")
321
+ self.pythonVer = "%d.%d" % sys.version_info[:2]
322
+
323
+ # Broadcast's __reduce__ method stores Broadcast instances here.
324
+ # This allows other code to determine which Broadcast instances have
325
+ # been pickled, so it can determine which Java broadcast objects to
326
+ # send.
327
+ self._pickled_broadcast_vars = BroadcastPickleRegistry()
328
+
329
+ SparkFiles._sc = self
330
+ root_dir = SparkFiles.getRootDirectory()
331
+ sys.path.insert(1, root_dir)
332
+
333
+ # Deploy any code dependencies specified in the constructor
334
+ self._python_includes = list()
335
+ for path in pyFiles or []:
336
+ self.addPyFile(path)
337
+
338
+ # Deploy code dependencies set by spark-submit; these will already have been added
339
+ # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
340
+ for path in self._conf.get("spark.submit.pyFiles", "").split(","):
341
+ if path != "":
342
+ (dirname, filename) = os.path.split(path)
343
+ try:
344
+ filepath = os.path.join(SparkFiles.getRootDirectory(), filename)
345
+ if not os.path.exists(filepath):
346
+ # In case of YARN with shell mode, 'spark.submit.pyFiles' files are
347
+ # not added via SparkContext.addFile. Here we check if the file exists,
348
+ # try to copy and then add it to the path. See SPARK-21945.
349
+ shutil.copyfile(path, filepath)
350
+ if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
351
+ self._python_includes.append(filename)
352
+ sys.path.insert(1, filepath)
353
+ except Exception:
354
+ warnings.warn(
355
+ "Failed to add file [%s] specified in 'spark.submit.pyFiles' to "
356
+ "Python path:\n %s" % (path, "\n ".join(sys.path)),
357
+ RuntimeWarning,
358
+ )
359
+
360
+ # Create a temporary directory inside spark.local.dir:
361
+ assert self._jvm is not None
362
+ local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
363
+ self._temp_dir = self._jvm.org.apache.spark.util.Utils.createTempDir(
364
+ local_dir, "pyspark"
365
+ ).getAbsolutePath()
366
+
367
+ # profiling stats collected for each PythonRDD
368
+ if (
369
+ self._conf.get("spark.python.profile", "false") == "true"
370
+ or self._conf.get("spark.python.profile.memory", "false") == "true"
371
+ ):
372
+ dump_path = self._conf.get("spark.python.profile.dump", None)
373
+ self.profiler_collector = ProfilerCollector(
374
+ profiler_cls, udf_profiler_cls, memory_profiler_cls, dump_path
375
+ )
376
+ else:
377
+ self.profiler_collector = None # type: ignore[assignment]
378
+
379
+ # create a signal handler which would be invoked on receiving SIGINT
380
+ def signal_handler(signal: Any, frame: Any) -> NoReturn:
381
+ self.cancelAllJobs()
382
+ raise KeyboardInterrupt()
383
+
384
+ # see http://stackoverflow.com/questions/23206787/
385
+ if isinstance(
386
+ threading.current_thread(), threading._MainThread # type: ignore[attr-defined]
387
+ ):
388
+ signal.signal(signal.SIGINT, signal_handler)
389
+
390
+ def __repr__(self) -> str:
391
+ return "<SparkContext master={master} appName={appName}>".format(
392
+ master=self.master,
393
+ appName=self.appName,
394
+ )
395
+
396
+ def _repr_html_(self) -> str:
397
+ return """
398
+ <div>
399
+ <p><b>SparkContext</b></p>
400
+
401
+ <p><a href="{sc.uiWebUrl}">Spark UI</a></p>
402
+
403
+ <dl>
404
+ <dt>Version</dt>
405
+ <dd><code>v{sc.version}</code></dd>
406
+ <dt>Master</dt>
407
+ <dd><code>{sc.master}</code></dd>
408
+ <dt>AppName</dt>
409
+ <dd><code>{sc.appName}</code></dd>
410
+ </dl>
411
+ </div>
412
+ """.format(
413
+ sc=self
414
+ )
415
+
416
+ def _initialize_context(self, jconf: JavaObject) -> JavaObject:
417
+ """
418
+ Initialize SparkContext in function to allow subclass specific initialization
419
+ """
420
+ assert self._jvm is not None
421
+ return self._jvm.JavaSparkContext(jconf)
422
+
423
+ @classmethod
424
+ def _ensure_initialized(
425
+ cls,
426
+ instance: Optional["SparkContext"] = None,
427
+ gateway: Optional[JavaGateway] = None,
428
+ conf: Optional[SparkConf] = None,
429
+ ) -> None:
430
+ """
431
+ Checks whether a SparkContext is initialized or not.
432
+ Throws error if a SparkContext is already running.
433
+ """
434
+ with SparkContext._lock:
435
+ if not SparkContext._gateway:
436
+ SparkContext._gateway = gateway or launch_gateway(conf)
437
+ SparkContext._jvm = SparkContext._gateway.jvm
438
+
439
+ if instance:
440
+ if (
441
+ SparkContext._active_spark_context
442
+ and SparkContext._active_spark_context != instance
443
+ ):
444
+ currentMaster = SparkContext._active_spark_context.master
445
+ currentAppName = SparkContext._active_spark_context.appName
446
+ callsite = SparkContext._active_spark_context._callsite
447
+
448
+ # Raise error if there is already a running Spark context
449
+ raise ValueError(
450
+ "Cannot run multiple SparkContexts at once; "
451
+ "existing SparkContext(app=%s, master=%s)"
452
+ " created by %s at %s:%s "
453
+ % (
454
+ currentAppName,
455
+ currentMaster,
456
+ callsite.function,
457
+ callsite.file,
458
+ callsite.linenum,
459
+ )
460
+ )
461
+ else:
462
+ SparkContext._active_spark_context = instance
463
+
464
+ def __getnewargs__(self) -> NoReturn:
465
+ # This method is called when attempting to pickle SparkContext, which is always an error:
466
+ raise PySparkRuntimeError(
467
+ error_class="CONTEXT_ONLY_VALID_ON_DRIVER",
468
+ message_parameters={},
469
+ )
470
+
471
+ def __enter__(self) -> "SparkContext":
472
+ """
473
+ Enable 'with SparkContext(...) as sc: app(sc)' syntax.
474
+ """
475
+ return self
476
+
477
+ def __exit__(
478
+ self,
479
+ type: Optional[Type[BaseException]],
480
+ value: Optional[BaseException],
481
+ trace: Optional[TracebackType],
482
+ ) -> None:
483
+ """
484
+ Enable 'with SparkContext(...) as sc: app' syntax.
485
+
486
+ Specifically stop the context on exit of the with block.
487
+ """
488
+ self.stop()
489
+
490
+ @classmethod
491
+ def getOrCreate(cls, conf: Optional[SparkConf] = None) -> "SparkContext":
492
+ """
493
+ Get or instantiate a :class:`SparkContext` and register it as a singleton object.
494
+
495
+ .. versionadded:: 1.4.0
496
+
497
+ Parameters
498
+ ----------
499
+ conf : :class:`SparkConf`, optional
500
+ :class:`SparkConf` that will be used for initialization of the :class:`SparkContext`.
501
+
502
+ Returns
503
+ -------
504
+ :class:`SparkContext`
505
+ current :class:`SparkContext`, or a new one if it wasn't created before the function
506
+ call.
507
+
508
+ Examples
509
+ --------
510
+ >>> SparkContext.getOrCreate()
511
+ <SparkContext ...>
512
+ """
513
+ with SparkContext._lock:
514
+ if SparkContext._active_spark_context is None:
515
+ SparkContext(conf=conf or SparkConf())
516
+ assert SparkContext._active_spark_context is not None
517
+ return SparkContext._active_spark_context
518
+
519
+ def setLogLevel(self, logLevel: str) -> None:
520
+ """
521
+ Control our logLevel. This overrides any user-defined log settings.
522
+ Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
523
+
524
+ .. versionadded:: 1.4.0
525
+
526
+ Parameters
527
+ ----------
528
+ logLevel : str
529
+ The desired log level as a string.
530
+
531
+ Examples
532
+ --------
533
+ >>> sc.setLogLevel("WARN") # doctest :+SKIP
534
+ """
535
+ self._jsc.setLogLevel(logLevel)
536
+
537
+ @classmethod
538
+ def setSystemProperty(cls, key: str, value: str) -> None:
539
+ """
540
+ Set a Java system property, such as `spark.executor.memory`. This must
541
+ be invoked before instantiating :class:`SparkContext`.
542
+
543
+ .. versionadded:: 0.9.0
544
+
545
+ Parameters
546
+ ----------
547
+ key : str
548
+ The key of a new Java system property.
549
+ value : str
550
+ The value of a new Java system property.
551
+ """
552
+ SparkContext._ensure_initialized()
553
+ assert SparkContext._jvm is not None
554
+ SparkContext._jvm.java.lang.System.setProperty(key, value)
555
+
556
+ @property
557
+ def version(self) -> str:
558
+ """
559
+ The version of Spark on which this application is running.
560
+
561
+ .. versionadded:: 1.1.0
562
+
563
+ Examples
564
+ --------
565
+ >>> _ = sc.version
566
+ """
567
+ return self._jsc.version()
568
+
569
+ @property
570
+ def applicationId(self) -> str:
571
+ """
572
+ A unique identifier for the Spark application.
573
+ Its format depends on the scheduler implementation.
574
+
575
+ * in case of local spark app something like 'local-1433865536131'
576
+ * in case of YARN something like 'application_1433865536131_34483'
577
+
578
+ .. versionadded:: 1.5.0
579
+
580
+ Examples
581
+ --------
582
+ >>> sc.applicationId # doctest: +ELLIPSIS
583
+ 'local-...'
584
+ """
585
+ return self._jsc.sc().applicationId()
586
+
587
+ @property
588
+ def uiWebUrl(self) -> Optional[str]:
589
+ """Return the URL of the SparkUI instance started by this :class:`SparkContext`
590
+
591
+ .. versionadded:: 2.1.0
592
+
593
+ Notes
594
+ -----
595
+ When the web ui is disabled, e.g., by ``spark.ui.enabled`` set to ``False``,
596
+ it returns ``None``.
597
+
598
+ Examples
599
+ --------
600
+ >>> sc.uiWebUrl
601
+ 'http://...'
602
+ """
603
+ jurl = self._jsc.sc().uiWebUrl()
604
+ return jurl.get() if jurl.nonEmpty() else None
605
+
606
+ @property
607
+ def startTime(self) -> int:
608
+ """Return the epoch time when the :class:`SparkContext` was started.
609
+
610
+ .. versionadded:: 1.5.0
611
+
612
+ Examples
613
+ --------
614
+ >>> _ = sc.startTime
615
+ """
616
+ return self._jsc.startTime()
617
+
618
+ @property
619
+ def defaultParallelism(self) -> int:
620
+ """
621
+ Default level of parallelism to use when not given by user (e.g. for reduce tasks)
622
+
623
+ .. versionadded:: 0.7.0
624
+
625
+ Examples
626
+ --------
627
+ >>> sc.defaultParallelism > 0
628
+ True
629
+ """
630
+ return self._jsc.sc().defaultParallelism()
631
+
632
+ @property
633
+ def defaultMinPartitions(self) -> int:
634
+ """
635
+ Default min number of partitions for Hadoop RDDs when not given by user
636
+
637
+ .. versionadded:: 1.1.0
638
+
639
+ Examples
640
+ --------
641
+ >>> sc.defaultMinPartitions > 0
642
+ True
643
+ """
644
+ return self._jsc.sc().defaultMinPartitions()
645
+
646
+ def stop(self) -> None:
647
+ """
648
+ Shut down the :class:`SparkContext`.
649
+
650
+ .. versionadded:: 0.7.0
651
+ """
652
+ if getattr(self, "_jsc", None):
653
+ try:
654
+ self._jsc.stop()
655
+ except Py4JError:
656
+ # Case: SPARK-18523
657
+ warnings.warn(
658
+ "Unable to cleanly shutdown Spark JVM process."
659
+ " It is possible that the process has crashed,"
660
+ " been killed or may also be in a zombie state.",
661
+ RuntimeWarning,
662
+ )
663
+ finally:
664
+ self._jsc = None
665
+ if getattr(self, "_accumulatorServer", None):
666
+ self._accumulatorServer.shutdown()
667
+ self._accumulatorServer = None # type: ignore[assignment]
668
+ with SparkContext._lock:
669
+ SparkContext._active_spark_context = None
670
+
671
+ def emptyRDD(self) -> RDD[Any]:
672
+ """
673
+ Create an :class:`RDD` that has no partitions or elements.
674
+
675
+ .. versionadded:: 1.5.0
676
+
677
+ Returns
678
+ -------
679
+ :class:`RDD`
680
+ An empty RDD
681
+
682
+ Examples
683
+ --------
684
+ >>> sc.emptyRDD()
685
+ EmptyRDD...
686
+ >>> sc.emptyRDD().count()
687
+ 0
688
+ """
689
+ return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
690
+
691
+ def range(
692
+ self, start: int, end: Optional[int] = None, step: int = 1, numSlices: Optional[int] = None
693
+ ) -> RDD[int]:
694
+ """
695
+ Create a new RDD of int containing elements from `start` to `end`
696
+ (exclusive), increased by `step` every element. Can be called the same
697
+ way as python's built-in range() function. If called with a single argument,
698
+ the argument is interpreted as `end`, and `start` is set to 0.
699
+
700
+ .. versionadded:: 1.5.0
701
+
702
+ Parameters
703
+ ----------
704
+ start : int
705
+ the start value
706
+ end : int, optional
707
+ the end value (exclusive)
708
+ step : int, optional, default 1
709
+ the incremental step
710
+ numSlices : int, optional
711
+ the number of partitions of the new RDD
712
+
713
+ Returns
714
+ -------
715
+ :class:`RDD`
716
+ An RDD of int
717
+
718
+ See Also
719
+ --------
720
+ :meth:`pyspark.sql.SparkSession.range`
721
+
722
+ Examples
723
+ --------
724
+ >>> sc.range(5).collect()
725
+ [0, 1, 2, 3, 4]
726
+ >>> sc.range(2, 4).collect()
727
+ [2, 3]
728
+ >>> sc.range(1, 7, 2).collect()
729
+ [1, 3, 5]
730
+
731
+ Generate RDD with a negative step
732
+
733
+ >>> sc.range(5, 0, -1).collect()
734
+ [5, 4, 3, 2, 1]
735
+ >>> sc.range(0, 5, -1).collect()
736
+ []
737
+
738
+ Control the number of partitions
739
+
740
+ >>> sc.range(5, numSlices=1).getNumPartitions()
741
+ 1
742
+ >>> sc.range(5, numSlices=10).getNumPartitions()
743
+ 10
744
+ """
745
+ if end is None:
746
+ end = start
747
+ start = 0
748
+
749
+ return self.parallelize(range(start, end, step), numSlices)
750
+
751
+ def parallelize(self, c: Iterable[T], numSlices: Optional[int] = None) -> RDD[T]:
752
+ """
753
+ Distribute a local Python collection to form an RDD. Using range
754
+ is recommended if the input represents a range for performance.
755
+
756
+ .. versionadded:: 0.7.0
757
+
758
+ Parameters
759
+ ----------
760
+ c : :class:`collections.abc.Iterable`
761
+ iterable collection to distribute
762
+ numSlices : int, optional
763
+ the number of partitions of the new RDD
764
+
765
+ Returns
766
+ -------
767
+ :class:`RDD`
768
+ RDD representing distributed collection.
769
+
770
+ Examples
771
+ --------
772
+ >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
773
+ [[0], [2], [3], [4], [6]]
774
+ >>> sc.parallelize(range(0, 6, 2), 5).glom().collect()
775
+ [[], [0], [], [2], [4]]
776
+
777
+ Deal with a list of strings.
778
+
779
+ >>> strings = ["a", "b", "c"]
780
+ >>> sc.parallelize(strings, 2).glom().collect()
781
+ [['a'], ['b', 'c']]
782
+ """
783
+ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
784
+ if isinstance(c, range):
785
+ size = len(c)
786
+ if size == 0:
787
+ return self.parallelize([], numSlices)
788
+ step = c[1] - c[0] if size > 1 else 1 # type: ignore[index]
789
+ start0 = c[0] # type: ignore[index]
790
+
791
+ def getStart(split: int) -> int:
792
+ assert numSlices is not None
793
+ return start0 + int((split * size / numSlices)) * step
794
+
795
+ def f(split: int, iterator: Iterable[T]) -> Iterable:
796
+ # it's an empty iterator here but we need this line for triggering the
797
+ # logic of signal handling in FramedSerializer.load_stream, for instance,
798
+ # SpecialLengths.END_OF_DATA_SECTION in _read_with_length. Since
799
+ # FramedSerializer.load_stream produces a generator, the control should
800
+ # at least be in that function once. Here we do it by explicitly converting
801
+ # the empty iterator to a list, thus make sure worker reuse takes effect.
802
+ # See more details in SPARK-26549.
803
+ assert len(list(iterator)) == 0
804
+ return range(getStart(split), getStart(split + 1), step)
805
+
806
+ return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
807
+
808
+ # Make sure we distribute data evenly if it's smaller than self.batchSize
809
+ if "__len__" not in dir(c):
810
+ c = list(c) # Make it a list so we can compute its length
811
+ batchSize = max(
812
+ 1, min(len(c) // numSlices, self._batchSize or 1024) # type: ignore[arg-type]
813
+ )
814
+ serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
815
+
816
+ def reader_func(temp_filename: str) -> JavaObject:
817
+ assert self._jvm is not None
818
+ return self._jvm.PythonRDD.readRDDFromFile(self._jsc, temp_filename, numSlices)
819
+
820
+ def createRDDServer() -> JavaObject:
821
+ assert self._jvm is not None
822
+ return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices)
823
+
824
+ jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer)
825
+ return RDD(jrdd, self, serializer)
826
+
827
+ def _serialize_to_jvm(
828
+ self,
829
+ data: Iterable[T],
830
+ serializer: Serializer,
831
+ reader_func: Callable,
832
+ server_func: Callable,
833
+ ) -> JavaObject:
834
+ """
835
+ Using Py4J to send a large dataset to the jvm is slow, so we use either a file
836
+ or a socket if we have encryption enabled.
837
+
838
+ Examples
839
+ --------
840
+ data
841
+ object to be serialized
842
+ serializer : class:`pyspark.serializers.Serializer`
843
+ reader_func : function
844
+ A function which takes a filename and reads in the data in the jvm and
845
+ returns a JavaRDD. Only used when encryption is disabled.
846
+ server_func : function
847
+ A function which creates a SocketAuthServer in the JVM to
848
+ accept the serialized data, for use when encryption is enabled.
849
+ """
850
+ if self._encryption_enabled:
851
+ # with encryption, we open a server in java and send the data directly
852
+ server = server_func()
853
+ (sock_file, _) = local_connect_and_auth(server.port(), server.secret())
854
+ chunked_out = ChunkedStream(sock_file, 8192)
855
+ serializer.dump_stream(data, chunked_out)
856
+ chunked_out.close()
857
+ # this call will block until the server has read all the data and processed it (or
858
+ # throws an exception)
859
+ r = server.getResult()
860
+ return r
861
+ else:
862
+ # without encryption, we serialize to a file, and we read the file in java and
863
+ # parallelize from there.
864
+ tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
865
+ try:
866
+ try:
867
+ serializer.dump_stream(data, tempFile)
868
+ finally:
869
+ tempFile.close()
870
+ return reader_func(tempFile.name)
871
+ finally:
872
+ # we eagerly reads the file so we can delete right after.
873
+ os.unlink(tempFile.name)
874
+
875
+ def pickleFile(self, name: str, minPartitions: Optional[int] = None) -> RDD[Any]:
876
+ """
877
+ Load an RDD previously saved using :meth:`RDD.saveAsPickleFile` method.
878
+
879
+ .. versionadded:: 1.1.0
880
+
881
+ Parameters
882
+ ----------
883
+ name : str
884
+ directory to the input data files, the path can be comma separated
885
+ paths as a list of inputs
886
+ minPartitions : int, optional
887
+ suggested minimum number of partitions for the resulting RDD
888
+
889
+ Returns
890
+ -------
891
+ :class:`RDD`
892
+ RDD representing unpickled data from the file(s).
893
+
894
+ See Also
895
+ --------
896
+ :meth:`RDD.saveAsPickleFile`
897
+
898
+ Examples
899
+ --------
900
+ >>> import os
901
+ >>> import tempfile
902
+ >>> with tempfile.TemporaryDirectory() as d:
903
+ ... # Write a temporary pickled file
904
+ ... path1 = os.path.join(d, "pickled1")
905
+ ... sc.parallelize(range(10)).saveAsPickleFile(path1, 3)
906
+ ...
907
+ ... # Write another temporary pickled file
908
+ ... path2 = os.path.join(d, "pickled2")
909
+ ... sc.parallelize(range(-10, -5)).saveAsPickleFile(path2, 3)
910
+ ...
911
+ ... # Load picked file
912
+ ... collected1 = sorted(sc.pickleFile(path1, 3).collect())
913
+ ... collected2 = sorted(sc.pickleFile(path2, 4).collect())
914
+ ...
915
+ ... # Load two picked files together
916
+ ... collected3 = sorted(sc.pickleFile('{},{}'.format(path1, path2), 5).collect())
917
+
918
+ >>> collected1
919
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
920
+ >>> collected2
921
+ [-10, -9, -8, -7, -6]
922
+ >>> collected3
923
+ [-10, -9, -8, -7, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
924
+ """
925
+ minPartitions = minPartitions or self.defaultMinPartitions
926
+ return RDD(self._jsc.objectFile(name, minPartitions), self)
927
+
928
+ def textFile(
929
+ self, name: str, minPartitions: Optional[int] = None, use_unicode: bool = True
930
+ ) -> RDD[str]:
931
+ """
932
+ Read a text file from HDFS, a local file system (available on all
933
+ nodes), or any Hadoop-supported file system URI, and return it as an
934
+ RDD of Strings. The text files must be encoded as UTF-8.
935
+
936
+ .. versionadded:: 0.7.0
937
+
938
+ Parameters
939
+ ----------
940
+ name : str
941
+ directory to the input data files, the path can be comma separated
942
+ paths as a list of inputs
943
+ minPartitions : int, optional
944
+ suggested minimum number of partitions for the resulting RDD
945
+ use_unicode : bool, default True
946
+ If `use_unicode` is False, the strings will be kept as `str` (encoding
947
+ as `utf-8`), which is faster and smaller than unicode.
948
+
949
+ .. versionadded:: 1.2.0
950
+
951
+ Returns
952
+ -------
953
+ :class:`RDD`
954
+ RDD representing text data from the file(s).
955
+
956
+ See Also
957
+ --------
958
+ :meth:`RDD.saveAsTextFile`
959
+ :meth:`SparkContext.wholeTextFiles`
960
+
961
+ Examples
962
+ --------
963
+ >>> import os
964
+ >>> import tempfile
965
+ >>> with tempfile.TemporaryDirectory() as d:
966
+ ... path1 = os.path.join(d, "text1")
967
+ ... path2 = os.path.join(d, "text2")
968
+ ...
969
+ ... # Write a temporary text file
970
+ ... sc.parallelize(["x", "y", "z"]).saveAsTextFile(path1)
971
+ ...
972
+ ... # Write another temporary text file
973
+ ... sc.parallelize(["aa", "bb", "cc"]).saveAsTextFile(path2)
974
+ ...
975
+ ... # Load text file
976
+ ... collected1 = sorted(sc.textFile(path1, 3).collect())
977
+ ... collected2 = sorted(sc.textFile(path2, 4).collect())
978
+ ...
979
+ ... # Load two text files together
980
+ ... collected3 = sorted(sc.textFile('{},{}'.format(path1, path2), 5).collect())
981
+
982
+ >>> collected1
983
+ ['x', 'y', 'z']
984
+ >>> collected2
985
+ ['aa', 'bb', 'cc']
986
+ >>> collected3
987
+ ['aa', 'bb', 'cc', 'x', 'y', 'z']
988
+ """
989
+ minPartitions = minPartitions or min(self.defaultParallelism, 2)
990
+ return RDD(self._jsc.textFile(name, minPartitions), self, UTF8Deserializer(use_unicode))
991
+
992
+ def wholeTextFiles(
993
+ self, path: str, minPartitions: Optional[int] = None, use_unicode: bool = True
994
+ ) -> RDD[Tuple[str, str]]:
995
+ """
996
+ Read a directory of text files from HDFS, a local file system
997
+ (available on all nodes), or any Hadoop-supported file system
998
+ URI. Each file is read as a single record and returned in a
999
+ key-value pair, where the key is the path of each file, the
1000
+ value is the content of each file.
1001
+ The text files must be encoded as UTF-8.
1002
+
1003
+ .. versionadded:: 1.0.0
1004
+
1005
+ For example, if you have the following files:
1006
+
1007
+ .. code-block:: text
1008
+
1009
+ hdfs://a-hdfs-path/part-00000
1010
+ hdfs://a-hdfs-path/part-00001
1011
+ ...
1012
+ hdfs://a-hdfs-path/part-nnnnn
1013
+
1014
+ Do ``rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")``,
1015
+ then ``rdd`` contains:
1016
+
1017
+ .. code-block:: text
1018
+
1019
+ (a-hdfs-path/part-00000, its content)
1020
+ (a-hdfs-path/part-00001, its content)
1021
+ ...
1022
+ (a-hdfs-path/part-nnnnn, its content)
1023
+
1024
+ Parameters
1025
+ ----------
1026
+ path : str
1027
+ directory to the input data files, the path can be comma separated
1028
+ paths as a list of inputs
1029
+ minPartitions : int, optional
1030
+ suggested minimum number of partitions for the resulting RDD
1031
+ use_unicode : bool, default True
1032
+ If `use_unicode` is False, the strings will be kept as `str` (encoding
1033
+ as `utf-8`), which is faster and smaller than unicode.
1034
+
1035
+ .. versionadded:: 1.2.0
1036
+
1037
+ Returns
1038
+ -------
1039
+ :class:`RDD`
1040
+ RDD representing path-content pairs from the file(s).
1041
+
1042
+ Notes
1043
+ -----
1044
+ Small files are preferred, as each file will be loaded fully in memory.
1045
+
1046
+ See Also
1047
+ --------
1048
+ :meth:`RDD.saveAsTextFile`
1049
+ :meth:`SparkContext.textFile`
1050
+
1051
+ Examples
1052
+ --------
1053
+ >>> import os
1054
+ >>> import tempfile
1055
+ >>> with tempfile.TemporaryDirectory() as d:
1056
+ ... # Write a temporary text file
1057
+ ... with open(os.path.join(d, "1.txt"), "w") as f:
1058
+ ... _ = f.write("123")
1059
+ ...
1060
+ ... # Write another temporary text file
1061
+ ... with open(os.path.join(d, "2.txt"), "w") as f:
1062
+ ... _ = f.write("xyz")
1063
+ ...
1064
+ ... collected = sorted(sc.wholeTextFiles(d).collect())
1065
+ >>> collected
1066
+ [('.../1.txt', '123'), ('.../2.txt', 'xyz')]
1067
+ """
1068
+ minPartitions = minPartitions or self.defaultMinPartitions
1069
+ return RDD(
1070
+ self._jsc.wholeTextFiles(path, minPartitions),
1071
+ self,
1072
+ PairDeserializer(UTF8Deserializer(use_unicode), UTF8Deserializer(use_unicode)),
1073
+ )
1074
+
1075
+ def binaryFiles(self, path: str, minPartitions: Optional[int] = None) -> RDD[Tuple[str, bytes]]:
1076
+ """
1077
+ Read a directory of binary files from HDFS, a local file system
1078
+ (available on all nodes), or any Hadoop-supported file system URI
1079
+ as a byte array. Each file is read as a single record and returned
1080
+ in a key-value pair, where the key is the path of each file, the
1081
+ value is the content of each file.
1082
+
1083
+ .. versionadded:: 1.3.0
1084
+
1085
+ Parameters
1086
+ ----------
1087
+ path : str
1088
+ directory to the input data files, the path can be comma separated
1089
+ paths as a list of inputs
1090
+ minPartitions : int, optional
1091
+ suggested minimum number of partitions for the resulting RDD
1092
+
1093
+ Returns
1094
+ -------
1095
+ :class:`RDD`
1096
+ RDD representing path-content pairs from the file(s).
1097
+
1098
+ Notes
1099
+ -----
1100
+ Small files are preferred, large file is also allowable, but may cause bad performance.
1101
+
1102
+ See Also
1103
+ --------
1104
+ :meth:`SparkContext.binaryRecords`
1105
+
1106
+ Examples
1107
+ --------
1108
+ >>> import os
1109
+ >>> import tempfile
1110
+ >>> with tempfile.TemporaryDirectory() as d:
1111
+ ... # Write a temporary binary file
1112
+ ... with open(os.path.join(d, "1.bin"), "wb") as f1:
1113
+ ... _ = f1.write(b"binary data I")
1114
+ ...
1115
+ ... # Write another temporary binary file
1116
+ ... with open(os.path.join(d, "2.bin"), "wb") as f2:
1117
+ ... _ = f2.write(b"binary data II")
1118
+ ...
1119
+ ... collected = sorted(sc.binaryFiles(d).collect())
1120
+
1121
+ >>> collected
1122
+ [('.../1.bin', b'binary data I'), ('.../2.bin', b'binary data II')]
1123
+ """
1124
+ minPartitions = minPartitions or self.defaultMinPartitions
1125
+ return RDD(
1126
+ self._jsc.binaryFiles(path, minPartitions),
1127
+ self,
1128
+ PairDeserializer(UTF8Deserializer(), NoOpSerializer()),
1129
+ )
1130
+
1131
+ def binaryRecords(self, path: str, recordLength: int) -> RDD[bytes]:
1132
+ """
1133
+ Load data from a flat binary file, assuming each record is a set of numbers
1134
+ with the specified numerical format (see ByteBuffer), and the number of
1135
+ bytes per record is constant.
1136
+
1137
+ .. versionadded:: 1.3.0
1138
+
1139
+ Parameters
1140
+ ----------
1141
+ path : str
1142
+ Directory to the input data files
1143
+ recordLength : int
1144
+ The length at which to split the records
1145
+
1146
+ Returns
1147
+ -------
1148
+ :class:`RDD`
1149
+ RDD of data with values, represented as byte arrays
1150
+
1151
+ See Also
1152
+ --------
1153
+ :meth:`SparkContext.binaryFiles`
1154
+
1155
+ Examples
1156
+ --------
1157
+ >>> import os
1158
+ >>> import tempfile
1159
+ >>> with tempfile.TemporaryDirectory() as d:
1160
+ ... # Write a temporary file
1161
+ ... with open(os.path.join(d, "1.bin"), "w") as f:
1162
+ ... for i in range(3):
1163
+ ... _ = f.write("%04d" % i)
1164
+ ...
1165
+ ... # Write another file
1166
+ ... with open(os.path.join(d, "2.bin"), "w") as f:
1167
+ ... for i in [-1, -2, -10]:
1168
+ ... _ = f.write("%04d" % i)
1169
+ ...
1170
+ ... collected = sorted(sc.binaryRecords(d, 4).collect())
1171
+
1172
+ >>> collected
1173
+ [b'-001', b'-002', b'-010', b'0000', b'0001', b'0002']
1174
+ """
1175
+ return RDD(self._jsc.binaryRecords(path, recordLength), self, NoOpSerializer())
1176
+
1177
+ def _dictToJavaMap(self, d: Optional[Dict[str, str]]) -> JavaMap:
1178
+ assert self._jvm is not None
1179
+ jm = self._jvm.java.util.HashMap()
1180
+ if not d:
1181
+ d = {}
1182
+ for k, v in d.items():
1183
+ jm[k] = v
1184
+ return jm
1185
+
1186
+ def sequenceFile(
1187
+ self,
1188
+ path: str,
1189
+ keyClass: Optional[str] = None,
1190
+ valueClass: Optional[str] = None,
1191
+ keyConverter: Optional[str] = None,
1192
+ valueConverter: Optional[str] = None,
1193
+ minSplits: Optional[int] = None,
1194
+ batchSize: int = 0,
1195
+ ) -> RDD[Tuple[T, U]]:
1196
+ """
1197
+ Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
1198
+ a local file system (available on all nodes), or any Hadoop-supported file system URI.
1199
+ The mechanism is as follows:
1200
+
1201
+ 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key
1202
+ and value Writable classes
1203
+ 2. Serialization is attempted via Pickle pickling
1204
+ 3. If this fails, the fallback is to call 'toString' on each key and value
1205
+ 4. :class:`CPickleSerializer` is used to deserialize pickled objects on the Python side
1206
+
1207
+ .. versionadded:: 1.3.0
1208
+
1209
+ Parameters
1210
+ ----------
1211
+ path : str
1212
+ path to sequencefile
1213
+ keyClass: str, optional
1214
+ fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text")
1215
+ valueClass : str, optional
1216
+ fully qualified classname of value Writable class
1217
+ (e.g. "org.apache.hadoop.io.LongWritable")
1218
+ keyConverter : str, optional
1219
+ fully qualified name of a function returning key WritableConverter
1220
+ valueConverter : str, optional
1221
+ fully qualifiedname of a function returning value WritableConverter
1222
+ minSplits : int, optional
1223
+ minimum splits in dataset (default min(2, sc.defaultParallelism))
1224
+ batchSize : int, optional, default 0
1225
+ The number of Python objects represented as a single
1226
+ Java object. (default 0, choose batchSize automatically)
1227
+
1228
+ Returns
1229
+ -------
1230
+ :class:`RDD`
1231
+ RDD of tuples of key and corresponding value
1232
+
1233
+ See Also
1234
+ --------
1235
+ :meth:`RDD.saveAsSequenceFile`
1236
+ :meth:`RDD.saveAsNewAPIHadoopFile`
1237
+ :meth:`RDD.saveAsHadoopFile`
1238
+ :meth:`SparkContext.newAPIHadoopFile`
1239
+ :meth:`SparkContext.hadoopFile`
1240
+
1241
+ Examples
1242
+ --------
1243
+ >>> import os
1244
+ >>> import tempfile
1245
+
1246
+ Set the class of output format
1247
+
1248
+ >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
1249
+
1250
+ >>> with tempfile.TemporaryDirectory() as d:
1251
+ ... path = os.path.join(d, "hadoop_file")
1252
+ ...
1253
+ ... # Write a temporary Hadoop file
1254
+ ... rdd = sc.parallelize([(1, {3.0: "bb"}), (2, {1.0: "aa"}), (3, {2.0: "dd"})])
1255
+ ... rdd.saveAsNewAPIHadoopFile(path, output_format_class)
1256
+ ...
1257
+ ... collected = sorted(sc.sequenceFile(path).collect())
1258
+
1259
+ >>> collected
1260
+ [(1, {3.0: 'bb'}), (2, {1.0: 'aa'}), (3, {2.0: 'dd'})]
1261
+ """
1262
+ minSplits = minSplits or min(self.defaultParallelism, 2)
1263
+ assert self._jvm is not None
1264
+ jrdd = self._jvm.PythonRDD.sequenceFile(
1265
+ self._jsc,
1266
+ path,
1267
+ keyClass,
1268
+ valueClass,
1269
+ keyConverter,
1270
+ valueConverter,
1271
+ minSplits,
1272
+ batchSize,
1273
+ )
1274
+ return RDD(jrdd, self)
1275
+
1276
+ def newAPIHadoopFile(
1277
+ self,
1278
+ path: str,
1279
+ inputFormatClass: str,
1280
+ keyClass: str,
1281
+ valueClass: str,
1282
+ keyConverter: Optional[str] = None,
1283
+ valueConverter: Optional[str] = None,
1284
+ conf: Optional[Dict[str, str]] = None,
1285
+ batchSize: int = 0,
1286
+ ) -> RDD[Tuple[T, U]]:
1287
+ """
1288
+ Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
1289
+ a local file system (available on all nodes), or any Hadoop-supported file system URI.
1290
+ The mechanism is the same as for meth:`SparkContext.sequenceFile`.
1291
+
1292
+ A Hadoop configuration can be passed in as a Python dict. This will be converted into a
1293
+ Configuration in Java
1294
+
1295
+ .. versionadded:: 1.1.0
1296
+
1297
+ Parameters
1298
+ ----------
1299
+ path : str
1300
+ path to Hadoop file
1301
+ inputFormatClass : str
1302
+ fully qualified classname of Hadoop InputFormat
1303
+ (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
1304
+ keyClass : str
1305
+ fully qualified classname of key Writable class
1306
+ (e.g. "org.apache.hadoop.io.Text")
1307
+ valueClass : str
1308
+ fully qualified classname of value Writable class
1309
+ (e.g. "org.apache.hadoop.io.LongWritable")
1310
+ keyConverter : str, optional
1311
+ fully qualified name of a function returning key WritableConverter
1312
+ None by default
1313
+ valueConverter : str, optional
1314
+ fully qualified name of a function returning value WritableConverter
1315
+ None by default
1316
+ conf : dict, optional
1317
+ Hadoop configuration, passed in as a dict
1318
+ None by default
1319
+ batchSize : int, optional, default 0
1320
+ The number of Python objects represented as a single
1321
+ Java object. (default 0, choose batchSize automatically)
1322
+
1323
+ Returns
1324
+ -------
1325
+ :class:`RDD`
1326
+ RDD of tuples of key and corresponding value
1327
+
1328
+ See Also
1329
+ --------
1330
+ :meth:`RDD.saveAsSequenceFile`
1331
+ :meth:`RDD.saveAsNewAPIHadoopFile`
1332
+ :meth:`RDD.saveAsHadoopFile`
1333
+ :meth:`SparkContext.sequenceFile`
1334
+ :meth:`SparkContext.hadoopFile`
1335
+
1336
+ Examples
1337
+ --------
1338
+ >>> import os
1339
+ >>> import tempfile
1340
+
1341
+ Set the related classes
1342
+
1343
+ >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
1344
+ >>> input_format_class = "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat"
1345
+ >>> key_class = "org.apache.hadoop.io.IntWritable"
1346
+ >>> value_class = "org.apache.hadoop.io.Text"
1347
+
1348
+ >>> with tempfile.TemporaryDirectory() as d:
1349
+ ... path = os.path.join(d, "new_hadoop_file")
1350
+ ...
1351
+ ... # Write a temporary Hadoop file
1352
+ ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])
1353
+ ... rdd.saveAsNewAPIHadoopFile(path, output_format_class, key_class, value_class)
1354
+ ...
1355
+ ... loaded = sc.newAPIHadoopFile(path, input_format_class, key_class, value_class)
1356
+ ... collected = sorted(loaded.collect())
1357
+
1358
+ >>> collected
1359
+ [(1, ''), (1, 'a'), (3, 'x')]
1360
+ """
1361
+ jconf = self._dictToJavaMap(conf)
1362
+ assert self._jvm is not None
1363
+ jrdd = self._jvm.PythonRDD.newAPIHadoopFile(
1364
+ self._jsc,
1365
+ path,
1366
+ inputFormatClass,
1367
+ keyClass,
1368
+ valueClass,
1369
+ keyConverter,
1370
+ valueConverter,
1371
+ jconf,
1372
+ batchSize,
1373
+ )
1374
+ return RDD(jrdd, self)
1375
+
1376
+ def newAPIHadoopRDD(
1377
+ self,
1378
+ inputFormatClass: str,
1379
+ keyClass: str,
1380
+ valueClass: str,
1381
+ keyConverter: Optional[str] = None,
1382
+ valueConverter: Optional[str] = None,
1383
+ conf: Optional[Dict[str, str]] = None,
1384
+ batchSize: int = 0,
1385
+ ) -> RDD[Tuple[T, U]]:
1386
+ """
1387
+ Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
1388
+ Hadoop configuration, which is passed in as a Python dict.
1389
+ This will be converted into a Configuration in Java.
1390
+ The mechanism is the same as for meth:`SparkContext.sequenceFile`.
1391
+
1392
+ .. versionadded:: 1.1.0
1393
+
1394
+ Parameters
1395
+ ----------
1396
+ inputFormatClass : str
1397
+ fully qualified classname of Hadoop InputFormat
1398
+ (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
1399
+ keyClass : str
1400
+ fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text")
1401
+ valueClass : str
1402
+ fully qualified classname of value Writable class
1403
+ (e.g. "org.apache.hadoop.io.LongWritable")
1404
+ keyConverter : str, optional
1405
+ fully qualified name of a function returning key WritableConverter
1406
+ (None by default)
1407
+ valueConverter : str, optional
1408
+ fully qualified name of a function returning value WritableConverter
1409
+ (None by default)
1410
+ conf : dict, optional
1411
+ Hadoop configuration, passed in as a dict (None by default)
1412
+ batchSize : int, optional, default 0
1413
+ The number of Python objects represented as a single
1414
+ Java object. (default 0, choose batchSize automatically)
1415
+
1416
+ Returns
1417
+ -------
1418
+ :class:`RDD`
1419
+ RDD of tuples of key and corresponding value
1420
+
1421
+ See Also
1422
+ --------
1423
+ :meth:`RDD.saveAsNewAPIHadoopDataset`
1424
+ :meth:`RDD.saveAsHadoopDataset`
1425
+ :meth:`SparkContext.hadoopRDD`
1426
+ :meth:`SparkContext.hadoopFile`
1427
+
1428
+ Examples
1429
+ --------
1430
+ >>> import os
1431
+ >>> import tempfile
1432
+
1433
+ Set the related classes
1434
+
1435
+ >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
1436
+ >>> input_format_class = "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat"
1437
+ >>> key_class = "org.apache.hadoop.io.IntWritable"
1438
+ >>> value_class = "org.apache.hadoop.io.Text"
1439
+
1440
+ >>> with tempfile.TemporaryDirectory() as d:
1441
+ ... path = os.path.join(d, "new_hadoop_file")
1442
+ ...
1443
+ ... # Create the conf for writing
1444
+ ... write_conf = {
1445
+ ... "mapreduce.job.outputformat.class": (output_format_class),
1446
+ ... "mapreduce.job.output.key.class": key_class,
1447
+ ... "mapreduce.job.output.value.class": value_class,
1448
+ ... "mapreduce.output.fileoutputformat.outputdir": path,
1449
+ ... }
1450
+ ...
1451
+ ... # Write a temporary Hadoop file
1452
+ ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])
1453
+ ... rdd.saveAsNewAPIHadoopDataset(conf=write_conf)
1454
+ ...
1455
+ ... # Create the conf for reading
1456
+ ... read_conf = {"mapreduce.input.fileinputformat.inputdir": path}
1457
+ ...
1458
+ ... loaded = sc.newAPIHadoopRDD(input_format_class,
1459
+ ... key_class, value_class, conf=read_conf)
1460
+ ... collected = sorted(loaded.collect())
1461
+
1462
+ >>> collected
1463
+ [(1, ''), (1, 'a'), (3, 'x')]
1464
+ """
1465
+ jconf = self._dictToJavaMap(conf)
1466
+ assert self._jvm is not None
1467
+ jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(
1468
+ self._jsc,
1469
+ inputFormatClass,
1470
+ keyClass,
1471
+ valueClass,
1472
+ keyConverter,
1473
+ valueConverter,
1474
+ jconf,
1475
+ batchSize,
1476
+ )
1477
+ return RDD(jrdd, self)
1478
+
1479
+ def hadoopFile(
1480
+ self,
1481
+ path: str,
1482
+ inputFormatClass: str,
1483
+ keyClass: str,
1484
+ valueClass: str,
1485
+ keyConverter: Optional[str] = None,
1486
+ valueConverter: Optional[str] = None,
1487
+ conf: Optional[Dict[str, str]] = None,
1488
+ batchSize: int = 0,
1489
+ ) -> RDD[Tuple[T, U]]:
1490
+ """
1491
+ Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
1492
+ a local file system (available on all nodes), or any Hadoop-supported file system URI.
1493
+ The mechanism is the same as for meth:`SparkContext.sequenceFile`.
1494
+
1495
+ .. versionadded:: 1.1.0
1496
+
1497
+ A Hadoop configuration can be passed in as a Python dict. This will be converted into a
1498
+ Configuration in Java.
1499
+
1500
+ Parameters
1501
+ ----------
1502
+ path : str
1503
+ path to Hadoop file
1504
+ inputFormatClass : str
1505
+ fully qualified classname of Hadoop InputFormat
1506
+ (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
1507
+ keyClass : str
1508
+ fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text")
1509
+ valueClass : str
1510
+ fully qualified classname of value Writable class
1511
+ (e.g. "org.apache.hadoop.io.LongWritable")
1512
+ keyConverter : str, optional
1513
+ fully qualified name of a function returning key WritableConverter
1514
+ valueConverter : str, optional
1515
+ fully qualified name of a function returning value WritableConverter
1516
+ conf : dict, optional
1517
+ Hadoop configuration, passed in as a dict
1518
+ batchSize : int, optional, default 0
1519
+ The number of Python objects represented as a single
1520
+ Java object. (default 0, choose batchSize automatically)
1521
+
1522
+ Returns
1523
+ -------
1524
+ :class:`RDD`
1525
+ RDD of tuples of key and corresponding value
1526
+
1527
+ See Also
1528
+ --------
1529
+ :meth:`RDD.saveAsSequenceFile`
1530
+ :meth:`RDD.saveAsNewAPIHadoopFile`
1531
+ :meth:`RDD.saveAsHadoopFile`
1532
+ :meth:`SparkContext.newAPIHadoopFile`
1533
+ :meth:`SparkContext.hadoopRDD`
1534
+
1535
+ Examples
1536
+ --------
1537
+ >>> import os
1538
+ >>> import tempfile
1539
+
1540
+ Set the related classes
1541
+
1542
+ >>> output_format_class = "org.apache.hadoop.mapred.TextOutputFormat"
1543
+ >>> input_format_class = "org.apache.hadoop.mapred.TextInputFormat"
1544
+ >>> key_class = "org.apache.hadoop.io.IntWritable"
1545
+ >>> value_class = "org.apache.hadoop.io.Text"
1546
+
1547
+ >>> with tempfile.TemporaryDirectory() as d:
1548
+ ... path = os.path.join(d, "old_hadoop_file")
1549
+ ...
1550
+ ... # Write a temporary Hadoop file
1551
+ ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])
1552
+ ... rdd.saveAsHadoopFile(path, output_format_class, key_class, value_class)
1553
+ ...
1554
+ ... loaded = sc.hadoopFile(path, input_format_class, key_class, value_class)
1555
+ ... collected = sorted(loaded.collect())
1556
+
1557
+ >>> collected
1558
+ [(0, '1\\t'), (0, '1\\ta'), (0, '3\\tx')]
1559
+ """
1560
+ jconf = self._dictToJavaMap(conf)
1561
+ assert self._jvm is not None
1562
+ jrdd = self._jvm.PythonRDD.hadoopFile(
1563
+ self._jsc,
1564
+ path,
1565
+ inputFormatClass,
1566
+ keyClass,
1567
+ valueClass,
1568
+ keyConverter,
1569
+ valueConverter,
1570
+ jconf,
1571
+ batchSize,
1572
+ )
1573
+ return RDD(jrdd, self)
1574
+
1575
+ def hadoopRDD(
1576
+ self,
1577
+ inputFormatClass: str,
1578
+ keyClass: str,
1579
+ valueClass: str,
1580
+ keyConverter: Optional[str] = None,
1581
+ valueConverter: Optional[str] = None,
1582
+ conf: Optional[Dict[str, str]] = None,
1583
+ batchSize: int = 0,
1584
+ ) -> RDD[Tuple[T, U]]:
1585
+ """
1586
+ Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
1587
+ Hadoop configuration, which is passed in as a Python dict.
1588
+ This will be converted into a Configuration in Java.
1589
+ The mechanism is the same as for meth:`SparkContext.sequenceFile`.
1590
+
1591
+ .. versionadded:: 1.1.0
1592
+
1593
+ Parameters
1594
+ ----------
1595
+ inputFormatClass : str
1596
+ fully qualified classname of Hadoop InputFormat
1597
+ (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
1598
+ keyClass : str
1599
+ fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text")
1600
+ valueClass : str
1601
+ fully qualified classname of value Writable class
1602
+ (e.g. "org.apache.hadoop.io.LongWritable")
1603
+ keyConverter : str, optional
1604
+ fully qualified name of a function returning key WritableConverter
1605
+ valueConverter : str, optional
1606
+ fully qualified name of a function returning value WritableConverter
1607
+ conf : dict, optional
1608
+ Hadoop configuration, passed in as a dict
1609
+ batchSize : int, optional, default 0
1610
+ The number of Python objects represented as a single
1611
+ Java object. (default 0, choose batchSize automatically)
1612
+
1613
+ Returns
1614
+ -------
1615
+ :class:`RDD`
1616
+ RDD of tuples of key and corresponding value
1617
+
1618
+ See Also
1619
+ --------
1620
+ :meth:`RDD.saveAsNewAPIHadoopDataset`
1621
+ :meth:`RDD.saveAsHadoopDataset`
1622
+ :meth:`SparkContext.newAPIHadoopRDD`
1623
+ :meth:`SparkContext.hadoopFile`
1624
+
1625
+ Examples
1626
+ --------
1627
+ >>> import os
1628
+ >>> import tempfile
1629
+
1630
+ Set the related classes
1631
+
1632
+ >>> output_format_class = "org.apache.hadoop.mapred.TextOutputFormat"
1633
+ >>> input_format_class = "org.apache.hadoop.mapred.TextInputFormat"
1634
+ >>> key_class = "org.apache.hadoop.io.IntWritable"
1635
+ >>> value_class = "org.apache.hadoop.io.Text"
1636
+
1637
+ >>> with tempfile.TemporaryDirectory() as d:
1638
+ ... path = os.path.join(d, "old_hadoop_file")
1639
+ ...
1640
+ ... # Create the conf for writing
1641
+ ... write_conf = {
1642
+ ... "mapred.output.format.class": output_format_class,
1643
+ ... "mapreduce.job.output.key.class": key_class,
1644
+ ... "mapreduce.job.output.value.class": value_class,
1645
+ ... "mapreduce.output.fileoutputformat.outputdir": path,
1646
+ ... }
1647
+ ...
1648
+ ... # Write a temporary Hadoop file
1649
+ ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])
1650
+ ... rdd.saveAsHadoopDataset(conf=write_conf)
1651
+ ...
1652
+ ... # Create the conf for reading
1653
+ ... read_conf = {"mapreduce.input.fileinputformat.inputdir": path}
1654
+ ...
1655
+ ... loaded = sc.hadoopRDD(input_format_class, key_class, value_class, conf=read_conf)
1656
+ ... collected = sorted(loaded.collect())
1657
+
1658
+ >>> collected
1659
+ [(0, '1\\t'), (0, '1\\ta'), (0, '3\\tx')]
1660
+ """
1661
+ jconf = self._dictToJavaMap(conf)
1662
+ assert self._jvm is not None
1663
+ jrdd = self._jvm.PythonRDD.hadoopRDD(
1664
+ self._jsc,
1665
+ inputFormatClass,
1666
+ keyClass,
1667
+ valueClass,
1668
+ keyConverter,
1669
+ valueConverter,
1670
+ jconf,
1671
+ batchSize,
1672
+ )
1673
+ return RDD(jrdd, self)
1674
+
1675
+ def _checkpointFile(self, name: str, input_deserializer: PairDeserializer) -> RDD:
1676
+ jrdd = self._jsc.checkpointFile(name)
1677
+ return RDD(jrdd, self, input_deserializer)
1678
+
1679
+ def union(self, rdds: List[RDD[T]]) -> RDD[T]:
1680
+ """
1681
+ Build the union of a list of RDDs.
1682
+
1683
+ This supports unions() of RDDs with different serialized formats,
1684
+ although this forces them to be reserialized using the default
1685
+ serializer:
1686
+
1687
+ .. versionadded:: 0.7.0
1688
+
1689
+ See Also
1690
+ --------
1691
+ :meth:`RDD.union`
1692
+
1693
+ Examples
1694
+ --------
1695
+ >>> import os
1696
+ >>> import tempfile
1697
+ >>> with tempfile.TemporaryDirectory() as d:
1698
+ ... # generate a text RDD
1699
+ ... with open(os.path.join(d, "union-text.txt"), "w") as f:
1700
+ ... _ = f.write("Hello")
1701
+ ... text_rdd = sc.textFile(d)
1702
+ ...
1703
+ ... # generate another RDD
1704
+ ... parallelized = sc.parallelize(["World!"])
1705
+ ...
1706
+ ... unioned = sorted(sc.union([text_rdd, parallelized]).collect())
1707
+
1708
+ >>> unioned
1709
+ ['Hello', 'World!']
1710
+ """
1711
+ first_jrdd_deserializer = rdds[0]._jrdd_deserializer
1712
+ if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds):
1713
+ rdds = [x._reserialize() for x in rdds]
1714
+ gw = SparkContext._gateway
1715
+ assert gw is not None
1716
+ jvm = SparkContext._jvm
1717
+ assert jvm is not None
1718
+ jrdd_cls = jvm.org.apache.spark.api.java.JavaRDD
1719
+ jpair_rdd_cls = jvm.org.apache.spark.api.java.JavaPairRDD
1720
+ jdouble_rdd_cls = jvm.org.apache.spark.api.java.JavaDoubleRDD
1721
+ if is_instance_of(gw, rdds[0]._jrdd, jrdd_cls):
1722
+ cls = jrdd_cls
1723
+ elif is_instance_of(gw, rdds[0]._jrdd, jpair_rdd_cls):
1724
+ cls = jpair_rdd_cls
1725
+ elif is_instance_of(gw, rdds[0]._jrdd, jdouble_rdd_cls):
1726
+ cls = jdouble_rdd_cls
1727
+ else:
1728
+ cls_name = rdds[0]._jrdd.getClass().getCanonicalName()
1729
+ raise TypeError("Unsupported Java RDD class %s" % cls_name)
1730
+ jrdds = gw.new_array(cls, len(rdds))
1731
+ for i in range(0, len(rdds)):
1732
+ jrdds[i] = rdds[i]._jrdd
1733
+ return RDD(self._jsc.union(jrdds), self, rdds[0]._jrdd_deserializer)
1734
+
1735
+ def broadcast(self, value: T) -> "Broadcast[T]":
1736
+ """
1737
+ Broadcast a read-only variable to the cluster, returning a :class:`Broadcast`
1738
+ object for reading it in distributed functions. The variable will
1739
+ be sent to each cluster only once.
1740
+
1741
+ .. versionadded:: 0.7.0
1742
+
1743
+ Parameters
1744
+ ----------
1745
+ value : T
1746
+ value to broadcast to the Spark nodes
1747
+
1748
+ Returns
1749
+ -------
1750
+ :class:`Broadcast`
1751
+ :class:`Broadcast` object, a read-only variable cached on each machine
1752
+
1753
+ Examples
1754
+ --------
1755
+ >>> mapping = {1: 10001, 2: 10002}
1756
+ >>> bc = sc.broadcast(mapping)
1757
+
1758
+ >>> rdd = sc.range(5)
1759
+ >>> rdd2 = rdd.map(lambda i: bc.value[i] if i in bc.value else -1)
1760
+ >>> rdd2.collect()
1761
+ [-1, 10001, 10002, -1, -1]
1762
+
1763
+ >>> bc.destroy()
1764
+ """
1765
+ return Broadcast(self, value, self._pickled_broadcast_vars)
1766
+
1767
+ def accumulator(
1768
+ self, value: T, accum_param: Optional["AccumulatorParam[T]"] = None
1769
+ ) -> "Accumulator[T]":
1770
+ """
1771
+ Create an :class:`Accumulator` with the given initial value, using a given
1772
+ :class:`AccumulatorParam` helper object to define how to add values of the
1773
+ data type if provided. Default AccumulatorParams are used for integers
1774
+ and floating-point numbers if you do not provide one. For other types,
1775
+ a custom AccumulatorParam can be used.
1776
+
1777
+ .. versionadded:: 0.7.0
1778
+
1779
+ Parameters
1780
+ ----------
1781
+ value : T
1782
+ initialized value
1783
+ accum_param : :class:`pyspark.AccumulatorParam`, optional
1784
+ helper object to define how to add values
1785
+
1786
+ Returns
1787
+ -------
1788
+ :class:`Accumulator`
1789
+ `Accumulator` object, a shared variable that can be accumulated
1790
+
1791
+ Examples
1792
+ --------
1793
+ >>> acc = sc.accumulator(9)
1794
+ >>> acc.value
1795
+ 9
1796
+ >>> acc += 1
1797
+ >>> acc.value
1798
+ 10
1799
+
1800
+ Accumulator object can be accumulated in RDD operations:
1801
+
1802
+ >>> rdd = sc.range(5)
1803
+ >>> def f(x):
1804
+ ... global acc
1805
+ ... acc += 1
1806
+ ...
1807
+ >>> rdd.foreach(f)
1808
+ >>> acc.value
1809
+ 15
1810
+ """
1811
+ if accum_param is None:
1812
+ if isinstance(value, int):
1813
+ accum_param = cast("AccumulatorParam[T]", accumulators.INT_ACCUMULATOR_PARAM)
1814
+ elif isinstance(value, float):
1815
+ accum_param = cast("AccumulatorParam[T]", accumulators.FLOAT_ACCUMULATOR_PARAM)
1816
+ elif isinstance(value, complex):
1817
+ accum_param = cast("AccumulatorParam[T]", accumulators.COMPLEX_ACCUMULATOR_PARAM)
1818
+ else:
1819
+ raise TypeError("No default accumulator param for type %s" % type(value))
1820
+ SparkContext._next_accum_id += 1
1821
+ return Accumulator(SparkContext._next_accum_id - 1, value, accum_param)
1822
+
1823
+ def addFile(self, path: str, recursive: bool = False) -> None:
1824
+ """
1825
+ Add a file to be downloaded with this Spark job on every node.
1826
+ The `path` passed can be either a local file, a file in HDFS
1827
+ (or other Hadoop-supported filesystems), or an HTTP, HTTPS or
1828
+ FTP URI.
1829
+
1830
+ To access the file in Spark jobs, use :meth:`SparkFiles.get` with the
1831
+ filename to find its download location.
1832
+
1833
+ A directory can be given if the recursive option is set to True.
1834
+ Currently directories are only supported for Hadoop-supported filesystems.
1835
+
1836
+ .. versionadded:: 0.7.0
1837
+
1838
+ Parameters
1839
+ ----------
1840
+ path : str
1841
+ can be either a local file, a file in HDFS (or other Hadoop-supported
1842
+ filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
1843
+ use :meth:`SparkFiles.get` to find its download location.
1844
+ recursive : bool, default False
1845
+ whether to recursively add files in the input directory
1846
+
1847
+ See Also
1848
+ --------
1849
+ :meth:`SparkContext.listFiles`
1850
+ :meth:`SparkContext.addPyFile`
1851
+ :meth:`SparkFiles.get`
1852
+
1853
+ Notes
1854
+ -----
1855
+ A path can be added only once. Subsequent additions of the same path are ignored.
1856
+
1857
+ Examples
1858
+ --------
1859
+ >>> import os
1860
+ >>> import tempfile
1861
+ >>> from pyspark import SparkFiles
1862
+
1863
+ >>> with tempfile.TemporaryDirectory() as d:
1864
+ ... path1 = os.path.join(d, "test1.txt")
1865
+ ... with open(path1, "w") as f:
1866
+ ... _ = f.write("100")
1867
+ ...
1868
+ ... path2 = os.path.join(d, "test2.txt")
1869
+ ... with open(path2, "w") as f:
1870
+ ... _ = f.write("200")
1871
+ ...
1872
+ ... sc.addFile(path1)
1873
+ ... file_list1 = sorted(sc.listFiles)
1874
+ ...
1875
+ ... sc.addFile(path2)
1876
+ ... file_list2 = sorted(sc.listFiles)
1877
+ ...
1878
+ ... # add path2 twice, this addition will be ignored
1879
+ ... sc.addFile(path2)
1880
+ ... file_list3 = sorted(sc.listFiles)
1881
+ ...
1882
+ ... def func(iterator):
1883
+ ... with open(SparkFiles.get("test1.txt")) as f:
1884
+ ... mul = int(f.readline())
1885
+ ... return [x * mul for x in iterator]
1886
+ ...
1887
+ ... collected = sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
1888
+
1889
+ >>> file_list1
1890
+ ['file:/.../test1.txt']
1891
+ >>> file_list2
1892
+ ['file:/.../test1.txt', 'file:/.../test2.txt']
1893
+ >>> file_list3
1894
+ ['file:/.../test1.txt', 'file:/.../test2.txt']
1895
+ >>> collected
1896
+ [100, 200, 300, 400]
1897
+ """
1898
+ self._jsc.sc().addFile(path, recursive)
1899
+
1900
+ @property
1901
+ def listFiles(self) -> List[str]:
1902
+ """Returns a list of file paths that are added to resources.
1903
+
1904
+ .. versionadded:: 3.4.0
1905
+
1906
+ See Also
1907
+ --------
1908
+ :meth:`SparkContext.addFile`
1909
+ """
1910
+ return list(
1911
+ self._jvm.scala.collection.JavaConverters.seqAsJavaList( # type: ignore[union-attr]
1912
+ self._jsc.sc().listFiles()
1913
+ )
1914
+ )
1915
+
1916
+ def addPyFile(self, path: str) -> None:
1917
+ """
1918
+ Add a .py or .zip dependency for all tasks to be executed on this
1919
+ SparkContext in the future. The `path` passed can be either a local
1920
+ file, a file in HDFS (or other Hadoop-supported filesystems), or an
1921
+ HTTP, HTTPS or FTP URI.
1922
+
1923
+ .. versionadded:: 0.7.0
1924
+
1925
+ Parameters
1926
+ ----------
1927
+ path : str
1928
+ can be either a .py file or .zip dependency.
1929
+
1930
+ See Also
1931
+ --------
1932
+ :meth:`SparkContext.addFile`
1933
+
1934
+ Notes
1935
+ -----
1936
+ A path can be added only once. Subsequent additions of the same path are ignored.
1937
+ """
1938
+ self.addFile(path)
1939
+ (dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix
1940
+ if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
1941
+ assert self._python_includes is not None
1942
+ self._python_includes.append(filename)
1943
+ # for tests in local mode
1944
+ sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
1945
+
1946
+ importlib.invalidate_caches()
1947
+
1948
+ def addArchive(self, path: str) -> None:
1949
+ """
1950
+ Add an archive to be downloaded with this Spark job on every node.
1951
+ The `path` passed can be either a local file, a file in HDFS
1952
+ (or other Hadoop-supported filesystems), or an HTTP, HTTPS or
1953
+ FTP URI.
1954
+
1955
+ To access the file in Spark jobs, use :meth:`SparkFiles.get` with the
1956
+ filename to find its download/unpacked location. The given path should
1957
+ be one of .zip, .tar, .tar.gz, .tgz and .jar.
1958
+
1959
+ .. versionadded:: 3.3.0
1960
+
1961
+ Parameters
1962
+ ----------
1963
+ path : str
1964
+ can be either a local file, a file in HDFS (or other Hadoop-supported
1965
+ filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
1966
+ use :meth:`SparkFiles.get` to find its download location.
1967
+
1968
+ See Also
1969
+ --------
1970
+ :meth:`SparkContext.listArchives`
1971
+ :meth:`SparkFiles.get`
1972
+
1973
+ Notes
1974
+ -----
1975
+ A path can be added only once. Subsequent additions of the same path are ignored.
1976
+ This API is experimental.
1977
+
1978
+ Examples
1979
+ --------
1980
+ Creates a zipped file that contains a text file written '100'.
1981
+
1982
+ >>> import os
1983
+ >>> import tempfile
1984
+ >>> import zipfile
1985
+ >>> from pyspark import SparkFiles
1986
+
1987
+ >>> with tempfile.TemporaryDirectory() as d:
1988
+ ... path = os.path.join(d, "test.txt")
1989
+ ... with open(path, "w") as f:
1990
+ ... _ = f.write("100")
1991
+ ...
1992
+ ... zip_path1 = os.path.join(d, "test1.zip")
1993
+ ... with zipfile.ZipFile(zip_path1, "w", zipfile.ZIP_DEFLATED) as z:
1994
+ ... z.write(path, os.path.basename(path))
1995
+ ...
1996
+ ... zip_path2 = os.path.join(d, "test2.zip")
1997
+ ... with zipfile.ZipFile(zip_path2, "w", zipfile.ZIP_DEFLATED) as z:
1998
+ ... z.write(path, os.path.basename(path))
1999
+ ...
2000
+ ... sc.addArchive(zip_path1)
2001
+ ... arch_list1 = sorted(sc.listArchives)
2002
+ ...
2003
+ ... sc.addArchive(zip_path2)
2004
+ ... arch_list2 = sorted(sc.listArchives)
2005
+ ...
2006
+ ... # add zip_path2 twice, this addition will be ignored
2007
+ ... sc.addArchive(zip_path2)
2008
+ ... arch_list3 = sorted(sc.listArchives)
2009
+ ...
2010
+ ... def func(iterator):
2011
+ ... with open("%s/test.txt" % SparkFiles.get("test1.zip")) as f:
2012
+ ... mul = int(f.readline())
2013
+ ... return [x * mul for x in iterator]
2014
+ ...
2015
+ ... collected = sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
2016
+
2017
+ >>> arch_list1
2018
+ ['file:/.../test1.zip']
2019
+ >>> arch_list2
2020
+ ['file:/.../test1.zip', 'file:/.../test2.zip']
2021
+ >>> arch_list3
2022
+ ['file:/.../test1.zip', 'file:/.../test2.zip']
2023
+ >>> collected
2024
+ [100, 200, 300, 400]
2025
+ """
2026
+ self._jsc.sc().addArchive(path)
2027
+
2028
+ @property
2029
+ def listArchives(self) -> List[str]:
2030
+ """Returns a list of archive paths that are added to resources.
2031
+
2032
+ .. versionadded:: 3.4.0
2033
+
2034
+ See Also
2035
+ --------
2036
+ :meth:`SparkContext.addArchive`
2037
+ """
2038
+ return list(
2039
+ self._jvm.scala.collection.JavaConverters.seqAsJavaList( # type: ignore[union-attr]
2040
+ self._jsc.sc().listArchives()
2041
+ )
2042
+ )
2043
+
2044
+ def setCheckpointDir(self, dirName: str) -> None:
2045
+ """
2046
+ Set the directory under which RDDs are going to be checkpointed. The
2047
+ directory must be an HDFS path if running on a cluster.
2048
+
2049
+ .. versionadded:: 0.7.0
2050
+
2051
+ Parameters
2052
+ ----------
2053
+ dirName : str
2054
+ path to the directory where checkpoint files will be stored
2055
+ (must be HDFS path if running in cluster)
2056
+
2057
+ See Also
2058
+ --------
2059
+ :meth:`SparkContext.getCheckpointDir`
2060
+ :meth:`RDD.checkpoint`
2061
+ :meth:`RDD.getCheckpointFile`
2062
+ """
2063
+ self._jsc.sc().setCheckpointDir(dirName)
2064
+
2065
+ def getCheckpointDir(self) -> Optional[str]:
2066
+ """
2067
+ Return the directory where RDDs are checkpointed. Returns None if no
2068
+ checkpoint directory has been set.
2069
+
2070
+ .. versionadded:: 3.1.0
2071
+
2072
+ See Also
2073
+ --------
2074
+ :meth:`SparkContext.setCheckpointDir`
2075
+ :meth:`RDD.checkpoint`
2076
+ :meth:`RDD.getCheckpointFile`
2077
+ """
2078
+ if not self._jsc.sc().getCheckpointDir().isEmpty():
2079
+ return self._jsc.sc().getCheckpointDir().get()
2080
+ return None
2081
+
2082
+ def _getJavaStorageLevel(self, storageLevel: StorageLevel) -> JavaObject:
2083
+ """
2084
+ Returns a Java StorageLevel based on a pyspark.StorageLevel.
2085
+ """
2086
+ if not isinstance(storageLevel, StorageLevel):
2087
+ raise TypeError("storageLevel must be of type pyspark.StorageLevel")
2088
+ assert self._jvm is not None
2089
+ newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel
2090
+ return newStorageLevel(
2091
+ storageLevel.useDisk,
2092
+ storageLevel.useMemory,
2093
+ storageLevel.useOffHeap,
2094
+ storageLevel.deserialized,
2095
+ storageLevel.replication,
2096
+ )
2097
+
2098
+ def setJobGroup(self, groupId: str, description: str, interruptOnCancel: bool = False) -> None:
2099
+ """
2100
+ Assigns a group ID to all the jobs started by this thread until the group ID is set to a
2101
+ different value or cleared.
2102
+
2103
+ Often, a unit of execution in an application consists of multiple Spark actions or jobs.
2104
+ Application programmers can use this method to group all those jobs together and give a
2105
+ group description. Once set, the Spark web UI will associate such jobs with this group.
2106
+
2107
+ The application can use :meth:`SparkContext.cancelJobGroup` to cancel all
2108
+ running jobs in this group.
2109
+
2110
+ .. versionadded:: 1.0.0
2111
+
2112
+ Parameters
2113
+ ----------
2114
+ groupId : str
2115
+ The group ID to assign.
2116
+ description : str
2117
+ The description to set for the job group.
2118
+ interruptOnCancel : bool, optional, default False
2119
+ whether to interrupt jobs on job cancellation.
2120
+
2121
+ Notes
2122
+ -----
2123
+ If interruptOnCancel is set to true for the job group, then job cancellation will result
2124
+ in Thread.interrupt() being called on the job's executor threads. This is useful to help
2125
+ ensure that the tasks are actually stopped in a timely manner, but is off by default due
2126
+ to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
2127
+
2128
+ If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
2129
+ local inheritance.
2130
+
2131
+ See Also
2132
+ --------
2133
+ :meth:`SparkContext.cancelJobGroup`
2134
+
2135
+ Examples
2136
+ --------
2137
+ >>> import threading
2138
+ >>> from time import sleep
2139
+ >>> from pyspark import InheritableThread
2140
+ >>> result = "Not Set"
2141
+ >>> lock = threading.Lock()
2142
+ >>> def map_func(x):
2143
+ ... sleep(100)
2144
+ ... raise RuntimeError("Task should have been cancelled")
2145
+ ...
2146
+ >>> def start_job(x):
2147
+ ... global result
2148
+ ... try:
2149
+ ... sc.setJobGroup("job_to_cancel", "some description")
2150
+ ... result = sc.parallelize(range(x)).map(map_func).collect()
2151
+ ... except Exception as e:
2152
+ ... result = "Cancelled"
2153
+ ... lock.release()
2154
+ ...
2155
+ >>> def stop_job():
2156
+ ... sleep(5)
2157
+ ... sc.cancelJobGroup("job_to_cancel")
2158
+ ...
2159
+ >>> suppress = lock.acquire()
2160
+ >>> suppress = InheritableThread(target=start_job, args=(10,)).start()
2161
+ >>> suppress = InheritableThread(target=stop_job).start()
2162
+ >>> suppress = lock.acquire()
2163
+ >>> print(result)
2164
+ Cancelled
2165
+ """
2166
+ self._jsc.setJobGroup(groupId, description, interruptOnCancel)
2167
+
2168
+ def setInterruptOnCancel(self, interruptOnCancel: bool) -> None:
2169
+ """
2170
+ Set the behavior of job cancellation from jobs started in this thread.
2171
+
2172
+ .. versionadded:: 3.5.0
2173
+
2174
+ Parameters
2175
+ ----------
2176
+ interruptOnCancel : bool
2177
+ If true, then job cancellation will result in ``Thread.interrupt()``
2178
+ being called on the job's executor threads. This is useful to help ensure that
2179
+ the tasks are actually stopped in a timely manner, but is off by default due to
2180
+ HDFS-1208, where HDFS may respond to ``Thread.interrupt()`` by marking nodes as dead.
2181
+
2182
+ See Also
2183
+ --------
2184
+ :meth:`SparkContext.addJobTag`
2185
+ :meth:`SparkContext.removeJobTag`
2186
+ :meth:`SparkContext.cancelAllJobs`
2187
+ :meth:`SparkContext.cancelJobGroup`
2188
+ :meth:`SparkContext.cancelJobsWithTag`
2189
+ """
2190
+ self._jsc.setInterruptOnCancel(interruptOnCancel)
2191
+
2192
+ def addJobTag(self, tag: str) -> None:
2193
+ """
2194
+ Add a tag to be assigned to all the jobs started by this thread.
2195
+
2196
+ .. versionadded:: 3.5.0
2197
+
2198
+ Parameters
2199
+ ----------
2200
+ tag : str
2201
+ The tag to be added. Cannot contain ',' (comma) character.
2202
+
2203
+ See Also
2204
+ --------
2205
+ :meth:`SparkContext.removeJobTag`
2206
+ :meth:`SparkContext.getJobTags`
2207
+ :meth:`SparkContext.clearJobTags`
2208
+ :meth:`SparkContext.cancelJobsWithTag`
2209
+ :meth:`SparkContext.setInterruptOnCancel`
2210
+
2211
+ Examples
2212
+ --------
2213
+ >>> import threading
2214
+ >>> from time import sleep
2215
+ >>> from pyspark import InheritableThread
2216
+ >>> sc.setInterruptOnCancel(interruptOnCancel=True)
2217
+ >>> result = "Not Set"
2218
+ >>> lock = threading.Lock()
2219
+ >>> def map_func(x):
2220
+ ... sleep(100)
2221
+ ... raise RuntimeError("Task should have been cancelled")
2222
+ ...
2223
+ >>> def start_job(x):
2224
+ ... global result
2225
+ ... try:
2226
+ ... sc.addJobTag("job_to_cancel")
2227
+ ... result = sc.parallelize(range(x)).map(map_func).collect()
2228
+ ... except Exception as e:
2229
+ ... result = "Cancelled"
2230
+ ... lock.release()
2231
+ ...
2232
+ >>> def stop_job():
2233
+ ... sleep(5)
2234
+ ... sc.cancelJobsWithTag("job_to_cancel")
2235
+ ...
2236
+ >>> suppress = lock.acquire()
2237
+ >>> suppress = InheritableThread(target=start_job, args=(10,)).start()
2238
+ >>> suppress = InheritableThread(target=stop_job).start()
2239
+ >>> suppress = lock.acquire()
2240
+ >>> print(result)
2241
+ Cancelled
2242
+ >>> sc.clearJobTags()
2243
+ """
2244
+ self._jsc.addJobTag(tag)
2245
+
2246
+ def removeJobTag(self, tag: str) -> None:
2247
+ """
2248
+ Remove a tag previously added to be assigned to all the jobs started by this thread.
2249
+ Noop if such a tag was not added earlier.
2250
+
2251
+ .. versionadded:: 3.5.0
2252
+
2253
+ Parameters
2254
+ ----------
2255
+ tag : str
2256
+ The tag to be removed. Cannot contain ',' (comma) character.
2257
+
2258
+ See Also
2259
+ --------
2260
+ :meth:`SparkContext.addJobTag`
2261
+ :meth:`SparkContext.getJobTags`
2262
+ :meth:`SparkContext.clearJobTags`
2263
+ :meth:`SparkContext.cancelJobsWithTag`
2264
+ :meth:`SparkContext.setInterruptOnCancel`
2265
+
2266
+ Examples
2267
+ --------
2268
+ >>> sc.addJobTag("job_to_cancel1")
2269
+ >>> sc.addJobTag("job_to_cancel2")
2270
+ >>> sc.getJobTags()
2271
+ {'job_to_cancel1', 'job_to_cancel2'}
2272
+ >>> sc.removeJobTag("job_to_cancel1")
2273
+ >>> sc.getJobTags()
2274
+ {'job_to_cancel2'}
2275
+ >>> sc.clearJobTags()
2276
+ """
2277
+ self._jsc.removeJobTag(tag)
2278
+
2279
+ def getJobTags(self) -> Set[str]:
2280
+ """
2281
+ Get the tags that are currently set to be assigned to all the jobs started by this thread.
2282
+
2283
+ .. versionadded:: 3.5.0
2284
+
2285
+ Returns
2286
+ -------
2287
+ set of str
2288
+ the tags that are currently set to be assigned to all the jobs started by this thread.
2289
+
2290
+ See Also
2291
+ --------
2292
+ :meth:`SparkContext.addJobTag`
2293
+ :meth:`SparkContext.removeJobTag`
2294
+ :meth:`SparkContext.clearJobTags`
2295
+ :meth:`SparkContext.cancelJobsWithTag`
2296
+ :meth:`SparkContext.setInterruptOnCancel`
2297
+
2298
+ Examples
2299
+ --------
2300
+ >>> sc.addJobTag("job_to_cancel")
2301
+ >>> sc.getJobTags()
2302
+ {'job_to_cancel'}
2303
+ >>> sc.clearJobTags()
2304
+ """
2305
+ return self._jsc.getJobTags()
2306
+
2307
+ def clearJobTags(self) -> None:
2308
+ """
2309
+ Clear the current thread's job tags.
2310
+
2311
+ .. versionadded:: 3.5.0
2312
+
2313
+ See Also
2314
+ --------
2315
+ :meth:`SparkContext.addJobTag`
2316
+ :meth:`SparkContext.removeJobTag`
2317
+ :meth:`SparkContext.getJobTags`
2318
+ :meth:`SparkContext.cancelJobsWithTag`
2319
+ :meth:`SparkContext.setInterruptOnCancel`
2320
+
2321
+ Examples
2322
+ --------
2323
+ >>> sc.addJobTag("job_to_cancel")
2324
+ >>> sc.clearJobTags()
2325
+ >>> sc.getJobTags()
2326
+ set()
2327
+ """
2328
+ self._jsc.clearJobTags()
2329
+
2330
+ def setLocalProperty(self, key: str, value: str) -> None:
2331
+ """
2332
+ Set a local property that affects jobs submitted from this thread, such as the
2333
+ Spark fair scheduler pool.
2334
+
2335
+ .. versionadded:: 1.0.0
2336
+
2337
+ Parameters
2338
+ ----------
2339
+ key : str
2340
+ The key of the local property to set.
2341
+ value : str
2342
+ The value of the local property to set.
2343
+
2344
+ See Also
2345
+ --------
2346
+ :meth:`SparkContext.getLocalProperty`
2347
+
2348
+ Notes
2349
+ -----
2350
+ If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
2351
+ local inheritance.
2352
+ """
2353
+ self._jsc.setLocalProperty(key, value)
2354
+
2355
+ def getLocalProperty(self, key: str) -> Optional[str]:
2356
+ """
2357
+ Get a local property set in this thread, or null if it is missing. See
2358
+ :meth:`setLocalProperty`.
2359
+
2360
+ .. versionadded:: 1.0.0
2361
+
2362
+ See Also
2363
+ --------
2364
+ :meth:`SparkContext.setLocalProperty`
2365
+ """
2366
+ return self._jsc.getLocalProperty(key)
2367
+
2368
+ def setJobDescription(self, value: str) -> None:
2369
+ """
2370
+ Set a human readable description of the current job.
2371
+
2372
+ .. versionadded:: 2.3.0
2373
+
2374
+ Parameters
2375
+ ----------
2376
+ value : str
2377
+ The job description to set.
2378
+
2379
+ Notes
2380
+ -----
2381
+ If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
2382
+ local inheritance.
2383
+ """
2384
+ self._jsc.setJobDescription(value)
2385
+
2386
+ def sparkUser(self) -> str:
2387
+ """
2388
+ Get SPARK_USER for user who is running SparkContext.
2389
+
2390
+ .. versionadded:: 1.0.0
2391
+ """
2392
+ return self._jsc.sc().sparkUser()
2393
+
2394
+ def cancelJobGroup(self, groupId: str) -> None:
2395
+ """
2396
+ Cancel active jobs for the specified group. See :meth:`SparkContext.setJobGroup`.
2397
+ for more information.
2398
+
2399
+ .. versionadded:: 1.1.0
2400
+
2401
+ Parameters
2402
+ ----------
2403
+ groupId : str
2404
+ The group ID to cancel the job.
2405
+
2406
+ See Also
2407
+ --------
2408
+ :meth:`SparkContext.setJobGroup`
2409
+ """
2410
+ self._jsc.sc().cancelJobGroup(groupId)
2411
+
2412
+ def cancelJobsWithTag(self, tag: str) -> None:
2413
+ """
2414
+ Cancel active jobs that have the specified tag. See
2415
+ :meth:`SparkContext.addJobTag`.
2416
+
2417
+ .. versionadded:: 3.5.0
2418
+
2419
+ Parameters
2420
+ ----------
2421
+ tag : str
2422
+ The tag to be cancelled. Cannot contain ',' (comma) character.
2423
+
2424
+ See Also
2425
+ --------
2426
+ :meth:`SparkContext.addJobTag`
2427
+ :meth:`SparkContext.removeJobTag`
2428
+ :meth:`SparkContext.getJobTags`
2429
+ :meth:`SparkContext.clearJobTags`
2430
+ :meth:`SparkContext.setInterruptOnCancel`
2431
+ """
2432
+ return self._jsc.cancelJobsWithTag(tag)
2433
+
2434
+ def cancelAllJobs(self) -> None:
2435
+ """
2436
+ Cancel all jobs that have been scheduled or are running.
2437
+
2438
+ .. versionadded:: 1.1.0
2439
+
2440
+ See Also
2441
+ --------
2442
+ :meth:`SparkContext.cancelJobGroup`
2443
+ :meth:`SparkContext.cancelJobsWithTag`
2444
+ :meth:`SparkContext.runJob`
2445
+ """
2446
+ self._jsc.sc().cancelAllJobs()
2447
+
2448
+ def statusTracker(self) -> StatusTracker:
2449
+ """
2450
+ Return :class:`StatusTracker` object
2451
+
2452
+ .. versionadded:: 1.4.0
2453
+ """
2454
+ return StatusTracker(self._jsc.statusTracker())
2455
+
2456
+ def runJob(
2457
+ self,
2458
+ rdd: RDD[T],
2459
+ partitionFunc: Callable[[Iterable[T]], Iterable[U]],
2460
+ partitions: Optional[Sequence[int]] = None,
2461
+ allowLocal: bool = False,
2462
+ ) -> List[U]:
2463
+ """
2464
+ Executes the given partitionFunc on the specified set of partitions,
2465
+ returning the result as an array of elements.
2466
+
2467
+ If 'partitions' is not specified, this will run over all partitions.
2468
+
2469
+ .. versionadded:: 1.1.0
2470
+
2471
+ Parameters
2472
+ ----------
2473
+ rdd : :class:`RDD`
2474
+ target RDD to run tasks on
2475
+ partitionFunc : function
2476
+ a function to run on each partition of the RDD
2477
+ partitions : list, optional
2478
+ set of partitions to run on; some jobs may not want to compute on all
2479
+ partitions of the target RDD, e.g. for operations like `first`
2480
+ allowLocal : bool, default False
2481
+ this parameter takes no effect
2482
+
2483
+ Returns
2484
+ -------
2485
+ list
2486
+ results of specified partitions
2487
+
2488
+ See Also
2489
+ --------
2490
+ :meth:`SparkContext.cancelAllJobs`
2491
+
2492
+ Examples
2493
+ --------
2494
+ >>> myRDD = sc.parallelize(range(6), 3)
2495
+ >>> sc.runJob(myRDD, lambda part: [x * x for x in part])
2496
+ [0, 1, 4, 9, 16, 25]
2497
+
2498
+ >>> myRDD = sc.parallelize(range(6), 3)
2499
+ >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
2500
+ [0, 1, 16, 25]
2501
+ """
2502
+ if partitions is None:
2503
+ partitions = list(range(rdd._jrdd.partitions().size()))
2504
+
2505
+ # Implementation note: This is implemented as a mapPartitions followed
2506
+ # by runJob() in order to avoid having to pass a Python lambda into
2507
+ # SparkContext#runJob.
2508
+ mappedRDD = rdd.mapPartitions(partitionFunc)
2509
+ assert self._jvm is not None
2510
+ sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
2511
+ return list(_load_from_socket(sock_info, mappedRDD._jrdd_deserializer))
2512
+
2513
+ def show_profiles(self) -> None:
2514
+ """Print the profile stats to stdout
2515
+
2516
+ .. versionadded:: 1.2.0
2517
+
2518
+ See Also
2519
+ --------
2520
+ :meth:`SparkContext.dump_profiles`
2521
+ """
2522
+ if self.profiler_collector is not None:
2523
+ self.profiler_collector.show_profiles()
2524
+ else:
2525
+ raise PySparkRuntimeError(
2526
+ error_class="INCORRECT_CONF_FOR_PROFILE",
2527
+ message_parameters={},
2528
+ )
2529
+
2530
+ def dump_profiles(self, path: str) -> None:
2531
+ """Dump the profile stats into directory `path`
2532
+
2533
+ .. versionadded:: 1.2.0
2534
+
2535
+ See Also
2536
+ --------
2537
+ :meth:`SparkContext.show_profiles`
2538
+ """
2539
+ if self.profiler_collector is not None:
2540
+ self.profiler_collector.dump_profiles(path)
2541
+ else:
2542
+ raise PySparkRuntimeError(
2543
+ error_class="INCORRECT_CONF_FOR_PROFILE",
2544
+ message_parameters={},
2545
+ )
2546
+
2547
+ def getConf(self) -> SparkConf:
2548
+ """Return a copy of this SparkContext's configuration :class:`SparkConf`.
2549
+
2550
+ .. versionadded:: 2.1.0
2551
+ """
2552
+ conf = SparkConf()
2553
+ conf.setAll(self._conf.getAll())
2554
+ return conf
2555
+
2556
+ @property
2557
+ def resources(self) -> Dict[str, ResourceInformation]:
2558
+ """
2559
+ Return the resource information of this :class:`SparkContext`.
2560
+ A resource could be a GPU, FPGA, etc.
2561
+
2562
+ .. versionadded:: 3.0.0
2563
+ """
2564
+ resources = {}
2565
+ jresources = self._jsc.resources()
2566
+ for x in jresources:
2567
+ name = jresources[x].name()
2568
+ jaddresses = jresources[x].addresses()
2569
+ addrs = [addr for addr in jaddresses]
2570
+ resources[name] = ResourceInformation(name, addrs)
2571
+ return resources
2572
+
2573
+ @staticmethod
2574
+ def _assert_on_driver() -> None:
2575
+ """
2576
+ Called to ensure that SparkContext is created only on the Driver.
2577
+
2578
+ Throws an exception if a SparkContext is about to be created in executors.
2579
+ """
2580
+ if TaskContext.get() is not None:
2581
+ raise PySparkRuntimeError(
2582
+ error_class="CONTEXT_ONLY_VALID_ON_DRIVER",
2583
+ message_parameters={},
2584
+ )
2585
+
2586
+
2587
+ def _test() -> None:
2588
+ import doctest
2589
+ from pyspark import SparkConf
2590
+
2591
+ globs = globals().copy()
2592
+ conf = SparkConf().set("spark.ui.enabled", "True")
2593
+ globs["sc"] = SparkContext("local[4]", "context tests", conf=conf)
2594
+ (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
2595
+ globs["sc"].stop()
2596
+ if failure_count:
2597
+ sys.exit(-1)
2598
+
2599
+
2600
+ if __name__ == "__main__":
2601
+ _test()