snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1318 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import sys
19
+ import array as pyarray
20
+ from math import exp, log
21
+ from collections import namedtuple
22
+ from typing import Any, List, Optional, Tuple, TypeVar, Union, overload, TYPE_CHECKING
23
+
24
+ import numpy as np
25
+ from numpy import array, random, tile
26
+
27
+ from pyspark import SparkContext, since
28
+ from pyspark.rdd import RDD
29
+ from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
30
+ from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector # noqa: F401
31
+ from pyspark.mllib.stat.distribution import MultivariateGaussian
32
+ from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
33
+ from pyspark.streaming import DStream
34
+
35
+ if TYPE_CHECKING:
36
+ from py4j.java_gateway import JavaObject
37
+ from pyspark.mllib._typing import VectorLike
38
+
39
+ T = TypeVar("T")
40
+
41
+ __all__ = [
42
+ "BisectingKMeansModel",
43
+ "BisectingKMeans",
44
+ "KMeansModel",
45
+ "KMeans",
46
+ "GaussianMixtureModel",
47
+ "GaussianMixture",
48
+ "PowerIterationClusteringModel",
49
+ "PowerIterationClustering",
50
+ "StreamingKMeans",
51
+ "StreamingKMeansModel",
52
+ "LDA",
53
+ "LDAModel",
54
+ ]
55
+
56
+
57
+ @inherit_doc
58
+ class BisectingKMeansModel(JavaModelWrapper):
59
+ """
60
+ A clustering model derived from the bisecting k-means method.
61
+
62
+ .. versionadded:: 2.0.0
63
+
64
+ Examples
65
+ --------
66
+ >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
67
+ >>> bskm = BisectingKMeans()
68
+ >>> model = bskm.train(sc.parallelize(data, 2), k=4)
69
+ >>> p = array([0.0, 0.0])
70
+ >>> model.predict(p)
71
+ 0
72
+ >>> model.k
73
+ 4
74
+ >>> model.computeCost(p)
75
+ 0.0
76
+ """
77
+
78
+ def __init__(self, java_model: "JavaObject"):
79
+ super(BisectingKMeansModel, self).__init__(java_model)
80
+ self.centers = [c.toArray() for c in self.call("clusterCenters")]
81
+
82
+ @property
83
+ @since("2.0.0")
84
+ def clusterCenters(self) -> List[np.ndarray]:
85
+ """Get the cluster centers, represented as a list of NumPy
86
+ arrays."""
87
+ return self.centers
88
+
89
+ @property
90
+ @since("2.0.0")
91
+ def k(self) -> int:
92
+ """Get the number of clusters"""
93
+ return self.call("k")
94
+
95
+ @overload
96
+ def predict(self, x: "VectorLike") -> int:
97
+ ...
98
+
99
+ @overload
100
+ def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
101
+ ...
102
+
103
+ def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[int, RDD[int]]:
104
+ """
105
+ Find the cluster that each of the points belongs to in this
106
+ model.
107
+
108
+ .. versionadded:: 2.0.0
109
+
110
+ Parameters
111
+ ----------
112
+ x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
113
+ A data point (or RDD of points) to determine cluster index.
114
+ :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent
115
+ objects (list, tuple, numpy.ndarray).
116
+
117
+ Returns
118
+ -------
119
+ int or :py:class:`pyspark.RDD` of int
120
+ Predicted cluster index or an RDD of predicted cluster indices
121
+ if the input is an RDD.
122
+ """
123
+ if isinstance(x, RDD):
124
+ vecs = x.map(_convert_to_vector)
125
+ return self.call("predict", vecs)
126
+
127
+ x = _convert_to_vector(x)
128
+ return self.call("predict", x)
129
+
130
+ def computeCost(self, x: Union["VectorLike", RDD["VectorLike"]]) -> float:
131
+ """
132
+ Return the Bisecting K-means cost (sum of squared distances of
133
+ points to their nearest center) for this model on the given
134
+ data. If provided with an RDD of points returns the sum.
135
+
136
+ .. versionadded:: 2.0.0
137
+
138
+ Parameters
139
+ ----------
140
+ point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
141
+ A data point (or RDD of points) to compute the cost(s).
142
+ :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent
143
+ objects (list, tuple, numpy.ndarray).
144
+ """
145
+ if isinstance(x, RDD):
146
+ vecs = x.map(_convert_to_vector)
147
+ return self.call("computeCost", vecs)
148
+
149
+ return self.call("computeCost", _convert_to_vector(x))
150
+
151
+
152
+ class BisectingKMeans:
153
+ """
154
+ A bisecting k-means algorithm based on the paper "A comparison of
155
+ document clustering techniques" by Steinbach, Karypis, and Kumar,
156
+ with modification to fit Spark.
157
+ The algorithm starts from a single cluster that contains all points.
158
+ Iteratively it finds divisible clusters on the bottom level and
159
+ bisects each of them using k-means, until there are `k` leaf
160
+ clusters in total or no leaf clusters are divisible.
161
+ The bisecting steps of clusters on the same level are grouped
162
+ together to increase parallelism. If bisecting all divisible
163
+ clusters on the bottom level would result more than `k` leaf
164
+ clusters, larger clusters get higher priority.
165
+
166
+ .. versionadded:: 2.0.0
167
+
168
+ Notes
169
+ -----
170
+ See the original paper [1]_
171
+
172
+ .. [1] Steinbach, M. et al. "A Comparison of Document Clustering Techniques." (2000).
173
+ KDD Workshop on Text Mining, 2000
174
+ http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
175
+ """
176
+
177
+ @classmethod
178
+ def train(
179
+ cls,
180
+ rdd: RDD["VectorLike"],
181
+ k: int = 4,
182
+ maxIterations: int = 20,
183
+ minDivisibleClusterSize: float = 1.0,
184
+ seed: int = -1888008604,
185
+ ) -> BisectingKMeansModel:
186
+ """
187
+ Runs the bisecting k-means algorithm return the model.
188
+
189
+ .. versionadded:: 2.0.0
190
+
191
+ Parameters
192
+ ----------
193
+ rdd : :py:class:`pyspark.RDD`
194
+ Training points as an `RDD` of `Vector` or convertible
195
+ sequence types.
196
+ k : int, optional
197
+ The desired number of leaf clusters. The actual number could
198
+ be smaller if there are no divisible leaf clusters.
199
+ (default: 4)
200
+ maxIterations : int, optional
201
+ Maximum number of iterations allowed to split clusters.
202
+ (default: 20)
203
+ minDivisibleClusterSize : float, optional
204
+ Minimum number of points (if >= 1.0) or the minimum proportion
205
+ of points (if < 1.0) of a divisible cluster.
206
+ (default: 1)
207
+ seed : int, optional
208
+ Random seed value for cluster initialization.
209
+ (default: -1888008604 from classOf[BisectingKMeans].getName.##)
210
+ """
211
+ java_model = callMLlibFunc(
212
+ "trainBisectingKMeans",
213
+ rdd.map(_convert_to_vector),
214
+ k,
215
+ maxIterations,
216
+ minDivisibleClusterSize,
217
+ seed,
218
+ )
219
+ return BisectingKMeansModel(java_model)
220
+
221
+
222
+ @inherit_doc
223
+ class KMeansModel(Saveable, Loader["KMeansModel"]):
224
+
225
+ """A clustering model derived from the k-means method.
226
+
227
+ .. versionadded:: 0.9.0
228
+
229
+ Examples
230
+ --------
231
+ >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
232
+ >>> model = KMeans.train(
233
+ ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random",
234
+ ... seed=50, initializationSteps=5, epsilon=1e-4)
235
+ >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
236
+ True
237
+ >>> model.predict(array([8.0, 9.0])) == model.predict(array([9.0, 8.0]))
238
+ True
239
+ >>> model.k
240
+ 2
241
+ >>> model.computeCost(sc.parallelize(data))
242
+ 2.0
243
+ >>> model = KMeans.train(sc.parallelize(data), 2)
244
+ >>> sparse_data = [
245
+ ... SparseVector(3, {1: 1.0}),
246
+ ... SparseVector(3, {1: 1.1}),
247
+ ... SparseVector(3, {2: 1.0}),
248
+ ... SparseVector(3, {2: 1.1})
249
+ ... ]
250
+ >>> model = KMeans.train(sc.parallelize(sparse_data), 2, initializationMode="k-means||",
251
+ ... seed=50, initializationSteps=5, epsilon=1e-4)
252
+ >>> model.predict(array([0., 1., 0.])) == model.predict(array([0, 1.1, 0.]))
253
+ True
254
+ >>> model.predict(array([0., 0., 1.])) == model.predict(array([0, 0, 1.1]))
255
+ True
256
+ >>> model.predict(sparse_data[0]) == model.predict(sparse_data[1])
257
+ True
258
+ >>> model.predict(sparse_data[2]) == model.predict(sparse_data[3])
259
+ True
260
+ >>> isinstance(model.clusterCenters, list)
261
+ True
262
+ >>> import os, tempfile
263
+ >>> path = tempfile.mkdtemp()
264
+ >>> model.save(sc, path)
265
+ >>> sameModel = KMeansModel.load(sc, path)
266
+ >>> sameModel.predict(sparse_data[0]) == model.predict(sparse_data[0])
267
+ True
268
+ >>> from shutil import rmtree
269
+ >>> try:
270
+ ... rmtree(path)
271
+ ... except OSError:
272
+ ... pass
273
+
274
+ >>> data = array([-383.1,-382.9, 28.7,31.2, 366.2,367.3]).reshape(3, 2)
275
+ >>> model = KMeans.train(sc.parallelize(data), 3, maxIterations=0,
276
+ ... initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)]))
277
+ >>> model.clusterCenters
278
+ [array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])]
279
+ """
280
+
281
+ def __init__(self, centers: List["VectorLike"]):
282
+ self.centers = centers
283
+
284
+ @property
285
+ @since("1.0.0")
286
+ def clusterCenters(self) -> List["VectorLike"]:
287
+ """Get the cluster centers, represented as a list of NumPy arrays."""
288
+ return self.centers
289
+
290
+ @property
291
+ @since("1.4.0")
292
+ def k(self) -> int:
293
+ """Total number of clusters."""
294
+ return len(self.centers)
295
+
296
+ @overload
297
+ def predict(self, x: "VectorLike") -> int:
298
+ ...
299
+
300
+ @overload
301
+ def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
302
+ ...
303
+
304
+ def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[int, RDD[int]]:
305
+ """
306
+ Find the cluster that each of the points belongs to in this
307
+ model.
308
+
309
+ .. versionadded:: 0.9.0
310
+
311
+ Parameters
312
+ ----------
313
+ x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
314
+ A data point (or RDD of points) to determine cluster index.
315
+ :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent
316
+ objects (list, tuple, numpy.ndarray).
317
+
318
+ Returns
319
+ -------
320
+ int or :py:class:`pyspark.RDD` of int
321
+ Predicted cluster index or an RDD of predicted cluster indices
322
+ if the input is an RDD.
323
+ """
324
+ best = 0
325
+ best_distance = float("inf")
326
+ if isinstance(x, RDD):
327
+ return x.map(self.predict)
328
+
329
+ x = _convert_to_vector(x)
330
+ for i in range(len(self.centers)):
331
+ distance = x.squared_distance(self.centers[i]) # type: ignore[attr-defined]
332
+ if distance < best_distance:
333
+ best = i
334
+ best_distance = distance
335
+ return best
336
+
337
+ def computeCost(self, rdd: RDD["VectorLike"]) -> float:
338
+ """
339
+ Return the K-means cost (sum of squared distances of points to
340
+ their nearest center) for this model on the given
341
+ data.
342
+
343
+ .. versionadded:: 1.4.0
344
+
345
+ Parameters
346
+ ----------
347
+ rdd : ::py:class:`pyspark.RDD`
348
+ The RDD of points to compute the cost on.
349
+ """
350
+ cost = callMLlibFunc(
351
+ "computeCostKmeansModel",
352
+ rdd.map(_convert_to_vector),
353
+ [_convert_to_vector(c) for c in self.centers],
354
+ )
355
+ return cost
356
+
357
+ @since("1.4.0")
358
+ def save(self, sc: SparkContext, path: str) -> None:
359
+ """
360
+ Save this model to the given path.
361
+ """
362
+ assert sc._jvm is not None
363
+
364
+ java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers])
365
+ java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
366
+ java_model.save(sc._jsc.sc(), path)
367
+
368
+ @classmethod
369
+ @since("1.4.0")
370
+ def load(cls, sc: SparkContext, path: str) -> "KMeansModel":
371
+ """
372
+ Load a model from the given path.
373
+ """
374
+ assert sc._jvm is not None
375
+
376
+ java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(sc._jsc.sc(), path)
377
+ return KMeansModel(_java2py(sc, java_model.clusterCenters()))
378
+
379
+
380
+ class KMeans:
381
+ """
382
+ K-means clustering.
383
+
384
+ .. versionadded:: 0.9.0
385
+ """
386
+
387
+ @classmethod
388
+ def train(
389
+ cls,
390
+ rdd: RDD["VectorLike"],
391
+ k: int,
392
+ maxIterations: int = 100,
393
+ initializationMode: str = "k-means||",
394
+ seed: Optional[int] = None,
395
+ initializationSteps: int = 2,
396
+ epsilon: float = 1e-4,
397
+ initialModel: Optional[KMeansModel] = None,
398
+ distanceMeasure: str = "euclidean",
399
+ ) -> "KMeansModel":
400
+ """
401
+ Train a k-means clustering model.
402
+
403
+ .. versionadded:: 0.9.0
404
+
405
+ Parameters
406
+ ----------
407
+ rdd : ::py:class:`pyspark.RDD`
408
+ Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`
409
+ or convertible sequence types.
410
+ k : int
411
+ Number of clusters to create.
412
+ maxIterations : int, optional
413
+ Maximum number of iterations allowed.
414
+ (default: 100)
415
+ initializationMode : str, optional
416
+ The initialization algorithm. This can be either "random" or
417
+ "k-means||".
418
+ (default: "k-means||")
419
+ seed : int, optional
420
+ Random seed value for cluster initialization. Set as None to
421
+ generate seed based on system time.
422
+ (default: None)
423
+ initializationSteps :
424
+ Number of steps for the k-means|| initialization mode.
425
+ This is an advanced setting -- the default of 2 is almost
426
+ always enough.
427
+ (default: 2)
428
+ epsilon : float, optional
429
+ Distance threshold within which a center will be considered to
430
+ have converged. If all centers move less than this Euclidean
431
+ distance, iterations are stopped.
432
+ (default: 1e-4)
433
+ initialModel : :py:class:`KMeansModel`, optional
434
+ Initial cluster centers can be provided as a KMeansModel object
435
+ rather than using the random or k-means|| initializationModel.
436
+ (default: None)
437
+ distanceMeasure : str, optional
438
+ The distance measure used by the k-means algorithm.
439
+ (default: "euclidean")
440
+ """
441
+ clusterInitialModel = []
442
+ if initialModel is not None:
443
+ if not isinstance(initialModel, KMeansModel):
444
+ raise TypeError(
445
+ "initialModel is of " + str(type(initialModel)) + ". It needs "
446
+ "to be of <type 'KMeansModel'>"
447
+ )
448
+ clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
449
+ model = callMLlibFunc(
450
+ "trainKMeansModel",
451
+ rdd.map(_convert_to_vector),
452
+ k,
453
+ maxIterations,
454
+ initializationMode,
455
+ seed,
456
+ initializationSteps,
457
+ epsilon,
458
+ clusterInitialModel,
459
+ distanceMeasure,
460
+ )
461
+ centers = callJavaFunc(rdd.context, model.clusterCenters)
462
+ return KMeansModel([c.toArray() for c in centers])
463
+
464
+
465
+ @inherit_doc
466
+ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader["GaussianMixtureModel"]):
467
+
468
+ """
469
+ A clustering model derived from the Gaussian Mixture Model method.
470
+
471
+ .. versionadded:: 1.3.0
472
+
473
+ Examples
474
+ --------
475
+ >>> from pyspark.mllib.linalg import Vectors, DenseMatrix
476
+ >>> from numpy.testing import assert_equal
477
+ >>> from shutil import rmtree
478
+ >>> import os, tempfile
479
+
480
+ >>> clusterdata_1 = sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,
481
+ ... 0.9,0.8,0.75,0.935,
482
+ ... -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2), 2)
483
+ >>> model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.0001,
484
+ ... maxIterations=50, seed=10)
485
+ >>> labels = model.predict(clusterdata_1).collect()
486
+ >>> labels[0]==labels[1]
487
+ False
488
+ >>> labels[1]==labels[2]
489
+ False
490
+ >>> labels[4]==labels[5]
491
+ True
492
+ >>> model.predict([-0.1,-0.05])
493
+ 0
494
+ >>> softPredicted = model.predictSoft([-0.1,-0.05])
495
+ >>> abs(softPredicted[0] - 1.0) < 0.03
496
+ True
497
+ >>> abs(softPredicted[1] - 0.0) < 0.03
498
+ True
499
+ >>> abs(softPredicted[2] - 0.0) < 0.03
500
+ True
501
+
502
+ >>> path = tempfile.mkdtemp()
503
+ >>> model.save(sc, path)
504
+ >>> sameModel = GaussianMixtureModel.load(sc, path)
505
+ >>> assert_equal(model.weights, sameModel.weights)
506
+ >>> mus, sigmas = list(
507
+ ... zip(*[(g.mu, g.sigma) for g in model.gaussians]))
508
+ >>> sameMus, sameSigmas = list(
509
+ ... zip(*[(g.mu, g.sigma) for g in sameModel.gaussians]))
510
+ >>> mus == sameMus
511
+ True
512
+ >>> sigmas == sameSigmas
513
+ True
514
+ >>> from shutil import rmtree
515
+ >>> try:
516
+ ... rmtree(path)
517
+ ... except OSError:
518
+ ... pass
519
+
520
+ >>> data = array([-5.1971, -2.5359, -3.8220,
521
+ ... -5.2211, -5.0602, 4.7118,
522
+ ... 6.8989, 3.4592, 4.6322,
523
+ ... 5.7048, 4.6567, 5.5026,
524
+ ... 4.5605, 5.2043, 6.2734])
525
+ >>> clusterdata_2 = sc.parallelize(data.reshape(5,3))
526
+ >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
527
+ ... maxIterations=150, seed=4)
528
+ >>> labels = model.predict(clusterdata_2).collect()
529
+ >>> labels[0]==labels[1]
530
+ True
531
+ >>> labels[2]==labels[3]==labels[4]
532
+ True
533
+ """
534
+
535
+ @property
536
+ @since("1.4.0")
537
+ def weights(self) -> np.ndarray:
538
+ """
539
+ Weights for each Gaussian distribution in the mixture, where weights[i] is
540
+ the weight for Gaussian i, and weights.sum == 1.
541
+ """
542
+ return array(self.call("weights"))
543
+
544
+ @property
545
+ @since("1.4.0")
546
+ def gaussians(self) -> List[MultivariateGaussian]:
547
+ """
548
+ Array of MultivariateGaussian where gaussians[i] represents
549
+ the Multivariate Gaussian (Normal) Distribution for Gaussian i.
550
+ """
551
+ return [
552
+ MultivariateGaussian(gaussian[0], gaussian[1]) for gaussian in self.call("gaussians")
553
+ ]
554
+
555
+ @property
556
+ @since("1.4.0")
557
+ def k(self) -> int:
558
+ """Number of gaussians in mixture."""
559
+ return len(self.weights)
560
+
561
+ @overload
562
+ def predict(self, x: "VectorLike") -> np.int64:
563
+ ...
564
+
565
+ @overload
566
+ def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
567
+ ...
568
+
569
+ def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[np.int64, RDD[int]]:
570
+ """
571
+ Find the cluster to which the point 'x' or each point in RDD 'x'
572
+ has maximum membership in this model.
573
+
574
+ .. versionadded:: 1.3.0
575
+
576
+ Parameters
577
+ ----------
578
+ x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
579
+ A feature vector or an RDD of vectors representing data points.
580
+
581
+ Returns
582
+ -------
583
+ numpy.float64 or :py:class:`pyspark.RDD` of int
584
+ Predicted cluster label or an RDD of predicted cluster labels
585
+ if the input is an RDD.
586
+ """
587
+ if isinstance(x, RDD):
588
+ cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z)))
589
+ return cluster_labels
590
+ else:
591
+ z = self.predictSoft(x)
592
+ return z.argmax()
593
+
594
+ @overload
595
+ def predictSoft(self, x: "VectorLike") -> np.ndarray:
596
+ ...
597
+
598
+ @overload
599
+ def predictSoft(self, x: RDD["VectorLike"]) -> RDD[pyarray.array]:
600
+ ...
601
+
602
+ def predictSoft(
603
+ self, x: Union["VectorLike", RDD["VectorLike"]]
604
+ ) -> Union[np.ndarray, RDD[pyarray.array]]:
605
+ """
606
+ Find the membership of point 'x' or each point in RDD 'x' to all mixture components.
607
+
608
+ .. versionadded:: 1.3.0
609
+
610
+ Parameters
611
+ ----------
612
+ x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
613
+ A feature vector or an RDD of vectors representing data points.
614
+
615
+ Returns
616
+ -------
617
+ numpy.ndarray or :py:class:`pyspark.RDD`
618
+ The membership value to all mixture components for vector 'x'
619
+ or each vector in RDD 'x'.
620
+ """
621
+ if isinstance(x, RDD):
622
+ means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
623
+ membership_matrix = callMLlibFunc(
624
+ "predictSoftGMM",
625
+ x.map(_convert_to_vector),
626
+ _convert_to_vector(self.weights),
627
+ means,
628
+ sigmas,
629
+ )
630
+ return membership_matrix.map(lambda x: pyarray.array("d", x))
631
+ else:
632
+ return self.call("predictSoft", _convert_to_vector(x)).toArray()
633
+
634
+ @classmethod
635
+ def load(cls, sc: SparkContext, path: str) -> "GaussianMixtureModel":
636
+ """Load the GaussianMixtureModel from disk.
637
+
638
+ .. versionadded:: 1.5.0
639
+
640
+ Parameters
641
+ ----------
642
+ sc : :py:class:`SparkContext`
643
+ path : str
644
+ Path to where the model is stored.
645
+ """
646
+ assert sc._jvm is not None
647
+
648
+ model = cls._load_java(sc, path)
649
+ wrapper = sc._jvm.org.apache.spark.mllib.api.python.GaussianMixtureModelWrapper(model)
650
+ return cls(wrapper)
651
+
652
+
653
+ class GaussianMixture:
654
+ """
655
+ Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm.
656
+
657
+ .. versionadded:: 1.3.0
658
+ """
659
+
660
+ @classmethod
661
+ def train(
662
+ cls,
663
+ rdd: RDD["VectorLike"],
664
+ k: int,
665
+ convergenceTol: float = 1e-3,
666
+ maxIterations: int = 100,
667
+ seed: Optional[int] = None,
668
+ initialModel: Optional[GaussianMixtureModel] = None,
669
+ ) -> GaussianMixtureModel:
670
+ """
671
+ Train a Gaussian Mixture clustering model.
672
+
673
+ .. versionadded:: 1.3.0
674
+
675
+ Parameters
676
+ ----------
677
+ rdd : ::py:class:`pyspark.RDD`
678
+ Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`
679
+ or convertible sequence types.
680
+ k : int
681
+ Number of independent Gaussians in the mixture model.
682
+ convergenceTol : float, optional
683
+ Maximum change in log-likelihood at which convergence is
684
+ considered to have occurred.
685
+ (default: 1e-3)
686
+ maxIterations : int, optional
687
+ Maximum number of iterations allowed.
688
+ (default: 100)
689
+ seed : int, optional
690
+ Random seed for initial Gaussian distribution. Set as None to
691
+ generate seed based on system time.
692
+ (default: None)
693
+ initialModel : GaussianMixtureModel, optional
694
+ Initial GMM starting point, bypassing the random
695
+ initialization.
696
+ (default: None)
697
+ """
698
+ initialModelWeights = None
699
+ initialModelMu = None
700
+ initialModelSigma = None
701
+ if initialModel is not None:
702
+ if initialModel.k != k:
703
+ raise ValueError(
704
+ "Mismatched cluster count, initialModel.k = %s, however k = %s"
705
+ % (initialModel.k, k)
706
+ )
707
+ initialModelWeights = list(initialModel.weights)
708
+ initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)]
709
+ initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)]
710
+ java_model = callMLlibFunc(
711
+ "trainGaussianMixtureModel",
712
+ rdd.map(_convert_to_vector),
713
+ k,
714
+ convergenceTol,
715
+ maxIterations,
716
+ seed,
717
+ initialModelWeights,
718
+ initialModelMu,
719
+ initialModelSigma,
720
+ )
721
+ return GaussianMixtureModel(java_model)
722
+
723
+
724
+ class PowerIterationClusteringModel(
725
+ JavaModelWrapper, JavaSaveable, JavaLoader["PowerIterationClusteringModel"]
726
+ ):
727
+
728
+ """
729
+ Model produced by :py:class:`PowerIterationClustering`.
730
+
731
+ .. versionadded:: 1.5.0
732
+
733
+ Examples
734
+ --------
735
+ >>> import math
736
+ >>> def genCircle(r, n):
737
+ ... points = []
738
+ ... for i in range(0, n):
739
+ ... theta = 2.0 * math.pi * i / n
740
+ ... points.append((r * math.cos(theta), r * math.sin(theta)))
741
+ ... return points
742
+ ...
743
+ >>> def sim(x, y):
744
+ ... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])
745
+ ... return math.exp(-dist2 / 2.0)
746
+ ...
747
+ >>> r1 = 1.0
748
+ >>> n1 = 10
749
+ >>> r2 = 4.0
750
+ >>> n2 = 40
751
+ >>> n = n1 + n2
752
+ >>> points = genCircle(r1, n1) + genCircle(r2, n2)
753
+ >>> similarities = [(i, j, sim(points[i], points[j])) for i in range(1, n) for j in range(0, i)]
754
+ >>> rdd = sc.parallelize(similarities, 2)
755
+ >>> model = PowerIterationClustering.train(rdd, 2, 40)
756
+ >>> model.k
757
+ 2
758
+ >>> result = sorted(model.assignments().collect(), key=lambda x: x.id)
759
+ >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster
760
+ True
761
+ >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster
762
+ True
763
+ >>> import os, tempfile
764
+ >>> path = tempfile.mkdtemp()
765
+ >>> model.save(sc, path)
766
+ >>> sameModel = PowerIterationClusteringModel.load(sc, path)
767
+ >>> sameModel.k
768
+ 2
769
+ >>> result = sorted(model.assignments().collect(), key=lambda x: x.id)
770
+ >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster
771
+ True
772
+ >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster
773
+ True
774
+ >>> from shutil import rmtree
775
+ >>> try:
776
+ ... rmtree(path)
777
+ ... except OSError:
778
+ ... pass
779
+ """
780
+
781
+ @property
782
+ @since("1.5.0")
783
+ def k(self) -> int:
784
+ """
785
+ Returns the number of clusters.
786
+ """
787
+ return self.call("k")
788
+
789
+ @since("1.5.0")
790
+ def assignments(self) -> RDD["PowerIterationClustering.Assignment"]:
791
+ """
792
+ Returns the cluster assignments of this model.
793
+ """
794
+ return self.call("getAssignments").map(lambda x: (PowerIterationClustering.Assignment(*x)))
795
+
796
+ @classmethod
797
+ @since("1.5.0")
798
+ def load(cls, sc: SparkContext, path: str) -> "PowerIterationClusteringModel":
799
+ """
800
+ Load a model from the given path.
801
+ """
802
+ assert sc._jvm is not None
803
+
804
+ model = cls._load_java(sc, path)
805
+ wrapper = sc._jvm.org.apache.spark.mllib.api.python.PowerIterationClusteringModelWrapper(
806
+ model
807
+ )
808
+ return PowerIterationClusteringModel(wrapper)
809
+
810
+
811
+ class PowerIterationClustering:
812
+ """
813
+ Power Iteration Clustering (PIC), a scalable graph clustering algorithm.
814
+
815
+
816
+ Developed by Lin and Cohen [1]_. From the abstract:
817
+
818
+ "PIC finds a very low-dimensional embedding of a
819
+ dataset using truncated power iteration on a normalized pair-wise
820
+ similarity matrix of the data."
821
+
822
+ .. versionadded:: 1.5.0
823
+
824
+ .. [1] Lin, Frank & Cohen, William. (2010). Power Iteration Clustering.
825
+ http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf
826
+ """
827
+
828
+ @classmethod
829
+ def train(
830
+ cls,
831
+ rdd: RDD[Tuple[int, int, float]],
832
+ k: int,
833
+ maxIterations: int = 100,
834
+ initMode: str = "random",
835
+ ) -> PowerIterationClusteringModel:
836
+ r"""
837
+ Train PowerIterationClusteringModel
838
+
839
+ .. versionadded:: 1.5.0
840
+
841
+ Parameters
842
+ ----------
843
+ rdd : :py:class:`pyspark.RDD`
844
+ An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
845
+ affinity matrix, which is the matrix A in the PIC paper. The
846
+ similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric
847
+ matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with
848
+ nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or
849
+ (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored,
850
+ because it is assumed s\ :sub:`ij`\ = 0.0.
851
+ k : int
852
+ Number of clusters.
853
+ maxIterations : int, optional
854
+ Maximum number of iterations of the PIC algorithm.
855
+ (default: 100)
856
+ initMode : str, optional
857
+ Initialization mode. This can be either "random" to use
858
+ a random vector as vertex properties, or "degree" to use
859
+ normalized sum similarities.
860
+ (default: "random")
861
+ """
862
+ model = callMLlibFunc(
863
+ "trainPowerIterationClusteringModel",
864
+ rdd.map(_convert_to_vector),
865
+ int(k),
866
+ int(maxIterations),
867
+ initMode,
868
+ )
869
+ return PowerIterationClusteringModel(model)
870
+
871
+ class Assignment(namedtuple("Assignment", ["id", "cluster"])):
872
+ """
873
+ Represents an (id, cluster) tuple.
874
+
875
+ .. versionadded:: 1.5.0
876
+ """
877
+
878
+
879
+ class StreamingKMeansModel(KMeansModel):
880
+ """
881
+ Clustering model which can perform an online update of the centroids.
882
+
883
+ The update formula for each centroid is given by
884
+
885
+ - c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t)
886
+ - n_t+1 = n_t * a + m_t
887
+
888
+ where
889
+
890
+ - c_t: Centroid at the n_th iteration.
891
+ - n_t: Number of samples (or) weights associated with the centroid
892
+ at the n_th iteration.
893
+ - x_t: Centroid of the new data closest to c_t.
894
+ - m_t: Number of samples (or) weights of the new data closest to c_t
895
+ - c_t+1: New centroid.
896
+ - n_t+1: New number of weights.
897
+ - a: Decay Factor, which gives the forgetfulness.
898
+
899
+ .. versionadded:: 1.5.0
900
+
901
+ Parameters
902
+ ----------
903
+ clusterCenters : list of :py:class:`pyspark.mllib.linalg.Vector` or covertible
904
+ Initial cluster centers.
905
+ clusterWeights : :py:class:`pyspark.mllib.linalg.Vector` or covertible
906
+ List of weights assigned to each cluster.
907
+
908
+ Notes
909
+ -----
910
+ If a is set to 1, it is the weighted mean of the previous
911
+ and new data. If it set to zero, the old centroids are completely
912
+ forgotten.
913
+
914
+ Examples
915
+ --------
916
+ >>> initCenters = [[0.0, 0.0], [1.0, 1.0]]
917
+ >>> initWeights = [1.0, 1.0]
918
+ >>> stkm = StreamingKMeansModel(initCenters, initWeights)
919
+ >>> data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1],
920
+ ... [0.9, 0.9], [1.1, 1.1]])
921
+ >>> stkm = stkm.update(data, 1.0, "batches")
922
+ >>> stkm.centers
923
+ array([[ 0., 0.],
924
+ [ 1., 1.]])
925
+ >>> stkm.predict([-0.1, -0.1])
926
+ 0
927
+ >>> stkm.predict([0.9, 0.9])
928
+ 1
929
+ >>> stkm.clusterWeights
930
+ [3.0, 3.0]
931
+ >>> decayFactor = 0.0
932
+ >>> data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])])
933
+ >>> stkm = stkm.update(data, 0.0, "batches")
934
+ >>> stkm.centers
935
+ array([[ 0.2, 0.2],
936
+ [ 1.5, 1.5]])
937
+ >>> stkm.clusterWeights
938
+ [1.0, 1.0]
939
+ >>> stkm.predict([0.2, 0.2])
940
+ 0
941
+ >>> stkm.predict([1.5, 1.5])
942
+ 1
943
+ """
944
+
945
+ def __init__(self, clusterCenters: List["VectorLike"], clusterWeights: "VectorLike"):
946
+ super(StreamingKMeansModel, self).__init__(centers=clusterCenters)
947
+ self._clusterWeights = list(clusterWeights) # type: ignore[arg-type]
948
+
949
+ @property
950
+ @since("1.5.0")
951
+ def clusterWeights(self) -> List[np.float64]:
952
+ """Return the cluster weights."""
953
+ return self._clusterWeights
954
+
955
+ @since("1.5.0")
956
+ def update(
957
+ self, data: RDD["VectorLike"], decayFactor: float, timeUnit: str
958
+ ) -> "StreamingKMeansModel":
959
+ """Update the centroids, according to data
960
+
961
+ .. versionadded:: 1.5.0
962
+
963
+ Parameters
964
+ ----------
965
+ data : :py:class:`pyspark.RDD`
966
+ RDD with new data for the model update.
967
+ decayFactor : float
968
+ Forgetfulness of the previous centroids.
969
+ timeUnit : str
970
+ Can be "batches" or "points". If points, then the decay factor
971
+ is raised to the power of number of new points and if batches,
972
+ then decay factor will be used as is.
973
+ """
974
+ if not isinstance(data, RDD):
975
+ raise TypeError("Data should be of an RDD, got %s." % type(data))
976
+ data = data.map(_convert_to_vector)
977
+ decayFactor = float(decayFactor)
978
+ if timeUnit not in ["batches", "points"]:
979
+ raise ValueError("timeUnit should be 'batches' or 'points', got %s." % timeUnit)
980
+ vectorCenters = [_convert_to_vector(center) for center in self.centers]
981
+ updatedModel = callMLlibFunc(
982
+ "updateStreamingKMeansModel",
983
+ vectorCenters,
984
+ self._clusterWeights,
985
+ data,
986
+ decayFactor,
987
+ timeUnit,
988
+ )
989
+ self.centers = array(updatedModel[0]) # type: ignore[assignment]
990
+ self._clusterWeights = list(updatedModel[1])
991
+ return self
992
+
993
+
994
+ class StreamingKMeans:
995
+ """
996
+ Provides methods to set k, decayFactor, timeUnit to configure the
997
+ KMeans algorithm for fitting and predicting on incoming dstreams.
998
+ More details on how the centroids are updated are provided under the
999
+ docs of StreamingKMeansModel.
1000
+
1001
+ .. versionadded:: 1.5.0
1002
+
1003
+ Parameters
1004
+ ----------
1005
+ k : int, optional
1006
+ Number of clusters.
1007
+ (default: 2)
1008
+ decayFactor : float, optional
1009
+ Forgetfulness of the previous centroids.
1010
+ (default: 1.0)
1011
+ timeUnit : str, optional
1012
+ Can be "batches" or "points". If points, then the decay factor is
1013
+ raised to the power of number of new points and if batches, then
1014
+ decay factor will be used as is.
1015
+ (default: "batches")
1016
+ """
1017
+
1018
+ def __init__(self, k: int = 2, decayFactor: float = 1.0, timeUnit: str = "batches"):
1019
+ self._k = k
1020
+ self._decayFactor = decayFactor
1021
+ if timeUnit not in ["batches", "points"]:
1022
+ raise ValueError("timeUnit should be 'batches' or 'points', got %s." % timeUnit)
1023
+ self._timeUnit = timeUnit
1024
+ self._model: Optional[StreamingKMeansModel] = None
1025
+
1026
+ @since("1.5.0")
1027
+ def latestModel(self) -> Optional[StreamingKMeansModel]:
1028
+ """Return the latest model"""
1029
+ return self._model
1030
+
1031
+ def _validate(self, dstream: Any) -> None:
1032
+ if self._model is None:
1033
+ raise ValueError(
1034
+ "Initial centers should be set either by setInitialCenters " "or setRandomCenters."
1035
+ )
1036
+ if not isinstance(dstream, DStream):
1037
+ raise TypeError(
1038
+ "Expected dstream to be of type DStream, " "got type %s" % type(dstream)
1039
+ )
1040
+
1041
+ @since("1.5.0")
1042
+ def setK(self, k: int) -> "StreamingKMeans":
1043
+ """Set number of clusters."""
1044
+ self._k = k
1045
+ return self
1046
+
1047
+ @since("1.5.0")
1048
+ def setDecayFactor(self, decayFactor: float) -> "StreamingKMeans":
1049
+ """Set decay factor."""
1050
+ self._decayFactor = decayFactor
1051
+ return self
1052
+
1053
+ @since("1.5.0")
1054
+ def setHalfLife(self, halfLife: float, timeUnit: str) -> "StreamingKMeans":
1055
+ """
1056
+ Set number of batches after which the centroids of that
1057
+ particular batch has half the weightage.
1058
+ """
1059
+ self._timeUnit = timeUnit
1060
+ self._decayFactor = exp(log(0.5) / halfLife)
1061
+ return self
1062
+
1063
+ @since("1.5.0")
1064
+ def setInitialCenters(
1065
+ self, centers: List["VectorLike"], weights: List[float]
1066
+ ) -> "StreamingKMeans":
1067
+ """
1068
+ Set initial centers. Should be set before calling trainOn.
1069
+ """
1070
+ self._model = StreamingKMeansModel(centers, weights)
1071
+ return self
1072
+
1073
+ @since("1.5.0")
1074
+ def setRandomCenters(self, dim: int, weight: float, seed: int) -> "StreamingKMeans":
1075
+ """
1076
+ Set the initial centers to be random samples from
1077
+ a gaussian population with constant weights.
1078
+ """
1079
+ rng = random.RandomState(seed)
1080
+ clusterCenters = rng.randn(self._k, dim)
1081
+ clusterWeights = tile(weight, self._k)
1082
+ self._model = StreamingKMeansModel(clusterCenters, clusterWeights) # type: ignore[arg-type]
1083
+ return self
1084
+
1085
+ @since("1.5.0")
1086
+ def trainOn(self, dstream: "DStream[VectorLike]") -> None:
1087
+ """Train the model on the incoming dstream."""
1088
+ self._validate(dstream)
1089
+
1090
+ def update(rdd: RDD["VectorLike"]) -> None:
1091
+ self._model.update(rdd, self._decayFactor, self._timeUnit) # type: ignore[union-attr]
1092
+
1093
+ dstream.foreachRDD(update)
1094
+
1095
+ @since("1.5.0")
1096
+ def predictOn(self, dstream: "DStream[VectorLike]") -> "DStream[int]":
1097
+ """
1098
+ Make predictions on a dstream.
1099
+ Returns a transformed dstream object
1100
+ """
1101
+ self._validate(dstream)
1102
+ return dstream.map(lambda x: self._model.predict(x)) # type: ignore[union-attr]
1103
+
1104
+ @since("1.5.0")
1105
+ def predictOnValues(self, dstream: "DStream[Tuple[T, VectorLike]]") -> "DStream[Tuple[T, int]]":
1106
+ """
1107
+ Make predictions on a keyed dstream.
1108
+ Returns a transformed dstream object.
1109
+ """
1110
+ self._validate(dstream)
1111
+ return dstream.mapValues(lambda x: self._model.predict(x)) # type: ignore[union-attr]
1112
+
1113
+
1114
+ class LDAModel(JavaModelWrapper, JavaSaveable, Loader["LDAModel"]):
1115
+
1116
+ """A clustering model derived from the LDA method.
1117
+
1118
+ Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
1119
+ Terminology
1120
+
1121
+ - "word" = "term": an element of the vocabulary
1122
+ - "token": instance of a term appearing in a document
1123
+ - "topic": multinomial distribution over words representing some concept
1124
+
1125
+ .. versionadded:: 1.5.0
1126
+
1127
+ Notes
1128
+ -----
1129
+ See the original LDA paper (journal version) [1]_
1130
+
1131
+ .. [1] Blei, D. et al. "Latent Dirichlet Allocation."
1132
+ J. Mach. Learn. Res. 3 (2003): 993-1022.
1133
+ https://www.jmlr.org/papers/v3/blei03a
1134
+
1135
+ Examples
1136
+ --------
1137
+ >>> from pyspark.mllib.linalg import Vectors
1138
+ >>> from numpy.testing import assert_almost_equal, assert_equal
1139
+ >>> data = [
1140
+ ... [1, Vectors.dense([0.0, 1.0])],
1141
+ ... [2, SparseVector(2, {0: 1.0})],
1142
+ ... ]
1143
+ >>> rdd = sc.parallelize(data)
1144
+ >>> model = LDA.train(rdd, k=2, seed=1)
1145
+ >>> model.vocabSize()
1146
+ 2
1147
+ >>> model.describeTopics()
1148
+ [([1, 0], [0.5..., 0.49...]), ([0, 1], [0.5..., 0.49...])]
1149
+ >>> model.describeTopics(1)
1150
+ [([1], [0.5...]), ([0], [0.5...])]
1151
+
1152
+ >>> topics = model.topicsMatrix()
1153
+ >>> topics_expect = array([[0.5, 0.5], [0.5, 0.5]])
1154
+ >>> assert_almost_equal(topics, topics_expect, 1)
1155
+
1156
+ >>> import os, tempfile
1157
+ >>> from shutil import rmtree
1158
+ >>> path = tempfile.mkdtemp()
1159
+ >>> model.save(sc, path)
1160
+ >>> sameModel = LDAModel.load(sc, path)
1161
+ >>> assert_equal(sameModel.topicsMatrix(), model.topicsMatrix())
1162
+ >>> sameModel.vocabSize() == model.vocabSize()
1163
+ True
1164
+ >>> try:
1165
+ ... rmtree(path)
1166
+ ... except OSError:
1167
+ ... pass
1168
+ """
1169
+
1170
+ @since("1.5.0")
1171
+ def topicsMatrix(self) -> np.ndarray:
1172
+ """Inferred topics, where each topic is represented by a distribution over terms."""
1173
+ return self.call("topicsMatrix").toArray()
1174
+
1175
+ @since("1.5.0")
1176
+ def vocabSize(self) -> int:
1177
+ """Vocabulary size (number of terms or terms in the vocabulary)"""
1178
+ return self.call("vocabSize")
1179
+
1180
+ def describeTopics(
1181
+ self, maxTermsPerTopic: Optional[int] = None
1182
+ ) -> List[Tuple[List[int], List[float]]]:
1183
+ """Return the topics described by weighted terms.
1184
+
1185
+ .. versionadded:: 1.6.0
1186
+ .. warning:: If vocabSize and k are large, this can return a large object!
1187
+
1188
+ Parameters
1189
+ ----------
1190
+ maxTermsPerTopic : int, optional
1191
+ Maximum number of terms to collect for each topic.
1192
+ (default: vocabulary size)
1193
+
1194
+ Returns
1195
+ -------
1196
+ list
1197
+ Array over topics. Each topic is represented as a pair of
1198
+ matching arrays: (term indices, term weights in topic).
1199
+ Each topic's terms are sorted in order of decreasing weight.
1200
+ """
1201
+ if maxTermsPerTopic is None:
1202
+ topics = self.call("describeTopics")
1203
+ else:
1204
+ topics = self.call("describeTopics", maxTermsPerTopic)
1205
+ return topics
1206
+
1207
+ @classmethod
1208
+ def load(cls, sc: SparkContext, path: str) -> "LDAModel":
1209
+ """Load the LDAModel from disk.
1210
+
1211
+ .. versionadded:: 1.5.0
1212
+
1213
+ Parameters
1214
+ ----------
1215
+ sc : :py:class:`pyspark.SparkContext`
1216
+ path : str
1217
+ Path to where the model is stored.
1218
+ """
1219
+ if not isinstance(sc, SparkContext):
1220
+ raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
1221
+ if not isinstance(path, str):
1222
+ raise TypeError("path should be a string, got type %s" % type(path))
1223
+ model = callMLlibFunc("loadLDAModel", sc, path)
1224
+ return LDAModel(model)
1225
+
1226
+
1227
+ class LDA:
1228
+ """
1229
+ Train Latent Dirichlet Allocation (LDA) model.
1230
+
1231
+ .. versionadded:: 1.5.0
1232
+ """
1233
+
1234
+ @classmethod
1235
+ def train(
1236
+ cls,
1237
+ rdd: RDD[Tuple[int, "VectorLike"]],
1238
+ k: int = 10,
1239
+ maxIterations: int = 20,
1240
+ docConcentration: float = -1.0,
1241
+ topicConcentration: float = -1.0,
1242
+ seed: Optional[int] = None,
1243
+ checkpointInterval: int = 10,
1244
+ optimizer: str = "em",
1245
+ ) -> LDAModel:
1246
+ """Train a LDA model.
1247
+
1248
+ .. versionadded:: 1.5.0
1249
+
1250
+ Parameters
1251
+ ----------
1252
+ rdd : :py:class:`pyspark.RDD`
1253
+ RDD of documents, which are tuples of document IDs and term
1254
+ (word) count vectors. The term count vectors are "bags of
1255
+ words" with a fixed-size vocabulary (where the vocabulary size
1256
+ is the length of the vector). Document IDs must be unique
1257
+ and >= 0.
1258
+ k : int, optional
1259
+ Number of topics to infer, i.e., the number of soft cluster
1260
+ centers.
1261
+ (default: 10)
1262
+ maxIterations : int, optional
1263
+ Maximum number of iterations allowed.
1264
+ (default: 20)
1265
+ docConcentration : float, optional
1266
+ Concentration parameter (commonly named "alpha") for the prior
1267
+ placed on documents' distributions over topics ("theta").
1268
+ (default: -1.0)
1269
+ topicConcentration : float, optional
1270
+ Concentration parameter (commonly named "beta" or "eta") for
1271
+ the prior placed on topics' distributions over terms.
1272
+ (default: -1.0)
1273
+ seed : int, optional
1274
+ Random seed for cluster initialization. Set as None to generate
1275
+ seed based on system time.
1276
+ (default: None)
1277
+ checkpointInterval : int, optional
1278
+ Period (in iterations) between checkpoints.
1279
+ (default: 10)
1280
+ optimizer : str, optional
1281
+ LDAOptimizer used to perform the actual calculation. Currently
1282
+ "em", "online" are supported.
1283
+ (default: "em")
1284
+ """
1285
+ model = callMLlibFunc(
1286
+ "trainLDAModel",
1287
+ rdd,
1288
+ k,
1289
+ maxIterations,
1290
+ docConcentration,
1291
+ topicConcentration,
1292
+ seed,
1293
+ checkpointInterval,
1294
+ optimizer,
1295
+ )
1296
+ return LDAModel(model)
1297
+
1298
+
1299
+ def _test() -> None:
1300
+ import doctest
1301
+ import numpy
1302
+ import pyspark.mllib.clustering
1303
+
1304
+ try:
1305
+ # Numpy 1.14+ changed it's string format.
1306
+ numpy.set_printoptions(legacy="1.13")
1307
+ except TypeError:
1308
+ pass
1309
+ globs = pyspark.mllib.clustering.__dict__.copy()
1310
+ globs["sc"] = SparkContext("local[4]", "PythonTest", batchSize=2)
1311
+ (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
1312
+ globs["sc"].stop()
1313
+ if failure_count:
1314
+ sys.exit(-1)
1315
+
1316
+
1317
+ if __name__ == "__main__":
1318
+ _test()