snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1662 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ Package for distributed linear algebra.
20
+ """
21
+
22
+ import sys
23
+ from typing import Any, Generic, Optional, Tuple, TypeVar, Union, TYPE_CHECKING
24
+
25
+ from py4j.java_gateway import JavaObject
26
+
27
+ from pyspark import RDD, since
28
+ from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
29
+ from pyspark.mllib.linalg import _convert_to_vector, DenseMatrix, Matrix, QRDecomposition, Vector
30
+ from pyspark.mllib.stat import MultivariateStatisticalSummary
31
+ from pyspark.sql import DataFrame
32
+ from pyspark.storagelevel import StorageLevel
33
+
34
+ UT = TypeVar("UT", bound="DistributedMatrix")
35
+ VT = TypeVar("VT", bound="Matrix")
36
+
37
+ if TYPE_CHECKING:
38
+ from pyspark.ml._typing import VectorLike
39
+
40
+ __all__ = [
41
+ "BlockMatrix",
42
+ "CoordinateMatrix",
43
+ "DistributedMatrix",
44
+ "IndexedRow",
45
+ "IndexedRowMatrix",
46
+ "MatrixEntry",
47
+ "RowMatrix",
48
+ "SingularValueDecomposition",
49
+ ]
50
+
51
+
52
+ class DistributedMatrix:
53
+ """
54
+ Represents a distributively stored matrix backed by one or
55
+ more RDDs.
56
+
57
+ """
58
+
59
+ def numRows(self) -> int:
60
+ """Get or compute the number of rows."""
61
+ raise NotImplementedError
62
+
63
+ def numCols(self) -> int:
64
+ """Get or compute the number of cols."""
65
+ raise NotImplementedError
66
+
67
+
68
+ class RowMatrix(DistributedMatrix):
69
+ """
70
+ Represents a row-oriented distributed Matrix with no meaningful
71
+ row indices.
72
+
73
+
74
+ Parameters
75
+ ----------
76
+ rows : :py:class:`pyspark.RDD` or :py:class:`pyspark.sql.DataFrame`
77
+ An RDD or DataFrame of vectors. If a DataFrame is provided, it must have a single
78
+ vector typed column.
79
+ numRows : int, optional
80
+ Number of rows in the matrix. A non-positive
81
+ value means unknown, at which point the number
82
+ of rows will be determined by the number of
83
+ records in the `rows` RDD.
84
+ numCols : int, optional
85
+ Number of columns in the matrix. A non-positive
86
+ value means unknown, at which point the number
87
+ of columns will be determined by the size of
88
+ the first row.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ rows: Union[RDD[Vector], DataFrame],
94
+ numRows: int = 0,
95
+ numCols: int = 0,
96
+ ):
97
+ """
98
+ Note: This docstring is not shown publicly.
99
+
100
+ Create a wrapper over a Java RowMatrix.
101
+
102
+ Publicly, we require that `rows` be an RDD or DataFrame. However, for
103
+ internal usage, `rows` can also be a Java RowMatrix
104
+ object, in which case we can wrap it directly. This
105
+ assists in clean matrix conversions.
106
+
107
+ Examples
108
+ --------
109
+ >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
110
+ >>> mat = RowMatrix(rows)
111
+
112
+ >>> mat_diff = RowMatrix(rows)
113
+ >>> (mat_diff._java_matrix_wrapper._java_model ==
114
+ ... mat._java_matrix_wrapper._java_model)
115
+ False
116
+
117
+ >>> mat_same = RowMatrix(mat._java_matrix_wrapper._java_model)
118
+ >>> (mat_same._java_matrix_wrapper._java_model ==
119
+ ... mat._java_matrix_wrapper._java_model)
120
+ True
121
+ """
122
+ if isinstance(rows, RDD):
123
+ rows = rows.map(_convert_to_vector)
124
+ java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols))
125
+ elif isinstance(rows, DataFrame):
126
+ java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols))
127
+ elif isinstance(rows, JavaObject) and rows.getClass().getSimpleName() == "RowMatrix":
128
+ java_matrix = rows
129
+ else:
130
+ raise TypeError("rows should be an RDD of vectors, got %s" % type(rows))
131
+
132
+ self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
133
+
134
+ @property
135
+ def rows(self) -> RDD[Vector]:
136
+ """
137
+ Rows of the RowMatrix stored as an RDD of vectors.
138
+
139
+ Examples
140
+ --------
141
+ >>> mat = RowMatrix(sc.parallelize([[1, 2, 3], [4, 5, 6]]))
142
+ >>> rows = mat.rows
143
+ >>> rows.first()
144
+ DenseVector([1.0, 2.0, 3.0])
145
+ """
146
+ return self._java_matrix_wrapper.call("rows")
147
+
148
+ def numRows(self) -> int:
149
+ """
150
+ Get or compute the number of rows.
151
+
152
+ Examples
153
+ --------
154
+ >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6],
155
+ ... [7, 8, 9], [10, 11, 12]])
156
+
157
+ >>> mat = RowMatrix(rows)
158
+ >>> print(mat.numRows())
159
+ 4
160
+
161
+ >>> mat = RowMatrix(rows, 7, 6)
162
+ >>> print(mat.numRows())
163
+ 7
164
+ """
165
+ return self._java_matrix_wrapper.call("numRows")
166
+
167
+ def numCols(self) -> int:
168
+ """
169
+ Get or compute the number of cols.
170
+
171
+ Examples
172
+ --------
173
+ >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6],
174
+ ... [7, 8, 9], [10, 11, 12]])
175
+
176
+ >>> mat = RowMatrix(rows)
177
+ >>> print(mat.numCols())
178
+ 3
179
+
180
+ >>> mat = RowMatrix(rows, 7, 6)
181
+ >>> print(mat.numCols())
182
+ 6
183
+ """
184
+ return self._java_matrix_wrapper.call("numCols")
185
+
186
+ def computeColumnSummaryStatistics(self) -> MultivariateStatisticalSummary:
187
+ """
188
+ Computes column-wise summary statistics.
189
+
190
+ .. versionadded:: 2.0.0
191
+
192
+ Returns
193
+ -------
194
+ :py:class:`MultivariateStatisticalSummary`
195
+ object containing column-wise summary statistics.
196
+
197
+ Examples
198
+ --------
199
+ >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
200
+ >>> mat = RowMatrix(rows)
201
+
202
+ >>> colStats = mat.computeColumnSummaryStatistics()
203
+ >>> colStats.mean()
204
+ array([ 2.5, 3.5, 4.5])
205
+ """
206
+ java_col_stats = self._java_matrix_wrapper.call("computeColumnSummaryStatistics")
207
+ return MultivariateStatisticalSummary(java_col_stats)
208
+
209
+ def computeCovariance(self) -> Matrix:
210
+ """
211
+ Computes the covariance matrix, treating each row as an
212
+ observation.
213
+
214
+ .. versionadded:: 2.0.0
215
+
216
+ Notes
217
+ -----
218
+ This cannot be computed on matrices with more than 65535 columns.
219
+
220
+ Examples
221
+ --------
222
+ >>> rows = sc.parallelize([[1, 2], [2, 1]])
223
+ >>> mat = RowMatrix(rows)
224
+
225
+ >>> mat.computeCovariance()
226
+ DenseMatrix(2, 2, [0.5, -0.5, -0.5, 0.5], 0)
227
+ """
228
+ return self._java_matrix_wrapper.call("computeCovariance")
229
+
230
+ def computeGramianMatrix(self) -> Matrix:
231
+ """
232
+ Computes the Gramian matrix `A^T A`.
233
+
234
+ .. versionadded:: 2.0.0
235
+
236
+ Notes
237
+ -----
238
+ This cannot be computed on matrices with more than 65535 columns.
239
+
240
+ Examples
241
+ --------
242
+ >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
243
+ >>> mat = RowMatrix(rows)
244
+
245
+ >>> mat.computeGramianMatrix()
246
+ DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)
247
+ """
248
+ return self._java_matrix_wrapper.call("computeGramianMatrix")
249
+
250
+ @since("2.0.0")
251
+ def columnSimilarities(self, threshold: float = 0.0) -> "CoordinateMatrix":
252
+ """
253
+ Compute similarities between columns of this matrix.
254
+
255
+ The threshold parameter is a trade-off knob between estimate
256
+ quality and computational cost.
257
+
258
+ The default threshold setting of 0 guarantees deterministically
259
+ correct results, but uses the brute-force approach of computing
260
+ normalized dot products.
261
+
262
+ Setting the threshold to positive values uses a sampling
263
+ approach and incurs strictly less computational cost than the
264
+ brute-force approach. However the similarities computed will
265
+ be estimates.
266
+
267
+ The sampling guarantees relative-error correctness for those
268
+ pairs of columns that have similarity greater than the given
269
+ similarity threshold.
270
+
271
+ To describe the guarantee, we set some notation:
272
+
273
+ - Let A be the smallest in magnitude non-zero element of
274
+ this matrix.
275
+ - Let B be the largest in magnitude non-zero element of
276
+ this matrix.
277
+ - Let L be the maximum number of non-zeros per row.
278
+
279
+ For example, for {0,1} matrices: A=B=1.
280
+ Another example, for the Netflix matrix: A=1, B=5
281
+
282
+ For those column pairs that are above the threshold, the
283
+ computed similarity is correct to within 20% relative error
284
+ with probability at least 1 - (0.981)^10/B^
285
+
286
+ The shuffle size is bounded by the *smaller* of the following
287
+ two expressions:
288
+
289
+ - O(n log(n) L / (threshold * A))
290
+ - O(m L^2^)
291
+
292
+ The latter is the cost of the brute-force approach, so for
293
+ non-zero thresholds, the cost is always cheaper than the
294
+ brute-force approach.
295
+
296
+ .. versionadded:: 2.0.0
297
+
298
+ Parameters
299
+ ----------
300
+ threshold : float, optional
301
+ Set to 0 for deterministic guaranteed
302
+ correctness. Similarities above this
303
+ threshold are estimated with the cost vs
304
+ estimate quality trade-off described above.
305
+
306
+ Returns
307
+ -------
308
+ :py:class:`CoordinateMatrix`
309
+ An n x n sparse upper-triangular CoordinateMatrix of
310
+ cosine similarities between columns of this matrix.
311
+
312
+ Examples
313
+ --------
314
+ >>> rows = sc.parallelize([[1, 2], [1, 5]])
315
+ >>> mat = RowMatrix(rows)
316
+
317
+ >>> sims = mat.columnSimilarities()
318
+ >>> sims.entries.first().value
319
+ 0.91914503...
320
+ """
321
+ java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold))
322
+ return CoordinateMatrix(java_sims_mat)
323
+
324
+ def tallSkinnyQR(
325
+ self, computeQ: bool = False
326
+ ) -> QRDecomposition[Optional["RowMatrix"], Matrix]:
327
+ """
328
+ Compute the QR decomposition of this RowMatrix.
329
+
330
+ The implementation is designed to optimize the QR decomposition
331
+ (factorization) for the RowMatrix of a tall and skinny shape [1]_.
332
+
333
+ .. [1] Paul G. Constantine, David F. Gleich. "Tall and skinny QR
334
+ factorizations in MapReduce architectures"
335
+ https://doi.org/10.1145/1996092.1996103
336
+
337
+ .. versionadded:: 2.0.0
338
+
339
+ Parameters
340
+ ----------
341
+ computeQ : bool, optional
342
+ whether to computeQ
343
+
344
+ Returns
345
+ -------
346
+ :py:class:`pyspark.mllib.linalg.QRDecomposition`
347
+ QRDecomposition(Q: RowMatrix, R: Matrix), where
348
+ Q = None if computeQ = false.
349
+
350
+ Examples
351
+ --------
352
+ >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]])
353
+ >>> mat = RowMatrix(rows)
354
+ >>> decomp = mat.tallSkinnyQR(True)
355
+ >>> Q = decomp.Q
356
+ >>> R = decomp.R
357
+
358
+ >>> # Test with absolute values
359
+ >>> absQRows = Q.rows.map(lambda row: abs(row.toArray()).tolist())
360
+ >>> absQRows.collect()
361
+ [[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]]
362
+
363
+ >>> # Test with absolute values
364
+ >>> abs(R.toArray()).tolist()
365
+ [[5.0, 10.0], [0.0, 1.0]]
366
+ """
367
+ decomp = JavaModelWrapper(self._java_matrix_wrapper.call("tallSkinnyQR", computeQ))
368
+ if computeQ:
369
+ java_Q = decomp.call("Q")
370
+ Q = RowMatrix(java_Q)
371
+ else:
372
+ Q = None
373
+ R = decomp.call("R")
374
+ return QRDecomposition(Q, R)
375
+
376
+ def computeSVD(
377
+ self, k: int, computeU: bool = False, rCond: float = 1e-9
378
+ ) -> "SingularValueDecomposition[RowMatrix, Matrix]":
379
+ """
380
+ Computes the singular value decomposition of the RowMatrix.
381
+
382
+ The given row matrix A of dimension (m X n) is decomposed into
383
+ U * s * V'T where
384
+
385
+ - U: (m X k) (left singular vectors) is a RowMatrix whose
386
+ columns are the eigenvectors of (A X A')
387
+ - s: DenseVector consisting of square root of the eigenvalues
388
+ (singular values) in descending order.
389
+ - v: (n X k) (right singular vectors) is a Matrix whose columns
390
+ are the eigenvectors of (A' X A)
391
+
392
+ For more specific details on implementation, please refer
393
+ the Scala documentation.
394
+
395
+ .. versionadded:: 2.2.0
396
+
397
+ Parameters
398
+ ----------
399
+ k : int
400
+ Number of leading singular values to keep (`0 < k <= n`).
401
+ It might return less than k if there are numerically zero singular values
402
+ or there are not enough Ritz values converged before the maximum number of
403
+ Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).
404
+ computeU : bool, optional
405
+ Whether or not to compute U. If set to be
406
+ True, then U is computed by A * V * s^-1
407
+ rCond : float, optional
408
+ Reciprocal condition number. All singular values
409
+ smaller than rCond * s[0] are treated as zero
410
+ where s[0] is the largest singular value.
411
+
412
+ Returns
413
+ -------
414
+ :py:class:`SingularValueDecomposition`
415
+
416
+ Examples
417
+ --------
418
+ >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]])
419
+ >>> rm = RowMatrix(rows)
420
+
421
+ >>> svd_model = rm.computeSVD(2, True)
422
+ >>> svd_model.U.rows.collect()
423
+ [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])]
424
+ >>> svd_model.s
425
+ DenseVector([3.4641, 3.1623])
426
+ >>> svd_model.V
427
+ DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, ...0.0], 0)
428
+ """
429
+ j_model = self._java_matrix_wrapper.call("computeSVD", int(k), bool(computeU), float(rCond))
430
+ return SingularValueDecomposition(j_model)
431
+
432
+ def computePrincipalComponents(self, k: int) -> Matrix:
433
+ """
434
+ Computes the k principal components of the given row matrix
435
+
436
+ .. versionadded:: 2.2.0
437
+
438
+ Notes
439
+ -----
440
+ This cannot be computed on matrices with more than 65535 columns.
441
+
442
+ Parameters
443
+ ----------
444
+ k : int
445
+ Number of principal components to keep.
446
+
447
+ Returns
448
+ -------
449
+ :py:class:`pyspark.mllib.linalg.DenseMatrix`
450
+
451
+ Examples
452
+ --------
453
+ >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]])
454
+ >>> rm = RowMatrix(rows)
455
+
456
+ >>> # Returns the two principal components of rm
457
+ >>> pca = rm.computePrincipalComponents(2)
458
+ >>> pca
459
+ DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0)
460
+
461
+ >>> # Transform into new dimensions with the greatest variance.
462
+ >>> rm.multiply(pca).rows.collect() # doctest: +NORMALIZE_WHITESPACE
463
+ [DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), \
464
+ DenseVector([-4.6102, -4.9745])]
465
+ """
466
+ return self._java_matrix_wrapper.call("computePrincipalComponents", k)
467
+
468
+ def multiply(self, matrix: Matrix) -> "RowMatrix":
469
+ """
470
+ Multiply this matrix by a local dense matrix on the right.
471
+
472
+ .. versionadded:: 2.2.0
473
+
474
+ Parameters
475
+ ----------
476
+ matrix : :py:class:`pyspark.mllib.linalg.Matrix`
477
+ a local dense matrix whose number of rows must match the number of columns
478
+ of this matrix
479
+
480
+ Returns
481
+ -------
482
+ :py:class:`RowMatrix`
483
+
484
+ Examples
485
+ --------
486
+ >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]]))
487
+ >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
488
+ [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])]
489
+ """
490
+ if not isinstance(matrix, DenseMatrix):
491
+ raise TypeError("Only multiplication with DenseMatrix is supported.")
492
+ j_model = self._java_matrix_wrapper.call("multiply", matrix)
493
+ return RowMatrix(j_model)
494
+
495
+
496
+ class SingularValueDecomposition(JavaModelWrapper, Generic[UT, VT]):
497
+ """
498
+ Represents singular value decomposition (SVD) factors.
499
+
500
+ .. versionadded:: 2.2.0
501
+ """
502
+
503
+ @property
504
+ @since("2.2.0")
505
+ def U(self) -> Optional[UT]: # type: ignore[return]
506
+ """
507
+ Returns a distributed matrix whose columns are the left
508
+ singular vectors of the SingularValueDecomposition if computeU was set to be True.
509
+ """
510
+ u = self.call("U")
511
+ if u is not None:
512
+ mat_name = u.getClass().getSimpleName()
513
+ if mat_name == "RowMatrix":
514
+ return RowMatrix(u) # type: ignore[return-value]
515
+ elif mat_name == "IndexedRowMatrix":
516
+ return IndexedRowMatrix(u) # type: ignore[return-value]
517
+ else:
518
+ raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name)
519
+
520
+ @property
521
+ @since("2.2.0")
522
+ def s(self) -> Vector:
523
+ """
524
+ Returns a DenseVector with singular values in descending order.
525
+ """
526
+ return self.call("s")
527
+
528
+ @property
529
+ @since("2.2.0")
530
+ def V(self) -> VT:
531
+ """
532
+ Returns a DenseMatrix whose columns are the right singular
533
+ vectors of the SingularValueDecomposition.
534
+ """
535
+ return self.call("V")
536
+
537
+
538
+ class IndexedRow:
539
+ """
540
+ Represents a row of an IndexedRowMatrix.
541
+
542
+ Just a wrapper over a (int, vector) tuple.
543
+
544
+ Parameters
545
+ ----------
546
+ index : int
547
+ The index for the given row.
548
+ vector : :py:class:`pyspark.mllib.linalg.Vector` or convertible
549
+ The row in the matrix at the given index.
550
+ """
551
+
552
+ def __init__(self, index: int, vector: "VectorLike") -> None:
553
+ self.index = int(index)
554
+ self.vector = _convert_to_vector(vector)
555
+
556
+ def __repr__(self) -> str:
557
+ return "IndexedRow(%s, %s)" % (self.index, self.vector)
558
+
559
+
560
+ def _convert_to_indexed_row(row: Any) -> IndexedRow:
561
+ if isinstance(row, IndexedRow):
562
+ return row
563
+ elif isinstance(row, tuple) and len(row) == 2:
564
+ return IndexedRow(*row)
565
+ else:
566
+ raise TypeError("Cannot convert type %s into IndexedRow" % type(row))
567
+
568
+
569
+ class IndexedRowMatrix(DistributedMatrix):
570
+ """
571
+ Represents a row-oriented distributed Matrix with indexed rows.
572
+
573
+ Parameters
574
+ ----------
575
+ rows : :py:class:`pyspark.RDD`
576
+ An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a
577
+ int typed column of indices and a vector typed column.
578
+ numRows : int, optional
579
+ Number of rows in the matrix. A non-positive
580
+ value means unknown, at which point the number
581
+ of rows will be determined by the max row
582
+ index plus one.
583
+ numCols : int, optional
584
+ Number of columns in the matrix. A non-positive
585
+ value means unknown, at which point the number
586
+ of columns will be determined by the size of
587
+ the first row.
588
+ """
589
+
590
+ def __init__(
591
+ self,
592
+ rows: RDD[Union[Tuple[int, "VectorLike"], IndexedRow]],
593
+ numRows: int = 0,
594
+ numCols: int = 0,
595
+ ):
596
+ """
597
+ Note: This docstring is not shown publicly.
598
+
599
+ Create a wrapper over a Java IndexedRowMatrix.
600
+
601
+ Publicly, we require that `rows` be an RDD or DataFrame. However, for
602
+ internal usage, `rows` can also be a Java IndexedRowMatrix
603
+ object, in which case we can wrap it directly. This
604
+ assists in clean matrix conversions.
605
+
606
+ Examples
607
+ --------
608
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
609
+ ... IndexedRow(1, [4, 5, 6])])
610
+ >>> mat = IndexedRowMatrix(rows)
611
+
612
+ >>> mat_diff = IndexedRowMatrix(rows)
613
+ >>> (mat_diff._java_matrix_wrapper._java_model ==
614
+ ... mat._java_matrix_wrapper._java_model)
615
+ False
616
+
617
+ >>> mat_same = IndexedRowMatrix(mat._java_matrix_wrapper._java_model)
618
+ >>> (mat_same._java_matrix_wrapper._java_model ==
619
+ ... mat._java_matrix_wrapper._java_model)
620
+ True
621
+ """
622
+ if isinstance(rows, RDD):
623
+ rows = rows.map(_convert_to_indexed_row)
624
+ # We use DataFrames for serialization of IndexedRows from
625
+ # Python, so first convert the RDD to a DataFrame on this
626
+ # side. This will convert each IndexedRow to a Row
627
+ # containing the 'index' and 'vector' values, which can
628
+ # both be easily serialized. We will convert back to
629
+ # IndexedRows on the Scala side.
630
+ java_matrix = callMLlibFunc(
631
+ "createIndexedRowMatrix", rows.toDF(), int(numRows), int(numCols)
632
+ )
633
+ elif isinstance(rows, DataFrame):
634
+ java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, int(numRows), int(numCols))
635
+ elif isinstance(rows, JavaObject) and rows.getClass().getSimpleName() == "IndexedRowMatrix":
636
+ java_matrix = rows
637
+ else:
638
+ raise TypeError(
639
+ "rows should be an RDD of IndexedRows or (int, vector) tuples, "
640
+ "got %s" % type(rows)
641
+ )
642
+
643
+ self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
644
+
645
+ @property
646
+ def rows(self) -> RDD[IndexedRow]:
647
+ """
648
+ Rows of the IndexedRowMatrix stored as an RDD of IndexedRows.
649
+
650
+ Examples
651
+ --------
652
+ >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]),
653
+ ... IndexedRow(1, [4, 5, 6])]))
654
+ >>> rows = mat.rows
655
+ >>> rows.first()
656
+ IndexedRow(0, [1.0,2.0,3.0])
657
+ """
658
+ # We use DataFrames for serialization of IndexedRows from
659
+ # Java, so we first convert the RDD of rows to a DataFrame
660
+ # on the Scala/Java side. Then we map each Row in the
661
+ # DataFrame back to an IndexedRow on this side.
662
+ rows_df = callMLlibFunc("getIndexedRows", self._java_matrix_wrapper._java_model)
663
+ rows = rows_df.rdd.map(lambda row: IndexedRow(row[0], row[1]))
664
+ return rows
665
+
666
+ def numRows(self) -> int:
667
+ """
668
+ Get or compute the number of rows.
669
+
670
+ Examples
671
+ --------
672
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
673
+ ... IndexedRow(1, [4, 5, 6]),
674
+ ... IndexedRow(2, [7, 8, 9]),
675
+ ... IndexedRow(3, [10, 11, 12])])
676
+
677
+ >>> mat = IndexedRowMatrix(rows)
678
+ >>> print(mat.numRows())
679
+ 4
680
+
681
+ >>> mat = IndexedRowMatrix(rows, 7, 6)
682
+ >>> print(mat.numRows())
683
+ 7
684
+ """
685
+ return self._java_matrix_wrapper.call("numRows")
686
+
687
+ def numCols(self) -> int:
688
+ """
689
+ Get or compute the number of cols.
690
+
691
+ Examples
692
+ --------
693
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
694
+ ... IndexedRow(1, [4, 5, 6]),
695
+ ... IndexedRow(2, [7, 8, 9]),
696
+ ... IndexedRow(3, [10, 11, 12])])
697
+
698
+ >>> mat = IndexedRowMatrix(rows)
699
+ >>> print(mat.numCols())
700
+ 3
701
+
702
+ >>> mat = IndexedRowMatrix(rows, 7, 6)
703
+ >>> print(mat.numCols())
704
+ 6
705
+ """
706
+ return self._java_matrix_wrapper.call("numCols")
707
+
708
+ def columnSimilarities(self) -> "CoordinateMatrix":
709
+ """
710
+ Compute all cosine similarities between columns.
711
+
712
+ Examples
713
+ --------
714
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
715
+ ... IndexedRow(6, [4, 5, 6])])
716
+ >>> mat = IndexedRowMatrix(rows)
717
+ >>> cs = mat.columnSimilarities()
718
+ >>> print(cs.numCols())
719
+ 3
720
+ """
721
+ java_coordinate_matrix = self._java_matrix_wrapper.call("columnSimilarities")
722
+ return CoordinateMatrix(java_coordinate_matrix)
723
+
724
+ def computeGramianMatrix(self) -> Matrix:
725
+ """
726
+ Computes the Gramian matrix `A^T A`.
727
+
728
+ .. versionadded:: 2.0.0
729
+
730
+ Notes
731
+ -----
732
+ This cannot be computed on matrices with more than 65535 columns.
733
+
734
+ Examples
735
+ --------
736
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
737
+ ... IndexedRow(1, [4, 5, 6])])
738
+ >>> mat = IndexedRowMatrix(rows)
739
+
740
+ >>> mat.computeGramianMatrix()
741
+ DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)
742
+ """
743
+ return self._java_matrix_wrapper.call("computeGramianMatrix")
744
+
745
+ def toRowMatrix(self) -> RowMatrix:
746
+ """
747
+ Convert this matrix to a RowMatrix.
748
+
749
+ Examples
750
+ --------
751
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
752
+ ... IndexedRow(6, [4, 5, 6])])
753
+ >>> mat = IndexedRowMatrix(rows).toRowMatrix()
754
+ >>> mat.rows.collect()
755
+ [DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0])]
756
+ """
757
+ java_row_matrix = self._java_matrix_wrapper.call("toRowMatrix")
758
+ return RowMatrix(java_row_matrix)
759
+
760
+ def toCoordinateMatrix(self) -> "CoordinateMatrix":
761
+ """
762
+ Convert this matrix to a CoordinateMatrix.
763
+
764
+ Examples
765
+ --------
766
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 0]),
767
+ ... IndexedRow(6, [0, 5])])
768
+ >>> mat = IndexedRowMatrix(rows).toCoordinateMatrix()
769
+ >>> mat.entries.take(3)
770
+ [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 0.0), MatrixEntry(6, 0, 0.0)]
771
+ """
772
+ java_coordinate_matrix = self._java_matrix_wrapper.call("toCoordinateMatrix")
773
+ return CoordinateMatrix(java_coordinate_matrix)
774
+
775
+ def toBlockMatrix(self, rowsPerBlock: int = 1024, colsPerBlock: int = 1024) -> "BlockMatrix":
776
+ """
777
+ Convert this matrix to a BlockMatrix.
778
+
779
+ Parameters
780
+ ----------
781
+ rowsPerBlock : int, optional
782
+ Number of rows that make up each block.
783
+ The blocks forming the final rows are not
784
+ required to have the given number of rows.
785
+ colsPerBlock : int, optional
786
+ Number of columns that make up each block.
787
+ The blocks forming the final columns are not
788
+ required to have the given number of columns.
789
+
790
+ Examples
791
+ --------
792
+ >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
793
+ ... IndexedRow(6, [4, 5, 6])])
794
+ >>> mat = IndexedRowMatrix(rows).toBlockMatrix()
795
+
796
+ >>> # This IndexedRowMatrix will have 7 effective rows, due to
797
+ >>> # the highest row index being 6, and the ensuing
798
+ >>> # BlockMatrix will have 7 rows as well.
799
+ >>> print(mat.numRows())
800
+ 7
801
+
802
+ >>> print(mat.numCols())
803
+ 3
804
+ """
805
+ java_block_matrix = self._java_matrix_wrapper.call(
806
+ "toBlockMatrix", rowsPerBlock, colsPerBlock
807
+ )
808
+ return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock)
809
+
810
+ def computeSVD(
811
+ self, k: int, computeU: bool = False, rCond: float = 1e-9
812
+ ) -> SingularValueDecomposition["IndexedRowMatrix", Matrix]:
813
+ """
814
+ Computes the singular value decomposition of the IndexedRowMatrix.
815
+
816
+ The given row matrix A of dimension (m X n) is decomposed into
817
+ U * s * V'T where
818
+
819
+ * U: (m X k) (left singular vectors) is a IndexedRowMatrix
820
+ whose columns are the eigenvectors of (A X A')
821
+ * s: DenseVector consisting of square root of the eigenvalues
822
+ (singular values) in descending order.
823
+ * v: (n X k) (right singular vectors) is a Matrix whose columns
824
+ are the eigenvectors of (A' X A)
825
+
826
+ For more specific details on implementation, please refer
827
+ the scala documentation.
828
+
829
+ .. versionadded:: 2.2.0
830
+
831
+ Parameters
832
+ ----------
833
+ k : int
834
+ Number of leading singular values to keep (`0 < k <= n`).
835
+ It might return less than k if there are numerically zero singular values
836
+ or there are not enough Ritz values converged before the maximum number of
837
+ Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).
838
+ computeU : bool, optional
839
+ Whether or not to compute U. If set to be
840
+ True, then U is computed by A * V * s^-1
841
+ rCond : float, optional
842
+ Reciprocal condition number. All singular values
843
+ smaller than rCond * s[0] are treated as zero
844
+ where s[0] is the largest singular value.
845
+
846
+ Returns
847
+ -------
848
+ :py:class:`SingularValueDecomposition`
849
+
850
+ Examples
851
+ --------
852
+ >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))]
853
+ >>> irm = IndexedRowMatrix(sc.parallelize(rows))
854
+ >>> svd_model = irm.computeSVD(2, True)
855
+ >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE
856
+ [IndexedRow(0, [-0.707106781187,0.707106781187]),\
857
+ IndexedRow(1, [-0.707106781187,-0.707106781187])]
858
+ >>> svd_model.s
859
+ DenseVector([3.4641, 3.1623])
860
+ >>> svd_model.V
861
+ DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, ...0.0], 0)
862
+ """
863
+ j_model = self._java_matrix_wrapper.call("computeSVD", int(k), bool(computeU), float(rCond))
864
+ return SingularValueDecomposition(j_model)
865
+
866
+ def multiply(self, matrix: Matrix) -> "IndexedRowMatrix":
867
+ """
868
+ Multiply this matrix by a local dense matrix on the right.
869
+
870
+ .. versionadded:: 2.2.0
871
+
872
+ Parameters
873
+ ----------
874
+ matrix : :py:class:`pyspark.mllib.linalg.Matrix`
875
+ a local dense matrix whose number of rows must match the number of columns
876
+ of this matrix
877
+
878
+ Returns
879
+ -------
880
+ :py:class:`IndexedRowMatrix`
881
+
882
+ Examples
883
+ --------
884
+ >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))
885
+ >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
886
+ [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]
887
+ """
888
+ if not isinstance(matrix, DenseMatrix):
889
+ raise TypeError("Only multiplication with DenseMatrix is supported.")
890
+ return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix))
891
+
892
+
893
+ class MatrixEntry:
894
+ """
895
+ Represents an entry of a CoordinateMatrix.
896
+
897
+ Just a wrapper over a (int, int, float) tuple.
898
+
899
+ Parameters
900
+ ----------
901
+ i : int
902
+ The row index of the matrix.
903
+ j : int
904
+ The column index of the matrix.
905
+ value : float
906
+ The (i, j)th entry of the matrix, as a float.
907
+ """
908
+
909
+ def __init__(self, i: int, j: int, value: float) -> None:
910
+ self.i = int(i)
911
+ self.j = int(j)
912
+ self.value = float(value)
913
+
914
+ def __repr__(self) -> str:
915
+ return "MatrixEntry(%s, %s, %s)" % (self.i, self.j, self.value)
916
+
917
+
918
+ def _convert_to_matrix_entry(entry: Any) -> MatrixEntry:
919
+ if isinstance(entry, MatrixEntry):
920
+ return entry
921
+ elif isinstance(entry, tuple) and len(entry) == 3:
922
+ return MatrixEntry(*entry)
923
+ else:
924
+ raise TypeError("Cannot convert type %s into MatrixEntry" % type(entry))
925
+
926
+
927
+ class CoordinateMatrix(DistributedMatrix):
928
+ """
929
+ Represents a matrix in coordinate format.
930
+
931
+ Parameters
932
+ ----------
933
+ entries : :py:class:`pyspark.RDD`
934
+ An RDD of MatrixEntry inputs or
935
+ (int, int, float) tuples.
936
+ numRows : int, optional
937
+ Number of rows in the matrix. A non-positive
938
+ value means unknown, at which point the number
939
+ of rows will be determined by the max row
940
+ index plus one.
941
+ numCols : int, optional
942
+ Number of columns in the matrix. A non-positive
943
+ value means unknown, at which point the number
944
+ of columns will be determined by the max row
945
+ index plus one.
946
+ """
947
+
948
+ def __init__(
949
+ self,
950
+ entries: RDD[Union[Tuple[int, int, float], MatrixEntry]],
951
+ numRows: int = 0,
952
+ numCols: int = 0,
953
+ ):
954
+ """
955
+ Note: This docstring is not shown publicly.
956
+
957
+ Create a wrapper over a Java CoordinateMatrix.
958
+
959
+ Publicly, we require that `rows` be an RDD. However, for
960
+ internal usage, `rows` can also be a Java CoordinateMatrix
961
+ object, in which case we can wrap it directly. This
962
+ assists in clean matrix conversions.
963
+
964
+ Examples
965
+ --------
966
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
967
+ ... MatrixEntry(6, 4, 2.1)])
968
+ >>> mat = CoordinateMatrix(entries)
969
+
970
+ >>> mat_diff = CoordinateMatrix(entries)
971
+ >>> (mat_diff._java_matrix_wrapper._java_model ==
972
+ ... mat._java_matrix_wrapper._java_model)
973
+ False
974
+
975
+ >>> mat_same = CoordinateMatrix(mat._java_matrix_wrapper._java_model)
976
+ >>> (mat_same._java_matrix_wrapper._java_model ==
977
+ ... mat._java_matrix_wrapper._java_model)
978
+ True
979
+ """
980
+ if isinstance(entries, RDD):
981
+ entries = entries.map(_convert_to_matrix_entry)
982
+ # We use DataFrames for serialization of MatrixEntry entries
983
+ # from Python, so first convert the RDD to a DataFrame on
984
+ # this side. This will convert each MatrixEntry to a Row
985
+ # containing the 'i', 'j', and 'value' values, which can
986
+ # each be easily serialized. We will convert back to
987
+ # MatrixEntry inputs on the Scala side.
988
+ java_matrix = callMLlibFunc(
989
+ "createCoordinateMatrix", entries.toDF(), int(numRows), int(numCols)
990
+ )
991
+ elif (
992
+ isinstance(entries, JavaObject)
993
+ and entries.getClass().getSimpleName() == "CoordinateMatrix"
994
+ ):
995
+ java_matrix = entries
996
+ else:
997
+ raise TypeError(
998
+ "entries should be an RDD of MatrixEntry entries or "
999
+ "(int, int, float) tuples, got %s" % type(entries)
1000
+ )
1001
+
1002
+ self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
1003
+
1004
+ @property
1005
+ def entries(self) -> RDD[MatrixEntry]:
1006
+ """
1007
+ Entries of the CoordinateMatrix stored as an RDD of
1008
+ MatrixEntries.
1009
+
1010
+ Examples
1011
+ --------
1012
+ >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2),
1013
+ ... MatrixEntry(6, 4, 2.1)]))
1014
+ >>> entries = mat.entries
1015
+ >>> entries.first()
1016
+ MatrixEntry(0, 0, 1.2)
1017
+ """
1018
+ # We use DataFrames for serialization of MatrixEntry entries
1019
+ # from Java, so we first convert the RDD of entries to a
1020
+ # DataFrame on the Scala/Java side. Then we map each Row in
1021
+ # the DataFrame back to a MatrixEntry on this side.
1022
+ entries_df = callMLlibFunc("getMatrixEntries", self._java_matrix_wrapper._java_model)
1023
+ entries = entries_df.rdd.map(lambda row: MatrixEntry(row[0], row[1], row[2]))
1024
+ return entries
1025
+
1026
+ def numRows(self) -> int:
1027
+ """
1028
+ Get or compute the number of rows.
1029
+
1030
+ Examples
1031
+ --------
1032
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
1033
+ ... MatrixEntry(1, 0, 2),
1034
+ ... MatrixEntry(2, 1, 3.7)])
1035
+
1036
+ >>> mat = CoordinateMatrix(entries)
1037
+ >>> print(mat.numRows())
1038
+ 3
1039
+
1040
+ >>> mat = CoordinateMatrix(entries, 7, 6)
1041
+ >>> print(mat.numRows())
1042
+ 7
1043
+ """
1044
+ return self._java_matrix_wrapper.call("numRows")
1045
+
1046
+ def numCols(self) -> int:
1047
+ """
1048
+ Get or compute the number of cols.
1049
+
1050
+ Examples
1051
+ --------
1052
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
1053
+ ... MatrixEntry(1, 0, 2),
1054
+ ... MatrixEntry(2, 1, 3.7)])
1055
+
1056
+ >>> mat = CoordinateMatrix(entries)
1057
+ >>> print(mat.numCols())
1058
+ 2
1059
+
1060
+ >>> mat = CoordinateMatrix(entries, 7, 6)
1061
+ >>> print(mat.numCols())
1062
+ 6
1063
+ """
1064
+ return self._java_matrix_wrapper.call("numCols")
1065
+
1066
+ def transpose(self) -> "CoordinateMatrix":
1067
+ """
1068
+ Transpose this CoordinateMatrix.
1069
+
1070
+ .. versionadded:: 2.0.0
1071
+
1072
+ Examples
1073
+ --------
1074
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
1075
+ ... MatrixEntry(1, 0, 2),
1076
+ ... MatrixEntry(2, 1, 3.7)])
1077
+ >>> mat = CoordinateMatrix(entries)
1078
+ >>> mat_transposed = mat.transpose()
1079
+
1080
+ >>> print(mat_transposed.numRows())
1081
+ 2
1082
+
1083
+ >>> print(mat_transposed.numCols())
1084
+ 3
1085
+ """
1086
+ java_transposed_matrix = self._java_matrix_wrapper.call("transpose")
1087
+ return CoordinateMatrix(java_transposed_matrix)
1088
+
1089
+ def toRowMatrix(self) -> RowMatrix:
1090
+ """
1091
+ Convert this matrix to a RowMatrix.
1092
+
1093
+ Examples
1094
+ --------
1095
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
1096
+ ... MatrixEntry(6, 4, 2.1)])
1097
+ >>> mat = CoordinateMatrix(entries).toRowMatrix()
1098
+
1099
+ >>> # This CoordinateMatrix will have 7 effective rows, due to
1100
+ >>> # the highest row index being 6, but the ensuing RowMatrix
1101
+ >>> # will only have 2 rows since there are only entries on 2
1102
+ >>> # unique rows.
1103
+ >>> print(mat.numRows())
1104
+ 2
1105
+
1106
+ >>> # This CoordinateMatrix will have 5 columns, due to the
1107
+ >>> # highest column index being 4, and the ensuing RowMatrix
1108
+ >>> # will have 5 columns as well.
1109
+ >>> print(mat.numCols())
1110
+ 5
1111
+ """
1112
+ java_row_matrix = self._java_matrix_wrapper.call("toRowMatrix")
1113
+ return RowMatrix(java_row_matrix)
1114
+
1115
+ def toIndexedRowMatrix(self) -> IndexedRowMatrix:
1116
+ """
1117
+ Convert this matrix to an IndexedRowMatrix.
1118
+
1119
+ Examples
1120
+ --------
1121
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
1122
+ ... MatrixEntry(6, 4, 2.1)])
1123
+ >>> mat = CoordinateMatrix(entries).toIndexedRowMatrix()
1124
+
1125
+ >>> # This CoordinateMatrix will have 7 effective rows, due to
1126
+ >>> # the highest row index being 6, and the ensuing
1127
+ >>> # IndexedRowMatrix will have 7 rows as well.
1128
+ >>> print(mat.numRows())
1129
+ 7
1130
+
1131
+ >>> # This CoordinateMatrix will have 5 columns, due to the
1132
+ >>> # highest column index being 4, and the ensuing
1133
+ >>> # IndexedRowMatrix will have 5 columns as well.
1134
+ >>> print(mat.numCols())
1135
+ 5
1136
+ """
1137
+ java_indexed_row_matrix = self._java_matrix_wrapper.call("toIndexedRowMatrix")
1138
+ return IndexedRowMatrix(java_indexed_row_matrix)
1139
+
1140
+ def toBlockMatrix(self, rowsPerBlock: int = 1024, colsPerBlock: int = 1024) -> "BlockMatrix":
1141
+ """
1142
+ Convert this matrix to a BlockMatrix.
1143
+
1144
+ Parameters
1145
+ ----------
1146
+ rowsPerBlock : int, optional
1147
+ Number of rows that make up each block.
1148
+ The blocks forming the final rows are not
1149
+ required to have the given number of rows.
1150
+ colsPerBlock : int, optional
1151
+ Number of columns that make up each block.
1152
+ The blocks forming the final columns are not
1153
+ required to have the given number of columns.
1154
+
1155
+ Examples
1156
+ --------
1157
+ >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
1158
+ ... MatrixEntry(6, 4, 2.1)])
1159
+ >>> mat = CoordinateMatrix(entries).toBlockMatrix()
1160
+
1161
+ >>> # This CoordinateMatrix will have 7 effective rows, due to
1162
+ >>> # the highest row index being 6, and the ensuing
1163
+ >>> # BlockMatrix will have 7 rows as well.
1164
+ >>> print(mat.numRows())
1165
+ 7
1166
+
1167
+ >>> # This CoordinateMatrix will have 5 columns, due to the
1168
+ >>> # highest column index being 4, and the ensuing
1169
+ >>> # BlockMatrix will have 5 columns as well.
1170
+ >>> print(mat.numCols())
1171
+ 5
1172
+ """
1173
+ java_block_matrix = self._java_matrix_wrapper.call(
1174
+ "toBlockMatrix", rowsPerBlock, colsPerBlock
1175
+ )
1176
+ return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock)
1177
+
1178
+
1179
+ def _convert_to_matrix_block_tuple(block: Any) -> Tuple[Tuple[int, int], Matrix]:
1180
+ if (
1181
+ isinstance(block, tuple)
1182
+ and len(block) == 2
1183
+ and isinstance(block[0], tuple)
1184
+ and len(block[0]) == 2
1185
+ and isinstance(block[1], Matrix)
1186
+ ):
1187
+ blockRowIndex = int(block[0][0])
1188
+ blockColIndex = int(block[0][1])
1189
+ subMatrix = block[1]
1190
+ return ((blockRowIndex, blockColIndex), subMatrix)
1191
+ else:
1192
+ raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block))
1193
+
1194
+
1195
+ class BlockMatrix(DistributedMatrix):
1196
+ """
1197
+ Represents a distributed matrix in blocks of local matrices.
1198
+
1199
+ Parameters
1200
+ ----------
1201
+ blocks : :py:class:`pyspark.RDD`
1202
+ An RDD of sub-matrix blocks
1203
+ ((blockRowIndex, blockColIndex), sub-matrix) that
1204
+ form this distributed matrix. If multiple blocks
1205
+ with the same index exist, the results for
1206
+ operations like add and multiply will be
1207
+ unpredictable.
1208
+ rowsPerBlock : int
1209
+ Number of rows that make up each block.
1210
+ The blocks forming the final rows are not
1211
+ required to have the given number of rows.
1212
+ colsPerBlock : int
1213
+ Number of columns that make up each block.
1214
+ The blocks forming the final columns are not
1215
+ required to have the given number of columns.
1216
+ numRows : int, optional
1217
+ Number of rows of this matrix. If the supplied
1218
+ value is less than or equal to zero, the number
1219
+ of rows will be calculated when `numRows` is
1220
+ invoked.
1221
+ numCols : int, optional
1222
+ Number of columns of this matrix. If the supplied
1223
+ value is less than or equal to zero, the number
1224
+ of columns will be calculated when `numCols` is
1225
+ invoked.
1226
+ """
1227
+
1228
+ def __init__(
1229
+ self,
1230
+ blocks: RDD[Tuple[Tuple[int, int], Matrix]],
1231
+ rowsPerBlock: int,
1232
+ colsPerBlock: int,
1233
+ numRows: int = 0,
1234
+ numCols: int = 0,
1235
+ ):
1236
+ """
1237
+ Note: This docstring is not shown publicly.
1238
+
1239
+ Create a wrapper over a Java BlockMatrix.
1240
+
1241
+ Publicly, we require that `blocks` be an RDD. However, for
1242
+ internal usage, `blocks` can also be a Java BlockMatrix
1243
+ object, in which case we can wrap it directly. This
1244
+ assists in clean matrix conversions.
1245
+
1246
+ Examples
1247
+ --------
1248
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1249
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1250
+ >>> mat = BlockMatrix(blocks, 3, 2)
1251
+
1252
+ >>> mat_diff = BlockMatrix(blocks, 3, 2)
1253
+ >>> (mat_diff._java_matrix_wrapper._java_model ==
1254
+ ... mat._java_matrix_wrapper._java_model)
1255
+ False
1256
+
1257
+ >>> mat_same = BlockMatrix(mat._java_matrix_wrapper._java_model, 3, 2)
1258
+ >>> (mat_same._java_matrix_wrapper._java_model ==
1259
+ ... mat._java_matrix_wrapper._java_model)
1260
+ True
1261
+ """
1262
+ if isinstance(blocks, RDD):
1263
+ blocks = blocks.map(_convert_to_matrix_block_tuple)
1264
+ # We use DataFrames for serialization of sub-matrix blocks
1265
+ # from Python, so first convert the RDD to a DataFrame on
1266
+ # this side. This will convert each sub-matrix block
1267
+ # tuple to a Row containing the 'blockRowIndex',
1268
+ # 'blockColIndex', and 'subMatrix' values, which can
1269
+ # each be easily serialized. We will convert back to
1270
+ # ((blockRowIndex, blockColIndex), sub-matrix) tuples on
1271
+ # the Scala side.
1272
+ java_matrix = callMLlibFunc(
1273
+ "createBlockMatrix",
1274
+ blocks.toDF(),
1275
+ int(rowsPerBlock),
1276
+ int(colsPerBlock),
1277
+ int(numRows),
1278
+ int(numCols),
1279
+ )
1280
+ elif isinstance(blocks, JavaObject) and blocks.getClass().getSimpleName() == "BlockMatrix":
1281
+ java_matrix = blocks
1282
+ else:
1283
+ raise TypeError(
1284
+ "blocks should be an RDD of sub-matrix blocks as "
1285
+ "((int, int), matrix) tuples, got %s" % type(blocks)
1286
+ )
1287
+
1288
+ self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
1289
+
1290
+ @property
1291
+ def blocks(self) -> RDD[Tuple[Tuple[int, int], Matrix]]:
1292
+ """
1293
+ The RDD of sub-matrix blocks
1294
+ ((blockRowIndex, blockColIndex), sub-matrix) that form this
1295
+ distributed matrix.
1296
+
1297
+ Examples
1298
+ --------
1299
+ >>> mat = BlockMatrix(
1300
+ ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1301
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2)
1302
+ >>> blocks = mat.blocks
1303
+ >>> blocks.first()
1304
+ ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0))
1305
+
1306
+ """
1307
+ # We use DataFrames for serialization of sub-matrix blocks
1308
+ # from Java, so we first convert the RDD of blocks to a
1309
+ # DataFrame on the Scala/Java side. Then we map each Row in
1310
+ # the DataFrame back to a sub-matrix block on this side.
1311
+ blocks_df = callMLlibFunc("getMatrixBlocks", self._java_matrix_wrapper._java_model)
1312
+ blocks = blocks_df.rdd.map(lambda row: ((row[0][0], row[0][1]), row[1]))
1313
+ return blocks
1314
+
1315
+ @property
1316
+ def rowsPerBlock(self) -> int:
1317
+ """
1318
+ Number of rows that make up each block.
1319
+
1320
+ Examples
1321
+ --------
1322
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1323
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1324
+ >>> mat = BlockMatrix(blocks, 3, 2)
1325
+ >>> mat.rowsPerBlock
1326
+ 3
1327
+ """
1328
+ return self._java_matrix_wrapper.call("rowsPerBlock")
1329
+
1330
+ @property
1331
+ def colsPerBlock(self) -> int:
1332
+ """
1333
+ Number of columns that make up each block.
1334
+
1335
+ Examples
1336
+ --------
1337
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1338
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1339
+ >>> mat = BlockMatrix(blocks, 3, 2)
1340
+ >>> mat.colsPerBlock
1341
+ 2
1342
+ """
1343
+ return self._java_matrix_wrapper.call("colsPerBlock")
1344
+
1345
+ @property
1346
+ def numRowBlocks(self) -> int:
1347
+ """
1348
+ Number of rows of blocks in the BlockMatrix.
1349
+
1350
+ Examples
1351
+ --------
1352
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1353
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1354
+ >>> mat = BlockMatrix(blocks, 3, 2)
1355
+ >>> mat.numRowBlocks
1356
+ 2
1357
+ """
1358
+ return self._java_matrix_wrapper.call("numRowBlocks")
1359
+
1360
+ @property
1361
+ def numColBlocks(self) -> int:
1362
+ """
1363
+ Number of columns of blocks in the BlockMatrix.
1364
+
1365
+ Examples
1366
+ --------
1367
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1368
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1369
+ >>> mat = BlockMatrix(blocks, 3, 2)
1370
+ >>> mat.numColBlocks
1371
+ 1
1372
+ """
1373
+ return self._java_matrix_wrapper.call("numColBlocks")
1374
+
1375
+ def numRows(self) -> int:
1376
+ """
1377
+ Get or compute the number of rows.
1378
+
1379
+ Examples
1380
+ --------
1381
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1382
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1383
+
1384
+ >>> mat = BlockMatrix(blocks, 3, 2)
1385
+ >>> print(mat.numRows())
1386
+ 6
1387
+
1388
+ >>> mat = BlockMatrix(blocks, 3, 2, 7, 6)
1389
+ >>> print(mat.numRows())
1390
+ 7
1391
+ """
1392
+ return self._java_matrix_wrapper.call("numRows")
1393
+
1394
+ def numCols(self) -> int:
1395
+ """
1396
+ Get or compute the number of cols.
1397
+
1398
+ Examples
1399
+ --------
1400
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1401
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1402
+
1403
+ >>> mat = BlockMatrix(blocks, 3, 2)
1404
+ >>> print(mat.numCols())
1405
+ 2
1406
+
1407
+ >>> mat = BlockMatrix(blocks, 3, 2, 7, 6)
1408
+ >>> print(mat.numCols())
1409
+ 6
1410
+ """
1411
+ return self._java_matrix_wrapper.call("numCols")
1412
+
1413
+ @since("2.0.0")
1414
+ def cache(self) -> "BlockMatrix":
1415
+ """
1416
+ Caches the underlying RDD.
1417
+ """
1418
+ self._java_matrix_wrapper.call("cache")
1419
+ return self
1420
+
1421
+ @since("2.0.0")
1422
+ def persist(self, storageLevel: StorageLevel) -> "BlockMatrix":
1423
+ """
1424
+ Persists the underlying RDD with the specified storage level.
1425
+ """
1426
+ if not isinstance(storageLevel, StorageLevel):
1427
+ raise TypeError("`storageLevel` should be a StorageLevel, got %s" % type(storageLevel))
1428
+ javaStorageLevel = self._java_matrix_wrapper._sc._getJavaStorageLevel(storageLevel)
1429
+ self._java_matrix_wrapper.call("persist", javaStorageLevel)
1430
+ return self
1431
+
1432
+ @since("2.0.0")
1433
+ def validate(self) -> None:
1434
+ """
1435
+ Validates the block matrix info against the matrix data (`blocks`)
1436
+ and throws an exception if any error is found.
1437
+ """
1438
+ self._java_matrix_wrapper.call("validate")
1439
+
1440
+ def add(self, other: "BlockMatrix") -> "BlockMatrix":
1441
+ """
1442
+ Adds two block matrices together. The matrices must have the
1443
+ same size and matching `rowsPerBlock` and `colsPerBlock` values.
1444
+ If one of the sub matrix blocks that are being added is a
1445
+ SparseMatrix, the resulting sub matrix block will also be a
1446
+ SparseMatrix, even if it is being added to a DenseMatrix. If
1447
+ two dense sub matrix blocks are added, the output block will
1448
+ also be a DenseMatrix.
1449
+
1450
+ Examples
1451
+ --------
1452
+ >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
1453
+ >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
1454
+ >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])
1455
+ >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])
1456
+ >>> blocks2 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])
1457
+ >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])
1458
+ >>> mat1 = BlockMatrix(blocks1, 3, 2)
1459
+ >>> mat2 = BlockMatrix(blocks2, 3, 2)
1460
+ >>> mat3 = BlockMatrix(blocks3, 3, 2)
1461
+
1462
+ >>> mat1.add(mat2).toLocalMatrix()
1463
+ DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0)
1464
+
1465
+ >>> mat1.add(mat3).toLocalMatrix()
1466
+ DenseMatrix(6, 2, [8.0, 2.0, 3.0, 14.0, 16.0, 18.0, 4.0, 16.0, 18.0, 20.0, 22.0, 24.0], 0)
1467
+ """
1468
+ if not isinstance(other, BlockMatrix):
1469
+ raise TypeError("Other should be a BlockMatrix, got %s" % type(other))
1470
+
1471
+ other_java_block_matrix = other._java_matrix_wrapper._java_model
1472
+ java_block_matrix = self._java_matrix_wrapper.call("add", other_java_block_matrix)
1473
+ return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
1474
+
1475
+ def subtract(self, other: "BlockMatrix") -> "BlockMatrix":
1476
+ """
1477
+ Subtracts the given block matrix `other` from this block matrix:
1478
+ `this - other`. The matrices must have the same size and
1479
+ matching `rowsPerBlock` and `colsPerBlock` values. If one of
1480
+ the sub matrix blocks that are being subtracted is a
1481
+ SparseMatrix, the resulting sub matrix block will also be a
1482
+ SparseMatrix, even if it is being subtracted from a DenseMatrix.
1483
+ If two dense sub matrix blocks are subtracted, the output block
1484
+ will also be a DenseMatrix.
1485
+
1486
+ .. versionadded:: 2.0.0
1487
+
1488
+ Examples
1489
+ --------
1490
+ >>> dm1 = Matrices.dense(3, 2, [3, 1, 5, 4, 6, 2])
1491
+ >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
1492
+ >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [1, 2, 3])
1493
+ >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])
1494
+ >>> blocks2 = sc.parallelize([((0, 0), dm2), ((1, 0), dm1)])
1495
+ >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])
1496
+ >>> mat1 = BlockMatrix(blocks1, 3, 2)
1497
+ >>> mat2 = BlockMatrix(blocks2, 3, 2)
1498
+ >>> mat3 = BlockMatrix(blocks3, 3, 2)
1499
+
1500
+ >>> mat1.subtract(mat2).toLocalMatrix()
1501
+ DenseMatrix(6, 2, [-4.0, -7.0, -4.0, 4.0, 7.0, 4.0, -6.0, -5.0, -10.0, 6.0, 5.0, 10.0], 0)
1502
+
1503
+ >>> mat2.subtract(mat3).toLocalMatrix()
1504
+ DenseMatrix(6, 2, [6.0, 8.0, 9.0, -4.0, -7.0, -4.0, 10.0, 9.0, 9.0, -6.0, -5.0, -10.0], 0)
1505
+ """
1506
+ if not isinstance(other, BlockMatrix):
1507
+ raise TypeError("Other should be a BlockMatrix, got %s" % type(other))
1508
+
1509
+ other_java_block_matrix = other._java_matrix_wrapper._java_model
1510
+ java_block_matrix = self._java_matrix_wrapper.call("subtract", other_java_block_matrix)
1511
+ return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
1512
+
1513
+ def multiply(self, other: "BlockMatrix") -> "BlockMatrix":
1514
+ """
1515
+ Left multiplies this BlockMatrix by `other`, another
1516
+ BlockMatrix. The `colsPerBlock` of this matrix must equal the
1517
+ `rowsPerBlock` of `other`. If `other` contains any SparseMatrix
1518
+ blocks, they will have to be converted to DenseMatrix blocks.
1519
+ The output BlockMatrix will only consist of DenseMatrix blocks.
1520
+ This may cause some performance issues until support for
1521
+ multiplying two sparse matrices is added.
1522
+
1523
+ Examples
1524
+ --------
1525
+ >>> dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])
1526
+ >>> dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12])
1527
+ >>> dm3 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
1528
+ >>> dm4 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
1529
+ >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])
1530
+ >>> blocks1 = sc.parallelize([((0, 0), dm1), ((0, 1), dm2)])
1531
+ >>> blocks2 = sc.parallelize([((0, 0), dm3), ((1, 0), dm4)])
1532
+ >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm4)])
1533
+ >>> mat1 = BlockMatrix(blocks1, 2, 3)
1534
+ >>> mat2 = BlockMatrix(blocks2, 3, 2)
1535
+ >>> mat3 = BlockMatrix(blocks3, 3, 2)
1536
+
1537
+ >>> mat1.multiply(mat2).toLocalMatrix()
1538
+ DenseMatrix(2, 2, [242.0, 272.0, 350.0, 398.0], 0)
1539
+
1540
+ >>> mat1.multiply(mat3).toLocalMatrix()
1541
+ DenseMatrix(2, 2, [227.0, 258.0, 394.0, 450.0], 0)
1542
+ """
1543
+ if not isinstance(other, BlockMatrix):
1544
+ raise TypeError("Other should be a BlockMatrix, got %s" % type(other))
1545
+
1546
+ other_java_block_matrix = other._java_matrix_wrapper._java_model
1547
+ java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix)
1548
+ return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
1549
+
1550
+ def transpose(self) -> "BlockMatrix":
1551
+ """
1552
+ Transpose this BlockMatrix. Returns a new BlockMatrix
1553
+ instance sharing the same underlying data. Is a lazy operation.
1554
+
1555
+ .. versionadded:: 2.0.0
1556
+
1557
+ Examples
1558
+ --------
1559
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1560
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1561
+ >>> mat = BlockMatrix(blocks, 3, 2)
1562
+
1563
+ >>> mat_transposed = mat.transpose()
1564
+ >>> mat_transposed.toLocalMatrix()
1565
+ DenseMatrix(2, 6, [1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 7.0, 10.0, 8.0, 11.0, 9.0, 12.0], 0)
1566
+ """
1567
+ java_transposed_matrix = self._java_matrix_wrapper.call("transpose")
1568
+ return BlockMatrix(java_transposed_matrix, self.colsPerBlock, self.rowsPerBlock)
1569
+
1570
+ def toLocalMatrix(self) -> Matrix:
1571
+ """
1572
+ Collect the distributed matrix on the driver as a DenseMatrix.
1573
+
1574
+ Examples
1575
+ --------
1576
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1577
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1578
+ >>> mat = BlockMatrix(blocks, 3, 2).toLocalMatrix()
1579
+
1580
+ >>> # This BlockMatrix will have 6 effective rows, due to
1581
+ >>> # having two sub-matrix blocks stacked, each with 3 rows.
1582
+ >>> # The ensuing DenseMatrix will also have 6 rows.
1583
+ >>> print(mat.numRows)
1584
+ 6
1585
+
1586
+ >>> # This BlockMatrix will have 2 effective columns, due to
1587
+ >>> # having two sub-matrix blocks stacked, each with 2
1588
+ >>> # columns. The ensuing DenseMatrix will also have 2 columns.
1589
+ >>> print(mat.numCols)
1590
+ 2
1591
+ """
1592
+ return self._java_matrix_wrapper.call("toLocalMatrix")
1593
+
1594
+ def toIndexedRowMatrix(self) -> IndexedRowMatrix:
1595
+ """
1596
+ Convert this matrix to an IndexedRowMatrix.
1597
+
1598
+ Examples
1599
+ --------
1600
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
1601
+ ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
1602
+ >>> mat = BlockMatrix(blocks, 3, 2).toIndexedRowMatrix()
1603
+
1604
+ >>> # This BlockMatrix will have 6 effective rows, due to
1605
+ >>> # having two sub-matrix blocks stacked, each with 3 rows.
1606
+ >>> # The ensuing IndexedRowMatrix will also have 6 rows.
1607
+ >>> print(mat.numRows())
1608
+ 6
1609
+
1610
+ >>> # This BlockMatrix will have 2 effective columns, due to
1611
+ >>> # having two sub-matrix blocks stacked, each with 2 columns.
1612
+ >>> # The ensuing IndexedRowMatrix will also have 2 columns.
1613
+ >>> print(mat.numCols())
1614
+ 2
1615
+ """
1616
+ java_indexed_row_matrix = self._java_matrix_wrapper.call("toIndexedRowMatrix")
1617
+ return IndexedRowMatrix(java_indexed_row_matrix)
1618
+
1619
+ def toCoordinateMatrix(self) -> CoordinateMatrix:
1620
+ """
1621
+ Convert this matrix to a CoordinateMatrix.
1622
+
1623
+ Examples
1624
+ --------
1625
+ >>> blocks = sc.parallelize([((0, 0), Matrices.dense(1, 2, [1, 2])),
1626
+ ... ((1, 0), Matrices.dense(1, 2, [7, 8]))])
1627
+ >>> mat = BlockMatrix(blocks, 1, 2).toCoordinateMatrix()
1628
+ >>> mat.entries.take(3)
1629
+ [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 2.0), MatrixEntry(1, 0, 7.0)]
1630
+ """
1631
+ java_coordinate_matrix = self._java_matrix_wrapper.call("toCoordinateMatrix")
1632
+ return CoordinateMatrix(java_coordinate_matrix)
1633
+
1634
+
1635
+ def _test() -> None:
1636
+ import doctest
1637
+ import numpy
1638
+ from pyspark.sql import SparkSession
1639
+ from pyspark.mllib.linalg import Matrices
1640
+ import pyspark.mllib.linalg.distributed
1641
+
1642
+ try:
1643
+ # Numpy 1.14+ changed it's string format.
1644
+ numpy.set_printoptions(legacy="1.13")
1645
+ except TypeError:
1646
+ pass
1647
+ globs = pyspark.mllib.linalg.distributed.__dict__.copy()
1648
+ spark = (
1649
+ SparkSession.builder.master("local[2]")
1650
+ .appName("mllib.linalg.distributed tests")
1651
+ .getOrCreate()
1652
+ )
1653
+ globs["sc"] = spark.sparkContext
1654
+ globs["Matrices"] = Matrices
1655
+ (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
1656
+ spark.stop()
1657
+ if failure_count:
1658
+ sys.exit(-1)
1659
+
1660
+
1661
+ if __name__ == "__main__":
1662
+ _test()