snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2702 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+ from abc import ABCMeta, abstractmethod
18
+ from functools import partial
19
+ from typing import Any, Callable, Generic, List, Optional
20
+
21
+ import numpy as np
22
+
23
+ from pyspark.sql import Window
24
+ from pyspark.sql import functions as F
25
+ from pyspark.pandas.missing.window import (
26
+ MissingPandasLikeRolling,
27
+ MissingPandasLikeRollingGroupby,
28
+ MissingPandasLikeExpanding,
29
+ MissingPandasLikeExpandingGroupby,
30
+ MissingPandasLikeExponentialMoving,
31
+ MissingPandasLikeExponentialMovingGroupby,
32
+ )
33
+
34
+ # For running doctests and reference resolution in PyCharm.
35
+ from pyspark import pandas as ps # noqa: F401
36
+ from pyspark.pandas._typing import FrameLike
37
+ from pyspark.pandas.groupby import GroupBy, DataFrameGroupBy
38
+ from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, SPARK_INDEX_NAME_FORMAT
39
+ from pyspark.pandas.spark import functions as SF
40
+ from pyspark.pandas.utils import scol_for
41
+ from pyspark.sql.column import Column
42
+ from pyspark.sql.types import (
43
+ DoubleType,
44
+ )
45
+ from pyspark.sql.window import WindowSpec
46
+
47
+
48
+ class RollingAndExpanding(Generic[FrameLike], metaclass=ABCMeta):
49
+ def __init__(self, window: WindowSpec, min_periods: int):
50
+ self._window = window
51
+ # This unbounded Window is later used to handle 'min_periods' for now.
52
+ self._unbounded_window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
53
+ Window.unboundedPreceding, Window.currentRow
54
+ )
55
+ self._min_periods = min_periods
56
+
57
+ @abstractmethod
58
+ def _apply_as_series_or_frame(self, func: Callable[[Column], Column]) -> FrameLike:
59
+ """
60
+ Wraps a function that handles Spark column in order
61
+ to support it in both pandas-on-Spark Series and DataFrame.
62
+ Note that the given `func` name should be same as the API's method name.
63
+ """
64
+ pass
65
+
66
+ @abstractmethod
67
+ def count(self) -> FrameLike:
68
+ pass
69
+
70
+ def sum(self) -> FrameLike:
71
+ def sum(scol: Column) -> Column:
72
+ return F.when(
73
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
74
+ F.sum(scol).over(self._window),
75
+ ).otherwise(F.lit(None))
76
+
77
+ return self._apply_as_series_or_frame(sum)
78
+
79
+ def min(self) -> FrameLike:
80
+ def min(scol: Column) -> Column:
81
+ return F.when(
82
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
83
+ F.min(scol).over(self._window),
84
+ ).otherwise(F.lit(None))
85
+
86
+ return self._apply_as_series_or_frame(min)
87
+
88
+ def max(self) -> FrameLike:
89
+ def max(scol: Column) -> Column:
90
+ return F.when(
91
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
92
+ F.max(scol).over(self._window),
93
+ ).otherwise(F.lit(None))
94
+
95
+ return self._apply_as_series_or_frame(max)
96
+
97
+ def mean(self) -> FrameLike:
98
+ def mean(scol: Column) -> Column:
99
+ return F.when(
100
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
101
+ F.mean(scol).over(self._window),
102
+ ).otherwise(F.lit(None))
103
+
104
+ return self._apply_as_series_or_frame(mean)
105
+
106
+ def quantile(self, q: float, accuracy: int = 10000) -> FrameLike:
107
+ def quantile(scol: Column) -> Column:
108
+ return F.when(
109
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
110
+ F.percentile_approx(scol.cast(DoubleType()), q, accuracy).over(self._window),
111
+ ).otherwise(F.lit(None))
112
+
113
+ return self._apply_as_series_or_frame(quantile)
114
+
115
+ def std(self) -> FrameLike:
116
+ def std(scol: Column) -> Column:
117
+ return F.when(
118
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
119
+ F.stddev(scol).over(self._window),
120
+ ).otherwise(F.lit(None))
121
+
122
+ return self._apply_as_series_or_frame(std)
123
+
124
+ def var(self) -> FrameLike:
125
+ def var(scol: Column) -> Column:
126
+ return F.when(
127
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
128
+ F.variance(scol).over(self._window),
129
+ ).otherwise(F.lit(None))
130
+
131
+ return self._apply_as_series_or_frame(var)
132
+
133
+ def skew(self) -> FrameLike:
134
+ def skew(scol: Column) -> Column:
135
+ return F.when(
136
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
137
+ SF.skew(scol).over(self._window),
138
+ ).otherwise(F.lit(None))
139
+
140
+ return self._apply_as_series_or_frame(skew)
141
+
142
+ def kurt(self) -> FrameLike:
143
+ def kurt(scol: Column) -> Column:
144
+ return F.when(
145
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
146
+ SF.kurt(scol).over(self._window),
147
+ ).otherwise(F.lit(None))
148
+
149
+ return self._apply_as_series_or_frame(kurt)
150
+
151
+
152
+ class RollingLike(RollingAndExpanding[FrameLike]):
153
+ def __init__(
154
+ self,
155
+ window: int,
156
+ min_periods: Optional[int] = None,
157
+ ):
158
+ if window < 0:
159
+ raise ValueError("window must be >= 0")
160
+ if (min_periods is not None) and (min_periods < 0):
161
+ raise ValueError("min_periods must be >= 0")
162
+ if min_periods is None:
163
+ # TODO: 'min_periods' is not equivalent in pandas because it does not count NA as
164
+ # a value.
165
+ min_periods = window
166
+
167
+ window_spec = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
168
+ Window.currentRow - (window - 1), Window.currentRow
169
+ )
170
+
171
+ super().__init__(window_spec, min_periods)
172
+
173
+ def count(self) -> FrameLike:
174
+ def count(scol: Column) -> Column:
175
+ return F.count(scol).over(self._window)
176
+
177
+ return self._apply_as_series_or_frame(count).astype("float64") # type: ignore[attr-defined]
178
+
179
+
180
+ class Rolling(RollingLike[FrameLike]):
181
+ def __init__(
182
+ self,
183
+ psdf_or_psser: FrameLike,
184
+ window: int,
185
+ min_periods: Optional[int] = None,
186
+ ):
187
+ from pyspark.pandas.frame import DataFrame
188
+ from pyspark.pandas.series import Series
189
+
190
+ super().__init__(window, min_periods)
191
+
192
+ self._psdf_or_psser = psdf_or_psser
193
+
194
+ if not isinstance(psdf_or_psser, (DataFrame, Series)):
195
+ raise TypeError(
196
+ "psdf_or_psser must be a series or dataframe; however, got: %s"
197
+ % type(psdf_or_psser)
198
+ )
199
+
200
+ def __getattr__(self, item: str) -> Any:
201
+ if hasattr(MissingPandasLikeRolling, item):
202
+ property_or_func = getattr(MissingPandasLikeRolling, item)
203
+ if isinstance(property_or_func, property):
204
+ return property_or_func.fget(self)
205
+ else:
206
+ return partial(property_or_func, self)
207
+ raise AttributeError(item)
208
+
209
+ def _apply_as_series_or_frame(self, func: Callable[[Column], Column]) -> FrameLike:
210
+ return self._psdf_or_psser._apply_series_op(
211
+ lambda psser: psser._with_new_scol(func(psser.spark.column)), # TODO: dtype?
212
+ should_resolve=True,
213
+ )
214
+
215
+ def count(self) -> FrameLike:
216
+ """
217
+ The rolling count of any non-NaN observations inside the window.
218
+
219
+ .. note:: the current implementation of this API uses Spark's Window without
220
+ specifying partition specification. This leads to move all data into
221
+ single partition in single machine and could cause serious
222
+ performance degradation. Avoid this method against very large dataset.
223
+
224
+ Returns
225
+ -------
226
+ Series or DataFrame
227
+ Return type is the same as the original object with `np.float64` dtype.
228
+
229
+ See Also
230
+ --------
231
+ pyspark.pandas.Series.expanding : Calling object with Series data.
232
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
233
+ pyspark.pandas.Series.count : Count of the full Series.
234
+ pyspark.pandas.DataFrame.count : Count of the full DataFrame.
235
+
236
+ Examples
237
+ --------
238
+ >>> s = ps.Series([2, 3, float("nan"), 10])
239
+ >>> s.rolling(1).count()
240
+ 0 1.0
241
+ 1 1.0
242
+ 2 0.0
243
+ 3 1.0
244
+ dtype: float64
245
+
246
+ >>> s.rolling(3).count()
247
+ 0 1.0
248
+ 1 2.0
249
+ 2 2.0
250
+ 3 2.0
251
+ dtype: float64
252
+
253
+ >>> s.to_frame().rolling(1).count()
254
+ 0
255
+ 0 1.0
256
+ 1 1.0
257
+ 2 0.0
258
+ 3 1.0
259
+
260
+ >>> s.to_frame().rolling(3).count()
261
+ 0
262
+ 0 1.0
263
+ 1 2.0
264
+ 2 2.0
265
+ 3 2.0
266
+ """
267
+ return super().count()
268
+
269
+ def sum(self) -> FrameLike:
270
+ """
271
+ Calculate rolling summation of given DataFrame or Series.
272
+
273
+ .. note:: the current implementation of this API uses Spark's Window without
274
+ specifying partition specification. This leads to move all data into
275
+ single partition in single machine and could cause serious
276
+ performance degradation. Avoid this method against very large dataset.
277
+
278
+ Returns
279
+ -------
280
+ Series or DataFrame
281
+ Same type as the input, with the same index, containing the
282
+ rolling summation.
283
+
284
+ See Also
285
+ --------
286
+ pyspark.pandas.Series.expanding : Calling object with Series data.
287
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
288
+ pyspark.pandas.Series.sum : Reducing sum for Series.
289
+ pyspark.pandas.DataFrame.sum : Reducing sum for DataFrame.
290
+
291
+ Examples
292
+ --------
293
+ >>> s = ps.Series([4, 3, 5, 2, 6])
294
+ >>> s
295
+ 0 4
296
+ 1 3
297
+ 2 5
298
+ 3 2
299
+ 4 6
300
+ dtype: int64
301
+
302
+ >>> s.rolling(2).sum()
303
+ 0 NaN
304
+ 1 7.0
305
+ 2 8.0
306
+ 3 7.0
307
+ 4 8.0
308
+ dtype: float64
309
+
310
+ >>> s.rolling(3).sum()
311
+ 0 NaN
312
+ 1 NaN
313
+ 2 12.0
314
+ 3 10.0
315
+ 4 13.0
316
+ dtype: float64
317
+
318
+ For DataFrame, each rolling summation is computed column-wise.
319
+
320
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
321
+ >>> df
322
+ A B
323
+ 0 4 16
324
+ 1 3 9
325
+ 2 5 25
326
+ 3 2 4
327
+ 4 6 36
328
+
329
+ >>> df.rolling(2).sum()
330
+ A B
331
+ 0 NaN NaN
332
+ 1 7.0 25.0
333
+ 2 8.0 34.0
334
+ 3 7.0 29.0
335
+ 4 8.0 40.0
336
+
337
+ >>> df.rolling(3).sum()
338
+ A B
339
+ 0 NaN NaN
340
+ 1 NaN NaN
341
+ 2 12.0 50.0
342
+ 3 10.0 38.0
343
+ 4 13.0 65.0
344
+ """
345
+ return super().sum()
346
+
347
+ def min(self) -> FrameLike:
348
+ """
349
+ Calculate the rolling minimum.
350
+
351
+ .. note:: the current implementation of this API uses Spark's Window without
352
+ specifying partition specification. This leads to move all data into
353
+ single partition in single machine and could cause serious
354
+ performance degradation. Avoid this method against very large dataset.
355
+
356
+ Returns
357
+ -------
358
+ Series or DataFrame
359
+ Returned object type is determined by the caller of the rolling
360
+ calculation.
361
+
362
+ See Also
363
+ --------
364
+ pyspark.pandas.Series.rolling : Calling object with a Series.
365
+ pyspark.pandas.DataFrame.rolling : Calling object with a DataFrame.
366
+ pyspark.pandas.Series.min : Similar method for Series.
367
+ pyspark.pandas.DataFrame.min : Similar method for DataFrame.
368
+
369
+ Examples
370
+ --------
371
+ >>> s = ps.Series([4, 3, 5, 2, 6])
372
+ >>> s
373
+ 0 4
374
+ 1 3
375
+ 2 5
376
+ 3 2
377
+ 4 6
378
+ dtype: int64
379
+
380
+ >>> s.rolling(2).min()
381
+ 0 NaN
382
+ 1 3.0
383
+ 2 3.0
384
+ 3 2.0
385
+ 4 2.0
386
+ dtype: float64
387
+
388
+ >>> s.rolling(3).min()
389
+ 0 NaN
390
+ 1 NaN
391
+ 2 3.0
392
+ 3 2.0
393
+ 4 2.0
394
+ dtype: float64
395
+
396
+ For DataFrame, each rolling minimum is computed column-wise.
397
+
398
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
399
+ >>> df
400
+ A B
401
+ 0 4 16
402
+ 1 3 9
403
+ 2 5 25
404
+ 3 2 4
405
+ 4 6 36
406
+
407
+ >>> df.rolling(2).min()
408
+ A B
409
+ 0 NaN NaN
410
+ 1 3.0 9.0
411
+ 2 3.0 9.0
412
+ 3 2.0 4.0
413
+ 4 2.0 4.0
414
+
415
+ >>> df.rolling(3).min()
416
+ A B
417
+ 0 NaN NaN
418
+ 1 NaN NaN
419
+ 2 3.0 9.0
420
+ 3 2.0 4.0
421
+ 4 2.0 4.0
422
+ """
423
+ return super().min()
424
+
425
+ def max(self) -> FrameLike:
426
+ """
427
+ Calculate the rolling maximum.
428
+
429
+ .. note:: the current implementation of this API uses Spark's Window without
430
+ specifying partition specification. This leads to move all data into
431
+ single partition in single machine and could cause serious
432
+ performance degradation. Avoid this method against very large dataset.
433
+
434
+ Returns
435
+ -------
436
+ Series or DataFrame
437
+ Return type is determined by the caller.
438
+
439
+ See Also
440
+ --------
441
+ pyspark.pandas.Series.rolling : Series rolling.
442
+ pyspark.pandas.DataFrame.rolling : DataFrame rolling.
443
+ pyspark.pandas.Series.max : Similar method for Series.
444
+ pyspark.pandas.DataFrame.max : Similar method for DataFrame.
445
+
446
+ Examples
447
+ --------
448
+ >>> s = ps.Series([4, 3, 5, 2, 6])
449
+ >>> s
450
+ 0 4
451
+ 1 3
452
+ 2 5
453
+ 3 2
454
+ 4 6
455
+ dtype: int64
456
+
457
+ >>> s.rolling(2).max()
458
+ 0 NaN
459
+ 1 4.0
460
+ 2 5.0
461
+ 3 5.0
462
+ 4 6.0
463
+ dtype: float64
464
+
465
+ >>> s.rolling(3).max()
466
+ 0 NaN
467
+ 1 NaN
468
+ 2 5.0
469
+ 3 5.0
470
+ 4 6.0
471
+ dtype: float64
472
+
473
+ For DataFrame, each rolling maximum is computed column-wise.
474
+
475
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
476
+ >>> df
477
+ A B
478
+ 0 4 16
479
+ 1 3 9
480
+ 2 5 25
481
+ 3 2 4
482
+ 4 6 36
483
+
484
+ >>> df.rolling(2).max()
485
+ A B
486
+ 0 NaN NaN
487
+ 1 4.0 16.0
488
+ 2 5.0 25.0
489
+ 3 5.0 25.0
490
+ 4 6.0 36.0
491
+
492
+ >>> df.rolling(3).max()
493
+ A B
494
+ 0 NaN NaN
495
+ 1 NaN NaN
496
+ 2 5.0 25.0
497
+ 3 5.0 25.0
498
+ 4 6.0 36.0
499
+ """
500
+ return super().max()
501
+
502
+ def mean(self) -> FrameLike:
503
+ """
504
+ Calculate the rolling mean of the values.
505
+
506
+ .. note:: the current implementation of this API uses Spark's Window without
507
+ specifying partition specification. This leads to move all data into
508
+ single partition in single machine and could cause serious
509
+ performance degradation. Avoid this method against very large dataset.
510
+
511
+ Returns
512
+ -------
513
+ Series or DataFrame
514
+ Returned object type is determined by the caller of the rolling
515
+ calculation.
516
+
517
+ See Also
518
+ --------
519
+ pyspark.pandas.Series.rolling : Calling object with Series data.
520
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
521
+ pyspark.pandas.Series.mean : Equivalent method for Series.
522
+ pyspark.pandas.DataFrame.mean : Equivalent method for DataFrame.
523
+
524
+ Examples
525
+ --------
526
+ >>> s = ps.Series([4, 3, 5, 2, 6])
527
+ >>> s
528
+ 0 4
529
+ 1 3
530
+ 2 5
531
+ 3 2
532
+ 4 6
533
+ dtype: int64
534
+
535
+ >>> s.rolling(2).mean()
536
+ 0 NaN
537
+ 1 3.5
538
+ 2 4.0
539
+ 3 3.5
540
+ 4 4.0
541
+ dtype: float64
542
+
543
+ >>> s.rolling(3).mean()
544
+ 0 NaN
545
+ 1 NaN
546
+ 2 4.000000
547
+ 3 3.333333
548
+ 4 4.333333
549
+ dtype: float64
550
+
551
+ For DataFrame, each rolling mean is computed column-wise.
552
+
553
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
554
+ >>> df
555
+ A B
556
+ 0 4 16
557
+ 1 3 9
558
+ 2 5 25
559
+ 3 2 4
560
+ 4 6 36
561
+
562
+ >>> df.rolling(2).mean()
563
+ A B
564
+ 0 NaN NaN
565
+ 1 3.5 12.5
566
+ 2 4.0 17.0
567
+ 3 3.5 14.5
568
+ 4 4.0 20.0
569
+
570
+ >>> df.rolling(3).mean()
571
+ A B
572
+ 0 NaN NaN
573
+ 1 NaN NaN
574
+ 2 4.000000 16.666667
575
+ 3 3.333333 12.666667
576
+ 4 4.333333 21.666667
577
+ """
578
+ return super().mean()
579
+
580
+ def quantile(self, quantile: float, accuracy: int = 10000) -> FrameLike:
581
+ """
582
+ Calculate the rolling quantile of the values.
583
+
584
+ .. versionadded:: 3.4.0
585
+
586
+ Parameters
587
+ ----------
588
+ quantile : float
589
+ Value between 0 and 1 providing the quantile to compute.
590
+ accuracy : int, optional
591
+ Default accuracy of approximation. Larger value means better accuracy.
592
+ The relative error can be deduced by 1.0 / accuracy.
593
+ This is a panda-on-Spark specific parameter.
594
+
595
+ Returns
596
+ -------
597
+ Series or DataFrame
598
+ Returned object type is determined by the caller of the rolling
599
+ calculation.
600
+
601
+ Notes
602
+ -----
603
+ `quantile` in pandas-on-Spark are using distributed percentile approximation
604
+ algorithm unlike pandas, the result might be different with pandas, also `interpolation`
605
+ parameter is not supported yet.
606
+
607
+ the current implementation of this API uses Spark's Window without
608
+ specifying partition specification. This leads to move all data into
609
+ single partition in single machine and could cause serious
610
+ performance degradation. Avoid this method against very large dataset.
611
+
612
+ See Also
613
+ --------
614
+ pyspark.pandas.Series.rolling : Calling rolling with Series data.
615
+ pyspark.pandas.DataFrame.rolling : Calling rolling with DataFrames.
616
+ pyspark.pandas.Series.quantile : Aggregating quantile for Series.
617
+ pyspark.pandas.DataFrame.quantile : Aggregating quantile for DataFrame.
618
+
619
+ Examples
620
+ --------
621
+ >>> s = ps.Series([4, 3, 5, 2, 6])
622
+ >>> s
623
+ 0 4
624
+ 1 3
625
+ 2 5
626
+ 3 2
627
+ 4 6
628
+ dtype: int64
629
+
630
+ >>> s.rolling(2).quantile(0.5)
631
+ 0 NaN
632
+ 1 3.0
633
+ 2 3.0
634
+ 3 2.0
635
+ 4 2.0
636
+ dtype: float64
637
+
638
+ >>> s.rolling(3).quantile(0.5)
639
+ 0 NaN
640
+ 1 NaN
641
+ 2 4.0
642
+ 3 3.0
643
+ 4 5.0
644
+ dtype: float64
645
+
646
+ For DataFrame, each rolling quantile is computed column-wise.
647
+
648
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
649
+ >>> df
650
+ A B
651
+ 0 4 16
652
+ 1 3 9
653
+ 2 5 25
654
+ 3 2 4
655
+ 4 6 36
656
+
657
+ >>> df.rolling(2).quantile(0.5)
658
+ A B
659
+ 0 NaN NaN
660
+ 1 3.0 9.0
661
+ 2 3.0 9.0
662
+ 3 2.0 4.0
663
+ 4 2.0 4.0
664
+
665
+ >>> df.rolling(3).quantile(0.5)
666
+ A B
667
+ 0 NaN NaN
668
+ 1 NaN NaN
669
+ 2 4.0 16.0
670
+ 3 3.0 9.0
671
+ 4 5.0 25.0
672
+ """
673
+ return super().quantile(quantile, accuracy)
674
+
675
+ def std(self) -> FrameLike:
676
+ """
677
+ Calculate rolling standard deviation.
678
+
679
+ .. note:: the current implementation of this API uses Spark's Window without
680
+ specifying partition specification. This leads to move all data into
681
+ single partition in single machine and could cause serious
682
+ performance degradation. Avoid this method against very large dataset.
683
+
684
+ Returns
685
+ -------
686
+ Series or DataFrame
687
+ Returns the same object type as the caller of the rolling calculation.
688
+
689
+ See Also
690
+ --------
691
+ pyspark.pandas.Series.rolling : Calling object with Series data.
692
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
693
+ pyspark.pandas.Series.std : Equivalent method for Series.
694
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
695
+ numpy.std : Equivalent method for Numpy array.
696
+
697
+ Examples
698
+ --------
699
+ >>> s = ps.Series([5, 5, 6, 7, 5, 5, 5])
700
+ >>> s.rolling(3).std()
701
+ 0 NaN
702
+ 1 NaN
703
+ 2 0.577350
704
+ 3 1.000000
705
+ 4 1.000000
706
+ 5 1.154701
707
+ 6 0.000000
708
+ dtype: float64
709
+
710
+ For DataFrame, each rolling standard deviation is computed column-wise.
711
+
712
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
713
+ >>> df.rolling(2).std()
714
+ A B
715
+ 0 NaN NaN
716
+ 1 0.000000 0.000000
717
+ 2 0.707107 7.778175
718
+ 3 0.707107 9.192388
719
+ 4 1.414214 16.970563
720
+ 5 0.000000 0.000000
721
+ 6 0.000000 0.000000
722
+ """
723
+ return super().std()
724
+
725
+ def var(self) -> FrameLike:
726
+ """
727
+ Calculate unbiased rolling variance.
728
+
729
+ .. note:: the current implementation of this API uses Spark's Window without
730
+ specifying partition specification. This leads to move all data into
731
+ single partition in single machine and could cause serious
732
+ performance degradation. Avoid this method against very large dataset.
733
+
734
+ Returns
735
+ -------
736
+ Series or DataFrame
737
+ Returns the same object type as the caller of the rolling calculation.
738
+
739
+ See Also
740
+ --------
741
+ Series.rolling : Calling object with Series data.
742
+ DataFrame.rolling : Calling object with DataFrames.
743
+ Series.var : Equivalent method for Series.
744
+ DataFrame.var : Equivalent method for DataFrame.
745
+ numpy.var : Equivalent method for Numpy array.
746
+
747
+ Examples
748
+ --------
749
+ >>> s = ps.Series([5, 5, 6, 7, 5, 5, 5])
750
+ >>> s.rolling(3).var()
751
+ 0 NaN
752
+ 1 NaN
753
+ 2 0.333333
754
+ 3 1.000000
755
+ 4 1.000000
756
+ 5 1.333333
757
+ 6 0.000000
758
+ dtype: float64
759
+
760
+ For DataFrame, each unbiased rolling variance is computed column-wise.
761
+
762
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
763
+ >>> df.rolling(2).var()
764
+ A B
765
+ 0 NaN NaN
766
+ 1 0.0 0.0
767
+ 2 0.5 60.5
768
+ 3 0.5 84.5
769
+ 4 2.0 288.0
770
+ 5 0.0 0.0
771
+ 6 0.0 0.0
772
+ """
773
+ return super().var()
774
+
775
+ def skew(self) -> FrameLike:
776
+ """
777
+ Calculate unbiased rolling skew.
778
+
779
+ .. note:: the current implementation of this API uses Spark's Window without
780
+ specifying partition specification. This leads to move all data into
781
+ single partition in single machine and could cause serious
782
+ performance degradation. Avoid this method against very large dataset.
783
+
784
+ Returns
785
+ -------
786
+ Series or DataFrame
787
+ Returns the same object type as the caller of the rolling calculation.
788
+
789
+ See Also
790
+ --------
791
+ pyspark.pandas.Series.rolling : Calling object with Series data.
792
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
793
+ pyspark.pandas.Series.std : Equivalent method for Series.
794
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
795
+ numpy.std : Equivalent method for Numpy array.
796
+
797
+ Examples
798
+ --------
799
+ >>> s = ps.Series([5, 5, 6, 7, 5, 1, 5, 9])
800
+ >>> s.rolling(3).skew()
801
+ 0 NaN
802
+ 1 NaN
803
+ 2 1.732051
804
+ 3 0.000000
805
+ 4 0.000000
806
+ 5 -0.935220
807
+ 6 -1.732051
808
+ 7 0.000000
809
+ dtype: float64
810
+
811
+ For DataFrame, each rolling standard deviation is computed column-wise.
812
+
813
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
814
+ >>> df.rolling(5).skew()
815
+ A B
816
+ 0 NaN NaN
817
+ 1 NaN NaN
818
+ 2 NaN NaN
819
+ 3 NaN NaN
820
+ 4 1.257788 1.369456
821
+ 5 -1.492685 -0.526039
822
+ 6 -1.492685 -0.526039
823
+ 7 -0.551618 0.686072
824
+ """
825
+ return super().skew()
826
+
827
+ def kurt(self) -> FrameLike:
828
+ """
829
+ Calculate unbiased rolling kurtosis.
830
+
831
+ .. note:: the current implementation of this API uses Spark's Window without
832
+ specifying partition specification. This leads to move all data into
833
+ single partition in single machine and could cause serious
834
+ performance degradation. Avoid this method against very large dataset.
835
+
836
+ Returns
837
+ -------
838
+ Series or DataFrame
839
+ Returns the same object type as the caller of the rolling calculation.
840
+
841
+ See Also
842
+ --------
843
+ pyspark.pandas.Series.rolling : Calling object with Series data.
844
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
845
+ pyspark.pandas.Series.var : Equivalent method for Series.
846
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
847
+ numpy.var : Equivalent method for Numpy array.
848
+
849
+ Examples
850
+ --------
851
+ >>> s = ps.Series([5, 5, 6, 7, 5, 1, 5, 9])
852
+ >>> s.rolling(4).kurt()
853
+ 0 NaN
854
+ 1 NaN
855
+ 2 NaN
856
+ 3 -1.289256
857
+ 4 -1.289256
858
+ 5 2.234867
859
+ 6 2.227147
860
+ 7 1.500000
861
+ dtype: float64
862
+
863
+ For DataFrame, each unbiased rolling variance is computed column-wise.
864
+
865
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
866
+ >>> df.rolling(5).kurt()
867
+ A B
868
+ 0 NaN NaN
869
+ 1 NaN NaN
870
+ 2 NaN NaN
871
+ 3 NaN NaN
872
+ 4 0.312500 0.906336
873
+ 5 2.818047 1.016942
874
+ 6 2.818047 1.016942
875
+ 7 0.867769 0.389750
876
+ """
877
+ return super().kurt()
878
+
879
+
880
+ class RollingGroupby(RollingLike[FrameLike]):
881
+ def __init__(
882
+ self,
883
+ groupby: GroupBy[FrameLike],
884
+ window: int,
885
+ min_periods: Optional[int] = None,
886
+ ):
887
+ super().__init__(window, min_periods)
888
+
889
+ self._groupby = groupby
890
+ self._window = self._window.partitionBy(*[ser.spark.column for ser in groupby._groupkeys])
891
+ self._unbounded_window = self._unbounded_window.partitionBy(
892
+ *[ser.spark.column for ser in groupby._groupkeys]
893
+ )
894
+
895
+ def __getattr__(self, item: str) -> Any:
896
+ if hasattr(MissingPandasLikeRollingGroupby, item):
897
+ property_or_func = getattr(MissingPandasLikeRollingGroupby, item)
898
+ if isinstance(property_or_func, property):
899
+ return property_or_func.fget(self)
900
+ else:
901
+ return partial(property_or_func, self)
902
+ raise AttributeError(item)
903
+
904
+ def _apply_as_series_or_frame(self, func: Callable[[Column], Column]) -> FrameLike:
905
+ """
906
+ Wraps a function that handles Spark column in order
907
+ to support it in both pandas-on-Spark Series and DataFrame.
908
+ Note that the given `func` name should be same as the API's method name.
909
+ """
910
+ from pyspark.pandas import DataFrame
911
+
912
+ groupby = self._groupby
913
+ psdf = groupby._psdf
914
+
915
+ # Here we need to include grouped key as an index, and shift previous index.
916
+ # [index_column0, index_column1] -> [grouped key, index_column0, index_column1]
917
+ new_index_scols: List[Column] = []
918
+ new_index_spark_column_names = []
919
+ new_index_names = []
920
+ new_index_fields = []
921
+ for groupkey in groupby._groupkeys:
922
+ index_column_name = SPARK_INDEX_NAME_FORMAT(len(new_index_scols))
923
+ new_index_scols.append(groupkey.spark.column.alias(index_column_name))
924
+ new_index_spark_column_names.append(index_column_name)
925
+ new_index_names.append(groupkey._column_label)
926
+ new_index_fields.append(groupkey._internal.data_fields[0].copy(name=index_column_name))
927
+
928
+ for new_index_scol, index_name, index_field in zip(
929
+ psdf._internal.index_spark_columns,
930
+ psdf._internal.index_names,
931
+ psdf._internal.index_fields,
932
+ ):
933
+ index_column_name = SPARK_INDEX_NAME_FORMAT(len(new_index_scols))
934
+ new_index_scols.append(new_index_scol.alias(index_column_name))
935
+ new_index_spark_column_names.append(index_column_name)
936
+ new_index_names.append(index_name)
937
+ new_index_fields.append(index_field.copy(name=index_column_name))
938
+
939
+ if groupby._agg_columns_selected:
940
+ agg_columns = groupby._agg_columns
941
+ else:
942
+ # pandas doesn't keep the groupkey as a column from 1.3 for DataFrameGroupBy
943
+ column_labels_to_exclude = groupby._column_labels_to_exclude.copy()
944
+ if isinstance(groupby, DataFrameGroupBy):
945
+ for groupkey in groupby._groupkeys: # type: ignore[attr-defined]
946
+ column_labels_to_exclude.add(groupkey._internal.column_labels[0])
947
+ agg_columns = [
948
+ psdf._psser_for(label)
949
+ for label in psdf._internal.column_labels
950
+ if label not in column_labels_to_exclude
951
+ ]
952
+
953
+ applied = []
954
+ for agg_column in agg_columns:
955
+ applied.append(agg_column._with_new_scol(func(agg_column.spark.column))) # TODO: dtype?
956
+
957
+ # Seems like pandas filters out when grouped key is NA.
958
+ cond = groupby._groupkeys[0].spark.column.isNotNull()
959
+ for c in groupby._groupkeys[1:]:
960
+ cond = cond | c.spark.column.isNotNull()
961
+
962
+ sdf = psdf._internal.spark_frame.filter(cond).select(
963
+ new_index_scols + [c.spark.column for c in applied]
964
+ )
965
+
966
+ internal = psdf._internal.copy(
967
+ spark_frame=sdf,
968
+ index_spark_columns=[scol_for(sdf, col) for col in new_index_spark_column_names],
969
+ index_names=new_index_names,
970
+ index_fields=new_index_fields,
971
+ column_labels=[c._column_label for c in applied],
972
+ data_spark_columns=[
973
+ scol_for(sdf, c._internal.data_spark_column_names[0]) for c in applied
974
+ ],
975
+ data_fields=[c._internal.data_fields[0] for c in applied],
976
+ )
977
+
978
+ return groupby._handle_output(DataFrame(internal))
979
+
980
+ def count(self) -> FrameLike:
981
+ """
982
+ The rolling count of any non-NaN observations inside the window.
983
+
984
+ Returns
985
+ -------
986
+ Series or DataFrame
987
+ Returned object type is determined by the caller of the expanding
988
+ calculation.
989
+
990
+ See Also
991
+ --------
992
+ pyspark.pandas.Series.rolling : Calling object with Series data.
993
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
994
+ pyspark.pandas.Series.count : Count of the full Series.
995
+ pyspark.pandas.DataFrame.count : Count of the full DataFrame.
996
+
997
+ Examples
998
+ --------
999
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1000
+ >>> s.groupby(s).rolling(3).count().sort_index()
1001
+ 2 0 1.0
1002
+ 1 2.0
1003
+ 3 2 1.0
1004
+ 3 2.0
1005
+ 4 3.0
1006
+ 4 5 1.0
1007
+ 6 2.0
1008
+ 7 3.0
1009
+ 8 3.0
1010
+ 5 9 1.0
1011
+ 10 2.0
1012
+ dtype: float64
1013
+
1014
+ For DataFrame, each rolling count is computed column-wise.
1015
+
1016
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1017
+ >>> df.groupby(df.A).rolling(2).count().sort_index() # doctest: +NORMALIZE_WHITESPACE
1018
+ B
1019
+ A
1020
+ 2 0 1.0
1021
+ 1 2.0
1022
+ 3 2 1.0
1023
+ 3 2.0
1024
+ 4 2.0
1025
+ 4 5 1.0
1026
+ 6 2.0
1027
+ 7 2.0
1028
+ 8 2.0
1029
+ 5 9 1.0
1030
+ 10 2.0
1031
+ """
1032
+ return super().count()
1033
+
1034
+ def sum(self) -> FrameLike:
1035
+ """
1036
+ The rolling summation of any non-NaN observations inside the window.
1037
+
1038
+ Returns
1039
+ -------
1040
+ Series or DataFrame
1041
+ Returned object type is determined by the caller of the rolling
1042
+ calculation.
1043
+
1044
+ See Also
1045
+ --------
1046
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1047
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1048
+ pyspark.pandas.Series.sum : Sum of the full Series.
1049
+ pyspark.pandas.DataFrame.sum : Sum of the full DataFrame.
1050
+
1051
+ Examples
1052
+ --------
1053
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1054
+ >>> s.groupby(s).rolling(3).sum().sort_index()
1055
+ 2 0 NaN
1056
+ 1 NaN
1057
+ 3 2 NaN
1058
+ 3 NaN
1059
+ 4 9.0
1060
+ 4 5 NaN
1061
+ 6 NaN
1062
+ 7 12.0
1063
+ 8 12.0
1064
+ 5 9 NaN
1065
+ 10 NaN
1066
+ dtype: float64
1067
+
1068
+ For DataFrame, each rolling summation is computed column-wise.
1069
+
1070
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1071
+ >>> df.groupby(df.A).rolling(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
1072
+ B
1073
+ A
1074
+ 2 0 NaN
1075
+ 1 8.0
1076
+ 3 2 NaN
1077
+ 3 18.0
1078
+ 4 18.0
1079
+ 4 5 NaN
1080
+ 6 32.0
1081
+ 7 32.0
1082
+ 8 32.0
1083
+ 5 9 NaN
1084
+ 10 50.0
1085
+ """
1086
+ return super().sum()
1087
+
1088
+ def min(self) -> FrameLike:
1089
+ """
1090
+ The rolling minimum of any non-NaN observations inside the window.
1091
+
1092
+ Returns
1093
+ -------
1094
+ Series or DataFrame
1095
+ Returned object type is determined by the caller of the rolling
1096
+ calculation.
1097
+
1098
+ See Also
1099
+ --------
1100
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1101
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1102
+ pyspark.pandas.Series.min : Min of the full Series.
1103
+ pyspark.pandas.DataFrame.min : Min of the full DataFrame.
1104
+
1105
+ Examples
1106
+ --------
1107
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1108
+ >>> s.groupby(s).rolling(3).min().sort_index()
1109
+ 2 0 NaN
1110
+ 1 NaN
1111
+ 3 2 NaN
1112
+ 3 NaN
1113
+ 4 3.0
1114
+ 4 5 NaN
1115
+ 6 NaN
1116
+ 7 4.0
1117
+ 8 4.0
1118
+ 5 9 NaN
1119
+ 10 NaN
1120
+ dtype: float64
1121
+
1122
+ For DataFrame, each rolling minimum is computed column-wise.
1123
+
1124
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1125
+ >>> df.groupby(df.A).rolling(2).min().sort_index() # doctest: +NORMALIZE_WHITESPACE
1126
+ B
1127
+ A
1128
+ 2 0 NaN
1129
+ 1 4.0
1130
+ 3 2 NaN
1131
+ 3 9.0
1132
+ 4 9.0
1133
+ 4 5 NaN
1134
+ 6 16.0
1135
+ 7 16.0
1136
+ 8 16.0
1137
+ 5 9 NaN
1138
+ 10 25.0
1139
+ """
1140
+ return super().min()
1141
+
1142
+ def max(self) -> FrameLike:
1143
+ """
1144
+ The rolling maximum of any non-NaN observations inside the window.
1145
+
1146
+ Returns
1147
+ -------
1148
+ Series or DataFrame
1149
+ Returned object type is determined by the caller of the rolling
1150
+ calculation.
1151
+
1152
+ See Also
1153
+ --------
1154
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1155
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1156
+ pyspark.pandas.Series.max : Max of the full Series.
1157
+ pyspark.pandas.DataFrame.max : Max of the full DataFrame.
1158
+
1159
+ Examples
1160
+ --------
1161
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1162
+ >>> s.groupby(s).rolling(3).max().sort_index()
1163
+ 2 0 NaN
1164
+ 1 NaN
1165
+ 3 2 NaN
1166
+ 3 NaN
1167
+ 4 3.0
1168
+ 4 5 NaN
1169
+ 6 NaN
1170
+ 7 4.0
1171
+ 8 4.0
1172
+ 5 9 NaN
1173
+ 10 NaN
1174
+ dtype: float64
1175
+
1176
+ For DataFrame, each rolling maximum is computed column-wise.
1177
+
1178
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1179
+ >>> df.groupby(df.A).rolling(2).max().sort_index() # doctest: +NORMALIZE_WHITESPACE
1180
+ B
1181
+ A
1182
+ 2 0 NaN
1183
+ 1 4.0
1184
+ 3 2 NaN
1185
+ 3 9.0
1186
+ 4 9.0
1187
+ 4 5 NaN
1188
+ 6 16.0
1189
+ 7 16.0
1190
+ 8 16.0
1191
+ 5 9 NaN
1192
+ 10 25.0
1193
+ """
1194
+ return super().max()
1195
+
1196
+ def mean(self) -> FrameLike:
1197
+ """
1198
+ The rolling mean of any non-NaN observations inside the window.
1199
+
1200
+ Returns
1201
+ -------
1202
+ Series or DataFrame
1203
+ Returned object type is determined by the caller of the rolling
1204
+ calculation.
1205
+
1206
+ See Also
1207
+ --------
1208
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1209
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1210
+ pyspark.pandas.Series.mean : Mean of the full Series.
1211
+ pyspark.pandas.DataFrame.mean : Mean of the full DataFrame.
1212
+
1213
+ Examples
1214
+ --------
1215
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1216
+ >>> s.groupby(s).rolling(3).mean().sort_index()
1217
+ 2 0 NaN
1218
+ 1 NaN
1219
+ 3 2 NaN
1220
+ 3 NaN
1221
+ 4 3.0
1222
+ 4 5 NaN
1223
+ 6 NaN
1224
+ 7 4.0
1225
+ 8 4.0
1226
+ 5 9 NaN
1227
+ 10 NaN
1228
+ dtype: float64
1229
+
1230
+ For DataFrame, each rolling mean is computed column-wise.
1231
+
1232
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1233
+ >>> df.groupby(df.A).rolling(2).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE
1234
+ B
1235
+ A
1236
+ 2 0 NaN
1237
+ 1 4.0
1238
+ 3 2 NaN
1239
+ 3 9.0
1240
+ 4 9.0
1241
+ 4 5 NaN
1242
+ 6 16.0
1243
+ 7 16.0
1244
+ 8 16.0
1245
+ 5 9 NaN
1246
+ 10 25.0
1247
+ """
1248
+ return super().mean()
1249
+
1250
+ def quantile(self, quantile: float, accuracy: int = 10000) -> FrameLike:
1251
+ """
1252
+ Calculate rolling quantile.
1253
+
1254
+ .. versionadded:: 3.4.0
1255
+
1256
+ Parameters
1257
+ ----------
1258
+ quantile : float
1259
+ Value between 0 and 1 providing the quantile to compute.
1260
+ accuracy : int, optional
1261
+ Default accuracy of approximation. Larger value means better accuracy.
1262
+ The relative error can be deduced by 1.0 / accuracy.
1263
+ This is a panda-on-Spark specific parameter.
1264
+
1265
+ Returns
1266
+ -------
1267
+ Series or DataFrame
1268
+ Returned object type is determined by the caller of the rolling
1269
+ calculation.
1270
+
1271
+ Notes
1272
+ -----
1273
+ `quantile` in pandas-on-Spark are using distributed percentile approximation
1274
+ algorithm unlike pandas, the result might be different with pandas, also `interpolation`
1275
+ parameter is not supported yet.
1276
+
1277
+ See Also
1278
+ --------
1279
+ pyspark.pandas.Series.rolling : Calling rolling with Series data.
1280
+ pyspark.pandas.DataFrame.rolling : Calling rolling with DataFrames.
1281
+ pyspark.pandas.Series.quantile : Aggregating quantile for Series.
1282
+ pyspark.pandas.DataFrame.quantile : Aggregating quantile for DataFrame.
1283
+
1284
+ Examples
1285
+ --------
1286
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1287
+ >>> s.groupby(s).rolling(3).quantile(0.5).sort_index()
1288
+ 2 0 NaN
1289
+ 1 NaN
1290
+ 3 2 NaN
1291
+ 3 NaN
1292
+ 4 3.0
1293
+ 4 5 NaN
1294
+ 6 NaN
1295
+ 7 4.0
1296
+ 8 4.0
1297
+ 5 9 NaN
1298
+ 10 NaN
1299
+ dtype: float64
1300
+
1301
+ For DataFrame, each rolling quantile is computed column-wise.
1302
+
1303
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1304
+ >>> df.groupby(df.A).rolling(2).quantile(0.5).sort_index()
1305
+ B
1306
+ A
1307
+ 2 0 NaN
1308
+ 1 4.0
1309
+ 3 2 NaN
1310
+ 3 9.0
1311
+ 4 9.0
1312
+ 4 5 NaN
1313
+ 6 16.0
1314
+ 7 16.0
1315
+ 8 16.0
1316
+ 5 9 NaN
1317
+ 10 25.0
1318
+ """
1319
+ return super().quantile(quantile, accuracy)
1320
+
1321
+ def std(self) -> FrameLike:
1322
+ """
1323
+ Calculate rolling standard deviation.
1324
+
1325
+ Returns
1326
+ -------
1327
+ Series or DataFrame
1328
+ Returns the same object type as the caller of the rolling calculation.
1329
+
1330
+ See Also
1331
+ --------
1332
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1333
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1334
+ pyspark.pandas.Series.std : Equivalent method for Series.
1335
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
1336
+ numpy.std : Equivalent method for Numpy array.
1337
+ """
1338
+ return super().std()
1339
+
1340
+ def var(self) -> FrameLike:
1341
+ """
1342
+ Calculate unbiased rolling variance.
1343
+
1344
+ Returns
1345
+ -------
1346
+ Series or DataFrame
1347
+ Returns the same object type as the caller of the rolling calculation.
1348
+
1349
+ See Also
1350
+ --------
1351
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1352
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1353
+ pyspark.pandas.Series.var : Equivalent method for Series.
1354
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
1355
+ numpy.var : Equivalent method for Numpy array.
1356
+ """
1357
+ return super().var()
1358
+
1359
+ def skew(self) -> FrameLike:
1360
+ """
1361
+ Calculate unbiased rolling skew.
1362
+
1363
+ Returns
1364
+ -------
1365
+ Series or DataFrame
1366
+ Returns the same object type as the caller of the rolling calculation.
1367
+
1368
+ See Also
1369
+ --------
1370
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1371
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1372
+ pyspark.pandas.Series.std : Equivalent method for Series.
1373
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
1374
+ numpy.std : Equivalent method for Numpy array.
1375
+ """
1376
+ return super().skew()
1377
+
1378
+ def kurt(self) -> FrameLike:
1379
+ """
1380
+ Calculate unbiased rolling kurtosis.
1381
+
1382
+ Returns
1383
+ -------
1384
+ Series or DataFrame
1385
+ Returns the same object type as the caller of the rolling calculation.
1386
+
1387
+ See Also
1388
+ --------
1389
+ pyspark.pandas.Series.rolling : Calling object with Series data.
1390
+ pyspark.pandas.DataFrame.rolling : Calling object with DataFrames.
1391
+ pyspark.pandas.Series.var : Equivalent method for Series.
1392
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
1393
+ numpy.var : Equivalent method for Numpy array.
1394
+ """
1395
+ return super().kurt()
1396
+
1397
+
1398
+ class ExpandingLike(RollingAndExpanding[FrameLike]):
1399
+ def __init__(self, min_periods: int = 1):
1400
+ if min_periods < 0:
1401
+ raise ValueError("min_periods must be >= 0")
1402
+
1403
+ window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
1404
+ Window.unboundedPreceding, Window.currentRow
1405
+ )
1406
+
1407
+ super().__init__(window, min_periods)
1408
+
1409
+ def count(self) -> FrameLike:
1410
+ def count(scol: Column) -> Column:
1411
+ return F.when(
1412
+ F.row_number().over(self._unbounded_window) >= self._min_periods,
1413
+ F.count(scol).over(self._window),
1414
+ ).otherwise(F.lit(None))
1415
+
1416
+ return self._apply_as_series_or_frame(count).astype("float64") # type: ignore[attr-defined]
1417
+
1418
+
1419
+ class Expanding(ExpandingLike[FrameLike]):
1420
+ def __init__(self, psdf_or_psser: FrameLike, min_periods: int = 1):
1421
+ from pyspark.pandas.frame import DataFrame
1422
+ from pyspark.pandas.series import Series
1423
+
1424
+ super().__init__(min_periods)
1425
+
1426
+ if not isinstance(psdf_or_psser, (DataFrame, Series)):
1427
+ raise TypeError(
1428
+ "psdf_or_psser must be a series or dataframe; however, got: %s"
1429
+ % type(psdf_or_psser)
1430
+ )
1431
+ self._psdf_or_psser = psdf_or_psser
1432
+
1433
+ def __getattr__(self, item: str) -> Any:
1434
+ if hasattr(MissingPandasLikeExpanding, item):
1435
+ property_or_func = getattr(MissingPandasLikeExpanding, item)
1436
+ if isinstance(property_or_func, property):
1437
+ return property_or_func.fget(self)
1438
+ else:
1439
+ return partial(property_or_func, self)
1440
+ raise AttributeError(item)
1441
+
1442
+ # TODO: when add 'axis' parameter, should add to here too.
1443
+ def __repr__(self) -> str:
1444
+ return "Expanding [min_periods={}]".format(self._min_periods)
1445
+
1446
+ _apply_as_series_or_frame = Rolling._apply_as_series_or_frame
1447
+
1448
+ def count(self) -> FrameLike:
1449
+ """
1450
+ The expanding count of any non-NaN observations inside the window.
1451
+
1452
+ .. note:: the current implementation of this API uses Spark's Window without
1453
+ specifying partition specification. This leads to move all data into
1454
+ single partition in single machine and could cause serious
1455
+ performance degradation. Avoid this method against very large dataset.
1456
+
1457
+ Returns
1458
+ -------
1459
+ Series or DataFrame
1460
+ Returned object type is determined by the caller of the expanding
1461
+ calculation.
1462
+
1463
+ See Also
1464
+ --------
1465
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1466
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1467
+ pyspark.pandas.Series.count : Count of the full Series.
1468
+ pyspark.pandas.DataFrame.count : Count of the full DataFrame.
1469
+
1470
+ Examples
1471
+ --------
1472
+ >>> s = ps.Series([2, 3, float("nan"), 10])
1473
+ >>> s.expanding().count()
1474
+ 0 1.0
1475
+ 1 2.0
1476
+ 2 2.0
1477
+ 3 3.0
1478
+ dtype: float64
1479
+
1480
+ >>> s.to_frame().expanding().count()
1481
+ 0
1482
+ 0 1.0
1483
+ 1 2.0
1484
+ 2 2.0
1485
+ 3 3.0
1486
+ """
1487
+ return super().count()
1488
+
1489
+ def sum(self) -> FrameLike:
1490
+ """
1491
+ Calculate expanding summation of given DataFrame or Series.
1492
+
1493
+ .. note:: the current implementation of this API uses Spark's Window without
1494
+ specifying partition specification. This leads to move all data into
1495
+ single partition in single machine and could cause serious
1496
+ performance degradation. Avoid this method against very large dataset.
1497
+
1498
+ Returns
1499
+ -------
1500
+ Series or DataFrame
1501
+ Same type as the input, with the same index, containing the
1502
+ expanding summation.
1503
+
1504
+ See Also
1505
+ --------
1506
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1507
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1508
+ pyspark.pandas.Series.sum : Reducing sum for Series.
1509
+ pyspark.pandas.DataFrame.sum : Reducing sum for DataFrame.
1510
+
1511
+ Examples
1512
+ --------
1513
+ >>> s = ps.Series([1, 2, 3, 4, 5])
1514
+ >>> s
1515
+ 0 1
1516
+ 1 2
1517
+ 2 3
1518
+ 3 4
1519
+ 4 5
1520
+ dtype: int64
1521
+
1522
+ >>> s.expanding(3).sum()
1523
+ 0 NaN
1524
+ 1 NaN
1525
+ 2 6.0
1526
+ 3 10.0
1527
+ 4 15.0
1528
+ dtype: float64
1529
+
1530
+ For DataFrame, each expanding summation is computed column-wise.
1531
+
1532
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1533
+ >>> df
1534
+ A B
1535
+ 0 1 1
1536
+ 1 2 4
1537
+ 2 3 9
1538
+ 3 4 16
1539
+ 4 5 25
1540
+
1541
+ >>> df.expanding(3).sum()
1542
+ A B
1543
+ 0 NaN NaN
1544
+ 1 NaN NaN
1545
+ 2 6.0 14.0
1546
+ 3 10.0 30.0
1547
+ 4 15.0 55.0
1548
+ """
1549
+ return super().sum()
1550
+
1551
+ def min(self) -> FrameLike:
1552
+ """
1553
+ Calculate the expanding minimum.
1554
+
1555
+ .. note:: the current implementation of this API uses Spark's Window without
1556
+ specifying partition specification. This leads to move all data into
1557
+ single partition in single machine and could cause serious
1558
+ performance degradation. Avoid this method against very large dataset.
1559
+
1560
+ Returns
1561
+ -------
1562
+ Series or DataFrame
1563
+ Returned object type is determined by the caller of the expanding
1564
+ calculation.
1565
+
1566
+ See Also
1567
+ --------
1568
+ pyspark.pandas.Series.expanding : Calling object with a Series.
1569
+ pyspark.pandas.DataFrame.expanding : Calling object with a DataFrame.
1570
+ pyspark.pandas.Series.min : Similar method for Series.
1571
+ pyspark.pandas.DataFrame.min : Similar method for DataFrame.
1572
+
1573
+ Examples
1574
+ --------
1575
+ Performing a expanding minimum with a window size of 3.
1576
+
1577
+ >>> s = ps.Series([4, 3, 5, 2, 6])
1578
+ >>> s.expanding(3).min()
1579
+ 0 NaN
1580
+ 1 NaN
1581
+ 2 3.0
1582
+ 3 2.0
1583
+ 4 2.0
1584
+ dtype: float64
1585
+ """
1586
+ return super().min()
1587
+
1588
+ def max(self) -> FrameLike:
1589
+ """
1590
+ Calculate the expanding maximum.
1591
+
1592
+ .. note:: the current implementation of this API uses Spark's Window without
1593
+ specifying partition specification. This leads to move all data into
1594
+ single partition in single machine and could cause serious
1595
+ performance degradation. Avoid this method against very large dataset.
1596
+
1597
+ Returns
1598
+ -------
1599
+ Series or DataFrame
1600
+ Return type is determined by the caller.
1601
+
1602
+ See Also
1603
+ --------
1604
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1605
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1606
+ pyspark.pandas.Series.max : Similar method for Series.
1607
+ pyspark.pandas.DataFrame.max : Similar method for DataFrame.
1608
+
1609
+ Examples
1610
+ --------
1611
+ Performing a expanding minimum with a window size of 3.
1612
+
1613
+ >>> s = ps.Series([4, 3, 5, 2, 6])
1614
+ >>> s.expanding(3).max()
1615
+ 0 NaN
1616
+ 1 NaN
1617
+ 2 5.0
1618
+ 3 5.0
1619
+ 4 6.0
1620
+ dtype: float64
1621
+ """
1622
+ return super().max()
1623
+
1624
+ def mean(self) -> FrameLike:
1625
+ """
1626
+ Calculate the expanding mean of the values.
1627
+
1628
+ .. note:: the current implementation of this API uses Spark's Window without
1629
+ specifying partition specification. This leads to move all data into
1630
+ single partition in single machine and could cause serious
1631
+ performance degradation. Avoid this method against very large dataset.
1632
+
1633
+ Returns
1634
+ -------
1635
+ Series or DataFrame
1636
+ Returned object type is determined by the caller of the expanding
1637
+ calculation.
1638
+
1639
+ See Also
1640
+ --------
1641
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1642
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1643
+ pyspark.pandas.Series.mean : Equivalent method for Series.
1644
+ pyspark.pandas.DataFrame.mean : Equivalent method for DataFrame.
1645
+
1646
+ Examples
1647
+ --------
1648
+ The below examples will show expanding mean calculations with window sizes of
1649
+ two and three, respectively.
1650
+
1651
+ >>> s = ps.Series([1, 2, 3, 4])
1652
+ >>> s.expanding(2).mean()
1653
+ 0 NaN
1654
+ 1 1.5
1655
+ 2 2.0
1656
+ 3 2.5
1657
+ dtype: float64
1658
+
1659
+ >>> s.expanding(3).mean()
1660
+ 0 NaN
1661
+ 1 NaN
1662
+ 2 2.0
1663
+ 3 2.5
1664
+ dtype: float64
1665
+ """
1666
+ return super().mean()
1667
+
1668
+ def quantile(self, quantile: float, accuracy: int = 10000) -> FrameLike:
1669
+ """
1670
+ Calculate the expanding quantile of the values.
1671
+
1672
+ Returns
1673
+ -------
1674
+ Series or DataFrame
1675
+ Returned object type is determined by the caller of the expanding
1676
+ calculation.
1677
+
1678
+ Parameters
1679
+ ----------
1680
+ quantile : float
1681
+ Value between 0 and 1 providing the quantile to compute.
1682
+ accuracy : int, optional
1683
+ Default accuracy of approximation. Larger value means better accuracy.
1684
+ The relative error can be deduced by 1.0 / accuracy.
1685
+ This is a panda-on-Spark specific parameter.
1686
+
1687
+ Notes
1688
+ -----
1689
+ `quantile` in pandas-on-Spark are using distributed percentile approximation
1690
+ algorithm unlike pandas, the result might be different with pandas (the result is
1691
+ similar to the interpolation set to `lower`), also `interpolation` parameter is
1692
+ not supported yet.
1693
+
1694
+ the current implementation of this API uses Spark's Window without
1695
+ specifying partition specification. This leads to move all data into
1696
+ single partition in single machine and could cause serious
1697
+ performance degradation. Avoid this method against very large dataset.
1698
+
1699
+ See Also
1700
+ --------
1701
+ pyspark.pandas.Series.expanding : Calling expanding with Series data.
1702
+ pyspark.pandas.DataFrame.expanding : Calling expanding with DataFrames.
1703
+ pyspark.pandas.Series.quantile : Aggregating quantile for Series.
1704
+ pyspark.pandas.DataFrame.quantile : Aggregating quantile for DataFrame.
1705
+
1706
+ Examples
1707
+ --------
1708
+ The below examples will show expanding quantile calculations with window sizes of
1709
+ two and three, respectively.
1710
+
1711
+ >>> s = ps.Series([1, 2, 3, 4])
1712
+ >>> s.expanding(2).quantile(0.5)
1713
+ 0 NaN
1714
+ 1 1.0
1715
+ 2 2.0
1716
+ 3 2.0
1717
+ dtype: float64
1718
+
1719
+ >>> s.expanding(3).quantile(0.5)
1720
+ 0 NaN
1721
+ 1 NaN
1722
+ 2 2.0
1723
+ 3 2.0
1724
+ dtype: float64
1725
+ """
1726
+ return super().quantile(quantile, accuracy)
1727
+
1728
+ def std(self) -> FrameLike:
1729
+ """
1730
+ Calculate expanding standard deviation.
1731
+
1732
+ .. note:: the current implementation of this API uses Spark's Window without
1733
+ specifying partition specification. This leads to move all data into
1734
+ single partition in single machine and could cause serious
1735
+ performance degradation. Avoid this method against very large dataset.
1736
+
1737
+ Returns
1738
+ -------
1739
+ Series or DataFrame
1740
+ Returns the same object type as the caller of the expanding calculation.
1741
+
1742
+ See Also
1743
+ --------
1744
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1745
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1746
+ pyspark.pandas.Series.std : Equivalent method for Series.
1747
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
1748
+ numpy.std : Equivalent method for Numpy array.
1749
+
1750
+ Examples
1751
+ --------
1752
+ >>> s = ps.Series([5, 5, 6, 7, 5, 5, 5])
1753
+ >>> s.expanding(3).std()
1754
+ 0 NaN
1755
+ 1 NaN
1756
+ 2 0.577350
1757
+ 3 0.957427
1758
+ 4 0.894427
1759
+ 5 0.836660
1760
+ 6 0.786796
1761
+ dtype: float64
1762
+
1763
+ For DataFrame, each expanding standard deviation variance is computed column-wise.
1764
+
1765
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1766
+ >>> df.expanding(2).std()
1767
+ A B
1768
+ 0 NaN NaN
1769
+ 1 0.000000 0.000000
1770
+ 2 0.577350 6.350853
1771
+ 3 0.957427 11.412712
1772
+ 4 0.894427 10.630146
1773
+ 5 0.836660 9.928075
1774
+ 6 0.786796 9.327379
1775
+ """
1776
+ return super().std()
1777
+
1778
+ def var(self) -> FrameLike:
1779
+ """
1780
+ Calculate unbiased expanding variance.
1781
+
1782
+ .. note:: the current implementation of this API uses Spark's Window without
1783
+ specifying partition specification. This leads to move all data into
1784
+ single partition in single machine and could cause serious
1785
+ performance degradation. Avoid this method against very large dataset.
1786
+
1787
+ Returns
1788
+ -------
1789
+ Series or DataFrame
1790
+ Returns the same object type as the caller of the expanding calculation.
1791
+
1792
+ See Also
1793
+ --------
1794
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1795
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1796
+ pyspark.pandas.Series.var : Equivalent method for Series.
1797
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
1798
+ numpy.var : Equivalent method for Numpy array.
1799
+
1800
+ Examples
1801
+ --------
1802
+ >>> s = ps.Series([5, 5, 6, 7, 5, 5, 5])
1803
+ >>> s.expanding(3).var()
1804
+ 0 NaN
1805
+ 1 NaN
1806
+ 2 0.333333
1807
+ 3 0.916667
1808
+ 4 0.800000
1809
+ 5 0.700000
1810
+ 6 0.619048
1811
+ dtype: float64
1812
+
1813
+ For DataFrame, each unbiased expanding variance is computed column-wise.
1814
+
1815
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1816
+ >>> df.expanding(2).var()
1817
+ A B
1818
+ 0 NaN NaN
1819
+ 1 0.000000 0.000000
1820
+ 2 0.333333 40.333333
1821
+ 3 0.916667 130.250000
1822
+ 4 0.800000 113.000000
1823
+ 5 0.700000 98.566667
1824
+ 6 0.619048 87.000000
1825
+ """
1826
+ return super().var()
1827
+
1828
+ def skew(self) -> FrameLike:
1829
+ """
1830
+ Calculate unbiased expanding skew.
1831
+
1832
+ .. note:: the current implementation of this API uses Spark's Window without
1833
+ specifying partition specification. This leads to move all data into
1834
+ single partition in single machine and could cause serious
1835
+ performance degradation. Avoid this method against very large dataset.
1836
+
1837
+ Returns
1838
+ -------
1839
+ Series or DataFrame
1840
+ Returns the same object type as the caller of the expanding calculation.
1841
+
1842
+ See Also
1843
+ --------
1844
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1845
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1846
+ pyspark.pandas.Series.std : Equivalent method for Series.
1847
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
1848
+ numpy.std : Equivalent method for Numpy array.
1849
+
1850
+ Examples
1851
+ --------
1852
+ >>> s = ps.Series([5, 5, 6, 7, 5, 1, 5, 9])
1853
+ >>> s.expanding(3).skew()
1854
+ 0 NaN
1855
+ 1 NaN
1856
+ 2 1.732051
1857
+ 3 0.854563
1858
+ 4 1.257788
1859
+ 5 -1.571593
1860
+ 6 -1.657542
1861
+ 7 -0.521760
1862
+ dtype: float64
1863
+
1864
+ For DataFrame, each expanding standard deviation variance is computed column-wise.
1865
+
1866
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1867
+ >>> df.expanding(5).skew()
1868
+ A B
1869
+ 0 NaN NaN
1870
+ 1 NaN NaN
1871
+ 2 NaN NaN
1872
+ 3 NaN NaN
1873
+ 4 1.257788 1.369456
1874
+ 5 -1.571593 -0.423309
1875
+ 6 -1.657542 -0.355737
1876
+ 7 -0.521760 1.116874
1877
+ """
1878
+ return super().skew()
1879
+
1880
+ def kurt(self) -> FrameLike:
1881
+ """
1882
+ Calculate unbiased expanding kurtosis.
1883
+
1884
+ .. note:: the current implementation of this API uses Spark's Window without
1885
+ specifying partition specification. This leads to move all data into
1886
+ single partition in single machine and could cause serious
1887
+ performance degradation. Avoid this method against very large dataset.
1888
+
1889
+ Returns
1890
+ -------
1891
+ Series or DataFrame
1892
+ Returns the same object type as the caller of the expanding calculation.
1893
+
1894
+ See Also
1895
+ --------
1896
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1897
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1898
+ pyspark.pandas.Series.var : Equivalent method for Series.
1899
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
1900
+ numpy.var : Equivalent method for Numpy array.
1901
+
1902
+ Examples
1903
+ --------
1904
+ >>> s = ps.Series([5, 5, 6, 7, 5, 1, 5, 9])
1905
+ >>> s.expanding(4).kurt()
1906
+ 0 NaN
1907
+ 1 NaN
1908
+ 2 NaN
1909
+ 3 -1.289256
1910
+ 4 0.312500
1911
+ 5 3.419520
1912
+ 6 4.028185
1913
+ 7 2.230373
1914
+ dtype: float64
1915
+
1916
+ For DataFrame, each unbiased expanding variance is computed column-wise.
1917
+
1918
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1919
+ >>> df.expanding(5).kurt()
1920
+ A B
1921
+ 0 NaN NaN
1922
+ 1 NaN NaN
1923
+ 2 NaN NaN
1924
+ 3 NaN NaN
1925
+ 4 0.312500 0.906336
1926
+ 5 3.419520 1.486581
1927
+ 6 4.028185 1.936169
1928
+ 7 2.230373 2.273792
1929
+ """
1930
+ return super().kurt()
1931
+
1932
+
1933
+ class ExpandingGroupby(ExpandingLike[FrameLike]):
1934
+ def __init__(self, groupby: GroupBy[FrameLike], min_periods: int = 1):
1935
+ super().__init__(min_periods)
1936
+
1937
+ self._groupby = groupby
1938
+ self._window = self._window.partitionBy(*[ser.spark.column for ser in groupby._groupkeys])
1939
+ self._unbounded_window = self._window.partitionBy(
1940
+ *[ser.spark.column for ser in groupby._groupkeys]
1941
+ )
1942
+
1943
+ def __getattr__(self, item: str) -> Any:
1944
+ if hasattr(MissingPandasLikeExpandingGroupby, item):
1945
+ property_or_func = getattr(MissingPandasLikeExpandingGroupby, item)
1946
+ if isinstance(property_or_func, property):
1947
+ return property_or_func.fget(self)
1948
+ else:
1949
+ return partial(property_or_func, self)
1950
+ raise AttributeError(item)
1951
+
1952
+ _apply_as_series_or_frame = RollingGroupby._apply_as_series_or_frame
1953
+
1954
+ def count(self) -> FrameLike:
1955
+ """
1956
+ The expanding count of any non-NaN observations inside the window.
1957
+
1958
+ Returns
1959
+ -------
1960
+ Series or DataFrame
1961
+ Returned object type is determined by the caller of the expanding
1962
+ calculation.
1963
+
1964
+ See Also
1965
+ --------
1966
+ pyspark.pandas.Series.expanding : Calling object with Series data.
1967
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
1968
+ pyspark.pandas.Series.count : Count of the full Series.
1969
+ pyspark.pandas.DataFrame.count : Count of the full DataFrame.
1970
+
1971
+ Examples
1972
+ --------
1973
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
1974
+ >>> s.groupby(s).expanding(3).count().sort_index()
1975
+ 2 0 NaN
1976
+ 1 NaN
1977
+ 3 2 NaN
1978
+ 3 NaN
1979
+ 4 3.0
1980
+ 4 5 NaN
1981
+ 6 NaN
1982
+ 7 3.0
1983
+ 8 4.0
1984
+ 5 9 NaN
1985
+ 10 NaN
1986
+ dtype: float64
1987
+
1988
+ For DataFrame, each expanding count is computed column-wise.
1989
+
1990
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
1991
+ >>> df.groupby(df.A).expanding(2).count().sort_index() # doctest: +NORMALIZE_WHITESPACE
1992
+ B
1993
+ A
1994
+ 2 0 NaN
1995
+ 1 2.0
1996
+ 3 2 NaN
1997
+ 3 2.0
1998
+ 4 3.0
1999
+ 4 5 NaN
2000
+ 6 2.0
2001
+ 7 3.0
2002
+ 8 4.0
2003
+ 5 9 NaN
2004
+ 10 2.0
2005
+ """
2006
+ return super().count()
2007
+
2008
+ def sum(self) -> FrameLike:
2009
+ """
2010
+ Calculate expanding summation of given DataFrame or Series.
2011
+
2012
+ Returns
2013
+ -------
2014
+ Series or DataFrame
2015
+ Same type as the input, with the same index, containing the
2016
+ expanding summation.
2017
+
2018
+ See Also
2019
+ --------
2020
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2021
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2022
+ pyspark.pandas.Series.sum : Reducing sum for Series.
2023
+ pyspark.pandas.DataFrame.sum : Reducing sum for DataFrame.
2024
+
2025
+ Examples
2026
+ --------
2027
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
2028
+ >>> s.groupby(s).expanding(3).sum().sort_index()
2029
+ 2 0 NaN
2030
+ 1 NaN
2031
+ 3 2 NaN
2032
+ 3 NaN
2033
+ 4 9.0
2034
+ 4 5 NaN
2035
+ 6 NaN
2036
+ 7 12.0
2037
+ 8 16.0
2038
+ 5 9 NaN
2039
+ 10 NaN
2040
+ dtype: float64
2041
+
2042
+ For DataFrame, each expanding summation is computed column-wise.
2043
+
2044
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
2045
+ >>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
2046
+ B
2047
+ A
2048
+ 2 0 NaN
2049
+ 1 8.0
2050
+ 3 2 NaN
2051
+ 3 18.0
2052
+ 4 27.0
2053
+ 4 5 NaN
2054
+ 6 32.0
2055
+ 7 48.0
2056
+ 8 64.0
2057
+ 5 9 NaN
2058
+ 10 50.0
2059
+ """
2060
+ return super().sum()
2061
+
2062
+ def min(self) -> FrameLike:
2063
+ """
2064
+ Calculate the expanding minimum.
2065
+
2066
+ Returns
2067
+ -------
2068
+ Series or DataFrame
2069
+ Returned object type is determined by the caller of the expanding
2070
+ calculation.
2071
+
2072
+ See Also
2073
+ --------
2074
+ pyspark.pandas.Series.expanding : Calling object with a Series.
2075
+ pyspark.pandas.DataFrame.expanding : Calling object with a DataFrame.
2076
+ pyspark.pandas.Series.min : Similar method for Series.
2077
+ pyspark.pandas.DataFrame.min : Similar method for DataFrame.
2078
+
2079
+ Examples
2080
+ --------
2081
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
2082
+ >>> s.groupby(s).expanding(3).min().sort_index()
2083
+ 2 0 NaN
2084
+ 1 NaN
2085
+ 3 2 NaN
2086
+ 3 NaN
2087
+ 4 3.0
2088
+ 4 5 NaN
2089
+ 6 NaN
2090
+ 7 4.0
2091
+ 8 4.0
2092
+ 5 9 NaN
2093
+ 10 NaN
2094
+ dtype: float64
2095
+
2096
+ For DataFrame, each expanding minimum is computed column-wise.
2097
+
2098
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
2099
+ >>> df.groupby(df.A).expanding(2).min().sort_index() # doctest: +NORMALIZE_WHITESPACE
2100
+ B
2101
+ A
2102
+ 2 0 NaN
2103
+ 1 4.0
2104
+ 3 2 NaN
2105
+ 3 9.0
2106
+ 4 9.0
2107
+ 4 5 NaN
2108
+ 6 16.0
2109
+ 7 16.0
2110
+ 8 16.0
2111
+ 5 9 NaN
2112
+ 10 25.0
2113
+ """
2114
+ return super().min()
2115
+
2116
+ def max(self) -> FrameLike:
2117
+ """
2118
+ Calculate the expanding maximum.
2119
+
2120
+ Returns
2121
+ -------
2122
+ Series or DataFrame
2123
+ Return type is determined by the caller.
2124
+
2125
+ See Also
2126
+ --------
2127
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2128
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2129
+ pyspark.pandas.Series.max : Similar method for Series.
2130
+ pyspark.pandas.DataFrame.max : Similar method for DataFrame.
2131
+
2132
+ Examples
2133
+ --------
2134
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
2135
+ >>> s.groupby(s).expanding(3).max().sort_index()
2136
+ 2 0 NaN
2137
+ 1 NaN
2138
+ 3 2 NaN
2139
+ 3 NaN
2140
+ 4 3.0
2141
+ 4 5 NaN
2142
+ 6 NaN
2143
+ 7 4.0
2144
+ 8 4.0
2145
+ 5 9 NaN
2146
+ 10 NaN
2147
+ dtype: float64
2148
+
2149
+ For DataFrame, each expanding maximum is computed column-wise.
2150
+
2151
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
2152
+ >>> df.groupby(df.A).expanding(2).max().sort_index() # doctest: +NORMALIZE_WHITESPACE
2153
+ B
2154
+ A
2155
+ 2 0 NaN
2156
+ 1 4.0
2157
+ 3 2 NaN
2158
+ 3 9.0
2159
+ 4 9.0
2160
+ 4 5 NaN
2161
+ 6 16.0
2162
+ 7 16.0
2163
+ 8 16.0
2164
+ 5 9 NaN
2165
+ 10 25.0
2166
+ """
2167
+ return super().max()
2168
+
2169
+ def mean(self) -> FrameLike:
2170
+ """
2171
+ Calculate the expanding mean of the values.
2172
+
2173
+ Returns
2174
+ -------
2175
+ Series or DataFrame
2176
+ Returned object type is determined by the caller of the expanding
2177
+ calculation.
2178
+
2179
+ See Also
2180
+ --------
2181
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2182
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2183
+ pyspark.pandas.Series.mean : Equivalent method for Series.
2184
+ pyspark.pandas.DataFrame.mean : Equivalent method for DataFrame.
2185
+
2186
+ Examples
2187
+ --------
2188
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
2189
+ >>> s.groupby(s).expanding(3).mean().sort_index()
2190
+ 2 0 NaN
2191
+ 1 NaN
2192
+ 3 2 NaN
2193
+ 3 NaN
2194
+ 4 3.0
2195
+ 4 5 NaN
2196
+ 6 NaN
2197
+ 7 4.0
2198
+ 8 4.0
2199
+ 5 9 NaN
2200
+ 10 NaN
2201
+ dtype: float64
2202
+
2203
+ For DataFrame, each expanding mean is computed column-wise.
2204
+
2205
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
2206
+ >>> df.groupby(df.A).expanding(2).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE
2207
+ B
2208
+ A
2209
+ 2 0 NaN
2210
+ 1 4.0
2211
+ 3 2 NaN
2212
+ 3 9.0
2213
+ 4 9.0
2214
+ 4 5 NaN
2215
+ 6 16.0
2216
+ 7 16.0
2217
+ 8 16.0
2218
+ 5 9 NaN
2219
+ 10 25.0
2220
+ """
2221
+ return super().mean()
2222
+
2223
+ def quantile(self, quantile: float, accuracy: int = 10000) -> FrameLike:
2224
+ """
2225
+ Calculate the expanding quantile of the values.
2226
+
2227
+ .. versionadded:: 3.4.0
2228
+
2229
+ Parameters
2230
+ ----------
2231
+ quantile : float
2232
+ Value between 0 and 1 providing the quantile to compute.
2233
+ accuracy : int, optional
2234
+ Default accuracy of approximation. Larger value means better accuracy.
2235
+ The relative error can be deduced by 1.0 / accuracy.
2236
+ This is a panda-on-Spark specific parameter.
2237
+
2238
+ Returns
2239
+ -------
2240
+ Series or DataFrame
2241
+ Returned object type is determined by the caller of the expanding
2242
+ calculation.
2243
+
2244
+ Notes
2245
+ -----
2246
+ `quantile` in pandas-on-Spark are using distributed percentile approximation
2247
+ algorithm unlike pandas, the result might be different with pandas, also `interpolation`
2248
+ parameter is not supported yet.
2249
+
2250
+ See Also
2251
+ --------
2252
+ pyspark.pandas.Series.expanding : Calling expanding with Series data.
2253
+ pyspark.pandas.DataFrame.expanding : Calling expanding with DataFrames.
2254
+ pyspark.pandas.Series.quantile : Aggregating quantile for Series.
2255
+ pyspark.pandas.DataFrame.quantile : Aggregating quantile for DataFrame.
2256
+
2257
+ Examples
2258
+ --------
2259
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
2260
+ >>> s.groupby(s).expanding(3).quantile(0.5).sort_index()
2261
+ 2 0 NaN
2262
+ 1 NaN
2263
+ 3 2 NaN
2264
+ 3 NaN
2265
+ 4 3.0
2266
+ 4 5 NaN
2267
+ 6 NaN
2268
+ 7 4.0
2269
+ 8 4.0
2270
+ 5 9 NaN
2271
+ 10 NaN
2272
+ dtype: float64
2273
+
2274
+ For DataFrame, each expanding quantile is computed column-wise.
2275
+
2276
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
2277
+ >>> df.groupby(df.A).expanding(2).quantile(0.5).sort_index()
2278
+ B
2279
+ A
2280
+ 2 0 NaN
2281
+ 1 4.0
2282
+ 3 2 NaN
2283
+ 3 9.0
2284
+ 4 9.0
2285
+ 4 5 NaN
2286
+ 6 16.0
2287
+ 7 16.0
2288
+ 8 16.0
2289
+ 5 9 NaN
2290
+ 10 25.0
2291
+ """
2292
+ return super().quantile(quantile, accuracy)
2293
+
2294
+ def std(self) -> FrameLike:
2295
+ """
2296
+ Calculate expanding standard deviation.
2297
+
2298
+
2299
+ Returns
2300
+ -------
2301
+ Series or DataFrame
2302
+ Returns the same object type as the caller of the expanding calculation.
2303
+
2304
+ See Also
2305
+ --------
2306
+ pyspark.pandas.Series.expanding: Calling object with Series data.
2307
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2308
+ pyspark.pandas.Series.std : Equivalent method for Series.
2309
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
2310
+ numpy.std : Equivalent method for Numpy array.
2311
+ """
2312
+ return super().std()
2313
+
2314
+ def var(self) -> FrameLike:
2315
+ """
2316
+ Calculate unbiased expanding variance.
2317
+
2318
+ Returns
2319
+ -------
2320
+ Series or DataFrame
2321
+ Returns the same object type as the caller of the expanding calculation.
2322
+
2323
+ See Also
2324
+ --------
2325
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2326
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2327
+ pyspark.pandas.Series.var : Equivalent method for Series.
2328
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
2329
+ numpy.var : Equivalent method for Numpy array.
2330
+ """
2331
+ return super().var()
2332
+
2333
+ def skew(self) -> FrameLike:
2334
+ """
2335
+ Calculate expanding standard skew.
2336
+
2337
+
2338
+ Returns
2339
+ -------
2340
+ Series or DataFrame
2341
+ Returns the same object type as the caller of the expanding calculation.
2342
+
2343
+ See Also
2344
+ --------
2345
+ pyspark.pandas.Series.expanding: Calling object with Series data.
2346
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2347
+ pyspark.pandas.Series.std : Equivalent method for Series.
2348
+ pyspark.pandas.DataFrame.std : Equivalent method for DataFrame.
2349
+ numpy.std : Equivalent method for Numpy array.
2350
+ """
2351
+ return super().skew()
2352
+
2353
+ def kurt(self) -> FrameLike:
2354
+ """
2355
+ Calculate unbiased expanding kurtosis.
2356
+
2357
+ Returns
2358
+ -------
2359
+ Series or DataFrame
2360
+ Returns the same object type as the caller of the expanding calculation.
2361
+
2362
+ See Also
2363
+ --------
2364
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2365
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2366
+ pyspark.pandas.Series.var : Equivalent method for Series.
2367
+ pyspark.pandas.DataFrame.var : Equivalent method for DataFrame.
2368
+ numpy.var : Equivalent method for Numpy array.
2369
+ """
2370
+ return super().kurt()
2371
+
2372
+
2373
+ class ExponentialMovingLike(Generic[FrameLike], metaclass=ABCMeta):
2374
+ def __init__(
2375
+ self,
2376
+ window: WindowSpec,
2377
+ com: Optional[float] = None,
2378
+ span: Optional[float] = None,
2379
+ halflife: Optional[float] = None,
2380
+ alpha: Optional[float] = None,
2381
+ min_periods: Optional[int] = None,
2382
+ ignore_na: bool = False,
2383
+ ):
2384
+ if (min_periods is not None) and (min_periods < 0):
2385
+ raise ValueError("min_periods must be >= 0")
2386
+ if min_periods is None:
2387
+ min_periods = 0
2388
+ self._min_periods = min_periods
2389
+ self._ignore_na = ignore_na
2390
+
2391
+ self._window = window
2392
+ # This unbounded Window is later used to handle 'min_periods' for now.
2393
+ self._unbounded_window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
2394
+ Window.unboundedPreceding, Window.currentRow
2395
+ )
2396
+
2397
+ if (com is not None) and (not com >= 0):
2398
+ raise ValueError("com must be >= 0")
2399
+ self._com = com
2400
+
2401
+ if (span is not None) and (not span >= 1):
2402
+ raise ValueError("span must be >= 1")
2403
+ self._span = span
2404
+
2405
+ if (halflife is not None) and (not halflife > 0):
2406
+ raise ValueError("halflife must be > 0")
2407
+ self._halflife = halflife
2408
+
2409
+ if (alpha is not None) and (not 0 < alpha <= 1):
2410
+ raise ValueError("alpha must be in (0, 1]")
2411
+ self._alpha = alpha
2412
+
2413
+ def _compute_unified_alpha(self) -> float:
2414
+ unified_alpha = np.nan
2415
+ opt_count = 0
2416
+
2417
+ if self._com is not None:
2418
+ unified_alpha = 1.0 / (1 + self._com)
2419
+ opt_count += 1
2420
+ if self._span is not None:
2421
+ unified_alpha = 2.0 / (1 + self._span)
2422
+ opt_count += 1
2423
+ if self._halflife is not None:
2424
+ unified_alpha = 1.0 - np.exp(-np.log(2) / self._halflife)
2425
+ opt_count += 1
2426
+ if self._alpha is not None:
2427
+ unified_alpha = self._alpha
2428
+ opt_count += 1
2429
+
2430
+ if opt_count == 0:
2431
+ raise ValueError("Must pass one of com, span, halflife, or alpha")
2432
+ if opt_count != 1:
2433
+ raise ValueError("com, span, halflife, and alpha are mutually exclusive")
2434
+
2435
+ return unified_alpha
2436
+
2437
+ @abstractmethod
2438
+ def _apply_as_series_or_frame(self, func: Callable[[Column], Column]) -> FrameLike:
2439
+ """
2440
+ Wraps a function that handles Spark column in order
2441
+ to support it in both pandas-on-Spark Series and DataFrame.
2442
+ Note that the given `func` name should be same as the API's method name.
2443
+ """
2444
+ pass
2445
+
2446
+ def mean(self) -> FrameLike:
2447
+ unified_alpha = self._compute_unified_alpha()
2448
+
2449
+ def mean(scol: Column) -> Column:
2450
+ col_ewm = SF.ewm(scol, unified_alpha, self._ignore_na)
2451
+ return F.when(
2452
+ F.count(F.when(~scol.isNull(), 1).otherwise(None)).over(self._unbounded_window)
2453
+ >= self._min_periods,
2454
+ col_ewm.over(self._window),
2455
+ ).otherwise(F.lit(None))
2456
+
2457
+ return self._apply_as_series_or_frame(mean)
2458
+
2459
+
2460
+ class ExponentialMoving(ExponentialMovingLike[FrameLike]):
2461
+ def __init__(
2462
+ self,
2463
+ psdf_or_psser: FrameLike,
2464
+ com: Optional[float] = None,
2465
+ span: Optional[float] = None,
2466
+ halflife: Optional[float] = None,
2467
+ alpha: Optional[float] = None,
2468
+ min_periods: Optional[int] = None,
2469
+ ignore_na: bool = False,
2470
+ ):
2471
+ from pyspark.pandas.frame import DataFrame
2472
+ from pyspark.pandas.series import Series
2473
+
2474
+ if not isinstance(psdf_or_psser, (DataFrame, Series)):
2475
+ raise TypeError(
2476
+ "psdf_or_psser must be a series or dataframe; however, got: %s"
2477
+ % type(psdf_or_psser)
2478
+ )
2479
+ self._psdf_or_psser = psdf_or_psser
2480
+
2481
+ window_spec = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
2482
+ Window.unboundedPreceding, Window.currentRow
2483
+ )
2484
+
2485
+ super().__init__(window_spec, com, span, halflife, alpha, min_periods, ignore_na)
2486
+
2487
+ def __getattr__(self, item: str) -> Any:
2488
+ if hasattr(MissingPandasLikeExponentialMoving, item):
2489
+ property_or_func = getattr(MissingPandasLikeExponentialMoving, item)
2490
+ if isinstance(property_or_func, property):
2491
+ return property_or_func.fget(self)
2492
+ else:
2493
+ return partial(property_or_func, self)
2494
+ raise AttributeError(item)
2495
+
2496
+ _apply_as_series_or_frame = Rolling._apply_as_series_or_frame
2497
+
2498
+ def mean(self) -> FrameLike:
2499
+ """
2500
+ Calculate an online exponentially weighted mean.
2501
+
2502
+ Notes
2503
+ -----
2504
+ There are behavior differences between pandas-on-Spark and pandas.
2505
+
2506
+ * the current implementation of this API uses Spark's Window without
2507
+ specifying partition specification. This leads to move all data into
2508
+ single partition in single machine and could cause serious
2509
+ performance degradation. Avoid this method against very large dataset.
2510
+
2511
+ Returns
2512
+ -------
2513
+ Series or DataFrame
2514
+ Returned object type is determined by the caller of the exponentially
2515
+ calculation.
2516
+
2517
+ See Also
2518
+ --------
2519
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2520
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2521
+ pyspark.pandas.Series.mean : Equivalent method for Series.
2522
+ pyspark.pandas.DataFrame.mean : Equivalent method for DataFrame.
2523
+
2524
+ Examples
2525
+ --------
2526
+ The below examples will show computing exponentially weighted moving average.
2527
+
2528
+ >>> df = ps.DataFrame({'s1': [.2, .0, .6, .2, .4, .5, .6], 's2': [2, 1, 3, 1, 0, 0, 0]})
2529
+ >>> df.ewm(com=0.1).mean()
2530
+ s1 s2
2531
+ 0 0.200000 2.000000
2532
+ 1 0.016667 1.083333
2533
+ 2 0.547368 2.827068
2534
+ 3 0.231557 1.165984
2535
+ 4 0.384688 0.105992
2536
+ 5 0.489517 0.009636
2537
+ 6 0.589956 0.000876
2538
+
2539
+ >>> df.s2.ewm(halflife=1.5, min_periods=3).mean()
2540
+ 0 NaN
2541
+ 1 NaN
2542
+ 2 2.182572
2543
+ 3 1.663174
2544
+ 4 0.979949
2545
+ 5 0.593155
2546
+ 6 0.364668
2547
+ Name: s2, dtype: float64
2548
+ """
2549
+ return super().mean()
2550
+
2551
+ # TODO: when add 'adjust' parameter, should add to here too.
2552
+ def __repr__(self) -> str:
2553
+ return (
2554
+ "ExponentialMoving [com={}, span={}, halflife={}, alpha={}, "
2555
+ "min_periods={}, ignore_na={}]".format(
2556
+ self._com,
2557
+ self._span,
2558
+ self._halflife,
2559
+ self._alpha,
2560
+ self._min_periods,
2561
+ self._ignore_na,
2562
+ )
2563
+ )
2564
+
2565
+
2566
+ class ExponentialMovingGroupby(ExponentialMovingLike[FrameLike]):
2567
+ def __init__(
2568
+ self,
2569
+ groupby: GroupBy[FrameLike],
2570
+ com: Optional[float] = None,
2571
+ span: Optional[float] = None,
2572
+ halflife: Optional[float] = None,
2573
+ alpha: Optional[float] = None,
2574
+ min_periods: Optional[int] = None,
2575
+ ignore_na: bool = False,
2576
+ ):
2577
+ window_spec = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
2578
+ Window.unboundedPreceding, Window.currentRow
2579
+ )
2580
+ super().__init__(window_spec, com, span, halflife, alpha, min_periods, ignore_na)
2581
+
2582
+ self._groupby = groupby
2583
+ self._window = self._window.partitionBy(*[ser.spark.column for ser in groupby._groupkeys])
2584
+ self._unbounded_window = self._unbounded_window.partitionBy(
2585
+ *[ser.spark.column for ser in groupby._groupkeys]
2586
+ )
2587
+
2588
+ def __getattr__(self, item: str) -> Any:
2589
+ if hasattr(MissingPandasLikeExponentialMovingGroupby, item):
2590
+ property_or_func = getattr(MissingPandasLikeExponentialMovingGroupby, item)
2591
+ if isinstance(property_or_func, property):
2592
+ return property_or_func.fget(self)
2593
+ else:
2594
+ return partial(property_or_func, self)
2595
+ raise AttributeError(item)
2596
+
2597
+ _apply_as_series_or_frame = RollingGroupby._apply_as_series_or_frame
2598
+
2599
+ def mean(self) -> FrameLike:
2600
+ """
2601
+ Calculate an online exponentially weighted mean.
2602
+
2603
+ Notes
2604
+ -----
2605
+ There are behavior differences between pandas-on-Spark and pandas.
2606
+
2607
+ * the current implementation of this API uses Spark's Window without
2608
+ specifying partition specification. This leads to move all data into
2609
+ single partition in single machine and could cause serious
2610
+ performance degradation. Avoid this method against very large dataset.
2611
+
2612
+ Returns
2613
+ -------
2614
+ Series or DataFrame
2615
+ Returned object type is determined by the caller of the exponentially
2616
+ calculation.
2617
+
2618
+ See Also
2619
+ --------
2620
+ pyspark.pandas.Series.expanding : Calling object with Series data.
2621
+ pyspark.pandas.DataFrame.expanding : Calling object with DataFrames.
2622
+ pyspark.pandas.Series.mean : Equivalent method for Series.
2623
+ pyspark.pandas.DataFrame.mean : Equivalent method for DataFrame.
2624
+
2625
+ Examples
2626
+ --------
2627
+ >>> s = ps.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
2628
+ >>> s.groupby(s).ewm(alpha=0.5).mean().sort_index()
2629
+ 2 0 2.0
2630
+ 1 2.0
2631
+ 3 2 3.0
2632
+ 3 3.0
2633
+ 4 3.0
2634
+ 4 5 4.0
2635
+ 6 4.0
2636
+ 7 4.0
2637
+ 8 4.0
2638
+ 5 9 5.0
2639
+ 10 5.0
2640
+ dtype: float64
2641
+
2642
+ For DataFrame, each ewm mean is computed column-wise.
2643
+
2644
+ >>> df = ps.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
2645
+ >>> df.groupby(df.A).ewm(alpha=0.5).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE
2646
+ B
2647
+ A
2648
+ 2 0 4.0
2649
+ 1 4.0
2650
+ 3 2 9.0
2651
+ 3 9.0
2652
+ 4 9.0
2653
+ 4 5 16.0
2654
+ 6 16.0
2655
+ 7 16.0
2656
+ 8 16.0
2657
+ 5 9 25.0
2658
+ 10 25.0
2659
+ """
2660
+ return super().mean()
2661
+
2662
+ # TODO: when add 'adjust' parameter, should add to here too.
2663
+ def __repr__(self) -> str:
2664
+ return (
2665
+ "ExponentialMovingGroupby [com={}, span={}, halflife={}, alpha={}, "
2666
+ "min_periods={}, ignore_na={}]".format(
2667
+ self._com,
2668
+ self._span,
2669
+ self._halflife,
2670
+ self._alpha,
2671
+ self._min_periods,
2672
+ self._ignore_na,
2673
+ )
2674
+ )
2675
+
2676
+
2677
+ def _test() -> None:
2678
+ import os
2679
+ import doctest
2680
+ import sys
2681
+ from pyspark.sql import SparkSession
2682
+ import pyspark.pandas.window
2683
+
2684
+ os.chdir(os.environ["SPARK_HOME"])
2685
+
2686
+ globs = pyspark.pandas.window.__dict__.copy()
2687
+ globs["ps"] = pyspark.pandas
2688
+ spark = (
2689
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.window tests").getOrCreate()
2690
+ )
2691
+ (failure_count, test_count) = doctest.testmod(
2692
+ pyspark.pandas.window,
2693
+ globs=globs,
2694
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
2695
+ )
2696
+ spark.stop()
2697
+ if failure_count:
2698
+ sys.exit(-1)
2699
+
2700
+
2701
+ if __name__ == "__main__":
2702
+ _test()