snowpark-connect 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (879) hide show
  1. snowflake/snowpark_connect/__init__.py +23 -0
  2. snowflake/snowpark_connect/analyze_plan/__init__.py +3 -0
  3. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +38 -0
  4. snowflake/snowpark_connect/column_name_handler.py +735 -0
  5. snowflake/snowpark_connect/config.py +576 -0
  6. snowflake/snowpark_connect/constants.py +47 -0
  7. snowflake/snowpark_connect/control_server.py +52 -0
  8. snowflake/snowpark_connect/dataframe_name_handler.py +54 -0
  9. snowflake/snowpark_connect/date_time_format_mapping.py +399 -0
  10. snowflake/snowpark_connect/empty_dataframe.py +18 -0
  11. snowflake/snowpark_connect/error/__init__.py +11 -0
  12. snowflake/snowpark_connect/error/error_mapping.py +6174 -0
  13. snowflake/snowpark_connect/error/error_utils.py +321 -0
  14. snowflake/snowpark_connect/error/exceptions.py +24 -0
  15. snowflake/snowpark_connect/execute_plan/__init__.py +3 -0
  16. snowflake/snowpark_connect/execute_plan/map_execution_command.py +204 -0
  17. snowflake/snowpark_connect/execute_plan/map_execution_root.py +173 -0
  18. snowflake/snowpark_connect/execute_plan/utils.py +183 -0
  19. snowflake/snowpark_connect/expression/__init__.py +3 -0
  20. snowflake/snowpark_connect/expression/literal.py +90 -0
  21. snowflake/snowpark_connect/expression/map_cast.py +343 -0
  22. snowflake/snowpark_connect/expression/map_expression.py +293 -0
  23. snowflake/snowpark_connect/expression/map_extension.py +104 -0
  24. snowflake/snowpark_connect/expression/map_sql_expression.py +633 -0
  25. snowflake/snowpark_connect/expression/map_udf.py +142 -0
  26. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +241 -0
  27. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +85 -0
  28. snowflake/snowpark_connect/expression/map_unresolved_function.py +9450 -0
  29. snowflake/snowpark_connect/expression/map_unresolved_star.py +218 -0
  30. snowflake/snowpark_connect/expression/map_update_fields.py +164 -0
  31. snowflake/snowpark_connect/expression/map_window_function.py +258 -0
  32. snowflake/snowpark_connect/expression/typer.py +125 -0
  33. snowflake/snowpark_connect/includes/__init__.py +0 -0
  34. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  35. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  36. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  37. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  39. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  43. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  44. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  45. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  46. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  47. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  48. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  49. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  50. snowflake/snowpark_connect/includes/jars/hadoop-client-api-3.3.4.jar +0 -0
  51. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  52. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  53. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  54. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  55. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  56. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  57. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  58. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  59. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  60. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  61. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  62. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  63. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  64. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  65. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  66. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  67. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  68. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  69. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  70. snowflake/snowpark_connect/includes/jars/scala-compiler-2.12.18.jar +0 -0
  71. snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
  72. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  73. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  74. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  75. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  76. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  77. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  78. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  79. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  80. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  81. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  82. snowflake/snowpark_connect/includes/jars/spark-kubernetes_2.12-3.5.6.jar +0 -0
  83. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  84. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  85. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  86. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  87. snowflake/snowpark_connect/includes/jars/spark-mllib_2.12-3.5.6.jar +0 -0
  88. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  89. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  90. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  91. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  92. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  93. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  94. snowflake/snowpark_connect/includes/jars/spark-streaming_2.12-3.5.6.jar +0 -0
  95. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  96. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  97. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  98. snowflake/snowpark_connect/includes/python/__init__.py +21 -0
  99. snowflake/snowpark_connect/includes/python/pyspark/__init__.py +173 -0
  100. snowflake/snowpark_connect/includes/python/pyspark/_globals.py +71 -0
  101. snowflake/snowpark_connect/includes/python/pyspark/_typing.pyi +43 -0
  102. snowflake/snowpark_connect/includes/python/pyspark/accumulators.py +341 -0
  103. snowflake/snowpark_connect/includes/python/pyspark/broadcast.py +383 -0
  104. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/__init__.py +8 -0
  105. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle.py +948 -0
  106. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/cloudpickle_fast.py +844 -0
  107. snowflake/snowpark_connect/includes/python/pyspark/cloudpickle/compat.py +18 -0
  108. snowflake/snowpark_connect/includes/python/pyspark/conf.py +276 -0
  109. snowflake/snowpark_connect/includes/python/pyspark/context.py +2601 -0
  110. snowflake/snowpark_connect/includes/python/pyspark/daemon.py +218 -0
  111. snowflake/snowpark_connect/includes/python/pyspark/errors/__init__.py +70 -0
  112. snowflake/snowpark_connect/includes/python/pyspark/errors/error_classes.py +889 -0
  113. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/__init__.py +16 -0
  114. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/base.py +228 -0
  115. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/captured.py +307 -0
  116. snowflake/snowpark_connect/includes/python/pyspark/errors/exceptions/connect.py +190 -0
  117. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/__init__.py +16 -0
  118. snowflake/snowpark_connect/includes/python/pyspark/errors/tests/test_errors.py +60 -0
  119. snowflake/snowpark_connect/includes/python/pyspark/errors/utils.py +116 -0
  120. snowflake/snowpark_connect/includes/python/pyspark/files.py +165 -0
  121. snowflake/snowpark_connect/includes/python/pyspark/find_spark_home.py +95 -0
  122. snowflake/snowpark_connect/includes/python/pyspark/install.py +203 -0
  123. snowflake/snowpark_connect/includes/python/pyspark/instrumentation_utils.py +190 -0
  124. snowflake/snowpark_connect/includes/python/pyspark/java_gateway.py +248 -0
  125. snowflake/snowpark_connect/includes/python/pyspark/join.py +118 -0
  126. snowflake/snowpark_connect/includes/python/pyspark/ml/__init__.py +71 -0
  127. snowflake/snowpark_connect/includes/python/pyspark/ml/_typing.pyi +84 -0
  128. snowflake/snowpark_connect/includes/python/pyspark/ml/base.py +414 -0
  129. snowflake/snowpark_connect/includes/python/pyspark/ml/classification.py +4332 -0
  130. snowflake/snowpark_connect/includes/python/pyspark/ml/clustering.py +2188 -0
  131. snowflake/snowpark_connect/includes/python/pyspark/ml/common.py +146 -0
  132. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/__init__.py +44 -0
  133. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/base.py +346 -0
  134. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/classification.py +382 -0
  135. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/evaluation.py +291 -0
  136. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/feature.py +258 -0
  137. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/functions.py +77 -0
  138. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/io_utils.py +335 -0
  139. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/pipeline.py +262 -0
  140. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/summarizer.py +120 -0
  141. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/tuning.py +579 -0
  142. snowflake/snowpark_connect/includes/python/pyspark/ml/connect/util.py +173 -0
  143. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/__init__.py +16 -0
  144. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/deepspeed_distributor.py +165 -0
  145. snowflake/snowpark_connect/includes/python/pyspark/ml/deepspeed/tests/test_deepspeed_distributor.py +306 -0
  146. snowflake/snowpark_connect/includes/python/pyspark/ml/dl_util.py +150 -0
  147. snowflake/snowpark_connect/includes/python/pyspark/ml/evaluation.py +1166 -0
  148. snowflake/snowpark_connect/includes/python/pyspark/ml/feature.py +7474 -0
  149. snowflake/snowpark_connect/includes/python/pyspark/ml/fpm.py +543 -0
  150. snowflake/snowpark_connect/includes/python/pyspark/ml/functions.py +842 -0
  151. snowflake/snowpark_connect/includes/python/pyspark/ml/image.py +271 -0
  152. snowflake/snowpark_connect/includes/python/pyspark/ml/linalg/__init__.py +1382 -0
  153. snowflake/snowpark_connect/includes/python/pyspark/ml/model_cache.py +55 -0
  154. snowflake/snowpark_connect/includes/python/pyspark/ml/param/__init__.py +602 -0
  155. snowflake/snowpark_connect/includes/python/pyspark/ml/param/_shared_params_code_gen.py +368 -0
  156. snowflake/snowpark_connect/includes/python/pyspark/ml/param/shared.py +878 -0
  157. snowflake/snowpark_connect/includes/python/pyspark/ml/pipeline.py +451 -0
  158. snowflake/snowpark_connect/includes/python/pyspark/ml/recommendation.py +748 -0
  159. snowflake/snowpark_connect/includes/python/pyspark/ml/regression.py +3335 -0
  160. snowflake/snowpark_connect/includes/python/pyspark/ml/stat.py +523 -0
  161. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/__init__.py +16 -0
  162. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_classification.py +53 -0
  163. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_evaluation.py +50 -0
  164. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_feature.py +43 -0
  165. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_function.py +114 -0
  166. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_pipeline.py +47 -0
  167. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_summarizer.py +43 -0
  168. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_connect_tuning.py +46 -0
  169. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_classification.py +238 -0
  170. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_evaluation.py +194 -0
  171. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py +156 -0
  172. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_pipeline.py +184 -0
  173. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_summarizer.py +78 -0
  174. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py +292 -0
  175. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_data_loader.py +50 -0
  176. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +152 -0
  177. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_algorithms.py +456 -0
  178. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_base.py +96 -0
  179. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_dl_util.py +186 -0
  180. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_evaluation.py +77 -0
  181. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_feature.py +401 -0
  182. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_functions.py +528 -0
  183. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_image.py +82 -0
  184. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_linalg.py +409 -0
  185. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_model_cache.py +55 -0
  186. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_param.py +441 -0
  187. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_persistence.py +546 -0
  188. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_pipeline.py +71 -0
  189. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_stat.py +52 -0
  190. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_training_summary.py +494 -0
  191. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_util.py +85 -0
  192. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/test_wrapper.py +138 -0
  193. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/__init__.py +16 -0
  194. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_basic.py +151 -0
  195. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_nested.py +97 -0
  196. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py +143 -0
  197. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tuning.py +551 -0
  198. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_basic.py +137 -0
  199. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_nested.py +96 -0
  200. snowflake/snowpark_connect/includes/python/pyspark/ml/tests/tuning/test_tvs_io_pipeline.py +142 -0
  201. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/__init__.py +16 -0
  202. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/data.py +100 -0
  203. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/distributor.py +1133 -0
  204. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/log_communication.py +198 -0
  205. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/__init__.py +16 -0
  206. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_data_loader.py +137 -0
  207. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_distributor.py +561 -0
  208. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/tests/test_log_communication.py +172 -0
  209. snowflake/snowpark_connect/includes/python/pyspark/ml/torch/torch_run_process_wrapper.py +83 -0
  210. snowflake/snowpark_connect/includes/python/pyspark/ml/tree.py +434 -0
  211. snowflake/snowpark_connect/includes/python/pyspark/ml/tuning.py +1741 -0
  212. snowflake/snowpark_connect/includes/python/pyspark/ml/util.py +749 -0
  213. snowflake/snowpark_connect/includes/python/pyspark/ml/wrapper.py +465 -0
  214. snowflake/snowpark_connect/includes/python/pyspark/mllib/__init__.py +44 -0
  215. snowflake/snowpark_connect/includes/python/pyspark/mllib/_typing.pyi +33 -0
  216. snowflake/snowpark_connect/includes/python/pyspark/mllib/classification.py +989 -0
  217. snowflake/snowpark_connect/includes/python/pyspark/mllib/clustering.py +1318 -0
  218. snowflake/snowpark_connect/includes/python/pyspark/mllib/common.py +174 -0
  219. snowflake/snowpark_connect/includes/python/pyspark/mllib/evaluation.py +691 -0
  220. snowflake/snowpark_connect/includes/python/pyspark/mllib/feature.py +1085 -0
  221. snowflake/snowpark_connect/includes/python/pyspark/mllib/fpm.py +233 -0
  222. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/__init__.py +1653 -0
  223. snowflake/snowpark_connect/includes/python/pyspark/mllib/linalg/distributed.py +1662 -0
  224. snowflake/snowpark_connect/includes/python/pyspark/mllib/random.py +698 -0
  225. snowflake/snowpark_connect/includes/python/pyspark/mllib/recommendation.py +389 -0
  226. snowflake/snowpark_connect/includes/python/pyspark/mllib/regression.py +1067 -0
  227. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/KernelDensity.py +59 -0
  228. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/__init__.py +34 -0
  229. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/_statistics.py +409 -0
  230. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/distribution.py +39 -0
  231. snowflake/snowpark_connect/includes/python/pyspark/mllib/stat/test.py +86 -0
  232. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/__init__.py +16 -0
  233. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_algorithms.py +353 -0
  234. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_feature.py +192 -0
  235. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_linalg.py +680 -0
  236. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_stat.py +206 -0
  237. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_streaming_algorithms.py +471 -0
  238. snowflake/snowpark_connect/includes/python/pyspark/mllib/tests/test_util.py +108 -0
  239. snowflake/snowpark_connect/includes/python/pyspark/mllib/tree.py +888 -0
  240. snowflake/snowpark_connect/includes/python/pyspark/mllib/util.py +659 -0
  241. snowflake/snowpark_connect/includes/python/pyspark/pandas/__init__.py +165 -0
  242. snowflake/snowpark_connect/includes/python/pyspark/pandas/_typing.py +52 -0
  243. snowflake/snowpark_connect/includes/python/pyspark/pandas/accessors.py +989 -0
  244. snowflake/snowpark_connect/includes/python/pyspark/pandas/base.py +1804 -0
  245. snowflake/snowpark_connect/includes/python/pyspark/pandas/categorical.py +822 -0
  246. snowflake/snowpark_connect/includes/python/pyspark/pandas/config.py +539 -0
  247. snowflake/snowpark_connect/includes/python/pyspark/pandas/correlation.py +262 -0
  248. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/__init__.py +16 -0
  249. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/base.py +519 -0
  250. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/binary_ops.py +98 -0
  251. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/boolean_ops.py +426 -0
  252. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/categorical_ops.py +141 -0
  253. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/complex_ops.py +145 -0
  254. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/date_ops.py +127 -0
  255. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/datetime_ops.py +171 -0
  256. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/null_ops.py +83 -0
  257. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/num_ops.py +588 -0
  258. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/string_ops.py +154 -0
  259. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/timedelta_ops.py +101 -0
  260. snowflake/snowpark_connect/includes/python/pyspark/pandas/data_type_ops/udt_ops.py +29 -0
  261. snowflake/snowpark_connect/includes/python/pyspark/pandas/datetimes.py +891 -0
  262. snowflake/snowpark_connect/includes/python/pyspark/pandas/exceptions.py +150 -0
  263. snowflake/snowpark_connect/includes/python/pyspark/pandas/extensions.py +388 -0
  264. snowflake/snowpark_connect/includes/python/pyspark/pandas/frame.py +13738 -0
  265. snowflake/snowpark_connect/includes/python/pyspark/pandas/generic.py +3560 -0
  266. snowflake/snowpark_connect/includes/python/pyspark/pandas/groupby.py +4448 -0
  267. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/__init__.py +21 -0
  268. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/base.py +2783 -0
  269. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/category.py +773 -0
  270. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/datetimes.py +843 -0
  271. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/multi.py +1323 -0
  272. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/numeric.py +210 -0
  273. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexes/timedelta.py +197 -0
  274. snowflake/snowpark_connect/includes/python/pyspark/pandas/indexing.py +1862 -0
  275. snowflake/snowpark_connect/includes/python/pyspark/pandas/internal.py +1680 -0
  276. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/__init__.py +48 -0
  277. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/common.py +76 -0
  278. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/frame.py +63 -0
  279. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/general_functions.py +43 -0
  280. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/groupby.py +93 -0
  281. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/indexes.py +184 -0
  282. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/resample.py +101 -0
  283. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/scalars.py +29 -0
  284. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/series.py +69 -0
  285. snowflake/snowpark_connect/includes/python/pyspark/pandas/missing/window.py +168 -0
  286. snowflake/snowpark_connect/includes/python/pyspark/pandas/mlflow.py +238 -0
  287. snowflake/snowpark_connect/includes/python/pyspark/pandas/namespace.py +3807 -0
  288. snowflake/snowpark_connect/includes/python/pyspark/pandas/numpy_compat.py +260 -0
  289. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/__init__.py +17 -0
  290. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/core.py +1213 -0
  291. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/matplotlib.py +928 -0
  292. snowflake/snowpark_connect/includes/python/pyspark/pandas/plot/plotly.py +261 -0
  293. snowflake/snowpark_connect/includes/python/pyspark/pandas/resample.py +816 -0
  294. snowflake/snowpark_connect/includes/python/pyspark/pandas/series.py +7440 -0
  295. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_formatter.py +308 -0
  296. snowflake/snowpark_connect/includes/python/pyspark/pandas/sql_processor.py +394 -0
  297. snowflake/snowpark_connect/includes/python/pyspark/pandas/strings.py +2371 -0
  298. snowflake/snowpark_connect/includes/python/pyspark/pandas/supported_api_gen.py +378 -0
  299. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/__init__.py +16 -0
  300. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/__init__.py +16 -0
  301. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_any_all.py +177 -0
  302. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_apply_func.py +575 -0
  303. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_binary_ops.py +235 -0
  304. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_combine.py +653 -0
  305. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_compute.py +463 -0
  306. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_corrwith.py +86 -0
  307. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cov.py +151 -0
  308. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_cumulative.py +139 -0
  309. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_describe.py +458 -0
  310. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_eval.py +86 -0
  311. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_melt.py +202 -0
  312. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_missing_data.py +520 -0
  313. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/computation/test_pivot.py +361 -0
  314. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/__init__.py +16 -0
  315. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/__init__.py +16 -0
  316. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_any_all.py +40 -0
  317. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_apply_func.py +42 -0
  318. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_binary_ops.py +40 -0
  319. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_combine.py +37 -0
  320. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_compute.py +60 -0
  321. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_corrwith.py +40 -0
  322. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cov.py +40 -0
  323. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_cumulative.py +90 -0
  324. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_describe.py +40 -0
  325. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_eval.py +40 -0
  326. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_melt.py +40 -0
  327. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_missing_data.py +42 -0
  328. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/computation/test_parity_pivot.py +37 -0
  329. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/__init__.py +16 -0
  330. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_base.py +36 -0
  331. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_binary_ops.py +42 -0
  332. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_boolean_ops.py +47 -0
  333. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_categorical_ops.py +55 -0
  334. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_complex_ops.py +40 -0
  335. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_date_ops.py +47 -0
  336. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_datetime_ops.py +47 -0
  337. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_null_ops.py +42 -0
  338. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py +43 -0
  339. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_ops.py +47 -0
  340. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py +43 -0
  341. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_string_ops.py +47 -0
  342. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_timedelta_ops.py +47 -0
  343. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py +40 -0
  344. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/data_type_ops/testing_utils.py +226 -0
  345. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/__init__.py +16 -0
  346. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_align.py +39 -0
  347. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic_slow.py +55 -0
  348. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_cov_corrwith.py +39 -0
  349. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_frame.py +39 -0
  350. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_dot_series.py +39 -0
  351. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_index.py +39 -0
  352. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_series.py +39 -0
  353. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_frame.py +43 -0
  354. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_setitem_series.py +43 -0
  355. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/__init__.py +16 -0
  356. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_attrs.py +40 -0
  357. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_constructor.py +39 -0
  358. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_conversion.py +42 -0
  359. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reindexing.py +42 -0
  360. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_reshaping.py +37 -0
  361. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_spark.py +40 -0
  362. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_take.py +42 -0
  363. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_time_series.py +48 -0
  364. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/frame/test_parity_truncate.py +40 -0
  365. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/__init__.py +16 -0
  366. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +40 -0
  367. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_apply_func.py +41 -0
  368. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_cumulative.py +67 -0
  369. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +40 -0
  370. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_groupby.py +55 -0
  371. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +40 -0
  372. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +38 -0
  373. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_missing_data.py +55 -0
  374. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +39 -0
  375. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +38 -0
  376. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/__init__.py +16 -0
  377. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_align.py +40 -0
  378. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_base.py +50 -0
  379. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_category.py +73 -0
  380. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_datetime.py +39 -0
  381. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing.py +40 -0
  382. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reindex.py +40 -0
  383. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_rename.py +40 -0
  384. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_reset_index.py +48 -0
  385. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/indexes/test_parity_timedelta.py +39 -0
  386. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/__init__.py +16 -0
  387. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/io/test_parity_io.py +40 -0
  388. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/__init__.py +16 -0
  389. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot.py +45 -0
  390. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_matplotlib.py +45 -0
  391. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_frame_plot_plotly.py +49 -0
  392. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot.py +37 -0
  393. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +53 -0
  394. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_plotly.py +45 -0
  395. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/__init__.py +16 -0
  396. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +38 -0
  397. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_arg_ops.py +37 -0
  398. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_of.py +37 -0
  399. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +38 -0
  400. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +37 -0
  401. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +40 -0
  402. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_cumulative.py +40 -0
  403. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_index.py +38 -0
  404. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_missing_data.py +40 -0
  405. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_series.py +37 -0
  406. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +38 -0
  407. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/series/test_parity_stat.py +38 -0
  408. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_categorical.py +66 -0
  409. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_config.py +37 -0
  410. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_csv.py +37 -0
  411. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py +42 -0
  412. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py +39 -0
  413. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_default_index.py +49 -0
  414. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ewm.py +37 -0
  415. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_expanding.py +39 -0
  416. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_extension.py +49 -0
  417. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +53 -0
  418. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_generic_functions.py +43 -0
  419. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexing.py +49 -0
  420. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_indexops_spark.py +39 -0
  421. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_internal.py +41 -0
  422. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_namespace.py +39 -0
  423. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_numpy_compat.py +60 -0
  424. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +48 -0
  425. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby.py +39 -0
  426. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_expanding.py +44 -0
  427. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +84 -0
  428. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_repr.py +37 -0
  429. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_resample.py +45 -0
  430. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_reshape.py +39 -0
  431. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_rolling.py +39 -0
  432. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_scalars.py +37 -0
  433. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py +39 -0
  434. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_datetime.py +39 -0
  435. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_series_string.py +39 -0
  436. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_spark_functions.py +39 -0
  437. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_sql.py +43 -0
  438. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_stats.py +37 -0
  439. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_typedef.py +36 -0
  440. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_utils.py +37 -0
  441. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/connect/test_parity_window.py +39 -0
  442. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/__init__.py +16 -0
  443. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_base.py +107 -0
  444. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +224 -0
  445. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +825 -0
  446. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +562 -0
  447. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +368 -0
  448. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +257 -0
  449. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +260 -0
  450. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +178 -0
  451. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +184 -0
  452. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +497 -0
  453. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +140 -0
  454. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +354 -0
  455. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_timedelta_ops.py +219 -0
  456. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +192 -0
  457. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +228 -0
  458. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/__init__.py +16 -0
  459. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_align.py +118 -0
  460. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_basic_slow.py +198 -0
  461. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +181 -0
  462. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py +103 -0
  463. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_dot_series.py +141 -0
  464. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_index.py +109 -0
  465. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_series.py +136 -0
  466. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py +125 -0
  467. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_series.py +217 -0
  468. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/__init__.py +16 -0
  469. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_attrs.py +384 -0
  470. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_constructor.py +598 -0
  471. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_conversion.py +73 -0
  472. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reindexing.py +869 -0
  473. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_reshaping.py +487 -0
  474. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_spark.py +309 -0
  475. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_take.py +156 -0
  476. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_time_series.py +149 -0
  477. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/frame/test_truncate.py +163 -0
  478. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/__init__.py +16 -0
  479. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_aggregate.py +311 -0
  480. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_apply_func.py +524 -0
  481. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_cumulative.py +419 -0
  482. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_describe.py +144 -0
  483. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_groupby.py +979 -0
  484. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_head_tail.py +234 -0
  485. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_index.py +206 -0
  486. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_missing_data.py +421 -0
  487. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_split_apply.py +187 -0
  488. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/groupby/test_stat.py +397 -0
  489. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/__init__.py +16 -0
  490. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_align.py +100 -0
  491. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_base.py +2743 -0
  492. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_category.py +484 -0
  493. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_datetime.py +276 -0
  494. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_indexing.py +432 -0
  495. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reindex.py +310 -0
  496. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_rename.py +257 -0
  497. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_reset_index.py +160 -0
  498. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/indexes/test_timedelta.py +128 -0
  499. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/__init__.py +16 -0
  500. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/io/test_io.py +137 -0
  501. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/__init__.py +16 -0
  502. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot.py +170 -0
  503. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +547 -0
  504. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +285 -0
  505. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot.py +106 -0
  506. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +409 -0
  507. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +247 -0
  508. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/__init__.py +16 -0
  509. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_all_any.py +105 -0
  510. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_arg_ops.py +197 -0
  511. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_of.py +137 -0
  512. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_as_type.py +227 -0
  513. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_compute.py +634 -0
  514. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_conversion.py +88 -0
  515. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_cumulative.py +139 -0
  516. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_index.py +475 -0
  517. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_missing_data.py +265 -0
  518. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_series.py +818 -0
  519. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_sort.py +162 -0
  520. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/series/test_stat.py +780 -0
  521. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_categorical.py +741 -0
  522. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_config.py +160 -0
  523. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_csv.py +453 -0
  524. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_conversion.py +281 -0
  525. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_dataframe_spark_io.py +487 -0
  526. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_default_index.py +109 -0
  527. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ewm.py +434 -0
  528. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_expanding.py +253 -0
  529. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_extension.py +152 -0
  530. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_frame_spark.py +162 -0
  531. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_generic_functions.py +234 -0
  532. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexing.py +1339 -0
  533. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_indexops_spark.py +82 -0
  534. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_internal.py +124 -0
  535. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_namespace.py +638 -0
  536. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_numpy_compat.py +200 -0
  537. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +1355 -0
  538. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +655 -0
  539. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +113 -0
  540. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +118 -0
  541. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_repr.py +192 -0
  542. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_resample.py +346 -0
  543. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_reshape.py +495 -0
  544. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_rolling.py +263 -0
  545. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_scalars.py +59 -0
  546. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_conversion.py +85 -0
  547. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_datetime.py +364 -0
  548. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_series_string.py +362 -0
  549. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_spark_functions.py +46 -0
  550. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_sql.py +123 -0
  551. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_stats.py +581 -0
  552. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_typedef.py +447 -0
  553. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_utils.py +301 -0
  554. snowflake/snowpark_connect/includes/python/pyspark/pandas/tests/test_window.py +465 -0
  555. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/__init__.py +18 -0
  556. snowflake/snowpark_connect/includes/python/pyspark/pandas/typedef/typehints.py +874 -0
  557. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/__init__.py +143 -0
  558. snowflake/snowpark_connect/includes/python/pyspark/pandas/usage_logging/usage_logger.py +132 -0
  559. snowflake/snowpark_connect/includes/python/pyspark/pandas/utils.py +1063 -0
  560. snowflake/snowpark_connect/includes/python/pyspark/pandas/window.py +2702 -0
  561. snowflake/snowpark_connect/includes/python/pyspark/profiler.py +489 -0
  562. snowflake/snowpark_connect/includes/python/pyspark/py.typed +1 -0
  563. snowflake/snowpark_connect/includes/python/pyspark/python/pyspark/shell.py +123 -0
  564. snowflake/snowpark_connect/includes/python/pyspark/rdd.py +5518 -0
  565. snowflake/snowpark_connect/includes/python/pyspark/rddsampler.py +115 -0
  566. snowflake/snowpark_connect/includes/python/pyspark/resource/__init__.py +38 -0
  567. snowflake/snowpark_connect/includes/python/pyspark/resource/information.py +69 -0
  568. snowflake/snowpark_connect/includes/python/pyspark/resource/profile.py +317 -0
  569. snowflake/snowpark_connect/includes/python/pyspark/resource/requests.py +539 -0
  570. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/__init__.py +16 -0
  571. snowflake/snowpark_connect/includes/python/pyspark/resource/tests/test_resources.py +83 -0
  572. snowflake/snowpark_connect/includes/python/pyspark/resultiterable.py +45 -0
  573. snowflake/snowpark_connect/includes/python/pyspark/serializers.py +681 -0
  574. snowflake/snowpark_connect/includes/python/pyspark/shell.py +123 -0
  575. snowflake/snowpark_connect/includes/python/pyspark/shuffle.py +854 -0
  576. snowflake/snowpark_connect/includes/python/pyspark/sql/__init__.py +75 -0
  577. snowflake/snowpark_connect/includes/python/pyspark/sql/_typing.pyi +80 -0
  578. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/__init__.py +18 -0
  579. snowflake/snowpark_connect/includes/python/pyspark/sql/avro/functions.py +188 -0
  580. snowflake/snowpark_connect/includes/python/pyspark/sql/catalog.py +1270 -0
  581. snowflake/snowpark_connect/includes/python/pyspark/sql/column.py +1431 -0
  582. snowflake/snowpark_connect/includes/python/pyspark/sql/conf.py +99 -0
  583. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/__init__.py +18 -0
  584. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/_typing.py +90 -0
  585. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/__init__.py +18 -0
  586. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/avro/functions.py +107 -0
  587. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/catalog.py +356 -0
  588. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/__init__.py +22 -0
  589. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/artifact.py +412 -0
  590. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/core.py +1689 -0
  591. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/client/reattach.py +340 -0
  592. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/column.py +514 -0
  593. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conf.py +128 -0
  594. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/conversion.py +490 -0
  595. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/dataframe.py +2172 -0
  596. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/expressions.py +1056 -0
  597. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/functions.py +3937 -0
  598. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/group.py +418 -0
  599. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/plan.py +2289 -0
  600. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/__init__.py +25 -0
  601. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.py +203 -0
  602. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2.pyi +2718 -0
  603. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/base_pb2_grpc.py +423 -0
  604. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.py +109 -0
  605. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/catalog_pb2.pyi +1130 -0
  606. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.py +141 -0
  607. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/commands_pb2.pyi +1766 -0
  608. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.py +47 -0
  609. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/common_pb2.pyi +123 -0
  610. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.py +53 -0
  611. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/example_plugins_pb2.pyi +112 -0
  612. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.py +107 -0
  613. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/expressions_pb2.pyi +1507 -0
  614. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.py +195 -0
  615. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/relations_pb2.pyi +3613 -0
  616. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.py +95 -0
  617. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/proto/types_pb2.pyi +980 -0
  618. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/__init__.py +18 -0
  619. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/protobuf/functions.py +166 -0
  620. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/readwriter.py +861 -0
  621. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/session.py +952 -0
  622. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/__init__.py +22 -0
  623. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/query.py +295 -0
  624. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/readwriter.py +618 -0
  625. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/__init__.py +18 -0
  626. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +87 -0
  627. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +100 -0
  628. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/types.py +301 -0
  629. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udf.py +296 -0
  630. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/udtf.py +200 -0
  631. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/utils.py +58 -0
  632. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/window.py +266 -0
  633. snowflake/snowpark_connect/includes/python/pyspark/sql/context.py +818 -0
  634. snowflake/snowpark_connect/includes/python/pyspark/sql/dataframe.py +5973 -0
  635. snowflake/snowpark_connect/includes/python/pyspark/sql/functions.py +15889 -0
  636. snowflake/snowpark_connect/includes/python/pyspark/sql/group.py +547 -0
  637. snowflake/snowpark_connect/includes/python/pyspark/sql/observation.py +152 -0
  638. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/__init__.py +21 -0
  639. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/__init__.pyi +344 -0
  640. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/__init__.pyi +17 -0
  641. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +20 -0
  642. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/_typing/protocols/series.pyi +20 -0
  643. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/conversion.py +671 -0
  644. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.py +480 -0
  645. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/functions.pyi +132 -0
  646. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/group_ops.py +523 -0
  647. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/map_ops.py +216 -0
  648. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/serializers.py +1019 -0
  649. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/typehints.py +172 -0
  650. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/types.py +972 -0
  651. snowflake/snowpark_connect/includes/python/pyspark/sql/pandas/utils.py +86 -0
  652. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/__init__.py +18 -0
  653. snowflake/snowpark_connect/includes/python/pyspark/sql/protobuf/functions.py +334 -0
  654. snowflake/snowpark_connect/includes/python/pyspark/sql/readwriter.py +2159 -0
  655. snowflake/snowpark_connect/includes/python/pyspark/sql/session.py +2088 -0
  656. snowflake/snowpark_connect/includes/python/pyspark/sql/sql_formatter.py +84 -0
  657. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/__init__.py +21 -0
  658. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/listener.py +1050 -0
  659. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/query.py +746 -0
  660. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/readwriter.py +1652 -0
  661. snowflake/snowpark_connect/includes/python/pyspark/sql/streaming/state.py +288 -0
  662. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/__init__.py +16 -0
  663. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/__init__.py +16 -0
  664. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/__init__.py +16 -0
  665. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_artifact.py +420 -0
  666. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/client/test_client.py +358 -0
  667. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/__init__.py +16 -0
  668. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach.py +36 -0
  669. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +44 -0
  670. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +116 -0
  671. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +35 -0
  672. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_basic.py +3612 -0
  673. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_column.py +1042 -0
  674. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_function.py +2381 -0
  675. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_connect_plan.py +1060 -0
  676. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow.py +163 -0
  677. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_map.py +38 -0
  678. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +48 -0
  679. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_catalog.py +36 -0
  680. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_column.py +55 -0
  681. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_conf.py +36 -0
  682. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_dataframe.py +96 -0
  683. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_datasources.py +44 -0
  684. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_errors.py +36 -0
  685. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_functions.py +59 -0
  686. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_group.py +36 -0
  687. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +59 -0
  688. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +74 -0
  689. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +62 -0
  690. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_map.py +58 -0
  691. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +70 -0
  692. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +50 -0
  693. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +68 -0
  694. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_pandas_udf_window.py +40 -0
  695. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_readwriter.py +46 -0
  696. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_serde.py +44 -0
  697. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_types.py +100 -0
  698. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udf.py +100 -0
  699. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_parity_udtf.py +163 -0
  700. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_session.py +181 -0
  701. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/connect/test_utils.py +42 -0
  702. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/__init__.py +16 -0
  703. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +623 -0
  704. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +869 -0
  705. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +342 -0
  706. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_map.py +436 -0
  707. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf.py +363 -0
  708. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +592 -0
  709. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +1503 -0
  710. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +392 -0
  711. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +375 -0
  712. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py +411 -0
  713. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/__init__.py +16 -0
  714. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming.py +401 -0
  715. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +295 -0
  716. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +106 -0
  717. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/streaming/test_streaming_listener.py +558 -0
  718. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow.py +1346 -0
  719. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_map.py +182 -0
  720. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_arrow_python_udf.py +202 -0
  721. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_catalog.py +503 -0
  722. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_column.py +225 -0
  723. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_conf.py +83 -0
  724. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_context.py +201 -0
  725. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_dataframe.py +1931 -0
  726. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_datasources.py +256 -0
  727. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_errors.py +69 -0
  728. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_functions.py +1349 -0
  729. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_group.py +53 -0
  730. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_pandas_sqlmetrics.py +68 -0
  731. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_readwriter.py +283 -0
  732. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_serde.py +155 -0
  733. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_session.py +412 -0
  734. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_types.py +1581 -0
  735. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf.py +961 -0
  736. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udf_profiler.py +165 -0
  737. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_udtf.py +1456 -0
  738. snowflake/snowpark_connect/includes/python/pyspark/sql/tests/test_utils.py +1686 -0
  739. snowflake/snowpark_connect/includes/python/pyspark/sql/types.py +2558 -0
  740. snowflake/snowpark_connect/includes/python/pyspark/sql/udf.py +714 -0
  741. snowflake/snowpark_connect/includes/python/pyspark/sql/udtf.py +325 -0
  742. snowflake/snowpark_connect/includes/python/pyspark/sql/utils.py +339 -0
  743. snowflake/snowpark_connect/includes/python/pyspark/sql/window.py +492 -0
  744. snowflake/snowpark_connect/includes/python/pyspark/statcounter.py +165 -0
  745. snowflake/snowpark_connect/includes/python/pyspark/status.py +112 -0
  746. snowflake/snowpark_connect/includes/python/pyspark/storagelevel.py +97 -0
  747. snowflake/snowpark_connect/includes/python/pyspark/streaming/__init__.py +22 -0
  748. snowflake/snowpark_connect/includes/python/pyspark/streaming/context.py +471 -0
  749. snowflake/snowpark_connect/includes/python/pyspark/streaming/dstream.py +933 -0
  750. snowflake/snowpark_connect/includes/python/pyspark/streaming/kinesis.py +205 -0
  751. snowflake/snowpark_connect/includes/python/pyspark/streaming/listener.py +83 -0
  752. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/__init__.py +16 -0
  753. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_context.py +184 -0
  754. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_dstream.py +706 -0
  755. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_kinesis.py +118 -0
  756. snowflake/snowpark_connect/includes/python/pyspark/streaming/tests/test_listener.py +160 -0
  757. snowflake/snowpark_connect/includes/python/pyspark/streaming/util.py +168 -0
  758. snowflake/snowpark_connect/includes/python/pyspark/taskcontext.py +502 -0
  759. snowflake/snowpark_connect/includes/python/pyspark/testing/__init__.py +21 -0
  760. snowflake/snowpark_connect/includes/python/pyspark/testing/connectutils.py +199 -0
  761. snowflake/snowpark_connect/includes/python/pyspark/testing/mllibutils.py +30 -0
  762. snowflake/snowpark_connect/includes/python/pyspark/testing/mlutils.py +275 -0
  763. snowflake/snowpark_connect/includes/python/pyspark/testing/objects.py +121 -0
  764. snowflake/snowpark_connect/includes/python/pyspark/testing/pandasutils.py +714 -0
  765. snowflake/snowpark_connect/includes/python/pyspark/testing/sqlutils.py +168 -0
  766. snowflake/snowpark_connect/includes/python/pyspark/testing/streamingutils.py +178 -0
  767. snowflake/snowpark_connect/includes/python/pyspark/testing/utils.py +636 -0
  768. snowflake/snowpark_connect/includes/python/pyspark/tests/__init__.py +16 -0
  769. snowflake/snowpark_connect/includes/python/pyspark/tests/test_appsubmit.py +306 -0
  770. snowflake/snowpark_connect/includes/python/pyspark/tests/test_broadcast.py +196 -0
  771. snowflake/snowpark_connect/includes/python/pyspark/tests/test_conf.py +44 -0
  772. snowflake/snowpark_connect/includes/python/pyspark/tests/test_context.py +346 -0
  773. snowflake/snowpark_connect/includes/python/pyspark/tests/test_daemon.py +89 -0
  774. snowflake/snowpark_connect/includes/python/pyspark/tests/test_install_spark.py +124 -0
  775. snowflake/snowpark_connect/includes/python/pyspark/tests/test_join.py +69 -0
  776. snowflake/snowpark_connect/includes/python/pyspark/tests/test_memory_profiler.py +167 -0
  777. snowflake/snowpark_connect/includes/python/pyspark/tests/test_pin_thread.py +194 -0
  778. snowflake/snowpark_connect/includes/python/pyspark/tests/test_profiler.py +168 -0
  779. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rdd.py +939 -0
  780. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddbarrier.py +52 -0
  781. snowflake/snowpark_connect/includes/python/pyspark/tests/test_rddsampler.py +66 -0
  782. snowflake/snowpark_connect/includes/python/pyspark/tests/test_readwrite.py +368 -0
  783. snowflake/snowpark_connect/includes/python/pyspark/tests/test_serializers.py +257 -0
  784. snowflake/snowpark_connect/includes/python/pyspark/tests/test_shuffle.py +267 -0
  785. snowflake/snowpark_connect/includes/python/pyspark/tests/test_stage_sched.py +153 -0
  786. snowflake/snowpark_connect/includes/python/pyspark/tests/test_statcounter.py +130 -0
  787. snowflake/snowpark_connect/includes/python/pyspark/tests/test_taskcontext.py +350 -0
  788. snowflake/snowpark_connect/includes/python/pyspark/tests/test_util.py +97 -0
  789. snowflake/snowpark_connect/includes/python/pyspark/tests/test_worker.py +271 -0
  790. snowflake/snowpark_connect/includes/python/pyspark/traceback_utils.py +81 -0
  791. snowflake/snowpark_connect/includes/python/pyspark/util.py +416 -0
  792. snowflake/snowpark_connect/includes/python/pyspark/version.py +19 -0
  793. snowflake/snowpark_connect/includes/python/pyspark/worker.py +1307 -0
  794. snowflake/snowpark_connect/includes/python/pyspark/worker_util.py +46 -0
  795. snowflake/snowpark_connect/proto/__init__.py +10 -0
  796. snowflake/snowpark_connect/proto/control_pb2.py +35 -0
  797. snowflake/snowpark_connect/proto/control_pb2.pyi +38 -0
  798. snowflake/snowpark_connect/proto/control_pb2_grpc.py +183 -0
  799. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +35 -0
  800. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +53 -0
  801. snowflake/snowpark_connect/proto/snowflake_rdd_pb2.pyi +39 -0
  802. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +47 -0
  803. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +111 -0
  804. snowflake/snowpark_connect/relation/__init__.py +3 -0
  805. snowflake/snowpark_connect/relation/catalogs/__init__.py +12 -0
  806. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +287 -0
  807. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +467 -0
  808. snowflake/snowpark_connect/relation/catalogs/utils.py +51 -0
  809. snowflake/snowpark_connect/relation/io_utils.py +76 -0
  810. snowflake/snowpark_connect/relation/map_aggregate.py +322 -0
  811. snowflake/snowpark_connect/relation/map_catalog.py +151 -0
  812. snowflake/snowpark_connect/relation/map_column_ops.py +1068 -0
  813. snowflake/snowpark_connect/relation/map_crosstab.py +48 -0
  814. snowflake/snowpark_connect/relation/map_extension.py +412 -0
  815. snowflake/snowpark_connect/relation/map_join.py +341 -0
  816. snowflake/snowpark_connect/relation/map_local_relation.py +326 -0
  817. snowflake/snowpark_connect/relation/map_map_partitions.py +146 -0
  818. snowflake/snowpark_connect/relation/map_relation.py +253 -0
  819. snowflake/snowpark_connect/relation/map_row_ops.py +716 -0
  820. snowflake/snowpark_connect/relation/map_sample_by.py +35 -0
  821. snowflake/snowpark_connect/relation/map_show_string.py +50 -0
  822. snowflake/snowpark_connect/relation/map_sql.py +1874 -0
  823. snowflake/snowpark_connect/relation/map_stats.py +324 -0
  824. snowflake/snowpark_connect/relation/map_subquery_alias.py +32 -0
  825. snowflake/snowpark_connect/relation/map_udtf.py +288 -0
  826. snowflake/snowpark_connect/relation/read/__init__.py +7 -0
  827. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +668 -0
  828. snowflake/snowpark_connect/relation/read/map_read.py +367 -0
  829. snowflake/snowpark_connect/relation/read/map_read_csv.py +142 -0
  830. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +108 -0
  831. snowflake/snowpark_connect/relation/read/map_read_json.py +344 -0
  832. snowflake/snowpark_connect/relation/read/map_read_parquet.py +194 -0
  833. snowflake/snowpark_connect/relation/read/map_read_socket.py +59 -0
  834. snowflake/snowpark_connect/relation/read/map_read_table.py +109 -0
  835. snowflake/snowpark_connect/relation/read/map_read_text.py +106 -0
  836. snowflake/snowpark_connect/relation/read/reader_config.py +399 -0
  837. snowflake/snowpark_connect/relation/read/utils.py +155 -0
  838. snowflake/snowpark_connect/relation/stage_locator.py +161 -0
  839. snowflake/snowpark_connect/relation/utils.py +219 -0
  840. snowflake/snowpark_connect/relation/write/__init__.py +3 -0
  841. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +339 -0
  842. snowflake/snowpark_connect/relation/write/map_write.py +436 -0
  843. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +48 -0
  844. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  845. snowflake/snowpark_connect/resources_initializer.py +75 -0
  846. snowflake/snowpark_connect/server.py +1136 -0
  847. snowflake/snowpark_connect/start_server.py +32 -0
  848. snowflake/snowpark_connect/tcm.py +8 -0
  849. snowflake/snowpark_connect/type_mapping.py +1003 -0
  850. snowflake/snowpark_connect/typed_column.py +94 -0
  851. snowflake/snowpark_connect/utils/__init__.py +3 -0
  852. snowflake/snowpark_connect/utils/artifacts.py +48 -0
  853. snowflake/snowpark_connect/utils/attribute_handling.py +72 -0
  854. snowflake/snowpark_connect/utils/cache.py +84 -0
  855. snowflake/snowpark_connect/utils/concurrent.py +124 -0
  856. snowflake/snowpark_connect/utils/context.py +390 -0
  857. snowflake/snowpark_connect/utils/describe_query_cache.py +231 -0
  858. snowflake/snowpark_connect/utils/interrupt.py +85 -0
  859. snowflake/snowpark_connect/utils/io_utils.py +35 -0
  860. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +117 -0
  861. snowflake/snowpark_connect/utils/profiling.py +47 -0
  862. snowflake/snowpark_connect/utils/session.py +180 -0
  863. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +38 -0
  864. snowflake/snowpark_connect/utils/telemetry.py +513 -0
  865. snowflake/snowpark_connect/utils/udf_cache.py +392 -0
  866. snowflake/snowpark_connect/utils/udf_helper.py +328 -0
  867. snowflake/snowpark_connect/utils/udf_utils.py +310 -0
  868. snowflake/snowpark_connect/utils/udtf_helper.py +420 -0
  869. snowflake/snowpark_connect/utils/udtf_utils.py +799 -0
  870. snowflake/snowpark_connect/utils/xxhash64.py +247 -0
  871. snowflake/snowpark_connect/version.py +6 -0
  872. snowpark_connect-0.20.2.data/scripts/snowpark-connect +71 -0
  873. snowpark_connect-0.20.2.data/scripts/snowpark-session +11 -0
  874. snowpark_connect-0.20.2.data/scripts/snowpark-submit +354 -0
  875. snowpark_connect-0.20.2.dist-info/METADATA +37 -0
  876. snowpark_connect-0.20.2.dist-info/RECORD +879 -0
  877. snowpark_connect-0.20.2.dist-info/WHEEL +5 -0
  878. snowpark_connect-0.20.2.dist-info/licenses/LICENSE.txt +202 -0
  879. snowpark_connect-0.20.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2558 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import sys
19
+ import decimal
20
+ import time
21
+ import math
22
+ import datetime
23
+ import calendar
24
+ import json
25
+ import re
26
+ import base64
27
+ from array import array
28
+ import ctypes
29
+ from collections.abc import Iterable
30
+ from functools import reduce
31
+ from typing import (
32
+ cast,
33
+ overload,
34
+ Any,
35
+ Callable,
36
+ ClassVar,
37
+ Dict,
38
+ Iterator,
39
+ List,
40
+ Optional,
41
+ Union,
42
+ Tuple,
43
+ Type,
44
+ TypeVar,
45
+ TYPE_CHECKING,
46
+ )
47
+
48
+ from py4j.protocol import register_input_converter
49
+ from py4j.java_gateway import GatewayClient, JavaClass, JavaGateway, JavaObject, JVMView
50
+
51
+ from pyspark.serializers import CloudPickleSerializer
52
+ from pyspark.sql.utils import has_numpy, get_active_spark_context
53
+ from pyspark.errors import PySparkNotImplementedError, PySparkTypeError, PySparkValueError
54
+
55
+ if has_numpy:
56
+ import numpy as np
57
+
58
+ T = TypeVar("T")
59
+ U = TypeVar("U")
60
+
61
+ __all__ = [
62
+ "DataType",
63
+ "NullType",
64
+ "CharType",
65
+ "StringType",
66
+ "VarcharType",
67
+ "BinaryType",
68
+ "BooleanType",
69
+ "DateType",
70
+ "TimestampType",
71
+ "TimestampNTZType",
72
+ "DecimalType",
73
+ "DoubleType",
74
+ "FloatType",
75
+ "ByteType",
76
+ "IntegerType",
77
+ "LongType",
78
+ "DayTimeIntervalType",
79
+ "YearMonthIntervalType",
80
+ "Row",
81
+ "ShortType",
82
+ "ArrayType",
83
+ "MapType",
84
+ "StructField",
85
+ "StructType",
86
+ ]
87
+
88
+
89
+ if TYPE_CHECKING:
90
+ import numpy as np
91
+
92
+
93
+ class DataType:
94
+ """Base class for data types."""
95
+
96
+ def __repr__(self) -> str:
97
+ return self.__class__.__name__ + "()"
98
+
99
+ def __hash__(self) -> int:
100
+ return hash(str(self))
101
+
102
+ def __eq__(self, other: Any) -> bool:
103
+ return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
104
+
105
+ def __ne__(self, other: Any) -> bool:
106
+ return not self.__eq__(other)
107
+
108
+ @classmethod
109
+ def typeName(cls) -> str:
110
+ return cls.__name__[:-4].lower()
111
+
112
+ def simpleString(self) -> str:
113
+ return self.typeName()
114
+
115
+ def jsonValue(self) -> Union[str, Dict[str, Any]]:
116
+ return self.typeName()
117
+
118
+ def json(self) -> str:
119
+ return json.dumps(self.jsonValue(), separators=(",", ":"), sort_keys=True)
120
+
121
+ def needConversion(self) -> bool:
122
+ """
123
+ Does this type needs conversion between Python object and internal SQL object.
124
+
125
+ This is used to avoid the unnecessary conversion for ArrayType/MapType/StructType.
126
+ """
127
+ return False
128
+
129
+ def toInternal(self, obj: Any) -> Any:
130
+ """
131
+ Converts a Python object into an internal SQL object.
132
+ """
133
+ return obj
134
+
135
+ def fromInternal(self, obj: Any) -> Any:
136
+ """
137
+ Converts an internal SQL object into a native Python object.
138
+ """
139
+ return obj
140
+
141
+
142
+ # This singleton pattern does not work with pickle, you will get
143
+ # another object after pickle and unpickle
144
+ class DataTypeSingleton(type):
145
+ """Metaclass for DataType"""
146
+
147
+ _instances: ClassVar[Dict[Type["DataTypeSingleton"], "DataTypeSingleton"]] = {}
148
+
149
+ def __call__(cls: Type[T]) -> T:
150
+ if cls not in cls._instances: # type: ignore[attr-defined]
151
+ cls._instances[cls] = super( # type: ignore[misc, attr-defined]
152
+ DataTypeSingleton, cls
153
+ ).__call__()
154
+ return cls._instances[cls] # type: ignore[attr-defined]
155
+
156
+
157
+ class NullType(DataType, metaclass=DataTypeSingleton):
158
+ """Null type.
159
+
160
+ The data type representing None, used for the types that cannot be inferred.
161
+ """
162
+
163
+ @classmethod
164
+ def typeName(cls) -> str:
165
+ return "void"
166
+
167
+
168
+ class AtomicType(DataType):
169
+ """An internal type used to represent everything that is not
170
+ null, UDTs, arrays, structs, and maps."""
171
+
172
+
173
+ class NumericType(AtomicType):
174
+ """Numeric data types."""
175
+
176
+
177
+ class IntegralType(NumericType, metaclass=DataTypeSingleton):
178
+ """Integral data types."""
179
+
180
+ pass
181
+
182
+
183
+ class FractionalType(NumericType):
184
+ """Fractional data types."""
185
+
186
+
187
+ class StringType(AtomicType, metaclass=DataTypeSingleton):
188
+ """String data type."""
189
+
190
+ pass
191
+
192
+
193
+ class CharType(AtomicType):
194
+ """Char data type
195
+
196
+ Parameters
197
+ ----------
198
+ length : int
199
+ the length limitation.
200
+ """
201
+
202
+ def __init__(self, length: int):
203
+ self.length = length
204
+
205
+ def simpleString(self) -> str:
206
+ return "char(%d)" % (self.length)
207
+
208
+ def jsonValue(self) -> str:
209
+ return "char(%d)" % (self.length)
210
+
211
+ def __repr__(self) -> str:
212
+ return "CharType(%d)" % (self.length)
213
+
214
+
215
+ class VarcharType(AtomicType):
216
+ """Varchar data type
217
+
218
+ Parameters
219
+ ----------
220
+ length : int
221
+ the length limitation.
222
+ """
223
+
224
+ def __init__(self, length: int):
225
+ self.length = length
226
+
227
+ def simpleString(self) -> str:
228
+ return "varchar(%d)" % (self.length)
229
+
230
+ def jsonValue(self) -> str:
231
+ return "varchar(%d)" % (self.length)
232
+
233
+ def __repr__(self) -> str:
234
+ return "VarcharType(%d)" % (self.length)
235
+
236
+
237
+ class BinaryType(AtomicType, metaclass=DataTypeSingleton):
238
+ """Binary (byte array) data type."""
239
+
240
+ pass
241
+
242
+
243
+ class BooleanType(AtomicType, metaclass=DataTypeSingleton):
244
+ """Boolean data type."""
245
+
246
+ pass
247
+
248
+
249
+ class DateType(AtomicType, metaclass=DataTypeSingleton):
250
+ """Date (datetime.date) data type."""
251
+
252
+ EPOCH_ORDINAL = datetime.datetime(1970, 1, 1).toordinal()
253
+
254
+ def needConversion(self) -> bool:
255
+ return True
256
+
257
+ def toInternal(self, d: datetime.date) -> int:
258
+ if d is not None:
259
+ return d.toordinal() - self.EPOCH_ORDINAL
260
+
261
+ def fromInternal(self, v: int) -> datetime.date:
262
+ if v is not None:
263
+ return datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
264
+
265
+
266
+ class TimestampType(AtomicType, metaclass=DataTypeSingleton):
267
+ """Timestamp (datetime.datetime) data type."""
268
+
269
+ def needConversion(self) -> bool:
270
+ return True
271
+
272
+ def toInternal(self, dt: datetime.datetime) -> int:
273
+ if dt is not None:
274
+ seconds = (
275
+ calendar.timegm(dt.utctimetuple()) if dt.tzinfo else time.mktime(dt.timetuple())
276
+ )
277
+ return int(seconds) * 1000000 + dt.microsecond
278
+
279
+ def fromInternal(self, ts: int) -> datetime.datetime:
280
+ if ts is not None:
281
+ # using int to avoid precision loss in float
282
+ return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
283
+
284
+
285
+ class TimestampNTZType(AtomicType, metaclass=DataTypeSingleton):
286
+ """Timestamp (datetime.datetime) data type without timezone information."""
287
+
288
+ def needConversion(self) -> bool:
289
+ return True
290
+
291
+ @classmethod
292
+ def typeName(cls) -> str:
293
+ return "timestamp_ntz"
294
+
295
+ def toInternal(self, dt: datetime.datetime) -> int:
296
+ if dt is not None:
297
+ seconds = calendar.timegm(dt.timetuple())
298
+ return int(seconds) * 1000000 + dt.microsecond
299
+
300
+ def fromInternal(self, ts: int) -> datetime.datetime:
301
+ if ts is not None:
302
+ # using int to avoid precision loss in float
303
+ return datetime.datetime.utcfromtimestamp(ts // 1000000).replace(
304
+ microsecond=ts % 1000000
305
+ )
306
+
307
+
308
+ class DecimalType(FractionalType):
309
+ """Decimal (decimal.Decimal) data type.
310
+
311
+ The DecimalType must have fixed precision (the maximum total number of digits)
312
+ and scale (the number of digits on the right of dot). For example, (5, 2) can
313
+ support the value from [-999.99 to 999.99].
314
+
315
+ The precision can be up to 38, the scale must be less or equal to precision.
316
+
317
+ When creating a DecimalType, the default precision and scale is (10, 0). When inferring
318
+ schema from decimal.Decimal objects, it will be DecimalType(38, 18).
319
+
320
+ Parameters
321
+ ----------
322
+ precision : int, optional
323
+ the maximum (i.e. total) number of digits (default: 10)
324
+ scale : int, optional
325
+ the number of digits on right side of dot. (default: 0)
326
+ """
327
+
328
+ def __init__(self, precision: int = 10, scale: int = 0):
329
+ self.precision = precision
330
+ self.scale = scale
331
+ self.hasPrecisionInfo = True # this is a public API
332
+
333
+ def simpleString(self) -> str:
334
+ return "decimal(%d,%d)" % (self.precision, self.scale)
335
+
336
+ def jsonValue(self) -> str:
337
+ return "decimal(%d,%d)" % (self.precision, self.scale)
338
+
339
+ def __repr__(self) -> str:
340
+ return "DecimalType(%d,%d)" % (self.precision, self.scale)
341
+
342
+
343
+ class DoubleType(FractionalType, metaclass=DataTypeSingleton):
344
+ """Double data type, representing double precision floats."""
345
+
346
+ pass
347
+
348
+
349
+ class FloatType(FractionalType, metaclass=DataTypeSingleton):
350
+ """Float data type, representing single precision floats."""
351
+
352
+ pass
353
+
354
+
355
+ class ByteType(IntegralType):
356
+ """Byte data type, i.e. a signed integer in a single byte."""
357
+
358
+ def simpleString(self) -> str:
359
+ return "tinyint"
360
+
361
+
362
+ class IntegerType(IntegralType):
363
+ """Int data type, i.e. a signed 32-bit integer."""
364
+
365
+ def simpleString(self) -> str:
366
+ return "int"
367
+
368
+
369
+ class LongType(IntegralType):
370
+ """Long data type, i.e. a signed 64-bit integer.
371
+
372
+ If the values are beyond the range of [-9223372036854775808, 9223372036854775807],
373
+ please use :class:`DecimalType`.
374
+ """
375
+
376
+ def simpleString(self) -> str:
377
+ return "bigint"
378
+
379
+
380
+ class ShortType(IntegralType):
381
+ """Short data type, i.e. a signed 16-bit integer."""
382
+
383
+ def simpleString(self) -> str:
384
+ return "smallint"
385
+
386
+
387
+ class AnsiIntervalType(AtomicType):
388
+ """The interval type which conforms to the ANSI SQL standard."""
389
+
390
+ pass
391
+
392
+
393
+ class DayTimeIntervalType(AnsiIntervalType):
394
+ """DayTimeIntervalType (datetime.timedelta)."""
395
+
396
+ DAY = 0
397
+ HOUR = 1
398
+ MINUTE = 2
399
+ SECOND = 3
400
+
401
+ _fields = {
402
+ DAY: "day",
403
+ HOUR: "hour",
404
+ MINUTE: "minute",
405
+ SECOND: "second",
406
+ }
407
+
408
+ _inverted_fields = dict(zip(_fields.values(), _fields.keys()))
409
+
410
+ def __init__(self, startField: Optional[int] = None, endField: Optional[int] = None):
411
+ if startField is None and endField is None:
412
+ # Default matched to scala side.
413
+ startField = DayTimeIntervalType.DAY
414
+ endField = DayTimeIntervalType.SECOND
415
+ elif startField is not None and endField is None:
416
+ endField = startField
417
+
418
+ fields = DayTimeIntervalType._fields
419
+ if startField not in fields.keys() or endField not in fields.keys():
420
+ raise RuntimeError("interval %s to %s is invalid" % (startField, endField))
421
+ self.startField = cast(int, startField)
422
+ self.endField = cast(int, endField)
423
+
424
+ def _str_repr(self) -> str:
425
+ fields = DayTimeIntervalType._fields
426
+ start_field_name = fields[self.startField]
427
+ end_field_name = fields[self.endField]
428
+ if start_field_name == end_field_name:
429
+ return "interval %s" % start_field_name
430
+ else:
431
+ return "interval %s to %s" % (start_field_name, end_field_name)
432
+
433
+ simpleString = _str_repr
434
+
435
+ jsonValue = _str_repr
436
+
437
+ def __repr__(self) -> str:
438
+ return "%s(%d, %d)" % (type(self).__name__, self.startField, self.endField)
439
+
440
+ def needConversion(self) -> bool:
441
+ return True
442
+
443
+ def toInternal(self, dt: datetime.timedelta) -> Optional[int]:
444
+ if dt is not None:
445
+ return (((dt.days * 86400) + dt.seconds) * 1_000_000) + dt.microseconds
446
+
447
+ def fromInternal(self, micros: int) -> Optional[datetime.timedelta]:
448
+ if micros is not None:
449
+ return datetime.timedelta(microseconds=micros)
450
+
451
+
452
+ class YearMonthIntervalType(AnsiIntervalType):
453
+ """YearMonthIntervalType, represents year-month intervals of the SQL standard"""
454
+
455
+ YEAR = 0
456
+ MONTH = 1
457
+
458
+ _fields = {
459
+ YEAR: "year",
460
+ MONTH: "month",
461
+ }
462
+
463
+ _inverted_fields = dict(zip(_fields.values(), _fields.keys()))
464
+
465
+ def __init__(self, startField: Optional[int] = None, endField: Optional[int] = None):
466
+ if startField is None and endField is None:
467
+ # Default matched to scala side.
468
+ startField = YearMonthIntervalType.YEAR
469
+ endField = YearMonthIntervalType.MONTH
470
+ elif startField is not None and endField is None:
471
+ endField = startField
472
+
473
+ fields = YearMonthIntervalType._fields
474
+ if startField not in fields.keys() or endField not in fields.keys():
475
+ raise RuntimeError("interval %s to %s is invalid" % (startField, endField))
476
+ self.startField = cast(int, startField)
477
+ self.endField = cast(int, endField)
478
+
479
+ def _str_repr(self) -> str:
480
+ fields = YearMonthIntervalType._fields
481
+ start_field_name = fields[self.startField]
482
+ end_field_name = fields[self.endField]
483
+ if start_field_name == end_field_name:
484
+ return "interval %s" % start_field_name
485
+ else:
486
+ return "interval %s to %s" % (start_field_name, end_field_name)
487
+
488
+ simpleString = _str_repr
489
+
490
+ jsonValue = _str_repr
491
+
492
+ def __repr__(self) -> str:
493
+ return "%s(%d, %d)" % (type(self).__name__, self.startField, self.endField)
494
+
495
+
496
+ class ArrayType(DataType):
497
+ """Array data type.
498
+
499
+ Parameters
500
+ ----------
501
+ elementType : :class:`DataType`
502
+ :class:`DataType` of each element in the array.
503
+ containsNull : bool, optional
504
+ whether the array can contain null (None) values.
505
+
506
+ Examples
507
+ --------
508
+ >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType
509
+
510
+ The below example demonstrates how to create class:`ArrayType`:
511
+
512
+ >>> arr = ArrayType(StringType())
513
+
514
+ The array can contain null (None) values by default:
515
+
516
+ >>> ArrayType(StringType()) == ArrayType(StringType(), True)
517
+ True
518
+ >>> ArrayType(StringType(), False) == ArrayType(StringType())
519
+ False
520
+ """
521
+
522
+ def __init__(self, elementType: DataType, containsNull: bool = True):
523
+ assert isinstance(elementType, DataType), "elementType %s should be an instance of %s" % (
524
+ elementType,
525
+ DataType,
526
+ )
527
+ self.elementType = elementType
528
+ self.containsNull = containsNull
529
+
530
+ def simpleString(self) -> str:
531
+ return "array<%s>" % self.elementType.simpleString()
532
+
533
+ def __repr__(self) -> str:
534
+ return "ArrayType(%s, %s)" % (self.elementType, str(self.containsNull))
535
+
536
+ def jsonValue(self) -> Dict[str, Any]:
537
+ return {
538
+ "type": self.typeName(),
539
+ "elementType": self.elementType.jsonValue(),
540
+ "containsNull": self.containsNull,
541
+ }
542
+
543
+ @classmethod
544
+ def fromJson(cls, json: Dict[str, Any]) -> "ArrayType":
545
+ return ArrayType(_parse_datatype_json_value(json["elementType"]), json["containsNull"])
546
+
547
+ def needConversion(self) -> bool:
548
+ return self.elementType.needConversion()
549
+
550
+ def toInternal(self, obj: List[Optional[T]]) -> List[Optional[T]]:
551
+ if not self.needConversion():
552
+ return obj
553
+ return obj and [self.elementType.toInternal(v) for v in obj]
554
+
555
+ def fromInternal(self, obj: List[Optional[T]]) -> List[Optional[T]]:
556
+ if not self.needConversion():
557
+ return obj
558
+ return obj and [self.elementType.fromInternal(v) for v in obj]
559
+
560
+
561
+ class MapType(DataType):
562
+ """Map data type.
563
+
564
+ Parameters
565
+ ----------
566
+ keyType : :class:`DataType`
567
+ :class:`DataType` of the keys in the map.
568
+ valueType : :class:`DataType`
569
+ :class:`DataType` of the values in the map.
570
+ valueContainsNull : bool, optional
571
+ indicates whether values can contain null (None) values.
572
+
573
+ Notes
574
+ -----
575
+ Keys in a map data type are not allowed to be null (None).
576
+
577
+ Examples
578
+ --------
579
+ >>> from pyspark.sql.types import IntegerType, FloatType, MapType, StringType
580
+
581
+ The below example demonstrates how to create class:`MapType`:
582
+
583
+ >>> map_type = MapType(StringType(), IntegerType())
584
+
585
+ The values of the map can contain null (``None``) values by default:
586
+
587
+ >>> (MapType(StringType(), IntegerType())
588
+ ... == MapType(StringType(), IntegerType(), True))
589
+ True
590
+ >>> (MapType(StringType(), IntegerType(), False)
591
+ ... == MapType(StringType(), FloatType()))
592
+ False
593
+ """
594
+
595
+ def __init__(self, keyType: DataType, valueType: DataType, valueContainsNull: bool = True):
596
+ assert isinstance(keyType, DataType), "keyType %s should be an instance of %s" % (
597
+ keyType,
598
+ DataType,
599
+ )
600
+ assert isinstance(valueType, DataType), "valueType %s should be an instance of %s" % (
601
+ valueType,
602
+ DataType,
603
+ )
604
+ self.keyType = keyType
605
+ self.valueType = valueType
606
+ self.valueContainsNull = valueContainsNull
607
+
608
+ def simpleString(self) -> str:
609
+ return "map<%s,%s>" % (self.keyType.simpleString(), self.valueType.simpleString())
610
+
611
+ def __repr__(self) -> str:
612
+ return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, str(self.valueContainsNull))
613
+
614
+ def jsonValue(self) -> Dict[str, Any]:
615
+ return {
616
+ "type": self.typeName(),
617
+ "keyType": self.keyType.jsonValue(),
618
+ "valueType": self.valueType.jsonValue(),
619
+ "valueContainsNull": self.valueContainsNull,
620
+ }
621
+
622
+ @classmethod
623
+ def fromJson(cls, json: Dict[str, Any]) -> "MapType":
624
+ return MapType(
625
+ _parse_datatype_json_value(json["keyType"]),
626
+ _parse_datatype_json_value(json["valueType"]),
627
+ json["valueContainsNull"],
628
+ )
629
+
630
+ def needConversion(self) -> bool:
631
+ return self.keyType.needConversion() or self.valueType.needConversion()
632
+
633
+ def toInternal(self, obj: Dict[T, Optional[U]]) -> Dict[T, Optional[U]]:
634
+ if not self.needConversion():
635
+ return obj
636
+ return obj and dict(
637
+ (self.keyType.toInternal(k), self.valueType.toInternal(v)) for k, v in obj.items()
638
+ )
639
+
640
+ def fromInternal(self, obj: Dict[T, Optional[U]]) -> Dict[T, Optional[U]]:
641
+ if not self.needConversion():
642
+ return obj
643
+ return obj and dict(
644
+ (self.keyType.fromInternal(k), self.valueType.fromInternal(v)) for k, v in obj.items()
645
+ )
646
+
647
+
648
+ class StructField(DataType):
649
+ """A field in :class:`StructType`.
650
+
651
+ Parameters
652
+ ----------
653
+ name : str
654
+ name of the field.
655
+ dataType : :class:`DataType`
656
+ :class:`DataType` of the field.
657
+ nullable : bool, optional
658
+ whether the field can be null (None) or not.
659
+ metadata : dict, optional
660
+ a dict from string to simple type that can be toInternald to JSON automatically
661
+
662
+ Examples
663
+ --------
664
+ >>> from pyspark.sql.types import StringType, StructField
665
+ >>> (StructField("f1", StringType(), True)
666
+ ... == StructField("f1", StringType(), True))
667
+ True
668
+ >>> (StructField("f1", StringType(), True)
669
+ ... == StructField("f2", StringType(), True))
670
+ False
671
+ """
672
+
673
+ def __init__(
674
+ self,
675
+ name: str,
676
+ dataType: DataType,
677
+ nullable: bool = True,
678
+ metadata: Optional[Dict[str, Any]] = None,
679
+ ):
680
+ assert isinstance(dataType, DataType), "dataType %s should be an instance of %s" % (
681
+ dataType,
682
+ DataType,
683
+ )
684
+ assert isinstance(name, str), "field name %s should be a string" % (name)
685
+ self.name = name
686
+ self.dataType = dataType
687
+ self.nullable = nullable
688
+ self.metadata = metadata or {}
689
+
690
+ def simpleString(self) -> str:
691
+ return "%s:%s" % (self.name, self.dataType.simpleString())
692
+
693
+ def __repr__(self) -> str:
694
+ return "StructField('%s', %s, %s)" % (self.name, self.dataType, str(self.nullable))
695
+
696
+ def jsonValue(self) -> Dict[str, Any]:
697
+ return {
698
+ "name": self.name,
699
+ "type": self.dataType.jsonValue(),
700
+ "nullable": self.nullable,
701
+ "metadata": self.metadata,
702
+ }
703
+
704
+ @classmethod
705
+ def fromJson(cls, json: Dict[str, Any]) -> "StructField":
706
+ return StructField(
707
+ json["name"],
708
+ _parse_datatype_json_value(json["type"]),
709
+ json["nullable"],
710
+ json["metadata"],
711
+ )
712
+
713
+ def needConversion(self) -> bool:
714
+ return self.dataType.needConversion()
715
+
716
+ def toInternal(self, obj: T) -> T:
717
+ return self.dataType.toInternal(obj)
718
+
719
+ def fromInternal(self, obj: T) -> T:
720
+ return self.dataType.fromInternal(obj)
721
+
722
+ def typeName(self) -> str: # type: ignore[override]
723
+ raise PySparkTypeError(
724
+ error_class="INVALID_TYPENAME_CALL",
725
+ message_parameters={},
726
+ )
727
+
728
+
729
+ class StructType(DataType):
730
+ """Struct type, consisting of a list of :class:`StructField`.
731
+
732
+ This is the data type representing a :class:`Row`.
733
+
734
+ Iterating a :class:`StructType` will iterate over its :class:`StructField`\\s.
735
+ A contained :class:`StructField` can be accessed by its name or position.
736
+
737
+ Examples
738
+ --------
739
+ >>> from pyspark.sql.types import *
740
+ >>> struct1 = StructType([StructField("f1", StringType(), True)])
741
+ >>> struct1["f1"]
742
+ StructField('f1', StringType(), True)
743
+ >>> struct1[0]
744
+ StructField('f1', StringType(), True)
745
+
746
+ >>> struct1 = StructType([StructField("f1", StringType(), True)])
747
+ >>> struct2 = StructType([StructField("f1", StringType(), True)])
748
+ >>> struct1 == struct2
749
+ True
750
+ >>> struct1 = StructType([StructField("f1", CharType(10), True)])
751
+ >>> struct2 = StructType([StructField("f1", CharType(10), True)])
752
+ >>> struct1 == struct2
753
+ True
754
+ >>> struct1 = StructType([StructField("f1", VarcharType(10), True)])
755
+ >>> struct2 = StructType([StructField("f1", VarcharType(10), True)])
756
+ >>> struct1 == struct2
757
+ True
758
+ >>> struct1 = StructType([StructField("f1", StringType(), True)])
759
+ >>> struct2 = StructType([StructField("f1", StringType(), True),
760
+ ... StructField("f2", IntegerType(), False)])
761
+ >>> struct1 == struct2
762
+ False
763
+
764
+ The below example demonstrates how to create a DataFrame based on a struct created
765
+ using class:`StructType` and class:`StructField`:
766
+
767
+ >>> data = [("Alice", ["Java", "Scala"]), ("Bob", ["Python", "Scala"])]
768
+ >>> schema = StructType([
769
+ ... StructField("name", StringType()),
770
+ ... StructField("languagesSkills", ArrayType(StringType())),
771
+ ... ])
772
+ >>> df = spark.createDataFrame(data=data, schema=schema)
773
+ >>> df.printSchema()
774
+ root
775
+ |-- name: string (nullable = true)
776
+ |-- languagesSkills: array (nullable = true)
777
+ | |-- element: string (containsNull = true)
778
+ >>> df.show()
779
+ +-----+---------------+
780
+ | name|languagesSkills|
781
+ +-----+---------------+
782
+ |Alice| [Java, Scala]|
783
+ | Bob|[Python, Scala]|
784
+ +-----+---------------+
785
+ """
786
+
787
+ def __init__(self, fields: Optional[List[StructField]] = None):
788
+ if not fields:
789
+ self.fields = []
790
+ self.names = []
791
+ else:
792
+ self.fields = fields
793
+ self.names = [f.name for f in fields]
794
+ assert all(
795
+ isinstance(f, StructField) for f in fields
796
+ ), "fields should be a list of StructField"
797
+ # Precalculated list of fields that need conversion with fromInternal/toInternal functions
798
+ self._needConversion = [f.needConversion() for f in self]
799
+ self._needSerializeAnyField = any(self._needConversion)
800
+
801
+ @overload
802
+ def add(
803
+ self,
804
+ field: str,
805
+ data_type: Union[str, DataType],
806
+ nullable: bool = True,
807
+ metadata: Optional[Dict[str, Any]] = None,
808
+ ) -> "StructType":
809
+ ...
810
+
811
+ @overload
812
+ def add(self, field: StructField) -> "StructType":
813
+ ...
814
+
815
+ def add(
816
+ self,
817
+ field: Union[str, StructField],
818
+ data_type: Optional[Union[str, DataType]] = None,
819
+ nullable: bool = True,
820
+ metadata: Optional[Dict[str, Any]] = None,
821
+ ) -> "StructType":
822
+ """
823
+ Construct a :class:`StructType` by adding new elements to it, to define the schema.
824
+ The method accepts either:
825
+
826
+ a) A single parameter which is a :class:`StructField` object.
827
+ b) Between 2 and 4 parameters as (name, data_type, nullable (optional),
828
+ metadata(optional). The data_type parameter may be either a String or a
829
+ :class:`DataType` object.
830
+
831
+ Parameters
832
+ ----------
833
+ field : str or :class:`StructField`
834
+ Either the name of the field or a :class:`StructField` object
835
+ data_type : :class:`DataType`, optional
836
+ If present, the DataType of the :class:`StructField` to create
837
+ nullable : bool, optional
838
+ Whether the field to add should be nullable (default True)
839
+ metadata : dict, optional
840
+ Any additional metadata (default None)
841
+
842
+ Returns
843
+ -------
844
+ :class:`StructType`
845
+
846
+ Examples
847
+ --------
848
+ >>> from pyspark.sql.types import IntegerType, StringType, StructField, StructType
849
+ >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
850
+ >>> struct2 = StructType([StructField("f1", StringType(), True),
851
+ ... StructField("f2", StringType(), True, None)])
852
+ >>> struct1 == struct2
853
+ True
854
+ >>> struct1 = StructType().add(StructField("f1", StringType(), True))
855
+ >>> struct2 = StructType([StructField("f1", StringType(), True)])
856
+ >>> struct1 == struct2
857
+ True
858
+ >>> struct1 = StructType().add("f1", "string", True)
859
+ >>> struct2 = StructType([StructField("f1", StringType(), True)])
860
+ >>> struct1 == struct2
861
+ True
862
+ """
863
+ if isinstance(field, StructField):
864
+ self.fields.append(field)
865
+ self.names.append(field.name)
866
+ else:
867
+ if isinstance(field, str) and data_type is None:
868
+ raise PySparkValueError(
869
+ error_class="ARGUMENT_REQUIRED",
870
+ message_parameters={
871
+ "arg_name": "data_type",
872
+ "condition": "passing name of struct_field to create",
873
+ },
874
+ )
875
+
876
+ if isinstance(data_type, str):
877
+ data_type_f = _parse_datatype_json_value(data_type)
878
+ else:
879
+ data_type_f = data_type
880
+ self.fields.append(StructField(field, data_type_f, nullable, metadata))
881
+ self.names.append(field)
882
+ # Precalculated list of fields that need conversion with fromInternal/toInternal functions
883
+ self._needConversion = [f.needConversion() for f in self]
884
+ self._needSerializeAnyField = any(self._needConversion)
885
+ return self
886
+
887
+ def __iter__(self) -> Iterator[StructField]:
888
+ """Iterate the fields"""
889
+ return iter(self.fields)
890
+
891
+ def __len__(self) -> int:
892
+ """Return the number of fields."""
893
+ return len(self.fields)
894
+
895
+ def __getitem__(self, key: Union[str, int]) -> StructField:
896
+ """Access fields by name or slice."""
897
+ if isinstance(key, str):
898
+ for field in self:
899
+ if field.name == key:
900
+ return field
901
+ raise KeyError("No StructField named {0}".format(key))
902
+ elif isinstance(key, int):
903
+ try:
904
+ return self.fields[key]
905
+ except IndexError:
906
+ raise IndexError("StructType index out of range")
907
+ elif isinstance(key, slice):
908
+ return StructType(self.fields[key])
909
+ else:
910
+ raise PySparkTypeError(
911
+ error_class="NOT_INT_OR_SLICE_OR_STR",
912
+ message_parameters={"arg_name": "key", "arg_type": type(key).__name__},
913
+ )
914
+
915
+ def simpleString(self) -> str:
916
+ return "struct<%s>" % (",".join(f.simpleString() for f in self))
917
+
918
+ def __repr__(self) -> str:
919
+ return "StructType([%s])" % ", ".join(str(field) for field in self)
920
+
921
+ def jsonValue(self) -> Dict[str, Any]:
922
+ return {"type": self.typeName(), "fields": [f.jsonValue() for f in self]}
923
+
924
+ @classmethod
925
+ def fromJson(cls, json: Dict[str, Any]) -> "StructType":
926
+ """
927
+ Constructs :class:`StructType` from a schema defined in JSON format.
928
+
929
+ Below is a JSON schema it must adhere to::
930
+
931
+ {
932
+ "title":"StructType",
933
+ "description":"Schema of StructType in json format",
934
+ "type":"object",
935
+ "properties":{
936
+ "fields":{
937
+ "description":"Array of struct fields",
938
+ "type":"array",
939
+ "items":{
940
+ "type":"object",
941
+ "properties":{
942
+ "name":{
943
+ "description":"Name of the field",
944
+ "type":"string"
945
+ },
946
+ "type":{
947
+ "description": "Type of the field. Can either be
948
+ another nested StructType or primitive type",
949
+ "type":"object/string"
950
+ },
951
+ "nullable":{
952
+ "description":"If nulls are allowed",
953
+ "type":"boolean"
954
+ },
955
+ "metadata":{
956
+ "description":"Additional metadata to supply",
957
+ "type":"object"
958
+ },
959
+ "required":[
960
+ "name",
961
+ "type",
962
+ "nullable",
963
+ "metadata"
964
+ ]
965
+ }
966
+ }
967
+ }
968
+ }
969
+ }
970
+
971
+ Parameters
972
+ ----------
973
+ json : dict or a dict-like object e.g. JSON object
974
+ This "dict" must have "fields" key that returns an array of fields
975
+ each of which must have specific keys (name, type, nullable, metadata).
976
+
977
+ Returns
978
+ -------
979
+ :class:`StructType`
980
+
981
+ Examples
982
+ --------
983
+ >>> json_str = '''
984
+ ... {
985
+ ... "fields": [
986
+ ... {
987
+ ... "metadata": {},
988
+ ... "name": "Person",
989
+ ... "nullable": true,
990
+ ... "type": {
991
+ ... "fields": [
992
+ ... {
993
+ ... "metadata": {},
994
+ ... "name": "name",
995
+ ... "nullable": false,
996
+ ... "type": "string"
997
+ ... },
998
+ ... {
999
+ ... "metadata": {},
1000
+ ... "name": "surname",
1001
+ ... "nullable": false,
1002
+ ... "type": "string"
1003
+ ... }
1004
+ ... ],
1005
+ ... "type": "struct"
1006
+ ... }
1007
+ ... }
1008
+ ... ],
1009
+ ... "type": "struct"
1010
+ ... }
1011
+ ... '''
1012
+ >>> import json
1013
+ >>> scheme = StructType.fromJson(json.loads(json_str))
1014
+ >>> scheme.simpleString()
1015
+ 'struct<Person:struct<name:string,surname:string>>'
1016
+ """
1017
+ return StructType([StructField.fromJson(f) for f in json["fields"]])
1018
+
1019
+ def fieldNames(self) -> List[str]:
1020
+ """
1021
+ Returns all field names in a list.
1022
+
1023
+ Examples
1024
+ --------
1025
+ >>> from pyspark.sql.types import StringType, StructField, StructType
1026
+ >>> struct = StructType([StructField("f1", StringType(), True)])
1027
+ >>> struct.fieldNames()
1028
+ ['f1']
1029
+ """
1030
+ return list(self.names)
1031
+
1032
+ def needConversion(self) -> bool:
1033
+ # We need convert Row()/namedtuple into tuple()
1034
+ return True
1035
+
1036
+ def toInternal(self, obj: Tuple) -> Tuple:
1037
+ if obj is None:
1038
+ return
1039
+
1040
+ if self._needSerializeAnyField:
1041
+ # Only calling toInternal function for fields that need conversion
1042
+ if isinstance(obj, dict):
1043
+ return tuple(
1044
+ f.toInternal(obj.get(n)) if c else obj.get(n)
1045
+ for n, f, c in zip(self.names, self.fields, self._needConversion)
1046
+ )
1047
+ elif isinstance(obj, (tuple, list)):
1048
+ return tuple(
1049
+ f.toInternal(v) if c else v
1050
+ for f, v, c in zip(self.fields, obj, self._needConversion)
1051
+ )
1052
+ elif hasattr(obj, "__dict__"):
1053
+ d = obj.__dict__
1054
+ return tuple(
1055
+ f.toInternal(d.get(n)) if c else d.get(n)
1056
+ for n, f, c in zip(self.names, self.fields, self._needConversion)
1057
+ )
1058
+ else:
1059
+ raise PySparkValueError(
1060
+ error_class="UNEXPECTED_TUPLE_WITH_STRUCT",
1061
+ message_parameters={"tuple": str(obj)},
1062
+ )
1063
+ else:
1064
+ if isinstance(obj, dict):
1065
+ return tuple(obj.get(n) for n in self.names)
1066
+ elif isinstance(obj, (list, tuple)):
1067
+ return tuple(obj)
1068
+ elif hasattr(obj, "__dict__"):
1069
+ d = obj.__dict__
1070
+ return tuple(d.get(n) for n in self.names)
1071
+ else:
1072
+ raise PySparkValueError(
1073
+ error_class="UNEXPECTED_TUPLE_WITH_STRUCT",
1074
+ message_parameters={"tuple": str(obj)},
1075
+ )
1076
+
1077
+ def fromInternal(self, obj: Tuple) -> "Row":
1078
+ if obj is None:
1079
+ return
1080
+ if isinstance(obj, Row):
1081
+ # it's already converted by pickler
1082
+ return obj
1083
+
1084
+ values: Union[Tuple, List]
1085
+ if self._needSerializeAnyField:
1086
+ # Only calling fromInternal function for fields that need conversion
1087
+ values = [
1088
+ f.fromInternal(v) if c else v
1089
+ for f, v, c in zip(self.fields, obj, self._needConversion)
1090
+ ]
1091
+ else:
1092
+ values = obj
1093
+ return _create_row(self.names, values)
1094
+
1095
+
1096
+ class UserDefinedType(DataType):
1097
+ """User-defined type (UDT).
1098
+
1099
+ .. note:: WARN: Spark Internal Use Only
1100
+ """
1101
+
1102
+ @classmethod
1103
+ def typeName(cls) -> str:
1104
+ return cls.__name__.lower()
1105
+
1106
+ @classmethod
1107
+ def sqlType(cls) -> DataType:
1108
+ """
1109
+ Underlying SQL storage type for this UDT.
1110
+ """
1111
+ raise PySparkNotImplementedError(
1112
+ error_class="NOT_IMPLEMENTED",
1113
+ message_parameters={"feature": "sqlType()"},
1114
+ )
1115
+
1116
+ @classmethod
1117
+ def module(cls) -> str:
1118
+ """
1119
+ The Python module of the UDT.
1120
+ """
1121
+ raise PySparkNotImplementedError(
1122
+ error_class="NOT_IMPLEMENTED",
1123
+ message_parameters={"feature": "module()"},
1124
+ )
1125
+
1126
+ @classmethod
1127
+ def scalaUDT(cls) -> str:
1128
+ """
1129
+ The class name of the paired Scala UDT (could be '', if there
1130
+ is no corresponding one).
1131
+ """
1132
+ return ""
1133
+
1134
+ def needConversion(self) -> bool:
1135
+ return True
1136
+
1137
+ @classmethod
1138
+ def _cachedSqlType(cls) -> DataType:
1139
+ """
1140
+ Cache the sqlType() into class, because it's heavily used in `toInternal`.
1141
+ """
1142
+ if not hasattr(cls, "_cached_sql_type"):
1143
+ cls._cached_sql_type = cls.sqlType() # type: ignore[attr-defined]
1144
+ return cls._cached_sql_type # type: ignore[attr-defined]
1145
+
1146
+ def toInternal(self, obj: Any) -> Any:
1147
+ if obj is not None:
1148
+ return self._cachedSqlType().toInternal(self.serialize(obj))
1149
+
1150
+ def fromInternal(self, obj: Any) -> Any:
1151
+ v = self._cachedSqlType().fromInternal(obj)
1152
+ if v is not None:
1153
+ return self.deserialize(v)
1154
+
1155
+ def serialize(self, obj: Any) -> Any:
1156
+ """
1157
+ Converts a user-type object into a SQL datum.
1158
+ """
1159
+ raise PySparkNotImplementedError(
1160
+ error_class="NOT_IMPLEMENTED",
1161
+ message_parameters={"feature": "toInternal()"},
1162
+ )
1163
+
1164
+ def deserialize(self, datum: Any) -> Any:
1165
+ """
1166
+ Converts a SQL datum into a user-type object.
1167
+ """
1168
+ raise PySparkNotImplementedError(
1169
+ error_class="NOT_IMPLEMENTED",
1170
+ message_parameters={"feature": "fromInternal()"},
1171
+ )
1172
+
1173
+ def simpleString(self) -> str:
1174
+ return "udt"
1175
+
1176
+ def json(self) -> str:
1177
+ return json.dumps(self.jsonValue(), separators=(",", ":"), sort_keys=True)
1178
+
1179
+ def jsonValue(self) -> Dict[str, Any]:
1180
+ if self.scalaUDT():
1181
+ assert self.module() != "__main__", "UDT in __main__ cannot work with ScalaUDT"
1182
+ schema = {
1183
+ "type": "udt",
1184
+ "class": self.scalaUDT(),
1185
+ "pyClass": "%s.%s" % (self.module(), type(self).__name__),
1186
+ "sqlType": self.sqlType().jsonValue(),
1187
+ }
1188
+ else:
1189
+ ser = CloudPickleSerializer()
1190
+ b = ser.dumps(type(self))
1191
+ schema = {
1192
+ "type": "udt",
1193
+ "pyClass": "%s.%s" % (self.module(), type(self).__name__),
1194
+ "serializedClass": base64.b64encode(b).decode("utf8"),
1195
+ "sqlType": self.sqlType().jsonValue(),
1196
+ }
1197
+ return schema
1198
+
1199
+ @classmethod
1200
+ def fromJson(cls, json: Dict[str, Any]) -> "UserDefinedType":
1201
+ pyUDT = str(json["pyClass"]) # convert unicode to str
1202
+ split = pyUDT.rfind(".")
1203
+ pyModule = pyUDT[:split]
1204
+ pyClass = pyUDT[split + 1 :]
1205
+ m = __import__(pyModule, globals(), locals(), [pyClass])
1206
+ if not hasattr(m, pyClass):
1207
+ s = base64.b64decode(json["serializedClass"].encode("utf-8"))
1208
+ UDT = CloudPickleSerializer().loads(s)
1209
+ else:
1210
+ UDT = getattr(m, pyClass)
1211
+ return UDT()
1212
+
1213
+ def __eq__(self, other: Any) -> bool:
1214
+ return type(self) == type(other)
1215
+
1216
+
1217
+ _atomic_types: List[Type[DataType]] = [
1218
+ StringType,
1219
+ CharType,
1220
+ VarcharType,
1221
+ BinaryType,
1222
+ BooleanType,
1223
+ DecimalType,
1224
+ FloatType,
1225
+ DoubleType,
1226
+ ByteType,
1227
+ ShortType,
1228
+ IntegerType,
1229
+ LongType,
1230
+ DateType,
1231
+ TimestampType,
1232
+ TimestampNTZType,
1233
+ NullType,
1234
+ ]
1235
+ _all_atomic_types: Dict[str, Type[DataType]] = dict((t.typeName(), t) for t in _atomic_types)
1236
+
1237
+ _complex_types: List[Type[Union[ArrayType, MapType, StructType]]] = [ArrayType, MapType, StructType]
1238
+ _all_complex_types: Dict[str, Type[Union[ArrayType, MapType, StructType]]] = dict(
1239
+ (v.typeName(), v) for v in _complex_types
1240
+ )
1241
+
1242
+ _LENGTH_CHAR = re.compile(r"char\(\s*(\d+)\s*\)")
1243
+ _LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)")
1244
+ _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)")
1245
+ _INTERVAL_DAYTIME = re.compile(r"interval (day|hour|minute|second)( to (day|hour|minute|second))?")
1246
+ _INTERVAL_YEARMONTH = re.compile(r"interval (year|month)( to (year|month))?")
1247
+
1248
+
1249
+ def _parse_datatype_string(s: str) -> DataType:
1250
+ """
1251
+ Parses the given data type string to a :class:`DataType`. The data type string format equals
1252
+ :class:`DataType.simpleString`, except that the top level struct type can omit
1253
+ the ``struct<>``. Since Spark 2.3, this also supports a schema in a DDL-formatted
1254
+ string and case-insensitive strings.
1255
+
1256
+ Examples
1257
+ --------
1258
+ >>> _parse_datatype_string("int ")
1259
+ IntegerType()
1260
+ >>> _parse_datatype_string("INT ")
1261
+ IntegerType()
1262
+ >>> _parse_datatype_string("a: byte, b: decimal( 16 , 8 ) ")
1263
+ StructType([StructField('a', ByteType(), True), StructField('b', DecimalType(16,8), True)])
1264
+ >>> _parse_datatype_string("a DOUBLE, b STRING")
1265
+ StructType([StructField('a', DoubleType(), True), StructField('b', StringType(), True)])
1266
+ >>> _parse_datatype_string("a DOUBLE, b CHAR( 50 )")
1267
+ StructType([StructField('a', DoubleType(), True), StructField('b', CharType(50), True)])
1268
+ >>> _parse_datatype_string("a DOUBLE, b VARCHAR( 50 )")
1269
+ StructType([StructField('a', DoubleType(), True), StructField('b', VarcharType(50), True)])
1270
+ >>> _parse_datatype_string("a: array< short>")
1271
+ StructType([StructField('a', ArrayType(ShortType(), True), True)])
1272
+ >>> _parse_datatype_string(" map<string , string > ")
1273
+ MapType(StringType(), StringType(), True)
1274
+
1275
+ >>> # Error cases
1276
+ >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL
1277
+ Traceback (most recent call last):
1278
+ ...
1279
+ ParseException:...
1280
+ >>> _parse_datatype_string("a: int,") # doctest: +IGNORE_EXCEPTION_DETAIL
1281
+ Traceback (most recent call last):
1282
+ ...
1283
+ ParseException:...
1284
+ >>> _parse_datatype_string("array<int") # doctest: +IGNORE_EXCEPTION_DETAIL
1285
+ Traceback (most recent call last):
1286
+ ...
1287
+ ParseException:...
1288
+ >>> _parse_datatype_string("map<int, boolean>>") # doctest: +IGNORE_EXCEPTION_DETAIL
1289
+ Traceback (most recent call last):
1290
+ ...
1291
+ ParseException:...
1292
+ """
1293
+ sc = get_active_spark_context()
1294
+
1295
+ def from_ddl_schema(type_str: str) -> DataType:
1296
+ return _parse_datatype_json_string(
1297
+ cast(JVMView, sc._jvm).org.apache.spark.sql.types.StructType.fromDDL(type_str).json()
1298
+ )
1299
+
1300
+ def from_ddl_datatype(type_str: str) -> DataType:
1301
+ return _parse_datatype_json_string(
1302
+ cast(JVMView, sc._jvm)
1303
+ .org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str)
1304
+ .json()
1305
+ )
1306
+
1307
+ try:
1308
+ # DDL format, "fieldname datatype, fieldname datatype".
1309
+ return from_ddl_schema(s)
1310
+ except Exception as e:
1311
+ try:
1312
+ # For backwards compatibility, "integer", "struct<fieldname: datatype>" and etc.
1313
+ return from_ddl_datatype(s)
1314
+ except BaseException:
1315
+ try:
1316
+ # For backwards compatibility, "fieldname: datatype, fieldname: datatype" case.
1317
+ return from_ddl_datatype("struct<%s>" % s.strip())
1318
+ except BaseException:
1319
+ raise e
1320
+
1321
+
1322
+ def _parse_datatype_json_string(json_string: str) -> DataType:
1323
+ """Parses the given data type JSON string.
1324
+
1325
+ Examples
1326
+ --------
1327
+ >>> import pickle
1328
+ >>> def check_datatype(datatype):
1329
+ ... pickled = pickle.loads(pickle.dumps(datatype))
1330
+ ... assert datatype == pickled
1331
+ ... scala_datatype = spark._jsparkSession.parseDataType(datatype.json())
1332
+ ... python_datatype = _parse_datatype_json_string(scala_datatype.json())
1333
+ ... assert datatype == python_datatype
1334
+ ...
1335
+ >>> for cls in _all_atomic_types.values():
1336
+ ... if cls is not VarcharType and cls is not CharType:
1337
+ ... check_datatype(cls())
1338
+ ... else:
1339
+ ... check_datatype(cls(1))
1340
+
1341
+ >>> # Simple ArrayType.
1342
+ >>> simple_arraytype = ArrayType(StringType(), True)
1343
+ >>> check_datatype(simple_arraytype)
1344
+
1345
+ >>> # Simple MapType.
1346
+ >>> simple_maptype = MapType(StringType(), LongType())
1347
+ >>> check_datatype(simple_maptype)
1348
+
1349
+ >>> # Simple StructType.
1350
+ >>> simple_structtype = StructType([
1351
+ ... StructField("a", DecimalType(), False),
1352
+ ... StructField("b", BooleanType(), True),
1353
+ ... StructField("c", LongType(), True),
1354
+ ... StructField("d", BinaryType(), False)])
1355
+ >>> check_datatype(simple_structtype)
1356
+
1357
+ >>> # Complex StructType.
1358
+ >>> complex_structtype = StructType([
1359
+ ... StructField("simpleArray", simple_arraytype, True),
1360
+ ... StructField("simpleMap", simple_maptype, True),
1361
+ ... StructField("simpleStruct", simple_structtype, True),
1362
+ ... StructField("boolean", BooleanType(), False),
1363
+ ... StructField("chars", CharType(10), False),
1364
+ ... StructField("words", VarcharType(10), False),
1365
+ ... StructField("withMeta", DoubleType(), False, {"name": "age"})])
1366
+ >>> check_datatype(complex_structtype)
1367
+
1368
+ >>> # Complex ArrayType.
1369
+ >>> complex_arraytype = ArrayType(complex_structtype, True)
1370
+ >>> check_datatype(complex_arraytype)
1371
+
1372
+ >>> # Complex MapType.
1373
+ >>> complex_maptype = MapType(complex_structtype,
1374
+ ... complex_arraytype, False)
1375
+ >>> check_datatype(complex_maptype)
1376
+ """
1377
+ return _parse_datatype_json_value(json.loads(json_string))
1378
+
1379
+
1380
+ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType:
1381
+ if not isinstance(json_value, dict):
1382
+ if json_value in _all_atomic_types.keys():
1383
+ return _all_atomic_types[json_value]()
1384
+ elif json_value == "decimal":
1385
+ return DecimalType()
1386
+ elif _FIXED_DECIMAL.match(json_value):
1387
+ m = _FIXED_DECIMAL.match(json_value)
1388
+ return DecimalType(int(m.group(1)), int(m.group(2))) # type: ignore[union-attr]
1389
+ elif _INTERVAL_DAYTIME.match(json_value):
1390
+ m = _INTERVAL_DAYTIME.match(json_value)
1391
+ inverted_fields = DayTimeIntervalType._inverted_fields
1392
+ first_field = inverted_fields.get(m.group(1)) # type: ignore[union-attr]
1393
+ second_field = inverted_fields.get(m.group(3)) # type: ignore[union-attr]
1394
+ if first_field is not None and second_field is None:
1395
+ return DayTimeIntervalType(first_field)
1396
+ return DayTimeIntervalType(first_field, second_field)
1397
+ elif _INTERVAL_YEARMONTH.match(json_value):
1398
+ m = _INTERVAL_YEARMONTH.match(json_value)
1399
+ inverted_fields = YearMonthIntervalType._inverted_fields
1400
+ first_field = inverted_fields.get(m.group(1)) # type: ignore[union-attr]
1401
+ second_field = inverted_fields.get(m.group(3)) # type: ignore[union-attr]
1402
+ if first_field is not None and second_field is None:
1403
+ return YearMonthIntervalType(first_field)
1404
+ return YearMonthIntervalType(first_field, second_field)
1405
+ elif _LENGTH_CHAR.match(json_value):
1406
+ m = _LENGTH_CHAR.match(json_value)
1407
+ return CharType(int(m.group(1))) # type: ignore[union-attr]
1408
+ elif _LENGTH_VARCHAR.match(json_value):
1409
+ m = _LENGTH_VARCHAR.match(json_value)
1410
+ return VarcharType(int(m.group(1))) # type: ignore[union-attr]
1411
+ else:
1412
+ raise PySparkValueError(
1413
+ error_class="CANNOT_PARSE_DATATYPE",
1414
+ message_parameters={"error": str(json_value)},
1415
+ )
1416
+ else:
1417
+ tpe = json_value["type"]
1418
+ if tpe in _all_complex_types:
1419
+ return _all_complex_types[tpe].fromJson(json_value)
1420
+ elif tpe == "udt":
1421
+ return UserDefinedType.fromJson(json_value)
1422
+ else:
1423
+ raise PySparkValueError(
1424
+ error_class="UNSUPPORTED_DATA_TYPE",
1425
+ message_parameters={"data_type": str(tpe)},
1426
+ )
1427
+
1428
+
1429
+ # Mapping Python types to Spark SQL DataType
1430
+ _type_mappings = {
1431
+ type(None): NullType,
1432
+ bool: BooleanType,
1433
+ int: LongType,
1434
+ float: DoubleType,
1435
+ str: StringType,
1436
+ bytearray: BinaryType,
1437
+ decimal.Decimal: DecimalType,
1438
+ datetime.date: DateType,
1439
+ datetime.datetime: TimestampType, # can be TimestampNTZType
1440
+ datetime.time: TimestampType, # can be TimestampNTZType
1441
+ datetime.timedelta: DayTimeIntervalType,
1442
+ bytes: BinaryType,
1443
+ }
1444
+
1445
+ # Mapping Python array types to Spark SQL DataType
1446
+ # We should be careful here. The size of these types in python depends on C
1447
+ # implementation. We need to make sure that this conversion does not lose any
1448
+ # precision. Also, JVM only support signed types, when converting unsigned types,
1449
+ # keep in mind that it require 1 more bit when stored as signed types.
1450
+ #
1451
+ # Reference for C integer size, see:
1452
+ # ISO/IEC 9899:201x specification, chapter 5.2.4.2.1 Sizes of integer types <limits.h>.
1453
+ # Reference for python array typecode, see:
1454
+ # https://docs.python.org/2/library/array.html
1455
+ # https://docs.python.org/3.6/library/array.html
1456
+ # Reference for JVM's supported integral types:
1457
+ # http://docs.oracle.com/javase/specs/jvms/se8/html/jvms-2.html#jvms-2.3.1
1458
+
1459
+ _array_signed_int_typecode_ctype_mappings = {
1460
+ "b": ctypes.c_byte,
1461
+ "h": ctypes.c_short,
1462
+ "i": ctypes.c_int,
1463
+ "l": ctypes.c_long,
1464
+ }
1465
+
1466
+ _array_unsigned_int_typecode_ctype_mappings = {
1467
+ "B": ctypes.c_ubyte,
1468
+ "H": ctypes.c_ushort,
1469
+ "I": ctypes.c_uint,
1470
+ "L": ctypes.c_ulong,
1471
+ }
1472
+
1473
+
1474
+ def _int_size_to_type(
1475
+ size: int,
1476
+ ) -> Optional[Union[Type[ByteType], Type[ShortType], Type[IntegerType], Type[LongType]]]:
1477
+ """
1478
+ Return the Catalyst datatype from the size of integers.
1479
+ """
1480
+ if size <= 8:
1481
+ return ByteType
1482
+ elif size <= 16:
1483
+ return ShortType
1484
+ elif size <= 32:
1485
+ return IntegerType
1486
+ elif size <= 64:
1487
+ return LongType
1488
+ else:
1489
+ return None
1490
+
1491
+
1492
+ # The list of all supported array typecodes, is stored here
1493
+ _array_type_mappings: Dict[str, Type[DataType]] = {
1494
+ # Warning: Actual properties for float and double in C is not specified in C.
1495
+ # On almost every system supported by both python and JVM, they are IEEE 754
1496
+ # single-precision binary floating-point format and IEEE 754 double-precision
1497
+ # binary floating-point format. And we do assume the same thing here for now.
1498
+ "f": FloatType,
1499
+ "d": DoubleType,
1500
+ }
1501
+
1502
+ # compute array typecode mappings for signed integer types
1503
+ for _typecode in _array_signed_int_typecode_ctype_mappings.keys():
1504
+ size = ctypes.sizeof(_array_signed_int_typecode_ctype_mappings[_typecode]) * 8
1505
+ dt = _int_size_to_type(size)
1506
+ if dt is not None:
1507
+ _array_type_mappings[_typecode] = dt
1508
+
1509
+ # compute array typecode mappings for unsigned integer types
1510
+ for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys():
1511
+ # JVM does not have unsigned types, so use signed types that is at least 1
1512
+ # bit larger to store
1513
+ size = ctypes.sizeof(_array_unsigned_int_typecode_ctype_mappings[_typecode]) * 8 + 1
1514
+ dt = _int_size_to_type(size)
1515
+ if dt is not None:
1516
+ _array_type_mappings[_typecode] = dt
1517
+
1518
+ # Type code 'u' in Python's array is deprecated since version 3.3, and will be
1519
+ # removed in version 4.0. See: https://docs.python.org/3/library/array.html
1520
+ if sys.version_info[0] < 4:
1521
+ _array_type_mappings["u"] = StringType
1522
+
1523
+
1524
+ def _from_numpy_type(nt: "np.dtype") -> Optional[DataType]:
1525
+ """Convert NumPy type to Spark data type."""
1526
+ import numpy as np
1527
+
1528
+ if nt == np.dtype("int8"):
1529
+ return ByteType()
1530
+ elif nt == np.dtype("int16"):
1531
+ return ShortType()
1532
+ elif nt == np.dtype("int32"):
1533
+ return IntegerType()
1534
+ elif nt == np.dtype("int64"):
1535
+ return LongType()
1536
+ elif nt == np.dtype("float32"):
1537
+ return FloatType()
1538
+ elif nt == np.dtype("float64"):
1539
+ return DoubleType()
1540
+
1541
+ return None
1542
+
1543
+
1544
+ def _infer_type(
1545
+ obj: Any,
1546
+ infer_dict_as_struct: bool = False,
1547
+ infer_array_from_first_element: bool = False,
1548
+ prefer_timestamp_ntz: bool = False,
1549
+ ) -> DataType:
1550
+ """Infer the DataType from obj"""
1551
+ if obj is None:
1552
+ return NullType()
1553
+
1554
+ if hasattr(obj, "__UDT__"):
1555
+ return obj.__UDT__
1556
+
1557
+ dataType = _type_mappings.get(type(obj))
1558
+ if dataType is DecimalType:
1559
+ # the precision and scale of `obj` may be different from row to row.
1560
+ return DecimalType(38, 18)
1561
+ if dataType is TimestampType and prefer_timestamp_ntz and obj.tzinfo is None:
1562
+ return TimestampNTZType()
1563
+ if dataType is DayTimeIntervalType:
1564
+ return DayTimeIntervalType()
1565
+ if dataType is YearMonthIntervalType:
1566
+ return YearMonthIntervalType()
1567
+ elif dataType is not None:
1568
+ return dataType()
1569
+
1570
+ if isinstance(obj, dict):
1571
+ if infer_dict_as_struct:
1572
+ struct = StructType()
1573
+ for key, value in obj.items():
1574
+ if key is not None and value is not None:
1575
+ struct.add(
1576
+ key,
1577
+ _infer_type(
1578
+ value,
1579
+ infer_dict_as_struct,
1580
+ infer_array_from_first_element,
1581
+ prefer_timestamp_ntz,
1582
+ ),
1583
+ True,
1584
+ )
1585
+ return struct
1586
+ else:
1587
+ for key, value in obj.items():
1588
+ if key is not None and value is not None:
1589
+ return MapType(
1590
+ _infer_type(
1591
+ key,
1592
+ infer_dict_as_struct,
1593
+ infer_array_from_first_element,
1594
+ prefer_timestamp_ntz,
1595
+ ),
1596
+ _infer_type(
1597
+ value,
1598
+ infer_dict_as_struct,
1599
+ infer_array_from_first_element,
1600
+ prefer_timestamp_ntz,
1601
+ ),
1602
+ True,
1603
+ )
1604
+ return MapType(NullType(), NullType(), True)
1605
+ elif isinstance(obj, list):
1606
+ if len(obj) > 0:
1607
+ if infer_array_from_first_element:
1608
+ return ArrayType(
1609
+ _infer_type(
1610
+ obj[0],
1611
+ infer_dict_as_struct,
1612
+ infer_array_from_first_element,
1613
+ prefer_timestamp_ntz,
1614
+ ),
1615
+ True,
1616
+ )
1617
+ else:
1618
+ return ArrayType(
1619
+ reduce(
1620
+ _merge_type,
1621
+ (
1622
+ _infer_type(
1623
+ v,
1624
+ infer_dict_as_struct,
1625
+ infer_array_from_first_element,
1626
+ prefer_timestamp_ntz,
1627
+ )
1628
+ for v in obj
1629
+ ),
1630
+ ),
1631
+ True,
1632
+ )
1633
+ return ArrayType(NullType(), True)
1634
+ elif isinstance(obj, array):
1635
+ if obj.typecode in _array_type_mappings:
1636
+ return ArrayType(_array_type_mappings[obj.typecode](), False)
1637
+ else:
1638
+ raise PySparkTypeError(
1639
+ error_class="UNSUPPORTED_DATA_TYPE",
1640
+ message_parameters={"data_type": f"array({obj.typecode})"},
1641
+ )
1642
+ else:
1643
+ try:
1644
+ return _infer_schema(
1645
+ obj,
1646
+ infer_dict_as_struct=infer_dict_as_struct,
1647
+ infer_array_from_first_element=infer_array_from_first_element,
1648
+ )
1649
+ except TypeError:
1650
+ raise PySparkTypeError(
1651
+ error_class="UNSUPPORTED_DATA_TYPE",
1652
+ message_parameters={"data_type": type(obj).__name__},
1653
+ )
1654
+
1655
+
1656
+ def _infer_schema(
1657
+ row: Any,
1658
+ names: Optional[List[str]] = None,
1659
+ infer_dict_as_struct: bool = False,
1660
+ infer_array_from_first_element: bool = False,
1661
+ prefer_timestamp_ntz: bool = False,
1662
+ ) -> StructType:
1663
+ """Infer the schema from dict/namedtuple/object"""
1664
+ items: Iterable[Tuple[str, Any]]
1665
+ if isinstance(row, dict):
1666
+ items = sorted(row.items())
1667
+
1668
+ elif isinstance(row, (tuple, list)):
1669
+ if hasattr(row, "__fields__"): # Row
1670
+ items = zip(row.__fields__, tuple(row)) # type: ignore[union-attr]
1671
+ elif hasattr(row, "_fields"): # namedtuple
1672
+ items = zip(row._fields, tuple(row)) # type: ignore[union-attr]
1673
+ else:
1674
+ if names is None:
1675
+ names = ["_%d" % i for i in range(1, len(row) + 1)]
1676
+ elif len(names) < len(row):
1677
+ names.extend("_%d" % i for i in range(len(names) + 1, len(row) + 1))
1678
+ items = zip(names, row)
1679
+
1680
+ elif hasattr(row, "__dict__"): # object
1681
+ items = sorted(row.__dict__.items())
1682
+
1683
+ else:
1684
+ raise PySparkTypeError(
1685
+ error_class="CANNOT_INFER_SCHEMA_FOR_TYPE",
1686
+ message_parameters={"data_type": type(row).__name__},
1687
+ )
1688
+
1689
+ fields = []
1690
+ for k, v in items:
1691
+ try:
1692
+ fields.append(
1693
+ StructField(
1694
+ k,
1695
+ _infer_type(
1696
+ v,
1697
+ infer_dict_as_struct,
1698
+ infer_array_from_first_element,
1699
+ prefer_timestamp_ntz,
1700
+ ),
1701
+ True,
1702
+ )
1703
+ )
1704
+ except TypeError:
1705
+ raise PySparkTypeError(
1706
+ error_class="CANNOT_INFER_TYPE_FOR_FIELD",
1707
+ message_parameters={"field_name": k},
1708
+ )
1709
+ return StructType(fields)
1710
+
1711
+
1712
+ def _has_nulltype(dt: DataType) -> bool:
1713
+ """Return whether there is a NullType in `dt` or not"""
1714
+ if isinstance(dt, StructType):
1715
+ return any(_has_nulltype(f.dataType) for f in dt.fields)
1716
+ elif isinstance(dt, ArrayType):
1717
+ return _has_nulltype((dt.elementType))
1718
+ elif isinstance(dt, MapType):
1719
+ return _has_nulltype(dt.keyType) or _has_nulltype(dt.valueType)
1720
+ else:
1721
+ return isinstance(dt, NullType)
1722
+
1723
+
1724
+ def _has_type(dt: DataType, dts: Union[type, Tuple[type, ...]]) -> bool:
1725
+ """Return whether there are specified types"""
1726
+ if isinstance(dt, dts):
1727
+ return True
1728
+ elif isinstance(dt, StructType):
1729
+ return any(_has_type(f.dataType, dts) for f in dt.fields)
1730
+ elif isinstance(dt, ArrayType):
1731
+ return _has_type(dt.elementType, dts)
1732
+ elif isinstance(dt, MapType):
1733
+ return _has_type(dt.keyType, dts) or _has_type(dt.valueType, dts)
1734
+ else:
1735
+ return False
1736
+
1737
+
1738
+ @overload
1739
+ def _merge_type(a: StructType, b: StructType, name: Optional[str] = None) -> StructType:
1740
+ ...
1741
+
1742
+
1743
+ @overload
1744
+ def _merge_type(a: ArrayType, b: ArrayType, name: Optional[str] = None) -> ArrayType:
1745
+ ...
1746
+
1747
+
1748
+ @overload
1749
+ def _merge_type(a: MapType, b: MapType, name: Optional[str] = None) -> MapType:
1750
+ ...
1751
+
1752
+
1753
+ @overload
1754
+ def _merge_type(a: DataType, b: DataType, name: Optional[str] = None) -> DataType:
1755
+ ...
1756
+
1757
+
1758
+ def _merge_type(
1759
+ a: Union[StructType, ArrayType, MapType, DataType],
1760
+ b: Union[StructType, ArrayType, MapType, DataType],
1761
+ name: Optional[str] = None,
1762
+ ) -> Union[StructType, ArrayType, MapType, DataType]:
1763
+ if name is None:
1764
+
1765
+ def new_msg(msg: str) -> str:
1766
+ return msg
1767
+
1768
+ def new_name(n: str) -> str:
1769
+ return "field %s" % n
1770
+
1771
+ else:
1772
+
1773
+ def new_msg(msg: str) -> str:
1774
+ return "%s: %s" % (name, msg)
1775
+
1776
+ def new_name(n: str) -> str:
1777
+ return "field %s in %s" % (n, name)
1778
+
1779
+ if isinstance(a, NullType):
1780
+ return b
1781
+ elif isinstance(b, NullType):
1782
+ return a
1783
+ elif isinstance(a, TimestampType) and isinstance(b, TimestampNTZType):
1784
+ return a
1785
+ elif isinstance(a, TimestampNTZType) and isinstance(b, TimestampType):
1786
+ return b
1787
+ elif isinstance(a, AtomicType) and isinstance(b, StringType):
1788
+ return b
1789
+ elif isinstance(a, StringType) and isinstance(b, AtomicType):
1790
+ return a
1791
+ elif type(a) is not type(b):
1792
+ # TODO: type cast (such as int -> long)
1793
+ raise PySparkTypeError(
1794
+ error_class="CANNOT_MERGE_TYPE",
1795
+ message_parameters={"data_type1": type(a).__name__, "data_type2": type(b).__name__},
1796
+ )
1797
+
1798
+ # same type
1799
+ if isinstance(a, StructType):
1800
+ nfs = dict((f.name, f.dataType) for f in cast(StructType, b).fields)
1801
+ fields = [
1802
+ StructField(
1803
+ f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()), name=new_name(f.name))
1804
+ )
1805
+ for f in a.fields
1806
+ ]
1807
+ names = set([f.name for f in fields])
1808
+ for n in nfs:
1809
+ if n not in names:
1810
+ fields.append(StructField(n, nfs[n]))
1811
+ return StructType(fields)
1812
+
1813
+ elif isinstance(a, ArrayType):
1814
+ return ArrayType(
1815
+ _merge_type(
1816
+ a.elementType, cast(ArrayType, b).elementType, name="element in array %s" % name
1817
+ ),
1818
+ True,
1819
+ )
1820
+
1821
+ elif isinstance(a, MapType):
1822
+ return MapType(
1823
+ _merge_type(a.keyType, cast(MapType, b).keyType, name="key of map %s" % name),
1824
+ _merge_type(a.valueType, cast(MapType, b).valueType, name="value of map %s" % name),
1825
+ True,
1826
+ )
1827
+ else:
1828
+ return a
1829
+
1830
+
1831
+ def _need_converter(dataType: DataType) -> bool:
1832
+ if isinstance(dataType, StructType):
1833
+ return True
1834
+ elif isinstance(dataType, ArrayType):
1835
+ return _need_converter(dataType.elementType)
1836
+ elif isinstance(dataType, MapType):
1837
+ return _need_converter(dataType.keyType) or _need_converter(dataType.valueType)
1838
+ elif isinstance(dataType, NullType):
1839
+ return True
1840
+ else:
1841
+ return False
1842
+
1843
+
1844
+ def _create_converter(dataType: DataType) -> Callable:
1845
+ """Create a converter to drop the names of fields in obj"""
1846
+ if not _need_converter(dataType):
1847
+ return lambda x: x
1848
+
1849
+ if isinstance(dataType, ArrayType):
1850
+ conv = _create_converter(dataType.elementType)
1851
+ return lambda row: [conv(v) for v in row]
1852
+
1853
+ elif isinstance(dataType, MapType):
1854
+ kconv = _create_converter(dataType.keyType)
1855
+ vconv = _create_converter(dataType.valueType)
1856
+ return lambda row: dict((kconv(k), vconv(v)) for k, v in row.items())
1857
+
1858
+ elif isinstance(dataType, NullType):
1859
+ return lambda x: None
1860
+
1861
+ elif not isinstance(dataType, StructType):
1862
+ return lambda x: x
1863
+
1864
+ # dataType must be StructType
1865
+ names = [f.name for f in dataType.fields]
1866
+ converters = [_create_converter(f.dataType) for f in dataType.fields]
1867
+ convert_fields = any(_need_converter(f.dataType) for f in dataType.fields)
1868
+
1869
+ def convert_struct(obj: Any) -> Optional[Tuple]:
1870
+ if obj is None:
1871
+ return None
1872
+
1873
+ if isinstance(obj, (tuple, list)):
1874
+ if convert_fields:
1875
+ return tuple(conv(v) for v, conv in zip(obj, converters))
1876
+ else:
1877
+ return tuple(obj)
1878
+
1879
+ if isinstance(obj, dict):
1880
+ d = obj
1881
+ elif hasattr(obj, "__dict__"): # object
1882
+ d = obj.__dict__
1883
+ else:
1884
+ raise PySparkTypeError(
1885
+ error_class="UNSUPPORTED_DATA_TYPE",
1886
+ message_parameters={"data_type": type(obj).__name__},
1887
+ )
1888
+
1889
+ if convert_fields:
1890
+ return tuple([conv(d.get(name)) for name, conv in zip(names, converters)])
1891
+ else:
1892
+ return tuple([d.get(name) for name in names])
1893
+
1894
+ return convert_struct
1895
+
1896
+
1897
+ _acceptable_types = {
1898
+ BooleanType: (bool,),
1899
+ ByteType: (int,),
1900
+ ShortType: (int,),
1901
+ IntegerType: (int,),
1902
+ LongType: (int,),
1903
+ FloatType: (float,),
1904
+ DoubleType: (float,),
1905
+ DecimalType: (decimal.Decimal,),
1906
+ StringType: (str,),
1907
+ CharType: (str,),
1908
+ VarcharType: (str,),
1909
+ BinaryType: (bytearray, bytes),
1910
+ DateType: (datetime.date, datetime.datetime),
1911
+ TimestampType: (datetime.datetime,),
1912
+ TimestampNTZType: (datetime.datetime,),
1913
+ DayTimeIntervalType: (datetime.timedelta,),
1914
+ ArrayType: (list, tuple, array),
1915
+ MapType: (dict,),
1916
+ StructType: (tuple, list, dict),
1917
+ }
1918
+
1919
+
1920
+ def _make_type_verifier(
1921
+ dataType: DataType,
1922
+ nullable: bool = True,
1923
+ name: Optional[str] = None,
1924
+ ) -> Callable:
1925
+ """
1926
+ Make a verifier that checks the type of obj against dataType and raises a TypeError if they do
1927
+ not match.
1928
+
1929
+ This verifier also checks the value of obj against datatype and raises a ValueError if it's not
1930
+ within the allowed range, e.g. using 128 as ByteType will overflow. Note that, Python float is
1931
+ not checked, so it will become infinity when cast to Java float, if it overflows.
1932
+
1933
+ Examples
1934
+ --------
1935
+ >>> _make_type_verifier(StructType([]))(None)
1936
+ >>> _make_type_verifier(StringType())("")
1937
+ >>> _make_type_verifier(LongType())(0)
1938
+ >>> _make_type_verifier(LongType())(1 << 64) # doctest: +IGNORE_EXCEPTION_DETAIL
1939
+ Traceback (most recent call last):
1940
+ ...
1941
+ pyspark.errors.exceptions.base.PySparkValueError:...
1942
+ >>> _make_type_verifier(ArrayType(ShortType()))(list(range(3)))
1943
+ >>> _make_type_verifier(ArrayType(StringType()))(set()) # doctest: +IGNORE_EXCEPTION_DETAIL
1944
+ Traceback (most recent call last):
1945
+ ...
1946
+ pyspark.errors.exceptions.base.PySparkTypeError:...
1947
+ >>> _make_type_verifier(MapType(StringType(), IntegerType()))({})
1948
+ >>> _make_type_verifier(StructType([]))(())
1949
+ >>> _make_type_verifier(StructType([]))([])
1950
+ >>> _make_type_verifier(StructType([]))([1]) # doctest: +IGNORE_EXCEPTION_DETAIL
1951
+ Traceback (most recent call last):
1952
+ ...
1953
+ pyspark.errors.exceptions.base.PySparkValueError:...
1954
+ >>> # Check if numeric values are within the allowed range.
1955
+ >>> _make_type_verifier(ByteType())(12)
1956
+ >>> _make_type_verifier(ByteType())(1234) # doctest: +IGNORE_EXCEPTION_DETAIL
1957
+ Traceback (most recent call last):
1958
+ ...
1959
+ pyspark.errors.exceptions.base.PySparkValueError:...
1960
+ >>> _make_type_verifier(ByteType(), False)(None) # doctest: +IGNORE_EXCEPTION_DETAIL
1961
+ Traceback (most recent call last):
1962
+ ...
1963
+ pyspark.errors.exceptions.base.PySparkValueError:...
1964
+ >>> _make_type_verifier(
1965
+ ... ArrayType(ShortType(), False))([1, None]) # doctest: +IGNORE_EXCEPTION_DETAIL
1966
+ Traceback (most recent call last):
1967
+ ...
1968
+ pyspark.errors.exceptions.base.PySparkValueError:...
1969
+ >>> _make_type_verifier( # doctest: +IGNORE_EXCEPTION_DETAIL
1970
+ ... MapType(StringType(), IntegerType())
1971
+ ... )({None: 1})
1972
+ Traceback (most recent call last):
1973
+ ...
1974
+ pyspark.errors.exceptions.base.PySparkValueError:...
1975
+ >>> schema = StructType().add("a", IntegerType()).add("b", StringType(), False)
1976
+ >>> _make_type_verifier(schema)((1, None)) # doctest: +IGNORE_EXCEPTION_DETAIL
1977
+ Traceback (most recent call last):
1978
+ ...
1979
+ pyspark.errors.exceptions.base.PySparkValueError:...
1980
+ """
1981
+
1982
+ if name is None:
1983
+
1984
+ def new_msg(msg: str) -> str:
1985
+ return msg
1986
+
1987
+ def new_name(n: str) -> str:
1988
+ return "field %s" % n
1989
+
1990
+ else:
1991
+
1992
+ def new_msg(msg: str) -> str:
1993
+ return "%s: %s" % (name, msg)
1994
+
1995
+ def new_name(n: str) -> str:
1996
+ return "field %s in %s" % (n, name)
1997
+
1998
+ def verify_nullability(obj: Any) -> bool:
1999
+ if obj is None:
2000
+ if nullable:
2001
+ return True
2002
+ else:
2003
+ raise PySparkValueError(
2004
+ error_class="CANNOT_BE_NONE",
2005
+ message_parameters={"arg_name": "obj"},
2006
+ )
2007
+ else:
2008
+ return False
2009
+
2010
+ _type = type(dataType)
2011
+
2012
+ def assert_acceptable_types(obj: Any) -> None:
2013
+ assert _type in _acceptable_types, new_msg(
2014
+ "unknown datatype: %s for object %r" % (dataType, obj)
2015
+ )
2016
+
2017
+ def verify_acceptable_types(obj: Any) -> None:
2018
+ # subclass of them can not be fromInternal in JVM
2019
+ if type(obj) not in _acceptable_types[_type]:
2020
+ raise PySparkTypeError(
2021
+ error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
2022
+ message_parameters={
2023
+ "data_type": str(dataType),
2024
+ "obj_name": str(obj),
2025
+ "obj_type": type(obj).__name__,
2026
+ },
2027
+ )
2028
+
2029
+ if isinstance(dataType, (StringType, CharType, VarcharType)):
2030
+ # StringType, CharType and VarcharType can work with any types
2031
+ def verify_value(obj: Any) -> None:
2032
+ pass
2033
+
2034
+ elif isinstance(dataType, UserDefinedType):
2035
+ verifier = _make_type_verifier(dataType.sqlType(), name=name)
2036
+
2037
+ def verify_udf(obj: Any) -> None:
2038
+ if not (hasattr(obj, "__UDT__") and obj.__UDT__ == dataType):
2039
+ raise PySparkValueError(
2040
+ error_class="NOT_INSTANCE_OF",
2041
+ message_parameters={
2042
+ "value": str(obj),
2043
+ "data_type": str(dataType),
2044
+ },
2045
+ )
2046
+ verifier(dataType.toInternal(obj))
2047
+
2048
+ verify_value = verify_udf
2049
+
2050
+ elif isinstance(dataType, ByteType):
2051
+
2052
+ def verify_byte(obj: Any) -> None:
2053
+ assert_acceptable_types(obj)
2054
+ verify_acceptable_types(obj)
2055
+ if obj < -128 or obj > 127:
2056
+ raise PySparkValueError(
2057
+ error_class="VALUE_OUT_OF_BOUND",
2058
+ message_parameters={
2059
+ "arg_name": "obj",
2060
+ "lower_bound": "127",
2061
+ "upper_bound": "-127",
2062
+ "actual": str(obj),
2063
+ },
2064
+ )
2065
+
2066
+ verify_value = verify_byte
2067
+
2068
+ elif isinstance(dataType, ShortType):
2069
+
2070
+ def verify_short(obj: Any) -> None:
2071
+ assert_acceptable_types(obj)
2072
+ verify_acceptable_types(obj)
2073
+ if obj < -32768 or obj > 32767:
2074
+ raise PySparkValueError(
2075
+ error_class="VALUE_OUT_OF_BOUND",
2076
+ message_parameters={
2077
+ "arg_name": "obj",
2078
+ "lower_bound": "32767",
2079
+ "upper_bound": "-32768",
2080
+ "actual": str(obj),
2081
+ },
2082
+ )
2083
+
2084
+ verify_value = verify_short
2085
+
2086
+ elif isinstance(dataType, IntegerType):
2087
+
2088
+ def verify_integer(obj: Any) -> None:
2089
+ assert_acceptable_types(obj)
2090
+ verify_acceptable_types(obj)
2091
+ if obj < -2147483648 or obj > 2147483647:
2092
+ raise PySparkValueError(
2093
+ error_class="VALUE_OUT_OF_BOUND",
2094
+ message_parameters={
2095
+ "arg_name": "obj",
2096
+ "lower_bound": "2147483647",
2097
+ "upper_bound": "-2147483648",
2098
+ "actual": str(obj),
2099
+ },
2100
+ )
2101
+
2102
+ verify_value = verify_integer
2103
+
2104
+ elif isinstance(dataType, LongType):
2105
+
2106
+ def verify_long(obj: Any) -> None:
2107
+ assert_acceptable_types(obj)
2108
+ verify_acceptable_types(obj)
2109
+ if obj < -9223372036854775808 or obj > 9223372036854775807:
2110
+ raise PySparkValueError(
2111
+ error_class="VALUE_OUT_OF_BOUND",
2112
+ message_parameters={
2113
+ "arg_name": "obj",
2114
+ "lower_bound": "9223372036854775807",
2115
+ "upper_bound": "-9223372036854775808",
2116
+ "actual": str(obj),
2117
+ },
2118
+ )
2119
+
2120
+ verify_value = verify_long
2121
+
2122
+ elif isinstance(dataType, ArrayType):
2123
+ element_verifier = _make_type_verifier(
2124
+ dataType.elementType, dataType.containsNull, name="element in array %s" % name
2125
+ )
2126
+
2127
+ def verify_array(obj: Any) -> None:
2128
+ assert_acceptable_types(obj)
2129
+ verify_acceptable_types(obj)
2130
+ for i in obj:
2131
+ element_verifier(i)
2132
+
2133
+ verify_value = verify_array
2134
+
2135
+ elif isinstance(dataType, MapType):
2136
+ key_verifier = _make_type_verifier(dataType.keyType, False, name="key of map %s" % name)
2137
+ value_verifier = _make_type_verifier(
2138
+ dataType.valueType, dataType.valueContainsNull, name="value of map %s" % name
2139
+ )
2140
+
2141
+ def verify_map(obj: Any) -> None:
2142
+ assert_acceptable_types(obj)
2143
+ verify_acceptable_types(obj)
2144
+ for k, v in obj.items():
2145
+ key_verifier(k)
2146
+ value_verifier(v)
2147
+
2148
+ verify_value = verify_map
2149
+
2150
+ elif isinstance(dataType, StructType):
2151
+ verifiers = []
2152
+ for f in dataType.fields:
2153
+ verifier = _make_type_verifier(f.dataType, f.nullable, name=new_name(f.name))
2154
+ verifiers.append((f.name, verifier))
2155
+
2156
+ def verify_struct(obj: Any) -> None:
2157
+ assert_acceptable_types(obj)
2158
+
2159
+ if isinstance(obj, dict):
2160
+ for f, verifier in verifiers:
2161
+ verifier(obj.get(f))
2162
+ elif isinstance(obj, (tuple, list)):
2163
+ if len(obj) != len(verifiers):
2164
+ raise PySparkValueError(
2165
+ error_class="LENGTH_SHOULD_BE_THE_SAME",
2166
+ message_parameters={
2167
+ "arg1": "obj",
2168
+ "arg2": "fields",
2169
+ "arg1_length": str(len(obj)),
2170
+ "arg2_length": str(len(verifiers)),
2171
+ },
2172
+ )
2173
+ for v, (_, verifier) in zip(obj, verifiers):
2174
+ verifier(v)
2175
+ elif hasattr(obj, "__dict__"):
2176
+ d = obj.__dict__
2177
+ for f, verifier in verifiers:
2178
+ verifier(d.get(f))
2179
+ else:
2180
+ raise PySparkTypeError(
2181
+ error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
2182
+ message_parameters={
2183
+ "data_type": "StructType",
2184
+ "obj_name": str(obj),
2185
+ "obj_type": type(obj).__name__,
2186
+ },
2187
+ )
2188
+
2189
+ verify_value = verify_struct
2190
+
2191
+ else:
2192
+
2193
+ def verify_default(obj: Any) -> None:
2194
+ assert_acceptable_types(obj)
2195
+ verify_acceptable_types(obj)
2196
+
2197
+ verify_value = verify_default
2198
+
2199
+ def verify(obj: Any) -> None:
2200
+ if not verify_nullability(obj):
2201
+ verify_value(obj)
2202
+
2203
+ return verify
2204
+
2205
+
2206
+ # This is used to unpickle a Row from JVM
2207
+ def _create_row_inbound_converter(dataType: DataType) -> Callable:
2208
+ return lambda *a: dataType.fromInternal(a)
2209
+
2210
+
2211
+ def _create_row(
2212
+ fields: Union["Row", List[str]], values: Union[Tuple[Any, ...], List[Any]]
2213
+ ) -> "Row":
2214
+ row = Row(*values)
2215
+ row.__fields__ = fields
2216
+ return row
2217
+
2218
+
2219
+ class Row(tuple):
2220
+
2221
+ """
2222
+ A row in :class:`DataFrame`.
2223
+ The fields in it can be accessed:
2224
+
2225
+ * like attributes (``row.key``)
2226
+ * like dictionary values (``row[key]``)
2227
+
2228
+ ``key in row`` will search through row keys.
2229
+
2230
+ Row can be used to create a row object by using named arguments.
2231
+ It is not allowed to omit a named argument to represent that the value is
2232
+ None or missing. This should be explicitly set to None in this case.
2233
+
2234
+ .. versionchanged:: 3.0.0
2235
+ Rows created from named arguments no longer have
2236
+ field names sorted alphabetically and will be ordered in the position as
2237
+ entered.
2238
+
2239
+ Examples
2240
+ --------
2241
+ >>> from pyspark.sql import Row
2242
+ >>> row = Row(name="Alice", age=11)
2243
+ >>> row
2244
+ Row(name='Alice', age=11)
2245
+ >>> row['name'], row['age']
2246
+ ('Alice', 11)
2247
+ >>> row.name, row.age
2248
+ ('Alice', 11)
2249
+ >>> 'name' in row
2250
+ True
2251
+ >>> 'wrong_key' in row
2252
+ False
2253
+
2254
+ Row also can be used to create another Row like class, then it
2255
+ could be used to create Row objects, such as
2256
+
2257
+ >>> Person = Row("name", "age")
2258
+ >>> Person
2259
+ <Row('name', 'age')>
2260
+ >>> 'name' in Person
2261
+ True
2262
+ >>> 'wrong_key' in Person
2263
+ False
2264
+ >>> Person("Alice", 11)
2265
+ Row(name='Alice', age=11)
2266
+
2267
+ This form can also be used to create rows as tuple values, i.e. with unnamed
2268
+ fields.
2269
+
2270
+ >>> row1 = Row("Alice", 11)
2271
+ >>> row2 = Row(name="Alice", age=11)
2272
+ >>> row1 == row2
2273
+ True
2274
+ """
2275
+
2276
+ @overload
2277
+ def __new__(cls, *args: str) -> "Row":
2278
+ ...
2279
+
2280
+ @overload
2281
+ def __new__(cls, **kwargs: Any) -> "Row":
2282
+ ...
2283
+
2284
+ def __new__(cls, *args: Optional[str], **kwargs: Optional[Any]) -> "Row":
2285
+ if args and kwargs:
2286
+ raise PySparkValueError(
2287
+ error_class="CANNOT_SET_TOGETHER",
2288
+ message_parameters={"arg_list": "args and kwargs"},
2289
+ )
2290
+ if kwargs:
2291
+ # create row objects
2292
+ row = tuple.__new__(cls, list(kwargs.values()))
2293
+ row.__fields__ = list(kwargs.keys())
2294
+ return row
2295
+ else:
2296
+ # create row class or objects
2297
+ return tuple.__new__(cls, args)
2298
+
2299
+ def asDict(self, recursive: bool = False) -> Dict[str, Any]:
2300
+ """
2301
+ Return as a dict
2302
+
2303
+ Parameters
2304
+ ----------
2305
+ recursive : bool, optional
2306
+ turns the nested Rows to dict (default: False).
2307
+
2308
+ Notes
2309
+ -----
2310
+ If a row contains duplicate field names, e.g., the rows of a join
2311
+ between two :class:`DataFrame` that both have the fields of same names,
2312
+ one of the duplicate fields will be selected by ``asDict``. ``__getitem__``
2313
+ will also return one of the duplicate fields, however returned value might
2314
+ be different to ``asDict``.
2315
+
2316
+ Examples
2317
+ --------
2318
+ >>> from pyspark.sql import Row
2319
+ >>> Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11}
2320
+ True
2321
+ >>> row = Row(key=1, value=Row(name='a', age=2))
2322
+ >>> row.asDict() == {'key': 1, 'value': Row(name='a', age=2)}
2323
+ True
2324
+ >>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}}
2325
+ True
2326
+ """
2327
+ if not hasattr(self, "__fields__"):
2328
+ raise PySparkTypeError(
2329
+ error_class="CANNOT_CONVERT_TYPE",
2330
+ message_parameters={
2331
+ "from_type": "Row",
2332
+ "to_type": "dict",
2333
+ },
2334
+ )
2335
+
2336
+ if recursive:
2337
+
2338
+ def conv(obj: Any) -> Any:
2339
+ if isinstance(obj, Row):
2340
+ return obj.asDict(True)
2341
+ elif isinstance(obj, list):
2342
+ return [conv(o) for o in obj]
2343
+ elif isinstance(obj, dict):
2344
+ return dict((k, conv(v)) for k, v in obj.items())
2345
+ else:
2346
+ return obj
2347
+
2348
+ return dict(zip(self.__fields__, (conv(o) for o in self)))
2349
+ else:
2350
+ return dict(zip(self.__fields__, self))
2351
+
2352
+ def __contains__(self, item: Any) -> bool:
2353
+ if hasattr(self, "__fields__"):
2354
+ return item in self.__fields__
2355
+ else:
2356
+ return super(Row, self).__contains__(item)
2357
+
2358
+ # let object acts like class
2359
+ def __call__(self, *args: Any) -> "Row":
2360
+ """create new Row object"""
2361
+ if len(args) > len(self):
2362
+ raise PySparkValueError(
2363
+ error_class="TOO_MANY_VALUES",
2364
+ message_parameters={
2365
+ "expected": str(len(self)),
2366
+ "item": "fields",
2367
+ "actual": str(len(args)),
2368
+ },
2369
+ )
2370
+ return _create_row(self, args)
2371
+
2372
+ def __getitem__(self, item: Any) -> Any:
2373
+ if isinstance(item, (int, slice)):
2374
+ return super(Row, self).__getitem__(item)
2375
+ try:
2376
+ # it will be slow when it has many fields,
2377
+ # but this will not be used in normal cases
2378
+ idx = self.__fields__.index(item)
2379
+ return super(Row, self).__getitem__(idx)
2380
+ except IndexError:
2381
+ raise KeyError(item)
2382
+ except ValueError:
2383
+ raise PySparkValueError(item)
2384
+
2385
+ def __getattr__(self, item: str) -> Any:
2386
+ if item.startswith("__"):
2387
+ raise AttributeError(item)
2388
+ try:
2389
+ # it will be slow when it has many fields,
2390
+ # but this will not be used in normal cases
2391
+ idx = self.__fields__.index(item)
2392
+ return self[idx]
2393
+ except IndexError:
2394
+ raise AttributeError(item)
2395
+ except ValueError:
2396
+ raise AttributeError(item)
2397
+
2398
+ def __setattr__(self, key: Any, value: Any) -> None:
2399
+ if key != "__fields__":
2400
+ raise RuntimeError("Row is read-only")
2401
+ self.__dict__[key] = value
2402
+
2403
+ def __reduce__(
2404
+ self,
2405
+ ) -> Union[str, Tuple[Any, ...]]:
2406
+ """Returns a tuple so Python knows how to pickle Row."""
2407
+ if hasattr(self, "__fields__"):
2408
+ return (_create_row, (self.__fields__, tuple(self)))
2409
+ else:
2410
+ return tuple.__reduce__(self)
2411
+
2412
+ def __repr__(self) -> str:
2413
+ """Printable representation of Row used in Python REPL."""
2414
+ if hasattr(self, "__fields__"):
2415
+ return "Row(%s)" % ", ".join(
2416
+ "%s=%r" % (k, v) for k, v in zip(self.__fields__, tuple(self))
2417
+ )
2418
+ else:
2419
+ return "<Row(%s)>" % ", ".join(repr(field) for field in self)
2420
+
2421
+
2422
+ class DateConverter:
2423
+ def can_convert(self, obj: Any) -> bool:
2424
+ return isinstance(obj, datetime.date)
2425
+
2426
+ def convert(self, obj: datetime.date, gateway_client: GatewayClient) -> JavaObject:
2427
+ Date = JavaClass("java.sql.Date", gateway_client)
2428
+ return Date.valueOf(obj.strftime("%Y-%m-%d"))
2429
+
2430
+
2431
+ class DatetimeConverter:
2432
+ def can_convert(self, obj: Any) -> bool:
2433
+ return isinstance(obj, datetime.datetime)
2434
+
2435
+ def convert(self, obj: datetime.datetime, gateway_client: GatewayClient) -> JavaObject:
2436
+ Timestamp = JavaClass("java.sql.Timestamp", gateway_client)
2437
+ seconds = (
2438
+ calendar.timegm(obj.utctimetuple()) if obj.tzinfo else time.mktime(obj.timetuple())
2439
+ )
2440
+ t = Timestamp(int(seconds) * 1000)
2441
+ t.setNanos(obj.microsecond * 1000)
2442
+ return t
2443
+
2444
+
2445
+ class DatetimeNTZConverter:
2446
+ def can_convert(self, obj: Any) -> bool:
2447
+ from pyspark.sql.utils import is_timestamp_ntz_preferred
2448
+
2449
+ return (
2450
+ isinstance(obj, datetime.datetime)
2451
+ and obj.tzinfo is None
2452
+ and is_timestamp_ntz_preferred()
2453
+ )
2454
+
2455
+ def convert(self, obj: datetime.datetime, gateway_client: GatewayClient) -> JavaObject:
2456
+ seconds = calendar.timegm(obj.utctimetuple())
2457
+ DateTimeUtils = JavaClass(
2458
+ "org.apache.spark.sql.catalyst.util.DateTimeUtils",
2459
+ gateway_client,
2460
+ )
2461
+ return DateTimeUtils.microsToLocalDateTime(int(seconds) * 1000000 + obj.microsecond)
2462
+
2463
+
2464
+ class DayTimeIntervalTypeConverter:
2465
+ def can_convert(self, obj: Any) -> bool:
2466
+ return isinstance(obj, datetime.timedelta)
2467
+
2468
+ def convert(self, obj: datetime.timedelta, gateway_client: GatewayClient) -> JavaObject:
2469
+ IntervalUtils = JavaClass(
2470
+ "org.apache.spark.sql.catalyst.util.IntervalUtils",
2471
+ gateway_client,
2472
+ )
2473
+ return IntervalUtils.microsToDuration(
2474
+ (math.floor(obj.total_seconds()) * 1000000) + obj.microseconds
2475
+ )
2476
+
2477
+
2478
+ class NumpyScalarConverter:
2479
+ def can_convert(self, obj: Any) -> bool:
2480
+ return has_numpy and isinstance(obj, np.generic)
2481
+
2482
+ def convert(self, obj: "np.generic", gateway_client: GatewayClient) -> Any:
2483
+ return obj.item()
2484
+
2485
+
2486
+ class NumpyArrayConverter:
2487
+ def _from_numpy_type_to_java_type(
2488
+ self, nt: "np.dtype", gateway: JavaGateway
2489
+ ) -> Optional[JavaClass]:
2490
+ """Convert NumPy type to Py4J Java type."""
2491
+ if nt in [np.dtype("int8"), np.dtype("int16")]:
2492
+ # Mapping int8 to gateway.jvm.byte causes
2493
+ # TypeError: 'bytes' object does not support item assignment
2494
+ return gateway.jvm.short
2495
+ elif nt == np.dtype("int32"):
2496
+ return gateway.jvm.int
2497
+ elif nt == np.dtype("int64"):
2498
+ return gateway.jvm.long
2499
+ elif nt == np.dtype("float32"):
2500
+ return gateway.jvm.float
2501
+ elif nt == np.dtype("float64"):
2502
+ return gateway.jvm.double
2503
+ elif nt == np.dtype("bool"):
2504
+ return gateway.jvm.boolean
2505
+
2506
+ return None
2507
+
2508
+ def can_convert(self, obj: Any) -> bool:
2509
+ return has_numpy and isinstance(obj, np.ndarray) and obj.ndim == 1
2510
+
2511
+ def convert(self, obj: "np.ndarray", gateway_client: GatewayClient) -> JavaObject:
2512
+ from pyspark import SparkContext
2513
+
2514
+ gateway = SparkContext._gateway
2515
+ assert gateway is not None
2516
+ plist = obj.tolist()
2517
+
2518
+ if len(obj) > 0 and isinstance(plist[0], str):
2519
+ jtpe = gateway.jvm.String
2520
+ else:
2521
+ jtpe = self._from_numpy_type_to_java_type(obj.dtype, gateway)
2522
+ if jtpe is None:
2523
+ raise PySparkTypeError(
2524
+ error_class="UNSUPPORTED_NUMPY_ARRAY_SCALAR",
2525
+ message_parameters={"dtype": str(obj.dtype)},
2526
+ )
2527
+ jarr = gateway.new_array(jtpe, len(obj))
2528
+ for i in range(len(plist)):
2529
+ jarr[i] = plist[i]
2530
+ return jarr
2531
+
2532
+
2533
+ # datetime is a subclass of date, we should register DatetimeConverter first
2534
+ register_input_converter(DatetimeNTZConverter())
2535
+ register_input_converter(DatetimeConverter())
2536
+ register_input_converter(DateConverter())
2537
+ register_input_converter(DayTimeIntervalTypeConverter())
2538
+ register_input_converter(NumpyScalarConverter())
2539
+ # NumPy array satisfies py4j.java_collections.ListConverter,
2540
+ # so prepend NumpyArrayConverter
2541
+ register_input_converter(NumpyArrayConverter(), prepend=True)
2542
+
2543
+
2544
+ def _test() -> None:
2545
+ import doctest
2546
+ from pyspark.sql import SparkSession
2547
+
2548
+ globs = globals()
2549
+ globs["spark"] = SparkSession.builder.getOrCreate()
2550
+ (failure_count, test_count) = doctest.testmod(
2551
+ globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
2552
+ )
2553
+ if failure_count:
2554
+ sys.exit(-1)
2555
+
2556
+
2557
+ if __name__ == "__main__":
2558
+ _test()