sparkless 3.23.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. sparkless-3.23.0/PKG-INFO +623 -0
  2. sparkless-3.23.0/README.md +569 -0
  3. sparkless-3.23.0/pyproject.toml +236 -0
  4. sparkless-3.23.0/setup.cfg +4 -0
  5. sparkless-3.23.0/sparkless/__init__.py +218 -0
  6. sparkless-3.23.0/sparkless/_version.py +19 -0
  7. sparkless-3.23.0/sparkless/backend/__init__.py +33 -0
  8. sparkless-3.23.0/sparkless/backend/factory.py +247 -0
  9. sparkless-3.23.0/sparkless/backend/polars/__init__.py +29 -0
  10. sparkless-3.23.0/sparkless/backend/polars/export.py +114 -0
  11. sparkless-3.23.0/sparkless/backend/polars/expression_translator.py +3693 -0
  12. sparkless-3.23.0/sparkless/backend/polars/materializer.py +1274 -0
  13. sparkless-3.23.0/sparkless/backend/polars/operation_executor.py +1344 -0
  14. sparkless-3.23.0/sparkless/backend/polars/parquet_storage.py +119 -0
  15. sparkless-3.23.0/sparkless/backend/polars/schema_registry.py +222 -0
  16. sparkless-3.23.0/sparkless/backend/polars/schema_utils.py +69 -0
  17. sparkless-3.23.0/sparkless/backend/polars/storage.py +771 -0
  18. sparkless-3.23.0/sparkless/backend/polars/type_mapper.py +158 -0
  19. sparkless-3.23.0/sparkless/backend/polars/window_handler.py +377 -0
  20. sparkless-3.23.0/sparkless/backend/protocols.py +121 -0
  21. sparkless-3.23.0/sparkless/compat/__init__.py +17 -0
  22. sparkless-3.23.0/sparkless/compat/datetime.py +180 -0
  23. sparkless-3.23.0/sparkless/config.py +105 -0
  24. sparkless-3.23.0/sparkless/core/__init__.py +77 -0
  25. sparkless-3.23.0/sparkless/core/condition_evaluator.py +1393 -0
  26. sparkless-3.23.0/sparkless/core/data_validation.py +221 -0
  27. sparkless-3.23.0/sparkless/core/ddl_adapter.py +133 -0
  28. sparkless-3.23.0/sparkless/core/exceptions/__init__.py +45 -0
  29. sparkless-3.23.0/sparkless/core/exceptions/analysis.py +323 -0
  30. sparkless-3.23.0/sparkless/core/exceptions/base.py +91 -0
  31. sparkless-3.23.0/sparkless/core/exceptions/execution.py +153 -0
  32. sparkless-3.23.0/sparkless/core/exceptions/operation.py +206 -0
  33. sparkless-3.23.0/sparkless/core/exceptions/py4j_compat.py +47 -0
  34. sparkless-3.23.0/sparkless/core/exceptions/runtime.py +193 -0
  35. sparkless-3.23.0/sparkless/core/exceptions/validation.py +207 -0
  36. sparkless-3.23.0/sparkless/core/interfaces/__init__.py +43 -0
  37. sparkless-3.23.0/sparkless/core/interfaces/dataframe.py +320 -0
  38. sparkless-3.23.0/sparkless/core/interfaces/functions.py +348 -0
  39. sparkless-3.23.0/sparkless/core/interfaces/session.py +219 -0
  40. sparkless-3.23.0/sparkless/core/interfaces/storage.py +253 -0
  41. sparkless-3.23.0/sparkless/core/protocols.py +120 -0
  42. sparkless-3.23.0/sparkless/core/safe_evaluator.py +215 -0
  43. sparkless-3.23.0/sparkless/core/schema_inference.py +298 -0
  44. sparkless-3.23.0/sparkless/core/type_utils.py +199 -0
  45. sparkless-3.23.0/sparkless/core/types/__init__.py +35 -0
  46. sparkless-3.23.0/sparkless/core/types/data_types.py +264 -0
  47. sparkless-3.23.0/sparkless/core/types/metadata.py +288 -0
  48. sparkless-3.23.0/sparkless/core/types/schema.py +226 -0
  49. sparkless-3.23.0/sparkless/data_generation/__init__.py +18 -0
  50. sparkless-3.23.0/sparkless/data_generation/builder.py +85 -0
  51. sparkless-3.23.0/sparkless/data_generation/convenience.py +64 -0
  52. sparkless-3.23.0/sparkless/data_generation/generator.py +337 -0
  53. sparkless-3.23.0/sparkless/dataframe/__init__.py +27 -0
  54. sparkless-3.23.0/sparkless/dataframe/aggregations/__init__.py +10 -0
  55. sparkless-3.23.0/sparkless/dataframe/aggregations/operations.py +208 -0
  56. sparkless-3.23.0/sparkless/dataframe/assertions/__init__.py +10 -0
  57. sparkless-3.23.0/sparkless/dataframe/assertions/assertions.py +88 -0
  58. sparkless-3.23.0/sparkless/dataframe/assertions/operations.py +45 -0
  59. sparkless-3.23.0/sparkless/dataframe/attribute_handler.py +76 -0
  60. sparkless-3.23.0/sparkless/dataframe/casting/__init__.py +5 -0
  61. sparkless-3.23.0/sparkless/dataframe/casting/type_converter.py +180 -0
  62. sparkless-3.23.0/sparkless/dataframe/collection_handler.py +62 -0
  63. sparkless-3.23.0/sparkless/dataframe/condition_handler.py +178 -0
  64. sparkless-3.23.0/sparkless/dataframe/dataframe.py +1380 -0
  65. sparkless-3.23.0/sparkless/dataframe/display/__init__.py +6 -0
  66. sparkless-3.23.0/sparkless/dataframe/display/formatter.py +57 -0
  67. sparkless-3.23.0/sparkless/dataframe/display/operations.py +274 -0
  68. sparkless-3.23.0/sparkless/dataframe/evaluation/__init__.py +5 -0
  69. sparkless-3.23.0/sparkless/dataframe/evaluation/expression_evaluator.py +4130 -0
  70. sparkless-3.23.0/sparkless/dataframe/export.py +46 -0
  71. sparkless-3.23.0/sparkless/dataframe/grouped/__init__.py +18 -0
  72. sparkless-3.23.0/sparkless/dataframe/grouped/base.py +2016 -0
  73. sparkless-3.23.0/sparkless/dataframe/grouped/cube.py +212 -0
  74. sparkless-3.23.0/sparkless/dataframe/grouped/pivot.py +252 -0
  75. sparkless-3.23.0/sparkless/dataframe/grouped/rollup.py +215 -0
  76. sparkless-3.23.0/sparkless/dataframe/joins/__init__.py +10 -0
  77. sparkless-3.23.0/sparkless/dataframe/joins/operations.py +350 -0
  78. sparkless-3.23.0/sparkless/dataframe/lazy.py +2396 -0
  79. sparkless-3.23.0/sparkless/dataframe/operations/__init__.py +10 -0
  80. sparkless-3.23.0/sparkless/dataframe/operations/aggregation_operations.py +313 -0
  81. sparkless-3.23.0/sparkless/dataframe/operations/join_operations.py +329 -0
  82. sparkless-3.23.0/sparkless/dataframe/operations/misc.py +1377 -0
  83. sparkless-3.23.0/sparkless/dataframe/operations/set_operations.py +261 -0
  84. sparkless-3.23.0/sparkless/dataframe/protocols.py +450 -0
  85. sparkless-3.23.0/sparkless/dataframe/rdd.py +264 -0
  86. sparkless-3.23.0/sparkless/dataframe/reader.py +480 -0
  87. sparkless-3.23.0/sparkless/dataframe/schema/__init__.py +6 -0
  88. sparkless-3.23.0/sparkless/dataframe/schema/operations.py +51 -0
  89. sparkless-3.23.0/sparkless/dataframe/schema/schema_manager.py +651 -0
  90. sparkless-3.23.0/sparkless/dataframe/services/__init__.py +24 -0
  91. sparkless-3.23.0/sparkless/dataframe/services/aggregation_service.py +208 -0
  92. sparkless-3.23.0/sparkless/dataframe/services/assertion_service.py +53 -0
  93. sparkless-3.23.0/sparkless/dataframe/services/display_service.py +280 -0
  94. sparkless-3.23.0/sparkless/dataframe/services/join_service.py +475 -0
  95. sparkless-3.23.0/sparkless/dataframe/services/misc_service.py +1344 -0
  96. sparkless-3.23.0/sparkless/dataframe/services/schema_service.py +24 -0
  97. sparkless-3.23.0/sparkless/dataframe/services/transformation_service.py +735 -0
  98. sparkless-3.23.0/sparkless/dataframe/transformations/__init__.py +10 -0
  99. sparkless-3.23.0/sparkless/dataframe/transformations/operations.py +602 -0
  100. sparkless-3.23.0/sparkless/dataframe/types.py +25 -0
  101. sparkless-3.23.0/sparkless/dataframe/validation/__init__.py +9 -0
  102. sparkless-3.23.0/sparkless/dataframe/validation/column_validator.py +444 -0
  103. sparkless-3.23.0/sparkless/dataframe/validation_handler.py +92 -0
  104. sparkless-3.23.0/sparkless/dataframe/window_handler.py +598 -0
  105. sparkless-3.23.0/sparkless/dataframe/writer.py +1027 -0
  106. sparkless-3.23.0/sparkless/delta.py +626 -0
  107. sparkless-3.23.0/sparkless/error_simulation.py +338 -0
  108. sparkless-3.23.0/sparkless/errors.py +153 -0
  109. sparkless-3.23.0/sparkless/functions/__init__.py +602 -0
  110. sparkless-3.23.0/sparkless/functions/aggregate.py +1175 -0
  111. sparkless-3.23.0/sparkless/functions/array.py +1076 -0
  112. sparkless-3.23.0/sparkless/functions/base.py +236 -0
  113. sparkless-3.23.0/sparkless/functions/bitwise.py +428 -0
  114. sparkless-3.23.0/sparkless/functions/conditional.py +916 -0
  115. sparkless-3.23.0/sparkless/functions/core/__init__.py +35 -0
  116. sparkless-3.23.0/sparkless/functions/core/column.py +766 -0
  117. sparkless-3.23.0/sparkless/functions/core/expressions.py +323 -0
  118. sparkless-3.23.0/sparkless/functions/core/lambda_parser.py +385 -0
  119. sparkless-3.23.0/sparkless/functions/core/literals.py +292 -0
  120. sparkless-3.23.0/sparkless/functions/core/operations.py +226 -0
  121. sparkless-3.23.0/sparkless/functions/core/sql_expr_parser.py +436 -0
  122. sparkless-3.23.0/sparkless/functions/crypto.py +155 -0
  123. sparkless-3.23.0/sparkless/functions/datetime.py +1761 -0
  124. sparkless-3.23.0/sparkless/functions/functions.py +3172 -0
  125. sparkless-3.23.0/sparkless/functions/json_csv.py +163 -0
  126. sparkless-3.23.0/sparkless/functions/map.py +403 -0
  127. sparkless-3.23.0/sparkless/functions/math.py +1040 -0
  128. sparkless-3.23.0/sparkless/functions/metadata.py +114 -0
  129. sparkless-3.23.0/sparkless/functions/ordering.py +95 -0
  130. sparkless-3.23.0/sparkless/functions/pandas_types.py +30 -0
  131. sparkless-3.23.0/sparkless/functions/string.py +1918 -0
  132. sparkless-3.23.0/sparkless/functions/udf.py +155 -0
  133. sparkless-3.23.0/sparkless/functions/window_execution.py +994 -0
  134. sparkless-3.23.0/sparkless/functions/xml.py +283 -0
  135. sparkless-3.23.0/sparkless/optimizer/__init__.py +24 -0
  136. sparkless-3.23.0/sparkless/optimizer/optimization_rules.py +374 -0
  137. sparkless-3.23.0/sparkless/optimizer/query_optimizer.py +524 -0
  138. sparkless-3.23.0/sparkless/performance_simulation.py +329 -0
  139. sparkless-3.23.0/sparkless/py.typed +0 -0
  140. sparkless-3.23.0/sparkless/session/__init__.py +34 -0
  141. sparkless-3.23.0/sparkless/session/catalog.py +719 -0
  142. sparkless-3.23.0/sparkless/session/config/__init__.py +20 -0
  143. sparkless-3.23.0/sparkless/session/config/configuration.py +232 -0
  144. sparkless-3.23.0/sparkless/session/context.py +131 -0
  145. sparkless-3.23.0/sparkless/session/core/__init__.py +18 -0
  146. sparkless-3.23.0/sparkless/session/core/builder.py +90 -0
  147. sparkless-3.23.0/sparkless/session/core/session.py +668 -0
  148. sparkless-3.23.0/sparkless/session/performance_tracker.py +117 -0
  149. sparkless-3.23.0/sparkless/session/services/__init__.py +31 -0
  150. sparkless-3.23.0/sparkless/session/services/dataframe_factory.py +519 -0
  151. sparkless-3.23.0/sparkless/session/services/lifecycle_manager.py +61 -0
  152. sparkless-3.23.0/sparkless/session/services/mocking_coordinator.py +109 -0
  153. sparkless-3.23.0/sparkless/session/services/protocols.py +97 -0
  154. sparkless-3.23.0/sparkless/session/services/sql_parameter_binder.py +74 -0
  155. sparkless-3.23.0/sparkless/session/session.py +29 -0
  156. sparkless-3.23.0/sparkless/session/sql/__init__.py +27 -0
  157. sparkless-3.23.0/sparkless/session/sql/executor.py +2393 -0
  158. sparkless-3.23.0/sparkless/session/sql/optimizer.py +260 -0
  159. sparkless-3.23.0/sparkless/session/sql/parser.py +1127 -0
  160. sparkless-3.23.0/sparkless/session/sql/validation.py +323 -0
  161. sparkless-3.23.0/sparkless/spark_types.py +944 -0
  162. sparkless-3.23.0/sparkless/sql/__init__.py +69 -0
  163. sparkless-3.23.0/sparkless/sql/functions.py +88 -0
  164. sparkless-3.23.0/sparkless/sql/types.py +57 -0
  165. sparkless-3.23.0/sparkless/sql/utils.py +84 -0
  166. sparkless-3.23.0/sparkless/storage/__init__.py +86 -0
  167. sparkless-3.23.0/sparkless/storage/backends/__init__.py +0 -0
  168. sparkless-3.23.0/sparkless/storage/backends/file.py +527 -0
  169. sparkless-3.23.0/sparkless/storage/backends/memory.py +430 -0
  170. sparkless-3.23.0/sparkless/storage/manager.py +432 -0
  171. sparkless-3.23.0/sparkless/storage/models.py +115 -0
  172. sparkless-3.23.0/sparkless/storage/serialization/__init__.py +0 -0
  173. sparkless-3.23.0/sparkless/storage/serialization/csv.py +120 -0
  174. sparkless-3.23.0/sparkless/storage/serialization/json.py +118 -0
  175. sparkless-3.23.0/sparkless/utils/profiling.py +188 -0
  176. sparkless-3.23.0/sparkless/utils/statistics.py +83 -0
  177. sparkless-3.23.0/sparkless/window.py +205 -0
  178. sparkless-3.23.0/sparkless.egg-info/PKG-INFO +623 -0
  179. sparkless-3.23.0/sparkless.egg-info/SOURCES.txt +232 -0
  180. sparkless-3.23.0/sparkless.egg-info/dependency_links.txt +1 -0
  181. sparkless-3.23.0/sparkless.egg-info/requires.txt +36 -0
  182. sparkless-3.23.0/sparkless.egg-info/top_level.txt +1 -0
  183. sparkless-3.23.0/tests/test_backend_capability_model.py +194 -0
  184. sparkless-3.23.0/tests/test_column_availability.py +123 -0
  185. sparkless-3.23.0/tests/test_delta_lake_schema_evolution.py +639 -0
  186. sparkless-3.23.0/tests/test_fixture_compatibility.py +75 -0
  187. sparkless-3.23.0/tests/test_function_api_compatibility.py +83 -0
  188. sparkless-3.23.0/tests/test_groupby_rollup_cube_with_list.py +147 -0
  189. sparkless-3.23.0/tests/test_issue_135_datetime_filter.py +167 -0
  190. sparkless-3.23.0/tests/test_issue_136_column_rename_validation.py +123 -0
  191. sparkless-3.23.0/tests/test_issue_137_datetime_validation.py +100 -0
  192. sparkless-3.23.0/tests/test_issue_138_column_drop_reference.py +144 -0
  193. sparkless-3.23.0/tests/test_issue_139_datetime_validation_compatibility.py +185 -0
  194. sparkless-3.23.0/tests/test_issue_145_string_cast.py +91 -0
  195. sparkless-3.23.0/tests/test_issue_149_to_timestamp_string.py +106 -0
  196. sparkless-3.23.0/tests/test_issue_151_to_timestamp_validation.py +102 -0
  197. sparkless-3.23.0/tests/test_issue_152_sql_column_aliases.py +106 -0
  198. sparkless-3.23.0/tests/test_issue_153_to_timestamp_returns_none.py +85 -0
  199. sparkless-3.23.0/tests/test_issue_156_select_dropped_column.py +67 -0
  200. sparkless-3.23.0/tests/test_issue_158_dropped_column_error.py +118 -0
  201. sparkless-3.23.0/tests/test_issue_160_actual_bug_reproduction.py +191 -0
  202. sparkless-3.23.0/tests/test_issue_160_cache_key_reproduction.py +205 -0
  203. sparkless-3.23.0/tests/test_issue_160_dropped_column_execution_plan.py +191 -0
  204. sparkless-3.23.0/tests/test_issue_160_exact_150_rows.py +130 -0
  205. sparkless-3.23.0/tests/test_issue_160_force_bug_reproduction.py +125 -0
  206. sparkless-3.23.0/tests/test_issue_160_lazy_frame_execution_plan.py +314 -0
  207. sparkless-3.23.0/tests/test_issue_160_lazy_polars_expr.py +264 -0
  208. sparkless-3.23.0/tests/test_issue_160_manual_cache_manipulation.py +143 -0
  209. sparkless-3.23.0/tests/test_issue_160_nested_operations.py +159 -0
  210. sparkless-3.23.0/tests/test_issue_160_reproduce_actual_bug.py +171 -0
  211. sparkless-3.23.0/tests/test_issue_160_reproduce_bug.py +197 -0
  212. sparkless-3.23.0/tests/test_issue_160_with_cache_enabled.py +84 -0
  213. sparkless-3.23.0/tests/test_issue_160_without_fix.py +154 -0
  214. sparkless-3.23.0/tests/test_issue_163_validation_after_drop.py +58 -0
  215. sparkless-3.23.0/tests/test_issue_164_schema_inference_numeric.py +108 -0
  216. sparkless-3.23.0/tests/test_issue_165_to_date_timestamp_type.py +130 -0
  217. sparkless-3.23.0/tests/test_issue_166_unix_timestamp.py +124 -0
  218. sparkless-3.23.0/tests/test_issue_168_validation_after_drop.py +148 -0
  219. sparkless-3.23.0/tests/test_issue_169_to_timestamp_drop_error.py +165 -0
  220. sparkless-3.23.0/tests/test_issue_170_to_date_timestamp_type.py +152 -0
  221. sparkless-3.23.0/tests/test_issue_173_validation_during_materialization.py +69 -0
  222. sparkless-3.23.0/tests/test_issue_188_string_concat_cache.py +222 -0
  223. sparkless-3.23.0/tests/test_issue_200_list_rows_with_column_schema.py +146 -0
  224. sparkless-3.23.0/tests/test_issue_202_select_with_list.py +144 -0
  225. sparkless-3.23.0/tests/test_issue_203_filter_with_string.py +150 -0
  226. sparkless-3.23.0/tests/test_issue_212_select_with_column_list.py +83 -0
  227. sparkless-3.23.0/tests/test_issue_213_createDataFrame_with_single_type.py +74 -0
  228. sparkless-3.23.0/tests/test_issue_214_sort_with_list.py +101 -0
  229. sparkless-3.23.0/tests/test_issue_215_row_kwargs_init.py +101 -0
  230. sparkless-3.23.0/tests/test_notebooks.py +211 -0
  231. sparkless-3.23.0/tests/test_sparkcontext_validation.py +141 -0
  232. sparkless-3.23.0/tests/test_to_timestamp_compatibility.py +233 -0
  233. sparkless-3.23.0/tests/test_type_strictness.py +165 -0
  234. sparkless-3.23.0/tests/test_with_timeout.py +99 -0
@@ -0,0 +1,623 @@
1
+ Metadata-Version: 2.1
2
+ Name: sparkless
3
+ Version: 3.23.0
4
+ Summary: Lightning-fast PySpark testing without JVM - 10x faster with 100% API compatibility
5
+ Author-email: Odos Matthews <odosmatthews@gmail.com>
6
+ Maintainer-email: Odos Matthews <odosmatthews@gmail.com>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/eddiethedean/sparkless
9
+ Project-URL: Repository, https://github.com/eddiethedean/sparkless
10
+ Project-URL: Issues, https://github.com/eddiethedean/sparkless/issues
11
+ Keywords: spark,pyspark,testing,development,data-engineering,dataframe,spark-session,unit-testing,type-safe,mypy,error-simulation,performance-testing,data-generation,enterprise
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: spark-ddl-parser>=0.1.0
26
+ Requires-Dist: polars>=0.20.0
27
+ Requires-Dist: psutil>=5.8.0
28
+ Requires-Dist: typing_extensions>=4.0.0; python_version < "3.9"
29
+ Provides-Extra: pandas
30
+ Requires-Dist: pandas>=1.3.0; extra == "pandas"
31
+ Requires-Dist: pandas-stubs>=2.0.0; extra == "pandas"
32
+ Provides-Extra: analytics
33
+ Requires-Dist: pandas>=1.3.0; extra == "analytics"
34
+ Requires-Dist: pandas-stubs>=2.0.0; extra == "analytics"
35
+ Requires-Dist: numpy>=1.20.0; extra == "analytics"
36
+ Requires-Dist: polars[pyarrow]>=0.20.0; extra == "analytics"
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
39
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
40
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
41
+ Requires-Dist: mypy>=1.19.0; extra == "dev"
42
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
43
+ Requires-Dist: pandas>=1.3.0; extra == "dev"
44
+ Requires-Dist: pandas-stubs>=2.0.0; extra == "dev"
45
+ Requires-Dist: types-psutil>=6.0.0; extra == "dev"
46
+ Provides-Extra: test
47
+ Requires-Dist: pytest>=7.0.0; extra == "test"
48
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
49
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
50
+ Requires-Dist: hypothesis>=6.0.0; extra == "test"
51
+ Provides-Extra: generate-outputs
52
+ Requires-Dist: pyspark<3.6.0,>=3.5.0; extra == "generate-outputs"
53
+ Requires-Dist: delta-spark<4.0.0,>=3.0.0; extra == "generate-outputs"
54
+
55
+ # Sparkless
56
+
57
+ <div align="center">
58
+
59
+ **๐Ÿš€ Test PySpark code at lightning speedโ€”no JVM required**
60
+
61
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
62
+ [![PySpark 3.2-3.5](https://img.shields.io/badge/pyspark-3.2--3.5-orange.svg)](https://spark.apache.org/)
63
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
64
+ [![PyPI version](https://badge.fury.io/py/sparkless.svg)](https://badge.fury.io/py/sparkless)
65
+ [![Tests](https://img.shields.io/badge/tests-572+%20passing%20%7C%200%20failing-brightgreen.svg)](https://github.com/eddiethedean/sparkless)
66
+ [![Type Checked](https://img.shields.io/badge/mypy-260%20files%20clean-blue.svg)](https://github.com/python/mypy)
67
+ [![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
68
+
69
+ *โšก 10x faster tests โ€ข ๐ŸŽฏ Drop-in PySpark replacement โ€ข ๐Ÿ“ฆ Zero JVM overhead โ€ข ๐Ÿงต Thread-safe Polars backend*
70
+
71
+ </div>
72
+
73
+ ---
74
+
75
+ ## Why Sparkless?
76
+
77
+ **Tired of waiting 30+ seconds for Spark to initialize in every test?**
78
+
79
+ Sparkless is a lightweight PySpark replacement that runs your tests **10x faster** by eliminating JVM overhead. Your existing PySpark code works unchangedโ€”just swap the import.
80
+
81
+ ```python
82
+ # Before
83
+ from pyspark.sql import SparkSession
84
+
85
+ # After
86
+ from sparkless.sql import SparkSession
87
+ ```
88
+
89
+ ### Key Benefits
90
+
91
+ | Feature | Description |
92
+ |---------|-------------|
93
+ | โšก **10x Faster** | No JVM startup (30s โ†’ 0.1s) |
94
+ | ๐ŸŽฏ **Drop-in Replacement** | Use existing PySpark code unchanged |
95
+ | ๐Ÿ“ฆ **Zero Java** | Pure Python with Polars backend (thread-safe, no SQL required) |
96
+ | ๐Ÿงช **100% Compatible** | Full PySpark 3.2-3.5 API support |
97
+ | ๐Ÿ”„ **Lazy Evaluation** | Mirrors PySpark's execution model |
98
+ | ๐Ÿญ **Production Ready** | 572+ passing tests, 100% mypy typed |
99
+ | ๐Ÿงต **Thread-Safe** | Polars backend designed for parallel execution |
100
+ | ๐Ÿ”ง **Modular Design** | DDL parsing via standalone spark-ddl-parser package |
101
+ | ๐ŸŽฏ **Type Safe** | Full type checking with `ty`, comprehensive type annotations |
102
+
103
+ ### Perfect For
104
+
105
+ - **Unit Testing** - Fast, isolated test execution with automatic cleanup
106
+ - **CI/CD Pipelines** - Reliable tests without infrastructure or resource leaks
107
+ - **Local Development** - Prototype without Spark cluster
108
+ - **Documentation** - Runnable examples without setup
109
+ - **Learning** - Understand PySpark without complexity
110
+ - **Integration Tests** - Configurable memory limits for large dataset testing
111
+
112
+ ---
113
+
114
+ ## Quick Start
115
+
116
+ ### Installation
117
+
118
+ ```bash
119
+ pip install sparkless
120
+ ```
121
+
122
+ ### Basic Usage
123
+
124
+ ```python
125
+ from sparkless.sql import SparkSession, functions as F
126
+
127
+ # Create session
128
+ spark = SparkSession("MyApp")
129
+
130
+ # Your PySpark code works as-is
131
+ data = [{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]
132
+ df = spark.createDataFrame(data)
133
+
134
+ # All operations work
135
+ result = df.filter(F.col("age") > 25).select("name").collect()
136
+ print(result)
137
+ # Output: [Row(name='Bob')]
138
+
139
+ # Show the DataFrame
140
+ df.show()
141
+ # Output:
142
+ # DataFrame[2 rows, 2 columns]
143
+ # age name
144
+ # 25 Alice
145
+ # 30 Bob
146
+ ```
147
+
148
+ ### Testing Example
149
+
150
+ ```python
151
+ import pytest
152
+ from sparkless.sql import SparkSession, functions as F
153
+
154
+ def test_data_pipeline():
155
+ """Test PySpark logic without Spark cluster."""
156
+ spark = SparkSession("TestApp")
157
+
158
+ # Test data
159
+ data = [{"score": 95}, {"score": 87}, {"score": 92}]
160
+ df = spark.createDataFrame(data)
161
+
162
+ # Business logic
163
+ high_scores = df.filter(F.col("score") > 90)
164
+
165
+ # Assertions
166
+ assert high_scores.count() == 2
167
+ assert high_scores.agg(F.avg("score")).collect()[0][0] == 93.5
168
+
169
+ # Always clean up
170
+ spark.stop()
171
+ ```
172
+
173
+ ---
174
+
175
+ ## Core Features
176
+
177
+ ### ๐Ÿš€ Complete PySpark API Compatibility
178
+
179
+ Sparkless implements **120+ functions** and **70+ DataFrame methods** across PySpark 3.0-3.5:
180
+
181
+ | Category | Functions | Examples |
182
+ |----------|-----------|----------|
183
+ | **String** (40+) | Text manipulation, regex, formatting | `upper`, `concat`, `regexp_extract`, `soundex` |
184
+ | **Math** (35+) | Arithmetic, trigonometry, rounding | `abs`, `sqrt`, `sin`, `cos`, `ln` |
185
+ | **DateTime** (30+) | Date/time operations, timezones | `date_add`, `hour`, `weekday`, `convert_timezone` |
186
+ | **Array** (25+) | Array manipulation, lambdas | `array_distinct`, `transform`, `filter`, `aggregate` |
187
+ | **Aggregate** (20+) | Statistical functions | `sum`, `avg`, `median`, `percentile`, `max_by` |
188
+ | **Map** (10+) | Dictionary operations | `map_keys`, `map_filter`, `transform_values` |
189
+ | **Conditional** (8+) | Logic and null handling | `when`, `coalesce`, `ifnull`, `nullif` |
190
+ | **Window** (8+) | Ranking and analytics | `row_number`, `rank`, `lag`, `lead` |
191
+ | **XML** (9+) | XML parsing and generation | `from_xml`, `to_xml`, `xpath_*` |
192
+ | **Bitwise** (6+) | Bit manipulation | `bit_count`, `bit_and`, `bit_xor` |
193
+
194
+ ๐Ÿ“– **See complete function list**: [`PYSPARK_FUNCTION_MATRIX.md`](PYSPARK_FUNCTION_MATRIX.md)
195
+
196
+ ### DataFrame Operations
197
+
198
+ - **Transformations**: `select`, `filter`, `withColumn`, `drop`, `distinct`, `orderBy`, `replace`
199
+ - **Aggregations**: `groupBy`, `agg`, `count`, `sum`, `avg`, `min`, `max`, `median`, `mode`
200
+ - **Joins**: `inner`, `left`, `right`, `outer`, `cross`
201
+ - **Advanced**: `union`, `pivot`, `unpivot`, `explode`, `transform`
202
+
203
+ ### Window Functions
204
+
205
+ ```python
206
+ from sparkless.sql import Window, functions as F
207
+
208
+ # Ranking and analytics
209
+ df = spark.createDataFrame([
210
+ {"name": "Alice", "dept": "IT", "salary": 50000},
211
+ {"name": "Bob", "dept": "HR", "salary": 60000},
212
+ {"name": "Charlie", "dept": "IT", "salary": 70000},
213
+ ])
214
+
215
+ result = df.withColumn("rank", F.row_number().over(
216
+ Window.partitionBy("dept").orderBy("salary")
217
+ ))
218
+
219
+ # Show results
220
+ for row in result.collect():
221
+ print(row)
222
+ # Output:
223
+ # Row(dept='HR', name='Bob', salary=60000, rank=1)
224
+ # Row(dept='IT', name='Alice', salary=50000, rank=1)
225
+ # Row(dept='IT', name='Charlie', salary=70000, rank=2)
226
+ ```
227
+
228
+ ### SQL Support
229
+
230
+ ```python
231
+ df = spark.createDataFrame([
232
+ {"name": "Alice", "salary": 50000},
233
+ {"name": "Bob", "salary": 60000},
234
+ {"name": "Charlie", "salary": 70000},
235
+ ])
236
+
237
+ # Create temporary view for SQL queries
238
+ df.createOrReplaceTempView("employees")
239
+
240
+ # Execute SQL queries
241
+ result = spark.sql("SELECT name, salary FROM employees WHERE salary > 50000")
242
+ result.show()
243
+ # SQL support enables querying DataFrames using SQL syntax
244
+ ```
245
+
246
+ ### Delta Lake Format
247
+
248
+ Full Delta Lake table format support:
249
+
250
+ ```python
251
+ # Write as Delta table
252
+ df.write.format("delta").mode("overwrite").saveAsTable("catalog.users")
253
+
254
+ # Time travel - query historical versions
255
+ v0_data = spark.read.format("delta").option("versionAsOf", 0).table("catalog.users")
256
+
257
+ # Schema evolution
258
+ new_df.write.format("delta") \
259
+ .mode("append") \
260
+ .option("mergeSchema", "true") \
261
+ .saveAsTable("catalog.users")
262
+
263
+ # MERGE operations for upserts
264
+ spark.sql("""
265
+ MERGE INTO catalog.users AS target
266
+ USING updates AS source
267
+ ON target.id = source.id
268
+ WHEN MATCHED THEN UPDATE SET *
269
+ WHEN NOT MATCHED THEN INSERT *
270
+ """)
271
+ ```
272
+
273
+ ### Lazy Evaluation
274
+
275
+ Sparkless mirrors PySpark's lazy execution model:
276
+
277
+ ```python
278
+ # Transformations are queued (not executed)
279
+ result = df.filter(F.col("age") > 25).select("name")
280
+
281
+ # Actions trigger execution
282
+ rows = result.collect() # โ† Execution happens here
283
+ count = result.count() # โ† Or here
284
+ ```
285
+
286
+ ### CTE Query Optimization
287
+
288
+ DataFrame operation chains are automatically optimized using Common Table Expressions:
289
+
290
+ ```python
291
+ # Enable lazy evaluation for CTE optimization
292
+ data = [
293
+ {"name": "Alice", "age": 25, "salary": 50000},
294
+ {"name": "Bob", "age": 30, "salary": 60000},
295
+ {"name": "Charlie", "age": 35, "salary": 70000},
296
+ {"name": "David", "age": 28, "salary": 55000},
297
+ ]
298
+ df = spark.createDataFrame(data)
299
+
300
+ # This entire chain executes as ONE optimized query:
301
+ result = (
302
+ df.filter(F.col("age") > 25) # CTE 0: WHERE clause
303
+ .select("name", "age", "salary") # CTE 1: Column selection
304
+ .withColumn("bonus", F.col("salary") * 0.1) # CTE 2: New column
305
+ .orderBy(F.desc("salary")) # CTE 3: ORDER BY
306
+ .limit(2) # CTE 4: LIMIT
307
+ ).collect() # Single query execution here
308
+
309
+ # Result:
310
+ # [Row(name='Charlie', age=35, salary=70000, bonus=7000.0),
311
+ # Row(name='Bob', age=30, salary=60000, bonus=6000.0)]
312
+
313
+ # Performance: 5-10x faster than creating 5 intermediate tables
314
+ ```
315
+
316
+ ---
317
+
318
+ ## Backend Architecture
319
+
320
+ ### Polars Backend (Default)
321
+
322
+ Sparkless uses **Polars** as the default backend, providing:
323
+
324
+ - ๐Ÿงต **Thread Safety** - Designed for parallel execution
325
+ - โšก **High Performance** - Optimized DataFrame operations
326
+ - ๐Ÿ“Š **Parquet Storage** - Tables persist as Parquet files
327
+ - ๐Ÿ”„ **Lazy Evaluation** - Automatic query optimization
328
+
329
+ ```python
330
+ # Default backend (Polars) - thread-safe, high-performance
331
+ spark = SparkSession("MyApp")
332
+
333
+ # Explicit backend selection
334
+ spark = SparkSession.builder \
335
+ .config("spark.sparkless.backend", "polars") \
336
+ .getOrCreate()
337
+ ```
338
+
339
+ ### Alternative Backends
340
+
341
+ ```python
342
+ # Memory backend for lightweight testing
343
+ spark = SparkSession.builder \
344
+ .config("spark.sparkless.backend", "memory") \
345
+ .getOrCreate()
346
+
347
+ # File backend for persistent storage
348
+ spark = SparkSession.builder \
349
+ .config("spark.sparkless.backend", "file") \
350
+ .config("spark.sparkless.backend.basePath", "/tmp/sparkless") \
351
+ .getOrCreate()
352
+ ```
353
+
354
+ ---
355
+
356
+ ## Advanced Features
357
+
358
+ ### Table Persistence
359
+
360
+ Tables created with `saveAsTable()` can persist across multiple sessions:
361
+
362
+ ```python
363
+ # First session - create table
364
+ spark1 = SparkSession("App1", db_path="test.db")
365
+ df = spark1.createDataFrame([{"id": 1, "name": "Alice"}])
366
+ df.write.mode("overwrite").saveAsTable("schema.my_table")
367
+ spark1.stop()
368
+
369
+ # Second session - table persists
370
+ spark2 = SparkSession("App2", db_path="test.db")
371
+ assert spark2.catalog.tableExists("schema", "my_table") # โœ… True
372
+ result = spark2.table("schema.my_table").collect() # โœ… Works!
373
+ spark2.stop()
374
+ ```
375
+
376
+ **Key Features:**
377
+ - **Cross-Session Persistence**: Tables persist when using `db_path` parameter
378
+ - **Schema Discovery**: Automatically discovers existing schemas and tables
379
+ - **Catalog Synchronization**: Reliable `catalog.tableExists()` checks
380
+ - **Data Integrity**: Full support for `append` and `overwrite` modes
381
+
382
+ ### Configurable Memory & Isolation
383
+
384
+ Control memory usage and test isolation:
385
+
386
+ ```python
387
+ # Default: 1GB memory limit, no disk spillover (best for tests)
388
+ spark = SparkSession("MyApp")
389
+
390
+ # Custom memory limit
391
+ spark = SparkSession("MyApp", max_memory="4GB")
392
+
393
+ # Allow disk spillover for large datasets
394
+ spark = SparkSession(
395
+ "MyApp",
396
+ max_memory="8GB",
397
+ allow_disk_spillover=True # Uses unique temp directory per session
398
+ )
399
+ ```
400
+
401
+ ---
402
+
403
+ ## Performance Comparison
404
+
405
+ Real-world test suite improvements:
406
+
407
+ | Operation | PySpark | Sparkless | Speedup |
408
+ |-----------|---------|------------|---------|
409
+ | Session Creation | 30-45s | 0.1s | **300x** |
410
+ | Simple Query | 2-5s | 0.01s | **200x** |
411
+ | Window Functions | 5-10s | 0.05s | **100x** |
412
+ | Full Test Suite | 5-10min | 30-60s | **10x** |
413
+
414
+ ### Performance Tooling
415
+
416
+ - [Hot path profiling guide](docs/performance/profiling.md)
417
+ - [Pandas fallback vs native benchmarks](docs/performance/pandas_fallback.md)
418
+
419
+ ---
420
+
421
+ ---
422
+
423
+ ## Recent Updates
424
+
425
+ ### Version 3.23.0 - Issues 225-231 Fixes & PySpark Compatibility Improvements
426
+
427
+ - ๐Ÿ› **Issue Fixes** โ€“ Fixed 7 critical issues (225-231) improving PySpark compatibility:
428
+ - **Issue #225**: String-to-numeric type coercion for comparison operations
429
+ - **Issue #226**: `isin()` method with `*values` arguments and type coercion
430
+ - **Issue #227**: `getItem()` out-of-bounds handling (returns `None` instead of errors)
431
+ - **Issue #228**: Regex look-ahead/look-behind fallback support
432
+ - **Issue #229**: Pandas DataFrame support with proper recognition
433
+ - **Issue #230**: Case-insensitive column name matching across all operations
434
+ - **Issue #231**: `simpleString()` method for all DataType classes
435
+ - ๐Ÿ”ง **SQL JOIN Parsing** โ€“ Fixed SQL JOIN condition parsing and validation
436
+ - โœ… **select() Validation** โ€“ Fixed validation to properly handle ColumnOperation expressions
437
+ - ๐Ÿงช **Test Coverage** โ€“ All 50 tests passing for issues 225-231, including pandas DataFrame support
438
+ - ๐Ÿ“ฆ **Code Quality** โ€“ Applied ruff formatting, fixed linting issues, and resolved mypy type errors
439
+
440
+ ### Version 3.20.0 - Logic Bug Fixes & Code Quality Improvements
441
+
442
+ - ๐Ÿ› **Exception Handling Fixes** โ€“ Fixed critical exception handling issues (issue #183): replaced bare `except:` clause with `except Exception:` and added comprehensive logging to exception handlers for better debuggability.
443
+ - ๐Ÿงช **Comprehensive Test Coverage** โ€“ Added 10 comprehensive test cases for string concatenation cache handling edge cases (issue #188), covering empty strings, None values, nested operations, and numeric vs string operations.
444
+ - ๐Ÿ“š **Improved Documentation** โ€“ Enhanced documentation for string concatenation cache heuristic, documenting limitations and expected behavior vs PySpark.
445
+ - ๐Ÿ” **Code Quality Review** โ€“ Systematic review of dictionary.get() usage patterns throughout codebase, confirming all patterns are safe with appropriate default values.
446
+ - โœ… **Type Safety** โ€“ Fixed mypy errors in CI: improved type narrowing for ColumnOperation.operation and removed redundant casts in writer.py.
447
+
448
+ ### Version 3.7.0 - Full SQL DDL/DML Support
449
+
450
+ - ๐Ÿ—„๏ธ **Complete SQL DDL/DML** โ€“ Full implementation of `CREATE TABLE`, `DROP TABLE`, `INSERT INTO`, `UPDATE`, and `DELETE FROM` statements in the SQL executor.
451
+ - ๐Ÿ“ **Enhanced SQL Parser** โ€“ Comprehensive support for DDL statements with column definitions, `IF NOT EXISTS`, and `IF EXISTS` clauses.
452
+ - ๐Ÿ’พ **INSERT Operations** โ€“ Support for `INSERT INTO ... VALUES (...)` with multiple rows and `INSERT INTO ... SELECT ...` sub-queries.
453
+ - ๐Ÿ”„ **UPDATE & DELETE** โ€“ Full support for `UPDATE ... SET ... WHERE ...` and `DELETE FROM ... WHERE ...` with Python-based expression evaluation.
454
+ - ๐Ÿ› **Bug Fixes** โ€“ Fixed recursion errors in schema projection and resolved import shadowing issues in SQL executor.
455
+ - โœจ **Code Quality** โ€“ Improved linting, formatting, and type safety across the codebase.
456
+
457
+ ### Version 3.6.0 - Profiling & Adaptive Execution
458
+
459
+ - โšก **Feature-Flagged Profiling** โ€“ Introduced `sparkless.utils.profiling` with opt-in instrumentation for Polars hot paths and expression evaluation, plus a new guide at `docs/performance/profiling.md`.
460
+ - ๐Ÿ” **Adaptive Execution Simulation** โ€“ Query plans can now inject synthetic `REPARTITION` steps based on skew metrics, configurable via `QueryOptimizer.configure_adaptive_execution` and covered by new regression tests.
461
+ - ๐Ÿผ **Pandas Backend Choice** โ€“ Added an optional native pandas mode (`MOCK_SPARK_PANDAS_MODE`) with benchmarking support (`scripts/benchmark_pandas_fallback.py`) and documentation in `docs/performance/pandas_fallback.md`.
462
+
463
+ ### Version 3.5.0 - Session-Aware Catalog & Safer Fallbacks
464
+
465
+ - ๐Ÿงญ **Session-Literal Helpers** โ€“ `F.current_catalog`, `F.current_database`, `F.current_schema`, and `F.current_user` return PySpark-compatible literals and understand the active session (with new regression coverage).
466
+ - ๐Ÿ—ƒ๏ธ **Reliable Catalog Context** โ€“ The Polars backend and unified storage manager now track the selected schema so `setCurrentDatabase` works end-to-end, and `SparkContext.sparkUser()` mirrors PySpark behaviour.
467
+ - ๐Ÿงฎ **Pure-Python Stats** โ€“ Lightweight `percentile` and `covariance` helpers keep percentile/cov tests green even without NumPy, eliminating native-crash regressions.
468
+ - ๐Ÿ› ๏ธ **Dynamic Dispatch** โ€“ `F.call_function("func_name", ...)` lets wrappers dynamically invoke registered Sparkless functions with PySpark-style error messages.
469
+
470
+ ### Version 3.4.0 - Workflow & CI Refresh
471
+
472
+ - โ™ป๏ธ **Unified Commands** โ€“ `Makefile`, `install.sh`, and docs now point to `bash tests/run_all_tests.sh`, `ruff`, and `mypy` as the standard dev workflow.
473
+ - ๐Ÿ›ก๏ธ **Automated Gates** โ€“ New GitHub Actions pipeline runs linting, type-checking, and the full test suite on every push and PR.
474
+ - ๐Ÿ—บ๏ธ **Forward Roadmap** โ€“ Published `plans/typing_delta_roadmap.md` to track mypy debt reduction and Delta feature milestones.
475
+ - ๐Ÿ“ **Documentation Sweep** โ€“ README and quick-start docs highlight the 3.4.0 tooling changes and contributor expectations.
476
+
477
+ ### Version 3.3.0 - Type Hardening & Clean Type Check
478
+
479
+ - ๐Ÿงฎ **Zero mypy Debt** โ€“ `mypy sparkless` now runs clean after migrating the Polars executor,
480
+ expression evaluator, Delta merge helpers, and reader/writer stack to Python 3.8+ compatible type syntax.
481
+ - ๐Ÿงพ **Accurate DataFrame Interfaces** โ€“ `DataFrameReader.load()` and related helpers now return
482
+ `IDataFrame` consistently while keeping type-only imports behind `TYPE_CHECKING`.
483
+ - ๐Ÿงฑ **Safer Delta & Projection Fallbacks** โ€“ Python-evaluated select columns always receive string
484
+ aliases, and Delta merge alias handling no longer leaks `None` keys into evaluation contexts.
485
+ - ๐Ÿ“š **Docs & Metadata Updated** โ€“ README highlights the new type guarantees and all packaging
486
+ metadata points to v3.3.0.
487
+
488
+ ### Version 3.2.0 - Python 3.8 Baseline & Tooling Refresh
489
+
490
+ - ๐Ÿ **Python 3.8+ Required** โ€“ Packaging metadata, tooling configs, and installation docs now align on Python 3.8 as the minimum supported runtime.
491
+ - ๐Ÿงฉ **Compatibility Layer** โ€“ Uses `typing_extensions` for Python 3.8 compatibility; datetime helpers use native typing with proper fallbacks.
492
+ - ๐Ÿช„ **Type Hint Modernisation** โ€“ Uses `typing` module generics (`List`, `Dict`, `Tuple`) for Python 3.8 compatibility, with `from __future__ import annotations` for deferred evaluation.
493
+ - ๐Ÿงผ **Ruff Formatting by Default** โ€“ Adopted `ruff format` across the repository, keeping style consistent with the Ruff rule set.
494
+
495
+ ### Version 3.1.0 - Type-Safe Protocols & Tooling
496
+
497
+ - โœ… **260-File Type Coverage** โ€“ DataFrame mixins now implement structural typing protocols (`SupportsDataFrameOps`), giving a clean `mypy` run across the entire project.
498
+ - ๐Ÿงน **Zero Ruff Debt** โ€“ Repository-wide linting is enabled by default; `ruff check` passes with no warnings thanks to tighter casts, imports, and configuration.
499
+ - ๐Ÿงญ **Backend Selection Docs** โ€“ Updated configuration builder and new `docs/backend_selection.md` make it trivial to toggle between Polars, Memory, File, or DuckDB backends.
500
+ - ๐Ÿงช **Delta Schema Evolution Fixes** โ€“ Polars mergeSchema appends now align frames to the on-disk schema, restoring compatibility with evolving Delta tables.
501
+ - ๐Ÿงฐ **Improved Test Harness** โ€“ `tests/run_all_tests.sh` respects virtual environments and ensures documentation examples are executed with the correct interpreter.
502
+
503
+ ### Version 3.0.0+ - Code Quality & Cleanup
504
+
505
+ **Dependency Cleanup & Type Safety:**
506
+
507
+ - ๐Ÿงน **Removed Legacy Dependencies** - Removed unused `sqlglot` dependency (legacy DuckDB/SQL backend code)
508
+ - ๐Ÿ—‘๏ธ **Code Cleanup** - Removed unused legacy SQL translation modules (`sql_translator.py`, `spark_function_mapper.py`)
509
+ - โœ… **Type Safety** - Fixed 177 type errors using `ty` type checker, improved return type annotations
510
+ - ๐Ÿ” **Linting** - Fixed all 63 ruff linting errors, codebase fully formatted
511
+ - โœ… **All Tests Passing** - Full test suite validated (572+ tests, all passing)
512
+ - ๐Ÿ“ฆ **Cleaner Dependencies** - Reduced dependency footprint, faster installation
513
+
514
+ ### Version 3.0.0 - MAJOR UPDATE
515
+
516
+ **Polars Backend Migration:**
517
+
518
+ - ๐Ÿš€ **Polars Backend** - Complete migration to Polars for thread-safe, high-performance operations
519
+ - ๐Ÿงต **Thread Safety** - Polars is thread-safe by design - no more connection locks or threading issues
520
+ - ๐Ÿ“Š **Parquet Storage** - Tables now persist as Parquet files
521
+ - โšก **Performance** - Better performance for DataFrame operations
522
+ - โœ… **All tests passing** - Full test suite validated with Polars backend
523
+ - ๐Ÿ“ฆ **Production-ready** - Stable release with improved architecture
524
+
525
+ See [Migration Guide](docs/migration_from_v2_to_v3.md) for details.
526
+
527
+ ---
528
+
529
+ ## Documentation
530
+
531
+ ### Getting Started
532
+ - ๐Ÿ“– [Installation & Setup](docs/getting_started.md)
533
+ - ๐ŸŽฏ [Quick Start Guide](docs/getting_started.md#quick-start)
534
+ - ๐Ÿ”„ [Migration from PySpark](docs/guides/migration.md)
535
+
536
+ ### Related Packages
537
+ - ๐Ÿ”ง [spark-ddl-parser](https://github.com/eddiethedean/spark-ddl-parser) - Zero-dependency PySpark DDL schema parser
538
+
539
+ ### Core Concepts
540
+ - ๐Ÿ“Š [API Reference](docs/api_reference.md)
541
+ - ๐Ÿ”„ [Lazy Evaluation](docs/guides/lazy_evaluation.md)
542
+ - ๐Ÿ—„๏ธ [SQL Operations](docs/sql_operations_guide.md)
543
+ - ๐Ÿ’พ [Storage & Persistence](docs/storage_serialization_guide.md)
544
+
545
+ ### Advanced Topics
546
+ - โš™๏ธ [Configuration](docs/guides/configuration.md)
547
+ - ๐Ÿ“ˆ [Benchmarking](docs/guides/benchmarking.md)
548
+ - ๐Ÿ”Œ [Plugins & Hooks](docs/guides/plugins.md)
549
+ - ๐Ÿ [Pytest Integration](docs/guides/pytest_integration.md)
550
+
551
+ ---
552
+
553
+ ## Development Setup
554
+
555
+ ```bash
556
+ # Install for development
557
+ git clone https://github.com/eddiethedean/sparkless.git
558
+ cd sparkless
559
+ pip install -e ".[dev]"
560
+
561
+ # Run all tests (with proper isolation)
562
+ bash tests/run_all_tests.sh
563
+
564
+ # Format code
565
+ ruff format .
566
+ ruff check . --fix
567
+
568
+ # Type checking
569
+ mypy sparkless tests
570
+
571
+ # Linting
572
+ ruff check .
573
+ ```
574
+
575
+ ---
576
+
577
+ ## Contributing
578
+
579
+ We welcome contributions! Areas of interest:
580
+
581
+ - โšก **Performance** - Further Polars optimizations
582
+ - ๐Ÿ“š **Documentation** - Examples, guides, tutorials
583
+ - ๐Ÿ› **Bug Fixes** - Edge cases and compatibility issues
584
+ - ๐Ÿงช **PySpark API Coverage** - Additional functions and methods
585
+ - ๐Ÿงช **Tests** - Additional test coverage and scenarios
586
+
587
+ ---
588
+
589
+ ## Known Limitations
590
+
591
+ While Sparkless provides comprehensive PySpark compatibility, some advanced features are planned for future releases:
592
+
593
+ - **Error Handling**: Enhanced error messages with recovery strategies
594
+ - **Performance**: Advanced query optimization, parallel execution, intelligent caching
595
+ - **Enterprise**: Schema evolution, data lineage, audit logging
596
+ - **Compatibility**: PySpark 3.6+, Iceberg support
597
+
598
+ **Want to contribute?** These are great opportunities for community contributions!
599
+
600
+ ---
601
+
602
+ ## License
603
+
604
+ MIT License - see [LICENSE](LICENSE) file for details.
605
+
606
+ ---
607
+
608
+ ## Links
609
+
610
+ - **GitHub**: [github.com/eddiethedean/sparkless](https://github.com/eddiethedean/sparkless)
611
+ - **PyPI**: [pypi.org/project/sparkless](https://pypi.org/project/sparkless/)
612
+ - **Issues**: [github.com/eddiethedean/sparkless/issues](https://github.com/eddiethedean/sparkless/issues)
613
+ - **Documentation**: [Full documentation](docs/)
614
+
615
+ ---
616
+
617
+ <div align="center">
618
+
619
+ **Built with โค๏ธ for the PySpark community**
620
+
621
+ *Star โญ this repo if Sparkless helps speed up your tests!*
622
+
623
+ </div>