sparkless 3.20.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. sparkless-3.20.0/PKG-INFO +636 -0
  2. sparkless-3.20.0/README.md +584 -0
  3. sparkless-3.20.0/pyproject.toml +232 -0
  4. sparkless-3.20.0/setup.cfg +4 -0
  5. sparkless-3.20.0/sparkless/__init__.py +218 -0
  6. sparkless-3.20.0/sparkless/_version.py +19 -0
  7. sparkless-3.20.0/sparkless/backend/__init__.py +33 -0
  8. sparkless-3.20.0/sparkless/backend/factory.py +247 -0
  9. sparkless-3.20.0/sparkless/backend/polars/__init__.py +29 -0
  10. sparkless-3.20.0/sparkless/backend/polars/export.py +114 -0
  11. sparkless-3.20.0/sparkless/backend/polars/expression_translator.py +3264 -0
  12. sparkless-3.20.0/sparkless/backend/polars/materializer.py +1069 -0
  13. sparkless-3.20.0/sparkless/backend/polars/operation_executor.py +1250 -0
  14. sparkless-3.20.0/sparkless/backend/polars/parquet_storage.py +119 -0
  15. sparkless-3.20.0/sparkless/backend/polars/schema_registry.py +222 -0
  16. sparkless-3.20.0/sparkless/backend/polars/schema_utils.py +69 -0
  17. sparkless-3.20.0/sparkless/backend/polars/storage.py +771 -0
  18. sparkless-3.20.0/sparkless/backend/polars/type_mapper.py +146 -0
  19. sparkless-3.20.0/sparkless/backend/polars/window_handler.py +377 -0
  20. sparkless-3.20.0/sparkless/backend/protocols.py +121 -0
  21. sparkless-3.20.0/sparkless/compat/__init__.py +17 -0
  22. sparkless-3.20.0/sparkless/compat/datetime.py +168 -0
  23. sparkless-3.20.0/sparkless/config.py +105 -0
  24. sparkless-3.20.0/sparkless/core/__init__.py +77 -0
  25. sparkless-3.20.0/sparkless/core/condition_evaluator.py +1343 -0
  26. sparkless-3.20.0/sparkless/core/data_validation.py +221 -0
  27. sparkless-3.20.0/sparkless/core/ddl_adapter.py +133 -0
  28. sparkless-3.20.0/sparkless/core/exceptions/__init__.py +45 -0
  29. sparkless-3.20.0/sparkless/core/exceptions/analysis.py +323 -0
  30. sparkless-3.20.0/sparkless/core/exceptions/base.py +91 -0
  31. sparkless-3.20.0/sparkless/core/exceptions/execution.py +153 -0
  32. sparkless-3.20.0/sparkless/core/exceptions/operation.py +206 -0
  33. sparkless-3.20.0/sparkless/core/exceptions/py4j_compat.py +47 -0
  34. sparkless-3.20.0/sparkless/core/exceptions/runtime.py +193 -0
  35. sparkless-3.20.0/sparkless/core/exceptions/validation.py +207 -0
  36. sparkless-3.20.0/sparkless/core/interfaces/__init__.py +43 -0
  37. sparkless-3.20.0/sparkless/core/interfaces/dataframe.py +320 -0
  38. sparkless-3.20.0/sparkless/core/interfaces/functions.py +348 -0
  39. sparkless-3.20.0/sparkless/core/interfaces/session.py +219 -0
  40. sparkless-3.20.0/sparkless/core/interfaces/storage.py +253 -0
  41. sparkless-3.20.0/sparkless/core/protocols.py +120 -0
  42. sparkless-3.20.0/sparkless/core/safe_evaluator.py +215 -0
  43. sparkless-3.20.0/sparkless/core/schema_inference.py +285 -0
  44. sparkless-3.20.0/sparkless/core/type_utils.py +199 -0
  45. sparkless-3.20.0/sparkless/core/types/__init__.py +35 -0
  46. sparkless-3.20.0/sparkless/core/types/data_types.py +264 -0
  47. sparkless-3.20.0/sparkless/core/types/metadata.py +288 -0
  48. sparkless-3.20.0/sparkless/core/types/schema.py +226 -0
  49. sparkless-3.20.0/sparkless/data_generation/__init__.py +18 -0
  50. sparkless-3.20.0/sparkless/data_generation/builder.py +85 -0
  51. sparkless-3.20.0/sparkless/data_generation/convenience.py +64 -0
  52. sparkless-3.20.0/sparkless/data_generation/generator.py +337 -0
  53. sparkless-3.20.0/sparkless/dataframe/__init__.py +27 -0
  54. sparkless-3.20.0/sparkless/dataframe/aggregations/__init__.py +10 -0
  55. sparkless-3.20.0/sparkless/dataframe/aggregations/operations.py +183 -0
  56. sparkless-3.20.0/sparkless/dataframe/assertions/__init__.py +10 -0
  57. sparkless-3.20.0/sparkless/dataframe/assertions/assertions.py +88 -0
  58. sparkless-3.20.0/sparkless/dataframe/assertions/operations.py +45 -0
  59. sparkless-3.20.0/sparkless/dataframe/attribute_handler.py +76 -0
  60. sparkless-3.20.0/sparkless/dataframe/casting/__init__.py +5 -0
  61. sparkless-3.20.0/sparkless/dataframe/casting/type_converter.py +180 -0
  62. sparkless-3.20.0/sparkless/dataframe/collection_handler.py +63 -0
  63. sparkless-3.20.0/sparkless/dataframe/condition_handler.py +178 -0
  64. sparkless-3.20.0/sparkless/dataframe/dataframe.py +1380 -0
  65. sparkless-3.20.0/sparkless/dataframe/display/__init__.py +6 -0
  66. sparkless-3.20.0/sparkless/dataframe/display/formatter.py +57 -0
  67. sparkless-3.20.0/sparkless/dataframe/display/operations.py +274 -0
  68. sparkless-3.20.0/sparkless/dataframe/evaluation/__init__.py +5 -0
  69. sparkless-3.20.0/sparkless/dataframe/evaluation/expression_evaluator.py +4110 -0
  70. sparkless-3.20.0/sparkless/dataframe/export.py +46 -0
  71. sparkless-3.20.0/sparkless/dataframe/grouped/__init__.py +18 -0
  72. sparkless-3.20.0/sparkless/dataframe/grouped/base.py +2012 -0
  73. sparkless-3.20.0/sparkless/dataframe/grouped/cube.py +212 -0
  74. sparkless-3.20.0/sparkless/dataframe/grouped/pivot.py +252 -0
  75. sparkless-3.20.0/sparkless/dataframe/grouped/rollup.py +215 -0
  76. sparkless-3.20.0/sparkless/dataframe/joins/__init__.py +10 -0
  77. sparkless-3.20.0/sparkless/dataframe/joins/operations.py +350 -0
  78. sparkless-3.20.0/sparkless/dataframe/lazy.py +2392 -0
  79. sparkless-3.20.0/sparkless/dataframe/operations/__init__.py +10 -0
  80. sparkless-3.20.0/sparkless/dataframe/operations/aggregation_operations.py +313 -0
  81. sparkless-3.20.0/sparkless/dataframe/operations/join_operations.py +329 -0
  82. sparkless-3.20.0/sparkless/dataframe/operations/misc.py +1376 -0
  83. sparkless-3.20.0/sparkless/dataframe/operations/set_operations.py +261 -0
  84. sparkless-3.20.0/sparkless/dataframe/protocols.py +443 -0
  85. sparkless-3.20.0/sparkless/dataframe/rdd.py +262 -0
  86. sparkless-3.20.0/sparkless/dataframe/reader.py +477 -0
  87. sparkless-3.20.0/sparkless/dataframe/schema/__init__.py +6 -0
  88. sparkless-3.20.0/sparkless/dataframe/schema/operations.py +51 -0
  89. sparkless-3.20.0/sparkless/dataframe/schema/schema_manager.py +639 -0
  90. sparkless-3.20.0/sparkless/dataframe/services/__init__.py +24 -0
  91. sparkless-3.20.0/sparkless/dataframe/services/aggregation_service.py +176 -0
  92. sparkless-3.20.0/sparkless/dataframe/services/assertion_service.py +53 -0
  93. sparkless-3.20.0/sparkless/dataframe/services/display_service.py +280 -0
  94. sparkless-3.20.0/sparkless/dataframe/services/join_service.py +439 -0
  95. sparkless-3.20.0/sparkless/dataframe/services/misc_service.py +1343 -0
  96. sparkless-3.20.0/sparkless/dataframe/services/schema_service.py +24 -0
  97. sparkless-3.20.0/sparkless/dataframe/services/transformation_service.py +609 -0
  98. sparkless-3.20.0/sparkless/dataframe/sqlalchemy_query_builder.py +305 -0
  99. sparkless-3.20.0/sparkless/dataframe/transformations/__init__.py +10 -0
  100. sparkless-3.20.0/sparkless/dataframe/transformations/operations.py +582 -0
  101. sparkless-3.20.0/sparkless/dataframe/types.py +25 -0
  102. sparkless-3.20.0/sparkless/dataframe/validation/__init__.py +9 -0
  103. sparkless-3.20.0/sparkless/dataframe/validation/column_validator.py +416 -0
  104. sparkless-3.20.0/sparkless/dataframe/validation_handler.py +92 -0
  105. sparkless-3.20.0/sparkless/dataframe/window_handler.py +598 -0
  106. sparkless-3.20.0/sparkless/dataframe/writer.py +1027 -0
  107. sparkless-3.20.0/sparkless/delta.py +624 -0
  108. sparkless-3.20.0/sparkless/error_simulation.py +338 -0
  109. sparkless-3.20.0/sparkless/errors.py +153 -0
  110. sparkless-3.20.0/sparkless/functions/__init__.py +602 -0
  111. sparkless-3.20.0/sparkless/functions/aggregate.py +1175 -0
  112. sparkless-3.20.0/sparkless/functions/array.py +1076 -0
  113. sparkless-3.20.0/sparkless/functions/base.py +236 -0
  114. sparkless-3.20.0/sparkless/functions/bitwise.py +428 -0
  115. sparkless-3.20.0/sparkless/functions/conditional.py +916 -0
  116. sparkless-3.20.0/sparkless/functions/core/__init__.py +35 -0
  117. sparkless-3.20.0/sparkless/functions/core/column.py +717 -0
  118. sparkless-3.20.0/sparkless/functions/core/expressions.py +323 -0
  119. sparkless-3.20.0/sparkless/functions/core/lambda_parser.py +385 -0
  120. sparkless-3.20.0/sparkless/functions/core/literals.py +287 -0
  121. sparkless-3.20.0/sparkless/functions/core/operations.py +221 -0
  122. sparkless-3.20.0/sparkless/functions/core/sql_expr_parser.py +436 -0
  123. sparkless-3.20.0/sparkless/functions/crypto.py +155 -0
  124. sparkless-3.20.0/sparkless/functions/datetime.py +1761 -0
  125. sparkless-3.20.0/sparkless/functions/functions.py +3172 -0
  126. sparkless-3.20.0/sparkless/functions/json_csv.py +163 -0
  127. sparkless-3.20.0/sparkless/functions/map.py +403 -0
  128. sparkless-3.20.0/sparkless/functions/math.py +1040 -0
  129. sparkless-3.20.0/sparkless/functions/metadata.py +114 -0
  130. sparkless-3.20.0/sparkless/functions/ordering.py +95 -0
  131. sparkless-3.20.0/sparkless/functions/pandas_types.py +30 -0
  132. sparkless-3.20.0/sparkless/functions/string.py +1910 -0
  133. sparkless-3.20.0/sparkless/functions/udf.py +155 -0
  134. sparkless-3.20.0/sparkless/functions/window_execution.py +994 -0
  135. sparkless-3.20.0/sparkless/functions/xml.py +279 -0
  136. sparkless-3.20.0/sparkless/optimizer/__init__.py +24 -0
  137. sparkless-3.20.0/sparkless/optimizer/optimization_rules.py +374 -0
  138. sparkless-3.20.0/sparkless/optimizer/query_optimizer.py +524 -0
  139. sparkless-3.20.0/sparkless/performance_simulation.py +329 -0
  140. sparkless-3.20.0/sparkless/py.typed +0 -0
  141. sparkless-3.20.0/sparkless/session/__init__.py +34 -0
  142. sparkless-3.20.0/sparkless/session/catalog.py +719 -0
  143. sparkless-3.20.0/sparkless/session/config/__init__.py +20 -0
  144. sparkless-3.20.0/sparkless/session/config/configuration.py +232 -0
  145. sparkless-3.20.0/sparkless/session/context.py +131 -0
  146. sparkless-3.20.0/sparkless/session/core/__init__.py +18 -0
  147. sparkless-3.20.0/sparkless/session/core/builder.py +90 -0
  148. sparkless-3.20.0/sparkless/session/core/session.py +663 -0
  149. sparkless-3.20.0/sparkless/session/performance_tracker.py +117 -0
  150. sparkless-3.20.0/sparkless/session/services/__init__.py +31 -0
  151. sparkless-3.20.0/sparkless/session/services/dataframe_factory.py +405 -0
  152. sparkless-3.20.0/sparkless/session/services/lifecycle_manager.py +61 -0
  153. sparkless-3.20.0/sparkless/session/services/mocking_coordinator.py +109 -0
  154. sparkless-3.20.0/sparkless/session/services/protocols.py +97 -0
  155. sparkless-3.20.0/sparkless/session/services/sql_parameter_binder.py +74 -0
  156. sparkless-3.20.0/sparkless/session/session.py +29 -0
  157. sparkless-3.20.0/sparkless/session/sql/__init__.py +27 -0
  158. sparkless-3.20.0/sparkless/session/sql/executor.py +2345 -0
  159. sparkless-3.20.0/sparkless/session/sql/optimizer.py +260 -0
  160. sparkless-3.20.0/sparkless/session/sql/parser.py +1127 -0
  161. sparkless-3.20.0/sparkless/session/sql/validation.py +323 -0
  162. sparkless-3.20.0/sparkless/spark_types.py +888 -0
  163. sparkless-3.20.0/sparkless/sql/__init__.py +69 -0
  164. sparkless-3.20.0/sparkless/sql/functions.py +88 -0
  165. sparkless-3.20.0/sparkless/sql/types.py +57 -0
  166. sparkless-3.20.0/sparkless/sql/utils.py +84 -0
  167. sparkless-3.20.0/sparkless/storage/__init__.py +86 -0
  168. sparkless-3.20.0/sparkless/storage/backends/__init__.py +0 -0
  169. sparkless-3.20.0/sparkless/storage/backends/file.py +527 -0
  170. sparkless-3.20.0/sparkless/storage/backends/memory.py +430 -0
  171. sparkless-3.20.0/sparkless/storage/manager.py +432 -0
  172. sparkless-3.20.0/sparkless/storage/models.py +153 -0
  173. sparkless-3.20.0/sparkless/storage/serialization/__init__.py +0 -0
  174. sparkless-3.20.0/sparkless/storage/serialization/csv.py +120 -0
  175. sparkless-3.20.0/sparkless/storage/serialization/json.py +118 -0
  176. sparkless-3.20.0/sparkless/storage/sqlalchemy_helpers.py +343 -0
  177. sparkless-3.20.0/sparkless/utils/profiling.py +188 -0
  178. sparkless-3.20.0/sparkless/utils/statistics.py +83 -0
  179. sparkless-3.20.0/sparkless/window.py +205 -0
  180. sparkless-3.20.0/sparkless.egg-info/PKG-INFO +636 -0
  181. sparkless-3.20.0/sparkless.egg-info/SOURCES.txt +229 -0
  182. sparkless-3.20.0/sparkless.egg-info/dependency_links.txt +1 -0
  183. sparkless-3.20.0/sparkless.egg-info/requires.txt +33 -0
  184. sparkless-3.20.0/sparkless.egg-info/top_level.txt +1 -0
  185. sparkless-3.20.0/tests/test_backend_capability_model.py +194 -0
  186. sparkless-3.20.0/tests/test_column_availability.py +123 -0
  187. sparkless-3.20.0/tests/test_delta_lake_schema_evolution.py +639 -0
  188. sparkless-3.20.0/tests/test_fixture_compatibility.py +75 -0
  189. sparkless-3.20.0/tests/test_function_api_compatibility.py +83 -0
  190. sparkless-3.20.0/tests/test_issue_135_datetime_filter.py +167 -0
  191. sparkless-3.20.0/tests/test_issue_136_column_rename_validation.py +123 -0
  192. sparkless-3.20.0/tests/test_issue_137_datetime_validation.py +100 -0
  193. sparkless-3.20.0/tests/test_issue_138_column_drop_reference.py +144 -0
  194. sparkless-3.20.0/tests/test_issue_139_datetime_validation_compatibility.py +185 -0
  195. sparkless-3.20.0/tests/test_issue_145_string_cast.py +91 -0
  196. sparkless-3.20.0/tests/test_issue_149_to_timestamp_string.py +106 -0
  197. sparkless-3.20.0/tests/test_issue_151_to_timestamp_validation.py +102 -0
  198. sparkless-3.20.0/tests/test_issue_152_sql_column_aliases.py +106 -0
  199. sparkless-3.20.0/tests/test_issue_153_to_timestamp_returns_none.py +85 -0
  200. sparkless-3.20.0/tests/test_issue_156_select_dropped_column.py +67 -0
  201. sparkless-3.20.0/tests/test_issue_158_dropped_column_error.py +118 -0
  202. sparkless-3.20.0/tests/test_issue_160_actual_bug_reproduction.py +191 -0
  203. sparkless-3.20.0/tests/test_issue_160_cache_key_reproduction.py +205 -0
  204. sparkless-3.20.0/tests/test_issue_160_dropped_column_execution_plan.py +191 -0
  205. sparkless-3.20.0/tests/test_issue_160_exact_150_rows.py +130 -0
  206. sparkless-3.20.0/tests/test_issue_160_force_bug_reproduction.py +125 -0
  207. sparkless-3.20.0/tests/test_issue_160_lazy_frame_execution_plan.py +314 -0
  208. sparkless-3.20.0/tests/test_issue_160_lazy_polars_expr.py +264 -0
  209. sparkless-3.20.0/tests/test_issue_160_manual_cache_manipulation.py +143 -0
  210. sparkless-3.20.0/tests/test_issue_160_nested_operations.py +159 -0
  211. sparkless-3.20.0/tests/test_issue_160_reproduce_actual_bug.py +171 -0
  212. sparkless-3.20.0/tests/test_issue_160_reproduce_bug.py +197 -0
  213. sparkless-3.20.0/tests/test_issue_160_with_cache_enabled.py +84 -0
  214. sparkless-3.20.0/tests/test_issue_160_without_fix.py +154 -0
  215. sparkless-3.20.0/tests/test_issue_163_validation_after_drop.py +58 -0
  216. sparkless-3.20.0/tests/test_issue_164_schema_inference_numeric.py +108 -0
  217. sparkless-3.20.0/tests/test_issue_165_to_date_timestamp_type.py +130 -0
  218. sparkless-3.20.0/tests/test_issue_166_unix_timestamp.py +124 -0
  219. sparkless-3.20.0/tests/test_issue_168_validation_after_drop.py +148 -0
  220. sparkless-3.20.0/tests/test_issue_169_to_timestamp_drop_error.py +165 -0
  221. sparkless-3.20.0/tests/test_issue_170_to_date_timestamp_type.py +152 -0
  222. sparkless-3.20.0/tests/test_issue_173_validation_during_materialization.py +69 -0
  223. sparkless-3.20.0/tests/test_issue_188_string_concat_cache.py +222 -0
  224. sparkless-3.20.0/tests/test_issue_200_list_rows_with_column_schema.py +146 -0
  225. sparkless-3.20.0/tests/test_issue_202_select_with_list.py +144 -0
  226. sparkless-3.20.0/tests/test_issue_203_filter_with_string.py +150 -0
  227. sparkless-3.20.0/tests/test_notebooks.py +211 -0
  228. sparkless-3.20.0/tests/test_sparkcontext_validation.py +141 -0
  229. sparkless-3.20.0/tests/test_to_timestamp_compatibility.py +233 -0
  230. sparkless-3.20.0/tests/test_type_strictness.py +165 -0
  231. sparkless-3.20.0/tests/test_with_timeout.py +99 -0
@@ -0,0 +1,636 @@
1
+ Metadata-Version: 2.1
2
+ Name: sparkless
3
+ Version: 3.20.0
4
+ Summary: Lightning-fast PySpark testing without JVM - 10x faster with 100% API compatibility
5
+ Author-email: Odos Matthews <odosmatthews@gmail.com>
6
+ Maintainer-email: Odos Matthews <odosmatthews@gmail.com>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/eddiethedean/sparkless
9
+ Project-URL: Repository, https://github.com/eddiethedean/sparkless
10
+ Project-URL: Issues, https://github.com/eddiethedean/sparkless/issues
11
+ Keywords: spark,pyspark,testing,development,data-engineering,dataframe,spark-session,unit-testing,type-safe,mypy,error-simulation,performance-testing,data-generation,enterprise
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: spark-ddl-parser>=0.1.0
25
+ Requires-Dist: polars>=0.20.0
26
+ Requires-Dist: psutil>=5.8.0
27
+ Provides-Extra: pandas
28
+ Requires-Dist: pandas>=1.3.0; extra == "pandas"
29
+ Requires-Dist: pandas-stubs>=2.0.0; extra == "pandas"
30
+ Provides-Extra: analytics
31
+ Requires-Dist: pandas>=1.3.0; extra == "analytics"
32
+ Requires-Dist: pandas-stubs>=2.0.0; extra == "analytics"
33
+ Requires-Dist: numpy>=1.20.0; extra == "analytics"
34
+ Requires-Dist: polars[pyarrow]>=0.20.0; extra == "analytics"
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
37
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
38
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
39
+ Requires-Dist: mypy>=1.19.0; extra == "dev"
40
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
41
+ Requires-Dist: pandas>=1.3.0; extra == "dev"
42
+ Requires-Dist: pandas-stubs>=2.0.0; extra == "dev"
43
+ Requires-Dist: types-psutil>=6.0.0; extra == "dev"
44
+ Provides-Extra: test
45
+ Requires-Dist: pytest>=7.0.0; extra == "test"
46
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
47
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
48
+ Requires-Dist: hypothesis>=6.0.0; extra == "test"
49
+ Provides-Extra: generate-outputs
50
+ Requires-Dist: pyspark<3.6.0,>=3.5.0; extra == "generate-outputs"
51
+ Requires-Dist: delta-spark<4.0.0,>=3.0.0; extra == "generate-outputs"
52
+
53
+ # Sparkless
54
+
55
+ <div align="center">
56
+
57
+ **๐Ÿš€ Test PySpark code at lightning speedโ€”no JVM required**
58
+
59
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
60
+ [![PySpark 3.2-3.5](https://img.shields.io/badge/pyspark-3.2--3.5-orange.svg)](https://spark.apache.org/)
61
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
62
+ [![PyPI version](https://badge.fury.io/py/sparkless.svg)](https://badge.fury.io/py/sparkless)
63
+ [![Tests](https://img.shields.io/badge/tests-650+%20passing%20%7C%200%20failing-brightgreen.svg)](https://github.com/eddiethedean/sparkless)
64
+ [![Type Checked](https://img.shields.io/badge/mypy-260%20files%20clean-blue.svg)](https://github.com/python/mypy)
65
+ [![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
66
+
67
+ *โšก 10x faster tests โ€ข ๐ŸŽฏ Drop-in PySpark replacement โ€ข ๐Ÿ“ฆ Zero JVM overhead โ€ข ๐Ÿงต Thread-safe Polars backend*
68
+
69
+ </div>
70
+
71
+ ---
72
+
73
+ ## Why Sparkless?
74
+
75
+ **Tired of waiting 30+ seconds for Spark to initialize in every test?**
76
+
77
+ Sparkless is a lightweight PySpark replacement that runs your tests **10x faster** by eliminating JVM overhead. Your existing PySpark code works unchangedโ€”just swap the import.
78
+
79
+ ```python
80
+ # Before
81
+ from pyspark.sql import SparkSession
82
+
83
+ # After
84
+ from sparkless.sql import SparkSession
85
+ ```
86
+
87
+ ### Key Benefits
88
+
89
+ | Feature | Description |
90
+ |---------|-------------|
91
+ | โšก **10x Faster** | No JVM startup (30s โ†’ 0.1s) |
92
+ | ๐ŸŽฏ **Drop-in Replacement** | Use existing PySpark code unchanged |
93
+ | ๐Ÿ“ฆ **Zero Java** | Pure Python with Polars backend (thread-safe, no SQL required) |
94
+ | ๐Ÿงช **100% Compatible** | Full PySpark 3.2-3.5 API support |
95
+ | ๐Ÿ”„ **Lazy Evaluation** | Mirrors PySpark's execution model |
96
+ | ๐Ÿญ **Production Ready** | 650+ passing tests, 100% mypy typed |
97
+ | ๐Ÿงต **Thread-Safe** | Polars backend designed for parallel execution |
98
+ | ๐Ÿ”ง **Modular Design** | DDL parsing via standalone spark-ddl-parser package |
99
+ | ๐ŸŽฏ **Type Safe** | Full type checking with `ty`, comprehensive type annotations |
100
+
101
+ ### Perfect For
102
+
103
+ - **Unit Testing** - Fast, isolated test execution with automatic cleanup
104
+ - **CI/CD Pipelines** - Reliable tests without infrastructure or resource leaks
105
+ - **Local Development** - Prototype without Spark cluster
106
+ - **Documentation** - Runnable examples without setup
107
+ - **Learning** - Understand PySpark without complexity
108
+ - **Integration Tests** - Configurable memory limits for large dataset testing
109
+
110
+ ---
111
+
112
+ ## Quick Start
113
+
114
+ ### Installation
115
+
116
+ ```bash
117
+ pip install sparkless
118
+ ```
119
+
120
+ ### Basic Usage
121
+
122
+ ```python
123
+ from sparkless.sql import SparkSession, functions as F
124
+
125
+ # Create session
126
+ spark = SparkSession("MyApp")
127
+
128
+ # Your PySpark code works as-is
129
+ data = [{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]
130
+ df = spark.createDataFrame(data)
131
+
132
+ # All operations work
133
+ result = df.filter(F.col("age") > 25).select("name").collect()
134
+ print(result)
135
+ # Output: [Row(name='Bob')]
136
+
137
+ # Show the DataFrame
138
+ df.show()
139
+ # Output:
140
+ # DataFrame[2 rows, 2 columns]
141
+ # age name
142
+ # 25 Alice
143
+ # 30 Bob
144
+ ```
145
+
146
+ ### Storage API (Sparkless-Specific)
147
+
148
+ Sparkless provides a convenient `.storage` API for managing databases and tables. **Note:** This is a **sparkless-specific convenience API** that does not exist in PySpark. For PySpark compatibility, use SQL commands or DataFrame operations instead:
149
+
150
+ ```python
151
+ # Sparkless: Using .storage API (convenient but NOT PySpark-compatible)
152
+ spark._storage.create_schema("test_db")
153
+ spark._storage.create_table("test_db", "users", schema)
154
+ spark._storage.insert_data("test_db", "users", data)
155
+ df = spark._storage.query_table("test_db", "users")
156
+
157
+ # Both Sparkless and PySpark: Using SQL commands (recommended for compatibility)
158
+ spark.sql("CREATE DATABASE IF NOT EXISTS test_db")
159
+ spark.sql("CREATE TABLE test_db.users (name STRING, age INT)")
160
+ df.write.saveAsTable("test_db.users") # Write DataFrame to table
161
+ df = spark.table("test_db.users") # Read table as DataFrame
162
+
163
+ # PySpark equivalent for insert_data:
164
+ # df = spark.createDataFrame(data, schema)
165
+ # df.write.mode("append").saveAsTable("test_db.users")
166
+ ```
167
+
168
+ **Migration Guide:**
169
+ - `spark._storage.create_schema()` โ†’ `spark.sql("CREATE DATABASE IF NOT EXISTS ...")`
170
+ - `spark._storage.create_table()` โ†’ `spark.sql("CREATE TABLE ...")` or `df.write.saveAsTable()`
171
+ - `spark._storage.insert_data()` โ†’ `df.write.mode("append").saveAsTable()`
172
+ - `spark._storage.query_table()` โ†’ `spark.table()` or `spark.sql("SELECT * FROM ...")`
173
+
174
+ See the [Storage API Guide](docs/storage_api_guide.md) and [Migration Guide](docs/migration_from_pyspark.md) for more details.
175
+
176
+ ### Testing Example
177
+
178
+ ```python
179
+ import pytest
180
+ from sparkless.sql import SparkSession, functions as F
181
+
182
+ def test_data_pipeline():
183
+ """Test PySpark logic without Spark cluster."""
184
+ spark = SparkSession("TestApp")
185
+
186
+ # Test data
187
+ data = [{"score": 95}, {"score": 87}, {"score": 92}]
188
+ df = spark.createDataFrame(data)
189
+
190
+ # Business logic
191
+ high_scores = df.filter(F.col("score") > 90)
192
+
193
+ # Assertions
194
+ assert high_scores.count() == 2
195
+ assert high_scores.agg(F.avg("score")).collect()[0][0] == 93.5
196
+
197
+ # Always clean up
198
+ spark.stop()
199
+ ```
200
+
201
+ ---
202
+
203
+ ## Core Features
204
+
205
+ ### ๐Ÿš€ Complete PySpark API Compatibility
206
+
207
+ Sparkless implements **120+ functions** and **70+ DataFrame methods** across PySpark 3.0-3.5:
208
+
209
+ | Category | Functions | Examples |
210
+ |----------|-----------|----------|
211
+ | **String** (40+) | Text manipulation, regex, formatting | `upper`, `concat`, `regexp_extract`, `soundex` |
212
+ | **Math** (35+) | Arithmetic, trigonometry, rounding | `abs`, `sqrt`, `sin`, `cos`, `ln` |
213
+ | **DateTime** (30+) | Date/time operations, timezones | `date_add`, `hour`, `weekday`, `convert_timezone` |
214
+ | **Array** (25+) | Array manipulation, lambdas | `array_distinct`, `transform`, `filter`, `aggregate` |
215
+ | **Aggregate** (20+) | Statistical functions | `sum`, `avg`, `median`, `percentile`, `max_by` |
216
+ | **Map** (10+) | Dictionary operations | `map_keys`, `map_filter`, `transform_values` |
217
+ | **Conditional** (8+) | Logic and null handling | `when`, `coalesce`, `ifnull`, `nullif` |
218
+ | **Window** (8+) | Ranking and analytics | `row_number`, `rank`, `lag`, `lead` |
219
+ | **XML** (9+) | XML parsing and generation | `from_xml`, `to_xml`, `xpath_*` |
220
+ | **Bitwise** (6+) | Bit manipulation | `bit_count`, `bit_and`, `bit_xor` |
221
+
222
+ ๐Ÿ“– **See complete function list**: [`PYSPARK_FUNCTION_MATRIX.md`](PYSPARK_FUNCTION_MATRIX.md)
223
+
224
+ ### DataFrame Operations
225
+
226
+ - **Transformations**: `select`, `filter`, `withColumn`, `drop`, `distinct`, `orderBy`, `replace`
227
+ - **Aggregations**: `groupBy`, `agg`, `count`, `sum`, `avg`, `min`, `max`, `median`, `mode`
228
+ - **Joins**: `inner`, `left`, `right`, `outer`, `cross`
229
+ - **Advanced**: `union`, `pivot`, `unpivot`, `explode`, `transform`
230
+
231
+ ### Window Functions
232
+
233
+ ```python
234
+ from sparkless.sql import Window, functions as F
235
+
236
+ # Ranking and analytics
237
+ df = spark.createDataFrame([
238
+ {"name": "Alice", "dept": "IT", "salary": 50000},
239
+ {"name": "Bob", "dept": "HR", "salary": 60000},
240
+ {"name": "Charlie", "dept": "IT", "salary": 70000},
241
+ ])
242
+
243
+ result = df.withColumn("rank", F.row_number().over(
244
+ Window.partitionBy("dept").orderBy("salary")
245
+ ))
246
+
247
+ # Show results
248
+ for row in result.collect():
249
+ print(row)
250
+ # Output:
251
+ # Row(dept='HR', name='Bob', salary=60000, rank=1)
252
+ # Row(dept='IT', name='Alice', salary=50000, rank=1)
253
+ # Row(dept='IT', name='Charlie', salary=70000, rank=2)
254
+ ```
255
+
256
+ ### SQL Support
257
+
258
+ ```python
259
+ df = spark.createDataFrame([
260
+ {"name": "Alice", "salary": 50000},
261
+ {"name": "Bob", "salary": 60000},
262
+ {"name": "Charlie", "salary": 70000},
263
+ ])
264
+
265
+ # Create temporary view for SQL queries
266
+ df.createOrReplaceTempView("employees")
267
+
268
+ # Execute SQL queries
269
+ result = spark.sql("SELECT name, salary FROM employees WHERE salary > 50000")
270
+ result.show()
271
+ # SQL support enables querying DataFrames using SQL syntax
272
+ ```
273
+
274
+ ### Delta Lake Format
275
+
276
+ Full Delta Lake table format support:
277
+
278
+ ```python
279
+ # Write as Delta table
280
+ df.write.format("delta").mode("overwrite").saveAsTable("catalog.users")
281
+
282
+ # Time travel - query historical versions
283
+ v0_data = spark.read.format("delta").option("versionAsOf", 0).table("catalog.users")
284
+
285
+ # Schema evolution
286
+ new_df.write.format("delta") \
287
+ .mode("append") \
288
+ .option("mergeSchema", "true") \
289
+ .saveAsTable("catalog.users")
290
+
291
+ # MERGE operations for upserts
292
+ spark.sql("""
293
+ MERGE INTO catalog.users AS target
294
+ USING updates AS source
295
+ ON target.id = source.id
296
+ WHEN MATCHED THEN UPDATE SET *
297
+ WHEN NOT MATCHED THEN INSERT *
298
+ """)
299
+ ```
300
+
301
+ ### Lazy Evaluation
302
+
303
+ Sparkless mirrors PySpark's lazy execution model:
304
+
305
+ ```python
306
+ # Transformations are queued (not executed)
307
+ result = df.filter(F.col("age") > 25).select("name")
308
+
309
+ # Actions trigger execution
310
+ rows = result.collect() # โ† Execution happens here
311
+ count = result.count() # โ† Or here
312
+ ```
313
+
314
+ ### CTE Query Optimization
315
+
316
+ DataFrame operation chains are automatically optimized using Common Table Expressions:
317
+
318
+ ```python
319
+ # Enable lazy evaluation for CTE optimization
320
+ data = [
321
+ {"name": "Alice", "age": 25, "salary": 50000},
322
+ {"name": "Bob", "age": 30, "salary": 60000},
323
+ {"name": "Charlie", "age": 35, "salary": 70000},
324
+ {"name": "David", "age": 28, "salary": 55000},
325
+ ]
326
+ df = spark.createDataFrame(data)
327
+
328
+ # This entire chain executes as ONE optimized query:
329
+ result = (
330
+ df.filter(F.col("age") > 25) # CTE 0: WHERE clause
331
+ .select("name", "age", "salary") # CTE 1: Column selection
332
+ .withColumn("bonus", F.col("salary") * 0.1) # CTE 2: New column
333
+ .orderBy(F.desc("salary")) # CTE 3: ORDER BY
334
+ .limit(2) # CTE 4: LIMIT
335
+ ).collect() # Single query execution here
336
+
337
+ # Result:
338
+ # [Row(name='Charlie', age=35, salary=70000, bonus=7000.0),
339
+ # Row(name='Bob', age=30, salary=60000, bonus=6000.0)]
340
+
341
+ # Performance: 5-10x faster than creating 5 intermediate tables
342
+ ```
343
+
344
+ ---
345
+
346
+ ## Backend Architecture
347
+
348
+ ### Polars Backend (Default)
349
+
350
+ Sparkless uses **Polars** as the default backend, providing:
351
+
352
+ - ๐Ÿงต **Thread Safety** - Designed for parallel execution
353
+ - โšก **High Performance** - Optimized DataFrame operations
354
+ - ๐Ÿ“Š **Parquet Storage** - Tables persist as Parquet files
355
+ - ๐Ÿ”„ **Lazy Evaluation** - Automatic query optimization
356
+
357
+ ```python
358
+ # Default backend (Polars) - thread-safe, high-performance
359
+ spark = SparkSession("MyApp")
360
+
361
+ # Explicit backend selection
362
+ spark = SparkSession.builder \
363
+ .config("spark.sparkless.backend", "polars") \
364
+ .getOrCreate()
365
+ ```
366
+
367
+ ### Alternative Backends
368
+
369
+ ```python
370
+ # Memory backend for lightweight testing
371
+ spark = SparkSession.builder \
372
+ .config("spark.sparkless.backend", "memory") \
373
+ .getOrCreate()
374
+
375
+ # File backend for persistent storage
376
+ spark = SparkSession.builder \
377
+ .config("spark.sparkless.backend", "file") \
378
+ .config("spark.sparkless.backend.basePath", "/tmp/sparkless") \
379
+ .getOrCreate()
380
+ ```
381
+
382
+ ---
383
+
384
+ ## Advanced Features
385
+
386
+ ### Table Persistence
387
+
388
+ Tables created with `saveAsTable()` can persist across multiple sessions:
389
+
390
+ ```python
391
+ # First session - create table
392
+ spark1 = SparkSession("App1", db_path="test.db")
393
+ df = spark1.createDataFrame([{"id": 1, "name": "Alice"}])
394
+ df.write.mode("overwrite").saveAsTable("schema.my_table")
395
+ spark1.stop()
396
+
397
+ # Second session - table persists
398
+ spark2 = SparkSession("App2", db_path="test.db")
399
+ assert spark2.catalog.tableExists("schema", "my_table") # โœ… True
400
+ result = spark2.table("schema.my_table").collect() # โœ… Works!
401
+ spark2.stop()
402
+ ```
403
+
404
+ **Key Features:**
405
+ - **Cross-Session Persistence**: Tables persist when using `db_path` parameter
406
+ - **Schema Discovery**: Automatically discovers existing schemas and tables
407
+ - **Catalog Synchronization**: Reliable `catalog.tableExists()` checks
408
+ - **Data Integrity**: Full support for `append` and `overwrite` modes
409
+
410
+ ### Configurable Memory & Isolation
411
+
412
+ Control memory usage and test isolation:
413
+
414
+ ```python
415
+ # Default: 1GB memory limit, no disk spillover (best for tests)
416
+ spark = SparkSession("MyApp")
417
+
418
+ # Custom memory limit
419
+ spark = SparkSession("MyApp", max_memory="4GB")
420
+
421
+ # Allow disk spillover for large datasets
422
+ spark = SparkSession(
423
+ "MyApp",
424
+ max_memory="8GB",
425
+ allow_disk_spillover=True # Uses unique temp directory per session
426
+ )
427
+ ```
428
+
429
+ ---
430
+
431
+ ## Performance Comparison
432
+
433
+ Real-world test suite improvements:
434
+
435
+ | Operation | PySpark | Sparkless | Speedup |
436
+ |-----------|---------|------------|---------|
437
+ | Session Creation | 30-45s | 0.1s | **300x** |
438
+ | Simple Query | 2-5s | 0.01s | **200x** |
439
+ | Window Functions | 5-10s | 0.05s | **100x** |
440
+ | Full Test Suite | 5-10min | 30-60s | **10x** |
441
+
442
+ ### Performance Tooling
443
+
444
+ - [Hot path profiling guide](docs/performance/profiling.md)
445
+ - [Pandas fallback vs native benchmarks](docs/performance/pandas_fallback.md)
446
+
447
+ ---
448
+
449
+ ---
450
+
451
+ ## Recent Updates
452
+
453
+ ### Version 3.20.0 - Logic Bug Fixes & Code Quality Improvements
454
+
455
+ - ๐Ÿ› **Exception Handling Fixes** โ€“ Fixed critical exception handling issues (issue #183): replaced bare `except:` clause with `except Exception:` and added comprehensive logging to exception handlers for better debuggability.
456
+ - ๐Ÿงช **Comprehensive Test Coverage** โ€“ Added 10 comprehensive test cases for string concatenation cache handling edge cases (issue #188), covering empty strings, None values, nested operations, and numeric vs string operations.
457
+ - ๐Ÿ“š **Improved Documentation** โ€“ Enhanced documentation for string concatenation cache heuristic, documenting limitations and expected behavior vs PySpark.
458
+ - ๐Ÿ” **Code Quality Review** โ€“ Systematic review of dictionary.get() usage patterns throughout codebase, confirming all patterns are safe with appropriate default values.
459
+ - โœ… **Type Safety** โ€“ Fixed mypy errors in CI: improved type narrowing for ColumnOperation.operation and removed redundant casts in writer.py.
460
+
461
+ ### Version 3.7.0 - Full SQL DDL/DML Support
462
+
463
+ - ๐Ÿ—„๏ธ **Complete SQL DDL/DML** โ€“ Full implementation of `CREATE TABLE`, `DROP TABLE`, `INSERT INTO`, `UPDATE`, and `DELETE FROM` statements in the SQL executor.
464
+ - ๐Ÿ“ **Enhanced SQL Parser** โ€“ Comprehensive support for DDL statements with column definitions, `IF NOT EXISTS`, and `IF EXISTS` clauses.
465
+ - ๐Ÿ’พ **INSERT Operations** โ€“ Support for `INSERT INTO ... VALUES (...)` with multiple rows and `INSERT INTO ... SELECT ...` sub-queries.
466
+ - ๐Ÿ”„ **UPDATE & DELETE** โ€“ Full support for `UPDATE ... SET ... WHERE ...` and `DELETE FROM ... WHERE ...` with Python-based expression evaluation.
467
+ - ๐Ÿ› **Bug Fixes** โ€“ Fixed recursion errors in schema projection and resolved import shadowing issues in SQL executor.
468
+ - โœจ **Code Quality** โ€“ Improved linting, formatting, and type safety across the codebase.
469
+
470
+ ### Version 3.6.0 - Profiling & Adaptive Execution
471
+
472
+ - โšก **Feature-Flagged Profiling** โ€“ Introduced `sparkless.utils.profiling` with opt-in instrumentation for Polars hot paths and expression evaluation, plus a new guide at `docs/performance/profiling.md`.
473
+ - ๐Ÿ” **Adaptive Execution Simulation** โ€“ Query plans can now inject synthetic `REPARTITION` steps based on skew metrics, configurable via `QueryOptimizer.configure_adaptive_execution` and covered by new regression tests.
474
+ - ๐Ÿผ **Pandas Backend Choice** โ€“ Added an optional native pandas mode (`MOCK_SPARK_PANDAS_MODE`) with benchmarking support (`scripts/benchmark_pandas_fallback.py`) and documentation in `docs/performance/pandas_fallback.md`.
475
+
476
+ ### Version 3.5.0 - Session-Aware Catalog & Safer Fallbacks
477
+
478
+ - ๐Ÿงญ **Session-Literal Helpers** โ€“ `F.current_catalog`, `F.current_database`, `F.current_schema`, and `F.current_user` return PySpark-compatible literals and understand the active session (with new regression coverage).
479
+ - ๐Ÿ—ƒ๏ธ **Reliable Catalog Context** โ€“ The Polars backend and unified storage manager now track the selected schema so `setCurrentDatabase` works end-to-end, and `SparkContext.sparkUser()` mirrors PySpark behaviour.
480
+ - ๐Ÿงฎ **Pure-Python Stats** โ€“ Lightweight `percentile` and `covariance` helpers keep percentile/cov tests green even without NumPy, eliminating native-crash regressions.
481
+ - ๐Ÿ› ๏ธ **Dynamic Dispatch** โ€“ `F.call_function("func_name", ...)` lets wrappers dynamically invoke registered Sparkless functions with PySpark-style error messages.
482
+
483
+ ### Version 3.4.0 - Workflow & CI Refresh
484
+
485
+ - โ™ป๏ธ **Unified Commands** โ€“ `Makefile`, `install.sh`, and docs now point to `bash tests/run_all_tests.sh`, `ruff`, and `mypy` as the standard dev workflow.
486
+ - ๐Ÿ›ก๏ธ **Automated Gates** โ€“ New GitHub Actions pipeline runs linting, type-checking, and the full test suite on every push and PR.
487
+ - ๐Ÿ—บ๏ธ **Forward Roadmap** โ€“ Published `plans/typing_delta_roadmap.md` to track mypy debt reduction and Delta feature milestones.
488
+ - ๐Ÿ“ **Documentation Sweep** โ€“ README and quick-start docs highlight the 3.4.0 tooling changes and contributor expectations.
489
+
490
+ ### Version 3.3.0 - Type Hardening & Clean Type Check
491
+
492
+ - ๐Ÿงฎ **Zero mypy Debt** โ€“ `mypy sparkless` now runs clean after migrating the Polars executor,
493
+ expression evaluator, Delta merge helpers, and reader/writer stack to Python 3.9 union syntax.
494
+ - ๐Ÿงพ **Accurate DataFrame Interfaces** โ€“ `DataFrameReader.load()` and related helpers now return
495
+ `IDataFrame` consistently while keeping type-only imports behind `TYPE_CHECKING`.
496
+ - ๐Ÿงฑ **Safer Delta & Projection Fallbacks** โ€“ Python-evaluated select columns always receive string
497
+ aliases, and Delta merge alias handling no longer leaks `None` keys into evaluation contexts.
498
+ - ๐Ÿ“š **Docs & Metadata Updated** โ€“ README highlights the new type guarantees and all packaging
499
+ metadata points to v3.3.0.
500
+
501
+ ### Version 3.2.0 - Python 3.9 Baseline & Tooling Refresh
502
+
503
+ - ๐Ÿ **Python 3.9+ Required** โ€“ Packaging metadata, tooling configs, and installation docs now align on Python 3.9 as the minimum supported runtime.
504
+ - ๐Ÿงฉ **Lean Compatibility Layer** โ€“ The Python 3.8 `sitecustomize` shim has been retired; datetime helpers use native typing without runtime fallbacks.
505
+ - ๐Ÿช„ **Type Hint Modernisation** โ€“ Replaced legacy `typing.List`/`Dict` usage with built-in generics (`list`, `dict`, `tuple`) and moved iterators to `collections.abc`.
506
+ - ๐Ÿงผ **Ruff Formatting by Default** โ€“ Adopted `ruff format` across the repository, keeping style consistent with the Ruff rule set.
507
+
508
+ ### Version 3.1.0 - Type-Safe Protocols & Tooling
509
+
510
+ - โœ… **260-File Type Coverage** โ€“ DataFrame mixins now implement structural typing protocols (`SupportsDataFrameOps`), giving a clean `mypy` run across the entire project.
511
+ - ๐Ÿงน **Zero Ruff Debt** โ€“ Repository-wide linting is enabled by default; `ruff check` passes with no warnings thanks to tighter casts, imports, and configuration.
512
+ - ๐Ÿงญ **Backend Selection Docs** โ€“ Updated configuration builder and new `docs/backend_selection.md` make it trivial to toggle between Polars, Memory, File, or DuckDB backends.
513
+ - ๐Ÿงช **Delta Schema Evolution Fixes** โ€“ Polars mergeSchema appends now align frames to the on-disk schema, restoring compatibility with evolving Delta tables.
514
+ - ๐Ÿงฐ **Improved Test Harness** โ€“ `tests/run_all_tests.sh` respects virtual environments and ensures documentation examples are executed with the correct interpreter.
515
+
516
+ ### Version 3.0.0+ - Code Quality & Cleanup
517
+
518
+ **Dependency Cleanup & Type Safety:**
519
+
520
+ - ๐Ÿงน **Removed Legacy Dependencies** - Removed unused `sqlglot` dependency (legacy DuckDB/SQL backend code)
521
+ - ๐Ÿ—‘๏ธ **Code Cleanup** - Removed unused legacy SQL translation modules (`sql_translator.py`, `spark_function_mapper.py`)
522
+ - โœ… **Type Safety** - Fixed 177 type errors using `ty` type checker, improved return type annotations
523
+ - ๐Ÿ” **Linting** - Fixed all 63 ruff linting errors, codebase fully formatted
524
+ - โœ… **All Tests Passing** - Full test suite validated (641+ tests, all passing)
525
+ - ๐Ÿ“ฆ **Cleaner Dependencies** - Reduced dependency footprint, faster installation
526
+
527
+ ### Version 3.0.0 - MAJOR UPDATE
528
+
529
+ **Polars Backend Migration:**
530
+
531
+ - ๐Ÿš€ **Polars Backend** - Complete migration to Polars for thread-safe, high-performance operations
532
+ - ๐Ÿงต **Thread Safety** - Polars is thread-safe by design - no more connection locks or threading issues
533
+ - ๐Ÿ“Š **Parquet Storage** - Tables now persist as Parquet files
534
+ - โšก **Performance** - Better performance for DataFrame operations
535
+ - โœ… **All tests passing** - Full test suite validated with Polars backend
536
+ - ๐Ÿ“ฆ **Production-ready** - Stable release with improved architecture
537
+
538
+ See [Migration Guide](docs/migration_from_v2_to_v3.md) for details.
539
+
540
+ ---
541
+
542
+ ## Documentation
543
+
544
+ ### Getting Started
545
+ - ๐Ÿ“– [Installation & Setup](docs/getting_started.md)
546
+ - ๐ŸŽฏ [Quick Start Guide](docs/getting_started.md#quick-start)
547
+ - ๐Ÿ”„ [Migration from PySpark](docs/guides/migration.md)
548
+
549
+ ### Related Packages
550
+ - ๐Ÿ”ง [spark-ddl-parser](https://github.com/eddiethedean/spark-ddl-parser) - Zero-dependency PySpark DDL schema parser
551
+
552
+ ### Core Concepts
553
+ - ๐Ÿ“Š [API Reference](docs/api_reference.md)
554
+ - ๐Ÿ”„ [Lazy Evaluation](docs/guides/lazy_evaluation.md)
555
+ - ๐Ÿ—„๏ธ [SQL Operations](docs/sql_operations_guide.md)
556
+ - ๐Ÿ’พ [Storage & Persistence](docs/storage_serialization_guide.md)
557
+
558
+ ### Advanced Topics
559
+ - โš™๏ธ [Configuration](docs/guides/configuration.md)
560
+ - ๐Ÿ“ˆ [Benchmarking](docs/guides/benchmarking.md)
561
+ - ๐Ÿ”Œ [Plugins & Hooks](docs/guides/plugins.md)
562
+ - ๐Ÿ [Pytest Integration](docs/guides/pytest_integration.md)
563
+
564
+ ---
565
+
566
+ ## Development Setup
567
+
568
+ ```bash
569
+ # Install for development
570
+ git clone https://github.com/eddiethedean/sparkless.git
571
+ cd sparkless
572
+ pip install -e ".[dev]"
573
+
574
+ # Run all tests (with proper isolation)
575
+ bash tests/run_all_tests.sh
576
+
577
+ # Format code
578
+ ruff format .
579
+ ruff check . --fix
580
+
581
+ # Type checking
582
+ mypy sparkless tests
583
+
584
+ # Linting
585
+ ruff check .
586
+ ```
587
+
588
+ ---
589
+
590
+ ## Contributing
591
+
592
+ We welcome contributions! Areas of interest:
593
+
594
+ - โšก **Performance** - Further Polars optimizations
595
+ - ๐Ÿ“š **Documentation** - Examples, guides, tutorials
596
+ - ๐Ÿ› **Bug Fixes** - Edge cases and compatibility issues
597
+ - ๐Ÿงช **PySpark API Coverage** - Additional functions and methods
598
+ - ๐Ÿงช **Tests** - Additional test coverage and scenarios
599
+
600
+ ---
601
+
602
+ ## Known Limitations
603
+
604
+ While Sparkless provides comprehensive PySpark compatibility, some advanced features are planned for future releases:
605
+
606
+ - **Error Handling**: Enhanced error messages with recovery strategies
607
+ - **Performance**: Advanced query optimization, parallel execution, intelligent caching
608
+ - **Enterprise**: Schema evolution, data lineage, audit logging
609
+ - **Compatibility**: PySpark 3.6+, Iceberg support
610
+
611
+ **Want to contribute?** These are great opportunities for community contributions!
612
+
613
+ ---
614
+
615
+ ## License
616
+
617
+ MIT License - see [LICENSE](LICENSE) file for details.
618
+
619
+ ---
620
+
621
+ ## Links
622
+
623
+ - **GitHub**: [github.com/eddiethedean/sparkless](https://github.com/eddiethedean/sparkless)
624
+ - **PyPI**: [pypi.org/project/sparkless](https://pypi.org/project/sparkless/)
625
+ - **Issues**: [github.com/eddiethedean/sparkless/issues](https://github.com/eddiethedean/sparkless/issues)
626
+ - **Documentation**: [Full documentation](docs/)
627
+
628
+ ---
629
+
630
+ <div align="center">
631
+
632
+ **Built with โค๏ธ for the PySpark community**
633
+
634
+ *Star โญ this repo if Sparkless helps speed up your tests!*
635
+
636
+ </div>