pycanopy 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {pycanopy-0.2.0 → pycanopy-0.2.2}/.github/workflows/CI.yml +2 -1
  2. {pycanopy-0.2.0 → pycanopy-0.2.2}/Cargo.lock +3 -3
  3. {pycanopy-0.2.0 → pycanopy-0.2.2}/Cargo.toml +9 -3
  4. pycanopy-0.2.2/Makefile +60 -0
  5. pycanopy-0.2.2/PKG-INFO +349 -0
  6. pycanopy-0.2.2/README.md +322 -0
  7. {pycanopy-0.2.0 → pycanopy-0.2.2}/pyproject.toml +5 -3
  8. pycanopy-0.2.2/python/pycanopy/__init__.py +5 -0
  9. pycanopy-0.2.2/python/pycanopy/engine.py +446 -0
  10. pycanopy-0.2.2/python/pycanopy/executor.py +410 -0
  11. pycanopy-0.2.2/python/pycanopy/frame.py +85 -0
  12. pycanopy-0.2.2/python/pycanopy/lazy.py +364 -0
  13. pycanopy-0.2.2/python/pycanopy/nodes.py +128 -0
  14. pycanopy-0.2.2/python/pycanopy/optimizer.py +351 -0
  15. pycanopy-0.2.2/python/pycanopy/py.typed +0 -0
  16. pycanopy-0.2.2/src/index/brute.rs +192 -0
  17. pycanopy-0.2.2/src/index/grid.rs +299 -0
  18. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/index/kdtree.rs +46 -53
  19. pycanopy-0.2.2/src/index/mod.rs +19 -0
  20. pycanopy-0.2.2/src/index/rtree.rs +133 -0
  21. pycanopy-0.2.2/src/lib.rs +953 -0
  22. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/planner/cost.rs +3 -2
  23. pycanopy-0.2.2/src/query/batch.rs +170 -0
  24. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/query/mod.rs +1 -0
  25. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/query/nearest.rs +3 -4
  26. pycanopy-0.2.2/src/query/range.rs +244 -0
  27. pycanopy-0.2.2/src/stats/collector.rs +320 -0
  28. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/stats/types.rs +5 -0
  29. pycanopy-0.2.2/tests/python/test_delta.py +105 -0
  30. {pycanopy-0.2.0 → pycanopy-0.2.2}/tests/python/test_engine.py +74 -33
  31. pycanopy-0.2.2/tests/python/test_fanout.py +156 -0
  32. pycanopy-0.2.2/tests/python/test_frame.py +195 -0
  33. {pycanopy-0.2.0 → pycanopy-0.2.2}/tests/python/test_ingestion.py +28 -12
  34. pycanopy-0.2.2/tests/python/test_joins.py +136 -0
  35. pycanopy-0.2.2/tests/rust/index_tests.rs +318 -0
  36. {pycanopy-0.2.0 → pycanopy-0.2.2}/tests/rust/planner_tests.rs +22 -35
  37. pycanopy-0.2.2/tests/rust/stats_tests.rs +85 -0
  38. pycanopy-0.2.0/Makefile +0 -23
  39. pycanopy-0.2.0/PKG-INFO +0 -155
  40. pycanopy-0.2.0/README.md +0 -129
  41. pycanopy-0.2.0/assets/pycanopy_logo2.png +0 -0
  42. pycanopy-0.2.0/python/pycanopy/__init__.py +0 -4
  43. pycanopy-0.2.0/python/pycanopy/engine.py +0 -226
  44. pycanopy-0.2.0/python/pycanopy/query.py +0 -18
  45. pycanopy-0.2.0/src/index/brute.rs +0 -152
  46. pycanopy-0.2.0/src/index/grid.rs +0 -287
  47. pycanopy-0.2.0/src/index/mod.rs +0 -35
  48. pycanopy-0.2.0/src/index/rtree.rs +0 -114
  49. pycanopy-0.2.0/src/lib.rs +0 -281
  50. pycanopy-0.2.0/src/query/range.rs +0 -119
  51. pycanopy-0.2.0/src/stats/collector.rs +0 -312
  52. pycanopy-0.2.0/tests/rust/index_tests.rs +0 -241
  53. pycanopy-0.2.0/tests/rust/stats_tests.rs +0 -76
  54. {pycanopy-0.2.0 → pycanopy-0.2.2}/.cargo/config.toml +0 -0
  55. {pycanopy-0.2.0 → pycanopy-0.2.2}/.github/workflows/release.yml +0 -0
  56. {pycanopy-0.2.0 → pycanopy-0.2.2}/.gitignore +0 -0
  57. {pycanopy-0.2.0 → pycanopy-0.2.2}/LICENSE +0 -0
  58. {pycanopy-0.2.0 → pycanopy-0.2.2}/assets/pycanopy_logo3.png +0 -0
  59. {pycanopy-0.2.0 → pycanopy-0.2.2}/rustfmt.toml +0 -0
  60. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/planner/calibration.rs +0 -0
  61. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/planner/mod.rs +0 -0
  62. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/planner/selector.rs +0 -0
  63. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/query/join.rs +0 -0
  64. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/query/types.rs +0 -0
  65. {pycanopy-0.2.0 → pycanopy-0.2.2}/src/stats/mod.rs +0 -0
  66. {pycanopy-0.2.0 → pycanopy-0.2.2}/tests/rust.rs +0 -0
@@ -14,7 +14,8 @@ jobs:
14
14
  - uses: actions/checkout@v4
15
15
  - uses: dtolnay/rust-toolchain@stable
16
16
  - uses: Swatinem/rust-cache@v2
17
- - run: cargo test
17
+ - uses: taiki-e/install-action@cargo-nextest
18
+ - run: cargo nextest run
18
19
  - run: cargo fmt --check
19
20
  - run: cargo clippy -- -D warnings
20
21
 
@@ -274,9 +274,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
274
274
 
275
275
  [[package]]
276
276
  name = "log"
277
- version = "0.4.31"
277
+ version = "0.4.32"
278
278
  source = "registry+https://github.com/rust-lang/crates.io-index"
279
- checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f"
279
+ checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
280
280
 
281
281
  [[package]]
282
282
  name = "matrixmultiply"
@@ -396,7 +396,7 @@ dependencies = [
396
396
 
397
397
  [[package]]
398
398
  name = "pycanopy"
399
- version = "0.1.0"
399
+ version = "0.2.2"
400
400
  dependencies = [
401
401
  "geo",
402
402
  "geo-index",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "pycanopy"
3
- version = "0.1.0"
3
+ version = "0.2.2"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -19,7 +19,13 @@ ordered-float = "4"
19
19
 
20
20
  [profile.release]
21
21
  opt-level = 3
22
- lto = true
22
+ lto = "thin"
23
23
  codegen-units = 1
24
24
 
25
- [dev-dependencies]
25
+ # Local dev builds: same opt-level as release but skips LTO and uses more
26
+ # codegen units to stay within 8GB RAM. Use for make check / make build.
27
+ [profile.dev-release]
28
+ inherits = "release"
29
+ lto = false
30
+ codegen-units = 8
31
+
@@ -0,0 +1,60 @@
1
+ .DEFAULT_GOAL := check
2
+
3
+ sources = python/ tests/python/ benchmarks/
4
+
5
+ # Preserve colour in cargo output when running from a tty.
6
+ export CARGO_TERM_COLOR=$(shell (test -t 0 && echo "always") || echo "auto")
7
+
8
+ .PHONY: format ## Auto-format Rust and Python source files
9
+ format:
10
+ cargo fmt
11
+ .venv/bin/ruff check --fix $(sources)
12
+ .venv/bin/ruff format $(sources)
13
+
14
+ .PHONY: lint-python ## Lint Python source files
15
+ lint-python:
16
+ .venv/bin/ruff check $(sources)
17
+ .venv/bin/ruff format --check $(sources)
18
+
19
+ .PHONY: lint-rust ## Lint Rust source files (fmt check + clippy over all code incl. tests)
20
+ lint-rust:
21
+ cargo fmt --all -- --check
22
+ cargo clippy --tests -- -D warnings
23
+
24
+ .PHONY: lint ## Lint Rust and Python source files
25
+ lint: lint-python lint-rust
26
+
27
+ .PHONY: build ## Debug build — fast compile, use for local iteration
28
+ build:
29
+ @rm -f python/pycanopy/*.so
30
+ maturin develop
31
+
32
+ .PHONY: build-prod ## Optimised build — use for benchmarks and profiling
33
+ build-prod:
34
+ @rm -f python/pycanopy/*.so
35
+ maturin develop --release
36
+
37
+ # Build first so clippy and cargo nextest reuse compiled objects from maturin
38
+ # instead of each triggering a second Rust compile pass.
39
+ # sccache (via RUSTC_WRAPPER in .cargo/config.toml) caches across runs.
40
+ .PHONY: check ## Format, build, lint, and test — run before every commit
41
+ check: format build lint
42
+ cargo nextest run
43
+ .venv/bin/pytest tests/python/ --durations=5
44
+
45
+ .PHONY: test ## Build and run all tests without formatting or linting
46
+ test: build
47
+ cargo nextest run
48
+ .venv/bin/pytest tests/python/
49
+
50
+ .PHONY: clean ## Remove build artifacts and caches
51
+ clean:
52
+ rm -rf `find . -name __pycache__`
53
+ rm -f `find . -type f -name '*.py[co]'`
54
+ rm -rf .pytest_cache .ruff_cache
55
+ rm -f python/pycanopy/*.so
56
+
57
+ .PHONY: help ## Display this help message
58
+ help:
59
+ @grep -E '^\.PHONY: .*?## .*$$' $(MAKEFILE_LIST) | \
60
+ awk 'BEGIN {FS = ".PHONY: |## "}; {printf "\033[36m%-15s\033[0m %s\n", $$2, $$3}'
@@ -0,0 +1,349 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycanopy
3
+ Version: 0.2.2
4
+ Classifier: License :: OSI Approved :: MIT License
5
+ Classifier: Programming Language :: Rust
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.9
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Topic :: Scientific/Engineering :: GIS
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Requires-Dist: pyarrow>=12.0
15
+ Requires-Dist: numpy>=1.24
16
+ Requires-Dist: polars>=0.20
17
+ License-File: LICENSE
18
+ Summary: Declarative spatial query layer for Polars
19
+ Keywords: geospatial,spatial-index,rtree,kdtree,knn,geoarrow
20
+ Author-email: Pranav Walimbe <pranav1077@gmail.com>
21
+ License: MIT
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
24
+ Project-URL: Issues, https://github.com/pranavwalimbe/pycanopy/issues
25
+ Project-URL: Repository, https://github.com/pranavwalimbe/pycanopy
26
+
27
+ <p align="center">
28
+ <img src="assets/pycanopy_logo3.png" alt="PyCanopy" width="800"/>
29
+ </p>
30
+
31
+ <p align="center">
32
+ <a href="https://pypi.org/project/pycanopy/"><img src="https://img.shields.io/pypi/v/pycanopy" alt="PyPI version"/></a>
33
+ <a href="https://pypi.org/project/pycanopy/"><img src="https://img.shields.io/pypi/pyversions/pycanopy" alt="Python versions"/></a>
34
+ <a href="https://github.com/pranav-walimbe/pycanopy/actions/workflows/CI.yml"><img src="https://img.shields.io/github/actions/workflow/status/pranav-walimbe/pycanopy/CI.yml?branch=main&label=tests" alt="CI"/></a>
35
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"/></a>
36
+ </p>
37
+
38
+ <p align="center">A spatial query layer for Polars. Rust core, Python API.</p>
39
+
40
+ ---
41
+
42
+ > [!NOTE]
43
+ > Up to **155x** on range queries · up to **1,949x** on kNN · up to **1,521x** on polygon contains · up to **8,522x** on within joins · [Full benchmarks](#benchmarks)
44
+
45
+ ---
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install pycanopy
51
+ ```
52
+
53
+ > Pre-built wheels for Linux, macOS, and Windows. No Rust toolchain required.
54
+
55
+ ```python
56
+ import polars as pl
57
+ from pycanopy import SpatialFrame
58
+
59
+ sf = SpatialFrame(pl.read_parquet("cities.parquet"), x_col="lon", y_col="lat")
60
+ result = sf.lazy().filter(pl.col("population") > 100_000).range_query(-10.0, 35.0, 40.0, 70.0).collect()
61
+ ```
62
+
63
+ ---
64
+
65
+ ## Why PyCanopy
66
+
67
+ Polars has no native spatial query support. Getting bounding-box filters, k-nearest neighbours, or point-in-polygon tests typically means converting to GeoPandas, managing an index manually, or scanning every row in Python. GeoPandas applies linear scans by default for containment and range tests; its STRtree requires explicit opt-in via `.sindex` and is the only available index type regardless of data distribution. KNN has no built-in path at all.
68
+
69
+ PyCanopy adds a declarative lazy query layer directly on Polars DataFrames. You describe the operations you want, and PyCanopy decides which index to build, in what order to run each operation, and what to hand off to Polars to execute. It is designed for in-memory workloads at the moment.
70
+
71
+ | | PyCanopy | GeoPandas | Manual STRtree |
72
+ |:----------------------------|:-------------:|:----------------:|:--------------:|
73
+ | Works natively in Polars | ✓ | ✗ | ✗ |
74
+ | Lazy / declarative API | ✓ | ✗ | ✗ |
75
+ | Auto index selection | ✓ | ✗ (STR only) | ✗ |
76
+ | KNN join built-in | ✓ | ✗ | ✗ |
77
+ | Delta buffer (live append) | ✓ | ✗ | ✗ |
78
+
79
+ - **Lazy planning** -- declare ops, the optimizer reorders and fuses them before any index is built
80
+ - **Auto index selection** -- KD-tree, R-tree, uniform grid, or brute force chosen per query
81
+ - **Native Polars** -- results are `pl.DataFrame`, no round-trip through GeoPandas
82
+ - **Rust hot paths** -- zero-copy at the Python boundary, loop-level parallelism via Rayon
83
+ - **Delta buffer** -- append new points and query immediately without rebuilding the index
84
+
85
+ ---
86
+
87
+ ## Usage
88
+
89
+ ### Point dataset: range and KNN
90
+
91
+ ```python
92
+ import polars as pl
93
+ from pycanopy import SpatialFrame
94
+
95
+ df = pl.read_parquet("cities.parquet")
96
+ sf = SpatialFrame(df, x_col="lon", y_col="lat")
97
+
98
+ # Bounding-box filter combined with a scalar predicate.
99
+ # Optimizer places the scalar filter first, then runs the range query
100
+ # on the reduced row set.
101
+ result = (
102
+ sf.lazy()
103
+ .filter(pl.col("population") > 100_000)
104
+ .range_query(min_x=-10.0, min_y=35.0, max_x=40.0, max_y=70.0)
105
+ .collect()
106
+ )
107
+
108
+ # k-nearest neighbours
109
+ nearest = sf.lazy().knn(x=2.35, y=48.85, k=5).collect()
110
+ ```
111
+
112
+ <details>
113
+ <summary>More examples -- KNN join, polygon contains, within-distance join, branching, delta buffer</summary>
114
+
115
+ ### Chaining multiple spatial predicates
116
+
117
+ ```python
118
+ # Two range predicates are fused into a single index build on large datasets.
119
+ result = (
120
+ sf.lazy()
121
+ .range_query(0.0, 0.0, 50.0, 50.0)
122
+ .range_query(10.0, 10.0, 40.0, 40.0)
123
+ .collect()
124
+ )
125
+ ```
126
+
127
+ ### Inspecting the optimizer plan
128
+
129
+ ```python
130
+ # Declare ops in any order — explain() shows what the optimizer will actually run.
131
+ lf = (
132
+ sf.lazy()
133
+ .range_query(min_x=-10.0, min_y=35.0, max_x=40.0, max_y=70.0)
134
+ .filter(pl.col("population") > 100_000)
135
+ )
136
+
137
+ print(lf.explain())
138
+ # RANGE_QUERY [(-10, 35) → (40, 70)]
139
+ # FROM
140
+ # FILTER [(col("population")) > (dyn int: 100000)]
141
+ # FROM
142
+ # DF [N=100,000; path: EXPR]
143
+
144
+ print(lf.explain(optimized=False))
145
+ # FILTER [(col("population")) > (dyn int: 100000)]
146
+ # FROM
147
+ # RANGE_QUERY [(-10, 35) → (40, 70)]
148
+ # FROM
149
+ # DF [N=100,000]
150
+ ```
151
+
152
+ Follows Polars' FROM-chain convention: bottom = runs first, top = outermost result. In the optimized plan, FILTER appears below RANGE_QUERY — the scalar filter runs first on raw data, and RANGE_QUERY receives the already-filtered subset. `explain(optimized=False)` shows declaration order for comparison.
153
+
154
+ ### KNN join
155
+
156
+ ```python
157
+ query_df = pl.DataFrame({"qx": [2.35, 13.4], "qy": [48.85, 52.5]})
158
+
159
+ # For each row in query_df, find the 3 nearest rows in sf.
160
+ result = sf.lazy().knn_join(query_df, x_col="qx", y_col="qy", k=3).collect()
161
+ ```
162
+
163
+ ### Polygon dataset: contains and range
164
+
165
+ ```python
166
+ from shapely.geometry import box
167
+ from pycanopy import SpatialFrame
168
+
169
+ polygons = [box(i, 0, i + 0.9, 0.9) for i in range(100_000)]
170
+ df = pl.DataFrame({"id": list(range(100_000)), "geom": polygons})
171
+ sf = SpatialFrame.from_polygons(df, geometry_col="geom")
172
+
173
+ # Which polygons contain this point?
174
+ containing = sf.lazy().contains(x=5.5, y=0.5).collect()
175
+
176
+ # Which polygon MBRs intersect this bbox?
177
+ intersecting = sf.lazy().range_query(0.0, 0.0, 10.0, 1.0).collect()
178
+ ```
179
+
180
+ ### Polygon holes
181
+
182
+ ```python
183
+ from shapely.geometry import Polygon
184
+
185
+ # Interior rings (holes) are fully supported.
186
+ outer = [(0, 0), (10, 0), (10, 10), (0, 10)]
187
+ hole = [(2, 2), (8, 2), (8, 8), (2, 8)]
188
+ donut = Polygon(outer, [hole])
189
+
190
+ sf = SpatialFrame.from_polygons(pl.DataFrame({"id": [0], "geom": [donut]}), geometry_col="geom")
191
+
192
+ # Point inside the hole is NOT contained.
193
+ sf.lazy().contains(x=5.0, y=5.0).collect() # empty
194
+
195
+ # Point outside the hole but inside the outer ring IS contained.
196
+ sf.lazy().contains(x=1.0, y=1.0).collect() # returns the polygon row
197
+ ```
198
+
199
+ ### Within join
200
+
201
+ ```python
202
+ # For each query point, find which polygons in sf contain it.
203
+ query_df = pl.DataFrame({"qx": [5.5, 12.3], "qy": [0.5, 0.5]})
204
+ result = sf.lazy().within_join(query_df, x_col="qx", y_col="qy").collect()
205
+ ```
206
+
207
+ ### Within-distance join
208
+
209
+ ```python
210
+ # For each query point, find all sf points within 50 km.
211
+ query_df = pl.DataFrame({"qx": [2.35, 13.4], "qy": [48.85, 52.5]})
212
+ result = sf.lazy().within_distance_join(query_df, x_col="qx", y_col="qy", distance=50.0).collect()
213
+ ```
214
+
215
+ ### Branching from a shared base
216
+
217
+ ```python
218
+ from pycanopy import SpatialFrame, SpatialLazyFrame
219
+
220
+ # Expensive filter applied once; two queries branch from the result.
221
+ base = sf.lazy().filter(pl.col("population") > 100_000).range_query(-10.0, 35.0, 40.0, 70.0)
222
+
223
+ major = base.filter(pl.col("population") > 1_000_000)
224
+ minor = base.filter(pl.col("population") <= 1_000_000)
225
+
226
+ # collect_all detects the shared prefix, caches it in Polars,
227
+ # and executes both branches in a single pass.
228
+ results = SpatialLazyFrame.collect_all([major, minor])
229
+ df_major, df_minor = results
230
+ ```
231
+
232
+ ### Live updates via delta buffer
233
+
234
+ ```python
235
+ # Append new points -- visible to queries immediately, no index rebuild yet.
236
+ import numpy as np
237
+ sf.engine.append_delta(np.array([2.5]), np.array([48.9]))
238
+
239
+ # Queries probe the main index and scan the delta in parallel.
240
+ result = sf.lazy().range_query(-10.0, 35.0, 40.0, 70.0).collect()
241
+
242
+ # The buffer flushes automatically when accumulated query cost exceeds
243
+ # the estimated index rebuild cost, or when it exceeds 10% of N.
244
+ # Force a flush manually if needed.
245
+ sf.engine.flush()
246
+ ```
247
+
248
+ </details>
249
+
250
+ ---
251
+
252
+ ## Benchmarks
253
+
254
+ Apple M-series used for benchmarking. **Warm** = cached index, second call. **Index build** = one-time cost, amortised across queries. Uniform distribution; clustered note below.
255
+
256
+ ### Single-query operations
257
+
258
+ | Operation | N | Index build | Warm | Naive | Speedup | Idx mem |
259
+ |:-----------------------|--------:|------------:|--------:|--------:|-----------:|--------:|
260
+ | Range query (points) | 100,000 | 1.3 ms | 29 µs | 4.4 ms | **155x** | 783 KB |
261
+ | kNN k=10 | 100,000 | 9.3 ms | 3 µs | 5.4 ms | **1,949x** | 1.9 MB |
262
+ | Polygon contains | 100,000 | 6.2 ms | 5 µs | 7.0 ms | **1,521x** | 3.7 MB |
263
+ | Polygon range | 100,000 | 5.6 ms | 8 µs | 3.3 ms | **391x** | 3.7 MB |
264
+ | kNN join k=5 | 10,000 | 7.3 ms | 2.1 ms | 5.4 s | **2,601x** | 180 KB |
265
+ | Within-distance join | 10,000 | 0.5 ms | 12.6 ms | 1.3 s | **102x** | — |
266
+ | Within join (polygons) | 10,000 | 1.6 ms | 0.52 ms | 4.4 s | **8,522x** | 354 KB |
267
+
268
+ ### Chained lazy queries (N = 100,000, uniform)
269
+
270
+ The optimizer reorders scalars before spatial ops regardless of declared order, and fuses consecutive wide spatial predicates into one index pass.
271
+
272
+ | Chain | Optimizer action | Index build | Warm | GeoPandas | Speedup |
273
+ |:---------------------------------------------|:-----------------|------------:|--------:|----------:|---------:|
274
+ | circ\_scalar → range³ | scalar first | 2.5 ms | 0.19 ms | 9.2 ms | **50x** |
275
+ | range² → 3× scalar (spatial declared first) | scalars first | 1.0 ms | 0.23 ms | 6.0 ms | **26x** |
276
+ | range⁴ at 10% selectivity | fused | 1.0 ms | 0.92 ms | 13 ms | **14x** |
277
+ | wide\_scalar (95%) → tight\_range (1%) | scalar first | 4.1 ms | 0.30 ms | 3.1 ms | **11x** |
278
+ | circ\_scalar + diag\_scalar → kNN k=50 | scalar first | 15 ms | 1.25 ms | 3.6 ms | **3x** |
279
+
280
+ ---
281
+
282
+ ## How It Works
283
+
284
+ ### Query flow
285
+
286
+ ```
287
+ sf.lazy().filter(...).range_query(...).knn_join(...).collect()
288
+ |
289
+ +------------+------------+
290
+ | SpatialOptimizer |
291
+ | * reorder ops by cost |
292
+ | * fuse spatial preds |
293
+ | * select index type |
294
+ | * spatial join order |
295
+ +------------+------------+
296
+ |
297
+ +------------+------------+
298
+ | Polars executes |
299
+ | scalar filters first |
300
+ | then spatial queries |
301
+ +------------+------------+
302
+ |
303
+ pl.DataFrame
304
+ ```
305
+
306
+ ### Optimizer decisions
307
+
308
+ - **Predicate pushdown:** scalar predicates are placed before spatial ones and sorted cheapest-first using AST cost estimation. They cost nothing extra and shrink the row count before any index is touched.
309
+ - **Fusion:** consecutive spatial predicates on large datasets are merged into a single index build and one pass over the data.
310
+ - **Index type:** selected per query based on geometry type, data distribution, and selectivity.
311
+ - **Join order:** for symmetric joins (`within_join`, `within_distance_join`), the optimizer indexes the smaller side when it is less than half the size of the other. `knn_join` is asymmetric and always indexes the engine side.
312
+
313
+ ### Index management
314
+
315
+ Indexes are built lazily. Nothing is constructed at load time; stats (extent, point distribution, a 32x32 histogram) are computed eagerly and drive selection at the first query. The selected index is then cached for all subsequent queries.
316
+
317
+ | Condition | Index |
318
+ |:----------------------------------------------|:-------------|
319
+ | N < 500, selectivity > 50%, or k/N > 10% | Brute force |
320
+ | Point range, uniform distribution | Uniform grid |
321
+ | Point range, clustered distribution | KD-tree |
322
+ | Point KNN or contains | KD-tree |
323
+ | Polygons, any query | R-tree |
324
+
325
+ All index types share the same underlying coordinate arrays with no duplication.
326
+
327
+ ### Why Rust
328
+
329
+ The hot paths need packed immutable index structures, zero-copy array slices at the Python boundary, and loop-level parallelism. C++ would require a separate FFI layer and loses the native Polars plugin integration that PyO3/Maturin provides for free.
330
+
331
+ ---
332
+
333
+ ## Accepted input formats
334
+
335
+ | Format | Example |
336
+ |:-----------------------------------|:-------------------------------------------|
337
+ | numpy `(N, 2)` array | `np.array([[x, y], ...])` |
338
+ | GeoArrow PyArrow array | `pa.StructArray` or `FixedSizeList<2>` |
339
+ | geopandas `GeoSeries` | `gdf.geometry` |
340
+ | list of shapely Points or Polygons | `[Point(x, y), ...]` |
341
+ | list of `(x, y)` tuples | `[(x, y), ...]` |
342
+ | Separate coordinate sequences | `Engine.from_coords(xs, ys)` |
343
+
344
+ ---
345
+
346
+ ## License
347
+
348
+ MIT
349
+