pycanopy 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {pycanopy-0.2.2 → pycanopy-0.3.0}/.gitignore +5 -1
  2. {pycanopy-0.2.2 → pycanopy-0.3.0}/Cargo.lock +5 -5
  3. {pycanopy-0.2.2 → pycanopy-0.3.0}/Cargo.toml +1 -1
  4. {pycanopy-0.2.2 → pycanopy-0.3.0}/Makefile +1 -1
  5. pycanopy-0.3.0/PKG-INFO +513 -0
  6. pycanopy-0.3.0/README.md +486 -0
  7. pycanopy-0.3.0/assets/pycanopy_logo3.png +0 -0
  8. pycanopy-0.3.0/assets/spatialbench_sf1_auto.png +0 -0
  9. {pycanopy-0.2.2 → pycanopy-0.3.0}/pyproject.toml +21 -3
  10. pycanopy-0.3.0/python/pycanopy/__init__.py +13 -0
  11. pycanopy-0.3.0/python/pycanopy/agg.py +155 -0
  12. pycanopy-0.3.0/python/pycanopy/engine.py +748 -0
  13. pycanopy-0.3.0/python/pycanopy/executor.py +600 -0
  14. pycanopy-0.3.0/python/pycanopy/frame.py +224 -0
  15. pycanopy-0.3.0/python/pycanopy/lazy.py +585 -0
  16. pycanopy-0.3.0/python/pycanopy/nodes.py +200 -0
  17. {pycanopy-0.2.2 → pycanopy-0.3.0}/python/pycanopy/optimizer.py +55 -38
  18. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/lib.rs +625 -107
  19. pycanopy-0.3.0/src/planner/calibration.rs +28 -0
  20. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/planner/cost.rs +59 -0
  21. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/planner/selector.rs +114 -1
  22. pycanopy-0.3.0/src/query/batch.rs +384 -0
  23. pycanopy-0.3.0/src/query/geometry.rs +211 -0
  24. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/query/mod.rs +3 -1
  25. pycanopy-0.3.0/src/query/multipoly.rs +96 -0
  26. pycanopy-0.3.0/src/query/prepared.rs +208 -0
  27. pycanopy-0.3.0/src/wkb.rs +220 -0
  28. pycanopy-0.2.2/PKG-INFO +0 -349
  29. pycanopy-0.2.2/README.md +0 -322
  30. pycanopy-0.2.2/assets/pycanopy_logo3.png +0 -0
  31. pycanopy-0.2.2/python/pycanopy/__init__.py +0 -5
  32. pycanopy-0.2.2/python/pycanopy/engine.py +0 -446
  33. pycanopy-0.2.2/python/pycanopy/executor.py +0 -410
  34. pycanopy-0.2.2/python/pycanopy/frame.py +0 -85
  35. pycanopy-0.2.2/python/pycanopy/lazy.py +0 -364
  36. pycanopy-0.2.2/python/pycanopy/nodes.py +0 -128
  37. pycanopy-0.2.2/src/planner/calibration.rs +0 -19
  38. pycanopy-0.2.2/src/query/batch.rs +0 -170
  39. pycanopy-0.2.2/src/query/join.rs +0 -1
  40. pycanopy-0.2.2/tests/python/test_delta.py +0 -105
  41. pycanopy-0.2.2/tests/python/test_engine.py +0 -293
  42. pycanopy-0.2.2/tests/python/test_fanout.py +0 -156
  43. pycanopy-0.2.2/tests/python/test_frame.py +0 -195
  44. pycanopy-0.2.2/tests/python/test_ingestion.py +0 -182
  45. pycanopy-0.2.2/tests/python/test_joins.py +0 -136
  46. pycanopy-0.2.2/tests/rust/index_tests.rs +0 -318
  47. pycanopy-0.2.2/tests/rust/planner_tests.rs +0 -168
  48. pycanopy-0.2.2/tests/rust/stats_tests.rs +0 -85
  49. pycanopy-0.2.2/tests/rust.rs +0 -8
  50. {pycanopy-0.2.2 → pycanopy-0.3.0}/.cargo/config.toml +0 -0
  51. {pycanopy-0.2.2 → pycanopy-0.3.0}/.github/workflows/CI.yml +0 -0
  52. {pycanopy-0.2.2 → pycanopy-0.3.0}/.github/workflows/release.yml +0 -0
  53. {pycanopy-0.2.2 → pycanopy-0.3.0}/LICENSE +0 -0
  54. {pycanopy-0.2.2 → pycanopy-0.3.0}/python/pycanopy/py.typed +0 -0
  55. {pycanopy-0.2.2 → pycanopy-0.3.0}/rustfmt.toml +0 -0
  56. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/index/brute.rs +0 -0
  57. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/index/grid.rs +0 -0
  58. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/index/kdtree.rs +0 -0
  59. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/index/mod.rs +0 -0
  60. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/index/rtree.rs +0 -0
  61. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/planner/mod.rs +0 -0
  62. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/query/nearest.rs +0 -0
  63. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/query/range.rs +0 -0
  64. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/query/types.rs +0 -0
  65. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/stats/collector.rs +0 -0
  66. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/stats/mod.rs +0 -0
  67. {pycanopy-0.2.2 → pycanopy-0.3.0}/src/stats/types.rs +0 -0
@@ -40,4 +40,8 @@ env/
40
40
  # Internal development files
41
41
  CLAUDE.md
42
42
  pycanopy_logo_files/
43
- benchmarks/
43
+
44
+ # Generated benchmark outputs in assets (keep only the logo and the SpatialBench chart)
45
+ /assets/*
46
+ !/assets/pycanopy_logo3.png
47
+ !/assets/spatialbench_sf1_auto.png
@@ -396,7 +396,7 @@ dependencies = [
396
396
 
397
397
  [[package]]
398
398
  name = "pycanopy"
399
- version = "0.2.2"
399
+ version = "0.3.0"
400
400
  dependencies = [
401
401
  "geo",
402
402
  "geo-index",
@@ -566,9 +566,9 @@ dependencies = [
566
566
 
567
567
  [[package]]
568
568
  name = "smallvec"
569
- version = "1.15.1"
569
+ version = "1.15.2"
570
570
  source = "registry+https://github.com/rust-lang/crates.io-index"
571
- checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
571
+ checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90"
572
572
 
573
573
  [[package]]
574
574
  name = "spade"
@@ -590,9 +590,9 @@ checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
590
590
 
591
591
  [[package]]
592
592
  name = "syn"
593
- version = "2.0.117"
593
+ version = "2.0.118"
594
594
  source = "registry+https://github.com/rust-lang/crates.io-index"
595
- checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
595
+ checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422"
596
596
  dependencies = [
597
597
  "proc-macro2",
598
598
  "quote",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "pycanopy"
3
- version = "0.2.2"
3
+ version = "0.3.0"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  .DEFAULT_GOAL := check
2
2
 
3
- sources = python/ tests/python/ benchmarks/
3
+ sources = python/ tests/python/ bench/
4
4
 
5
5
  # Preserve colour in cargo output when running from a tty.
6
6
  export CARGO_TERM_COLOR=$(shell (test -t 0 && echo "always") || echo "auto")
@@ -0,0 +1,513 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycanopy
3
+ Version: 0.3.0
4
+ Classifier: License :: OSI Approved :: MIT License
5
+ Classifier: Programming Language :: Rust
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.9
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Topic :: Scientific/Engineering :: GIS
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Requires-Dist: pyarrow>=12.0
15
+ Requires-Dist: numpy>=1.24
16
+ Requires-Dist: polars>=0.20
17
+ License-File: LICENSE
18
+ Summary: Declarative spatial query layer for Polars
19
+ Keywords: geospatial,spatial-index,rtree,kdtree,knn,geoarrow
20
+ Author-email: Pranav Walimbe <pranav1077@gmail.com>
21
+ License: MIT
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
24
+ Project-URL: Issues, https://github.com/pranav-walimbe/PyCanopy/issues
25
+ Project-URL: Repository, https://github.com/pranav-walimbe/PyCanopy
26
+
27
+ <p align="center">
28
+ <img src="assets/pycanopy_logo3.png" alt="PyCanopy" width="800"/>
29
+ </p>
30
+
31
+ <p align="center">
32
+ <a href="https://pypi.org/project/pycanopy/"><img src="https://img.shields.io/pypi/v/pycanopy" alt="PyPI version"/></a>
33
+ <a href="https://pypi.org/project/pycanopy/"><img src="https://img.shields.io/pypi/pyversions/pycanopy" alt="Python versions"/></a>
34
+ <a href="https://github.com/pranav-walimbe/pycanopy/actions/workflows/CI.yml"><img src="https://img.shields.io/github/actions/workflow/status/pranav-walimbe/pycanopy/CI.yml?branch=main&label=tests" alt="CI"/></a>
35
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"/></a>
36
+ </p>
37
+
38
+ <p align="center">A spatial query layer for Polars. Rust core, Python API.</p>
39
+
40
+ ---
41
+
42
+ ## State of the art on Apache SpatialBench
43
+
44
+ PyCanopy reaches state of the art on [Apache SpatialBench](https://sedona.apache.org/spatialbench/single-node-benchmarks/), the standard single-node spatial-analytics benchmark whose 12 queries span range filters, distance and kNN joins, and point-in-polygon aggregation over millions of trips and zones. On matched hardware it beats the best open-source engines like Apache SedonaDB and DuckDB on most queries, without leaving Polars.
45
+
46
+ <p align="center">
47
+ <img src="assets/spatialbench_sf1_auto.png" alt="PyCanopy vs SedonaDB, DuckDB, and GeoPandas on Apache SpatialBench SF1" width="100%"/>
48
+ </p>
49
+
50
+ <p align="center"><sub>Apache SpatialBench SF1 · log scale, lower is better · missing bars are TIMEOUT / ERROR</sub></p>
51
+
52
+ > [!NOTE]
53
+ > Versus GeoPandas microbenchmarks: up to **199×** on range queries · **1,024×** on kNN · **931×** on polygon contains · **3,307×** on within joins · [Full benchmarks](#benchmarks)
54
+
55
+ ---
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install pycanopy
61
+ ```
62
+
63
+ > Pre-built wheels for Linux, macOS, and Windows. No Rust toolchain required.
64
+
65
+ ```python
66
+ import polars as pl
67
+ from pycanopy import SpatialFrame
68
+
69
+ sf = SpatialFrame(pl.read_parquet("cities.parquet"), x_col="lon", y_col="lat")
70
+ result = sf.lazy().filter(pl.col("population") > 100_000).range_query(-10.0, 35.0, 40.0, 70.0).collect()
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Why PyCanopy
76
+
77
+ Every spatial option for a Polars user asks you to give something up:
78
+
79
+ - **GeoPandas** is eager and pandas-based. Its one index (STRtree) is opt-in, and a join larger than memory simply fails.
80
+ - **DuckDB spatial** is fast and out-of-core, but you leave Polars for SQL and create the R-tree index by hand.
81
+ - **SedonaDB** is a capable spatial engine, but it is a separate SQL engine rather than a Polars-native API.
82
+
83
+ PyCanopy's principle is to stay inside Polars and add a real query planner. You declare spatial ops in any order. It reorders them, fuses adjacent predicates, pushes projections into joins, and uses a cost model to decide per query whether to build an index at all (and which kind). kNN and within-distance joins are first-class, and results larger than RAM stream and spill to disk.
84
+
85
+ How the options compare:
86
+
87
+ | | PyCanopy | GeoPandas | DuckDB spatial | SedonaDB | GeoPolars |
88
+ |:-----------------------------------------|:--------:|:-----------:|:--------------:|:--------:|:---------:|
89
+ | Runs inside Polars (no SQL, no convert) | ✓ | ✗ | ✗ (SQL) | ✗ (SQL) | ✓ |
90
+ | Lazy, declarative API | ✓ | ✗ (eager) | SQL | SQL | ✓ |
91
+ | Automatic index, no manual setup | ✓ | ✗ (manual) | ✗ (manual) | ✓ | ✗ |
92
+ | Cost-based index vs scan, per query | ✓ | ✗ | ✗ | ✗ | ✗ |
93
+ | kNN join built in | ✓ | ✓ (nearest) | ✗ | ✓ | ✗ |
94
+ | Within-distance / point-in-polygon join | ✓ | ✓ | ✓ | ✓ | ✗ |
95
+ | Larger-than-RAM joins | ✓ | ✗ | ✓ | ✓ | ✗ |
96
+
97
+ ---
98
+
99
+ ## Operations
100
+
101
+ **Point datasets**
102
+
103
+ | Operation | Call | Returns |
104
+ |:-----------------------|:------------------------------------------------------|:-------------------------------------------------|
105
+ | Range query | `.range_query(min_x, min_y, max_x, max_y)` | Rows inside the bounding box |
106
+ | k-nearest neighbours | `.knn(x, y, k)` | The `k` rows nearest a point |
107
+ | kNN join | `.knn_join(df, x_col, y_col, k)` | The `k` nearest rows for each query point |
108
+ | Within-distance join | `.within_distance_join(df, x_col, y_col, distance)` | Rows within `distance` of each query point |
109
+ | Convex-hull area | `SpatialFrame.convex_hull_area(xs, ys)` | Area of the convex hull of a point set |
110
+
111
+ **Polygon datasets**
112
+
113
+ | Operation | Call | Returns |
114
+ |:------------------------------|:-------------------------------------------------------------|:--------------------------------------------------------|
115
+ | Point in polygon | `.contains(x, y)` | Polygons that contain the point |
116
+ | MBR range | `.range_query(min_x, min_y, max_x, max_y)` | Polygons whose bounding box meets the query box |
117
+ | Within join | `.within_join(df, x_col, y_col)` | Polygons that contain each query point |
118
+ | Point-to-polygon distance join | `.polygon_within_distance_join(df, x_col, y_col, distance)` | Polygons within `distance` of each query point |
119
+ | Point-to-polygon kNN join | `.polygon_knn_join(df, x_col, y_col, k)` | The `k` nearest polygons for each query point |
120
+ | Intersects self-join | `.intersects_pairs()` | Intersecting polygon pairs with overlap area and IoU |
121
+ | Area | `.polygon_areas()` | Area of each polygon |
122
+ | Points near a polygon | `.points_within_distance_of_polygon(polygon, distance)` | Points within `distance` of a single polygon |
123
+
124
+ **Reductions and streaming** (compose with any join)
125
+
126
+ | Operation | Call | Returns |
127
+ |:-----------------------|:-----------------------------------------------------------|:-------------------------------------------------------------|
128
+ | Aggregate-join | `.group_by(keys).agg(pc.agg.count/sum/mean/min/max(...))` | One row per group, reduced over the join with no pair frame |
129
+ | Projection pushdown | `.select(cols)` | Narrows both join sides before the gather |
130
+ | Stream in batches | `.collect_batched()` | An iterator of result morsels, bounded memory |
131
+ | Stream to Parquet | `.sink_parquet(path)` | Writes the result to disk in bounded memory |
132
+ | Out-of-core pipeline | `.lazy_source()` | A Polars source that fuses join + sort + sink, spilling to disk |
133
+
134
+ ---
135
+
136
+ ## Usage
137
+
138
+ ### Point dataset: range and KNN
139
+
140
+ ```python
141
+ import polars as pl
142
+ from pycanopy import SpatialFrame
143
+
144
+ df = pl.read_parquet("cities.parquet")
145
+ sf = SpatialFrame(df, x_col="lon", y_col="lat")
146
+
147
+ # Bounding-box filter combined with a scalar predicate.
148
+ # Optimizer places the scalar filter first, then runs the range query
149
+ # on the reduced row set.
150
+ result = (
151
+ sf.lazy()
152
+ .filter(pl.col("population") > 100_000)
153
+ .range_query(min_x=-10.0, min_y=35.0, max_x=40.0, max_y=70.0)
154
+ .collect()
155
+ )
156
+
157
+ # k-nearest neighbours
158
+ nearest = sf.lazy().knn(x=2.35, y=48.85, k=5).collect()
159
+ ```
160
+
161
+ ### Inspecting the plan
162
+
163
+ ```python
164
+ # Declare ops in any order. explain() shows what the optimizer will actually run.
165
+ lf = (
166
+ sf.lazy()
167
+ .range_query(min_x=-10.0, min_y=35.0, max_x=40.0, max_y=70.0)
168
+ .filter(pl.col("population") > 100_000)
169
+ )
170
+
171
+ print(lf.explain())
172
+ # RANGE_QUERY [(-10, 35) → (40, 70)]
173
+ # FROM
174
+ # FILTER [(col("population")) > (dyn int: 100000)]
175
+ # FROM
176
+ # DF [N=100,000; path: EXPR]
177
+ ```
178
+
179
+ The optimizer flipped the declaration order. The scalar filter runs first on all rows, then the spatial query runs on the smaller survivor set. Plans follow Polars' FROM-chain convention, so the bottom runs first and the top is the final result.
180
+
181
+ ### Aggregate over a join
182
+
183
+ ```python
184
+ import pycanopy as pc
185
+
186
+ # Count trips per zone and average their fare, reduced over a streamed
187
+ # point-in-polygon join. The full pair frame is never materialised: each
188
+ # morsel reduces to per-group partials that combine into the final result.
189
+ stats = (
190
+ zones.lazy()
191
+ .within_join(trips, x_col="lon", y_col="lat")
192
+ .group_by(["zone_id", "zone_name"])
193
+ .agg(trip_count=pc.agg.count(), avg_fare=pc.agg.mean("fare"))
194
+ )
195
+ ```
196
+
197
+ ### Out-of-core joins (larger than RAM)
198
+
199
+ ```python
200
+ # A join whose result exceeds memory: stream it straight to Parquet,
201
+ # bounded to one morsel at a time.
202
+ sf.lazy().polygon_knn_join(trips, "lon", "lat", k=5).sink_parquet("nearest.parquet")
203
+
204
+ # Or fuse the join with a sort and sink into a single spilling Polars
205
+ # pipeline, so even an ordered result larger than RAM never materialises.
206
+ (
207
+ sf.lazy()
208
+ .polygon_knn_join(trips, "lon", "lat", k=5)
209
+ .select(["trip_id", "building_id", "distance_to_polygon"])
210
+ .lazy_source()
211
+ .sort("distance_to_polygon")
212
+ .sink_parquet("nearest_sorted.parquet")
213
+ )
214
+ ```
215
+
216
+ <details>
217
+ <summary>More examples: point and polygon joins, aggregations, branching, delta buffer, index modes</summary>
218
+
219
+ ### Chaining multiple spatial predicates
220
+
221
+ ```python
222
+ # Two range predicates are fused into a single index build on large datasets.
223
+ result = (
224
+ sf.lazy()
225
+ .range_query(0.0, 0.0, 50.0, 50.0)
226
+ .range_query(10.0, 10.0, 40.0, 40.0)
227
+ .collect()
228
+ )
229
+ ```
230
+
231
+ ### KNN join
232
+
233
+ ```python
234
+ query_df = pl.DataFrame({"qx": [2.35, 13.4], "qy": [48.85, 52.5]})
235
+
236
+ # For each row in query_df, find the 3 nearest rows in sf.
237
+ result = sf.lazy().knn_join(query_df, x_col="qx", y_col="qy", k=3).collect()
238
+ ```
239
+
240
+ ### Polygon dataset: contains and range
241
+
242
+ ```python
243
+ from shapely.geometry import box
244
+ from pycanopy import SpatialFrame
245
+
246
+ polygons = [box(i, 0, i + 0.9, 0.9) for i in range(100_000)]
247
+ df = pl.DataFrame({"id": list(range(100_000)), "geom": polygons})
248
+ sf = SpatialFrame.from_polygons(df, geometry_col="geom")
249
+
250
+ # Which polygons contain this point?
251
+ containing = sf.lazy().contains(x=5.5, y=0.5).collect()
252
+
253
+ # Which polygon MBRs intersect this bbox?
254
+ intersecting = sf.lazy().range_query(0.0, 0.0, 10.0, 1.0).collect()
255
+ ```
256
+
257
+ ### Polygon holes
258
+
259
+ ```python
260
+ from shapely.geometry import Polygon
261
+
262
+ # Interior rings (holes) are fully supported.
263
+ outer = [(0, 0), (10, 0), (10, 10), (0, 10)]
264
+ hole = [(2, 2), (8, 2), (8, 8), (2, 8)]
265
+ donut = Polygon(outer, [hole])
266
+
267
+ sf = SpatialFrame.from_polygons(pl.DataFrame({"id": [0], "geom": [donut]}), geometry_col="geom")
268
+
269
+ # Point inside the hole is NOT contained.
270
+ sf.lazy().contains(x=5.0, y=5.0).collect() # empty
271
+
272
+ # Point outside the hole but inside the outer ring IS contained.
273
+ sf.lazy().contains(x=1.0, y=1.0).collect() # returns the polygon row
274
+ ```
275
+
276
+ ### Within join
277
+
278
+ ```python
279
+ # For each query point, find which polygons in sf contain it.
280
+ query_df = pl.DataFrame({"qx": [5.5, 12.3], "qy": [0.5, 0.5]})
281
+ result = sf.lazy().within_join(query_df, x_col="qx", y_col="qy").collect()
282
+ ```
283
+
284
+ ### Within-distance join
285
+
286
+ ```python
287
+ # For each query point, find all sf points within 50 km.
288
+ query_df = pl.DataFrame({"qx": [2.35, 13.4], "qy": [48.85, 52.5]})
289
+ result = sf.lazy().within_distance_join(query_df, x_col="qx", y_col="qy", distance=50.0).collect()
290
+ ```
291
+
292
+ ### Point-to-polygon joins
293
+
294
+ ```python
295
+ # (polygon SpatialFrame) For each query point, the polygons within a distance
296
+ # of it. Distance is to the polygon boundary, and zero when the point is inside.
297
+ query_df = pl.DataFrame({"qx": [5.5, 12.3], "qy": [0.5, 0.5]})
298
+ near = sf.lazy().polygon_within_distance_join(query_df, x_col="qx", y_col="qy", distance=2.0).collect()
299
+
300
+ # For each query point, its k nearest polygons (adds a distance_to_polygon column).
301
+ nearest = sf.lazy().polygon_knn_join(query_df, x_col="qx", y_col="qy", k=3).collect()
302
+ ```
303
+
304
+ ### Polygon aggregations
305
+
306
+ ```python
307
+ # Area of every polygon (appends an 'area' column).
308
+ areas = sf.polygon_areas()
309
+
310
+ # All intersecting polygon pairs, with overlap area and IoU.
311
+ overlaps = sf.intersects_pairs()
312
+
313
+ # (point SpatialFrame) rows whose point lies within a distance of one polygon.
314
+ from shapely.geometry import box
315
+ pts = point_sf.points_within_distance_of_polygon(box(0.0, 0.0, 1.0, 1.0), distance=0.5)
316
+ ```
317
+
318
+ ### Convex-hull area
319
+
320
+ ```python
321
+ import numpy as np
322
+
323
+ # Area of the convex hull of a standalone point set (no frame needed).
324
+ area = SpatialFrame.convex_hull_area(np.array([0.0, 1.0, 0.5]), np.array([0.0, 0.0, 1.0]))
325
+ ```
326
+
327
+ ### Index mode
328
+
329
+ ```python
330
+ # Fixed per frame. "auto" lets the cost model choose index vs scan per query;
331
+ # "none" always scans; "eager" (default) always builds the selected index.
332
+ sf = SpatialFrame(df, x_col="lon", y_col="lat", index_mode="auto")
333
+ ```
334
+
335
+ ### Branching from a shared base
336
+
337
+ ```python
338
+ from pycanopy import SpatialFrame, SpatialLazyFrame
339
+
340
+ # Expensive filter applied once; two queries branch from the result.
341
+ base = sf.lazy().filter(pl.col("population") > 100_000).range_query(-10.0, 35.0, 40.0, 70.0)
342
+
343
+ major = base.filter(pl.col("population") > 1_000_000)
344
+ minor = base.filter(pl.col("population") <= 1_000_000)
345
+
346
+ # collect_all detects the shared prefix, caches it in Polars,
347
+ # and executes both branches in a single pass.
348
+ results = SpatialLazyFrame.collect_all([major, minor])
349
+ df_major, df_minor = results
350
+ ```
351
+
352
+ ### Live updates via delta buffer
353
+
354
+ ```python
355
+ # Append new points -- visible to queries immediately, no index rebuild yet.
356
+ import numpy as np
357
+ sf.engine.append_delta(np.array([2.5]), np.array([48.9]))
358
+
359
+ # Queries probe the main index and scan the delta in parallel.
360
+ result = sf.lazy().range_query(-10.0, 35.0, 40.0, 70.0).collect()
361
+
362
+ # The buffer flushes automatically when accumulated query cost exceeds
363
+ # the estimated index rebuild cost, or when it exceeds 10% of N.
364
+ # Force a flush manually if needed.
365
+ sf.engine.flush()
366
+ ```
367
+
368
+ </details>
369
+
370
+ ---
371
+
372
+ ## Benchmarks
373
+
374
+ ### Apache SpatialBench
375
+
376
+ Run on a single `m7i.2xlarge` (8 vCPU, 32 GB), the same instance as the published [SedonaDB / DuckDB / GeoPandas numbers](https://sedona.apache.org/spatialbench/single-node-benchmarks/).
377
+
378
+ **SF1** (~6M trips). PyCanopy beats SedonaDB on 11 of 12 queries and wins the heavy cross-zone joins q10/q11/q12 by 2 to 4x.
379
+
380
+ <p align="center">
381
+ <img src="assets/spatialbench_sf1_auto.png" alt="PyCanopy vs SedonaDB, DuckDB, and GeoPandas on Apache SpatialBench SF1" width="100%"/>
382
+ </p>
383
+ <p align="center"><sub>Apache SpatialBench SF1 · log scale, lower is better · missing bars are TIMEOUT / ERROR</sub></p>
384
+
385
+ **SF10** (~60M trips). PyCanopy wins 8 of 12. q12 returns a result larger than the 32 GB box, so it streams the join and spills the sort to disk, completing where DuckDB errors and GeoPandas times out.
386
+
387
+ <p align="center">
388
+ <img src="assets/spatialbench_sf10_auto.png" alt="PyCanopy vs SedonaDB, DuckDB, and GeoPandas on Apache SpatialBench SF10" width="100%"/>
389
+ </p>
390
+ <p align="center"><sub>Apache SpatialBench SF10 · log scale, lower is better · missing bars are TIMEOUT / ERROR</sub></p>
391
+
392
+ ### Per-operation vs GeoPandas
393
+
394
+ Apple M-series. **Cold** = fresh engine, index build included. **Warm** = cached index, second call. **GeoPandas** is the naive baseline (no spatial index). Uniform random data.
395
+
396
+ | Operation | N | Cold | Warm | GeoPandas | Speedup |
397
+ |:-----------------------------------|--------:|--------:|--------:|----------:|----------:|
398
+ | Range query (points) | 100,000 | 2.6 ms | 28 µs | 5.6 ms | **199×** |
399
+ | kNN k=10 | 100,000 | 9.9 ms | 7 µs | 7.3 ms | **1,024×** |
400
+ | Contains (polygons) | 100,000 | 6.1 ms | 6 µs | 5.4 ms | **931×** |
401
+ | Range (polygons) | 100,000 | 6.1 ms | 9 µs | 4.4 ms | **503×** |
402
+ | kNN join k=5 | 10,000 | 10.4 ms | 2.2 ms | 5.5 s | **2,463×** |
403
+ | Within-distance join | 10,000 | 14.1 ms | 13.6 ms | 3.5 s | **260×** |
404
+ | Within join (polygons) | 5,000 | 2.8 ms | 0.37 ms | 1.2 s | **3,307×** |
405
+ | Point→polygon kNN join k=5 | 5,000 | 6.7 ms | 5.7 ms | 6.1 s | **1,076×** |
406
+ | Point→polygon within-distance join | 5,000 | 6.6 ms | 6.4 ms | 5.4 s | **845×** |
407
+ | Intersects self-join | 5,000 | 2.2 ms | 1.1 ms | 0.86 s | **796×** |
408
+
409
+ ---
410
+
411
+ ## How It Works
412
+
413
+ PyCanopy plans a query in two layers, then hands the result to Polars to run.
414
+
415
+ ### Query flow
416
+
417
+ ```
418
+ sf.lazy().filter(...).range_query(...).knn_join(...).collect()
419
+ |
420
+ +---------------+----------------+
421
+ | Logical plan (whole chain) |
422
+ | order ops, fuse predicates, |
423
+ | pick join side, EXPR vs IO |
424
+ +---------------+----------------+
425
+ |
426
+ +---------------+----------------+
427
+ | Access path (per operation) |
428
+ | index or scan, and which |
429
+ | kind: a cost model decides |
430
+ +---------------+----------------+
431
+ |
432
+ +---------------+----------------+
433
+ | Polars runs the emitted ops |
434
+ +---------------+----------------+
435
+ |
436
+ pl.DataFrame
437
+ ```
438
+
439
+ ### Logical planning
440
+
441
+ Decisions about the whole chain, made before any data is touched:
442
+
443
+ - **Predicate pushdown:** scalar filters run before spatial ops, cheapest first (cost estimated from the Polars expression tree). They shrink the row count for little cost.
444
+ - **Fusion:** consecutive spatial predicates merge into one index build and one pass.
445
+ - **Join side:** symmetric joins (`within_join`, `within_distance_join`) index the smaller side. `knn_join` always indexes the engine side.
446
+ - **Projection pushdown:** a terminal `.select()` is pushed into the join, so only the requested columns are gathered from each side instead of the full width.
447
+ - **Execution path:** very selective filters slice the prebuilt index directly (IO path). Otherwise filters run first and a small index is built on the survivors (EXPR path).
448
+
449
+ ### Cost model: index or scan?
450
+
451
+ Building an index costs about `N log N`, so it only pays off if queried enough times. For each operation the planner compares two estimates (`N` is the dataset size, `Q` the number of query points):
452
+
453
+ ```
454
+ scan = Q * N every row, for every query point
455
+ index = N log N (build once) + Q * log N (probe per query point)
456
+ ```
457
+
458
+ Building wins once `Q` passes roughly `log N`. A one-off lookup scans; a join with many probes builds the index and reuses it. Selectivity refines this: if a predicate keeps most rows, the planner skips the index, since a tree that prunes nothing loses to a plain scan.
459
+
460
+ `index_mode`, set per frame, picks how the estimate is used:
461
+
462
+ - **`eager`** (default): always build the selected index.
463
+ - **`auto`**: build only when the estimate beats a scan for this `Q`.
464
+ - **`none`**: always scan.
465
+
466
+ ### Index management
467
+
468
+ Indexes build lazily, never at load time. Dataset stats (extent, distribution, a 32x32 histogram) are computed once up front and drive the first query's choice, after which the index is cached for all later queries. When a non-brute index is built, its kind comes from:
469
+
470
+ | Condition | Index |
471
+ |:----------------------------------------------|:-------------|
472
+ | N < 500, selectivity > 50%, or k/N > 10% | Brute force |
473
+ | Point range, uniform distribution | Uniform grid |
474
+ | Point range, clustered distribution | KD-tree |
475
+ | Point KNN or contains | KD-tree |
476
+ | Polygons, any query | R-tree |
477
+
478
+ All index types share the same coordinate arrays with no duplication.
479
+
480
+ ### Streaming and out-of-core
481
+
482
+ A join never has to fit in memory. The probe side is sliced into fixed-size morsels run one at a time, so the join intermediate stays bounded:
483
+
484
+ - `collect()` auto-streams a large probe, bounding the transient.
485
+ - `collect_batched()` and `sink_parquet()` bound the full output (to an iterator, or straight to a Parquet file).
486
+ - `lazy_source()` exposes the streamed join as a native Polars source, fusing it with a downstream `sort` and `sink_parquet` into one pipeline that spills to disk, so an ordered result larger than RAM still completes.
487
+ - `group_by(keys).agg(...)` reduces each morsel to associative partials that combine into the per-group result, so the join is never materialised at all.
488
+
489
+ ### Why Rust
490
+
491
+ The hot paths need packed immutable index structures, zero-copy array slices at the Python boundary, and loop-level parallelism. C++ would require a separate FFI layer and loses the native Polars plugin integration that PyO3/Maturin provides for free.
492
+
493
+ ---
494
+
495
+ ## Accepted input formats
496
+
497
+ | Format | Example |
498
+ |:-----------------------------------|:-------------------------------------------|
499
+ | numpy `(N, 2)` array | `np.array([[x, y], ...])` |
500
+ | GeoArrow PyArrow array | `pa.StructArray` or `FixedSizeList<2>` |
501
+ | geopandas `GeoSeries` | `gdf.geometry` |
502
+ | shapely Points / Polygons / MultiPolygons | `[Point(x, y), ...]` |
503
+ | list of `(x, y)` tuples | `[(x, y), ...]` |
504
+ | Separate coordinate sequences | `Engine.from_coords(xs, ys)` |
505
+ | WKB point column (Binary) | `SpatialFrame.from_wkb_points(df, "geom")` |
506
+ | WKB polygon column (Binary) | `SpatialFrame.from_wkb_polygons(df, "geom")` |
507
+
508
+ ---
509
+
510
+ ## License
511
+
512
+ MIT
513
+