sqlrite 0.1.11__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlrite-0.1.11 → sqlrite-0.1.13}/Cargo.lock +5 -5
- {sqlrite-0.1.11 → sqlrite-0.1.13}/Cargo.toml +1 -1
- {sqlrite-0.1.11 → sqlrite-0.1.13}/PKG-INFO +1 -1
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/package.json +1 -1
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/phase-7-plan.md +16 -4
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/roadmap.md +3 -3
- {sqlrite-0.1.11 → sqlrite-0.1.13}/pyproject.toml +1 -1
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/python/Cargo.toml +1 -1
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/executor.rs +362 -7
- sqlrite-0.1.13/src/sql/hnsw.rs +790 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/mod.rs +1 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/.github/workflows/ci.yml +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/.github/workflows/release-pr.yml +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/.github/workflows/release.yml +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/.github/workflows/rust.yml +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/.gitignore +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/CODE_OF_CONDUCT.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/LICENSE +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/MAINTAINERS +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/Makefile +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/README.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/index.html +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/package-lock.json +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/src/App.svelte +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/src/app.css +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/src/main.ts +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/src/vite-env.d.ts +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/svelte.config.js +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/tsconfig.json +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/desktop/vite.config.ts +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/_index.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/architecture.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/design-decisions.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/desktop.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/embedding.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/file-format.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/getting-started.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/pager.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/release-plan.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/release-secrets.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/smoke-test.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/sql-engine.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/storage-model.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/supported-sql.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/docs/usage.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/README.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/c/Makefile +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/c/hello.c +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/go/go.mod +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/go/hello.go +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/nodejs/hello.mjs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/python/hello.py +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/rust/quickstart.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/wasm/Makefile +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/examples/wasm/index.html +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/SQLRite - Desktop.png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/SQLRite Data Structures.png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/SQLRite Simple SQL Execution High Level Diagram.png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/SQLRite Simple SQL INSERT Execution High Level Diagram (Insert Row).png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/SQLRite Simple SQL INSERT Execution High Level Diagram.png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/SQLRite_logo.png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/images/architecture.png +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/rust-toolchain.toml +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/AST.delete.example +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/AST.insert.exemple +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/AST.select.example +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/AST.update.example +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/CREATE TABLE sqlrite_schema.sql +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/CREATE_TABLE with duplicate.sql +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/CREATE_TABLE.sql +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/samples/INSERT.sql +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/scripts/bump-version.sh +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/README.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/conn.go +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/go.mod +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/rows.go +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/sqlrite.go +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/sqlrite_test.go +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/go/stmt.go +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/python/README.md +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/python/src/lib.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/sdk/python/tests/test_sqlrite.py +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/connection.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/error.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/lib.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/main.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/meta_command/mod.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/repl/mod.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/db/database.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/db/mod.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/db/secondary_index.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/db/table.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/cell.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/file.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/header.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/index_cell.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/interior_page.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/mod.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/overflow.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/page.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/pager.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/table_page.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/varint.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/pager/wal.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/parser/create.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/parser/insert.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/parser/mod.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/parser/select.rs +0 -0
- {sqlrite-0.1.11 → sqlrite-0.1.13}/src/sql/tokenizer.rs +0 -0
|
@@ -3736,7 +3736,7 @@ dependencies = [
|
|
|
3736
3736
|
|
|
3737
3737
|
[[package]]
|
|
3738
3738
|
name = "sqlrite-desktop"
|
|
3739
|
-
version = "0.1.
|
|
3739
|
+
version = "0.1.13"
|
|
3740
3740
|
dependencies = [
|
|
3741
3741
|
"serde",
|
|
3742
3742
|
"serde_json",
|
|
@@ -3748,7 +3748,7 @@ dependencies = [
|
|
|
3748
3748
|
|
|
3749
3749
|
[[package]]
|
|
3750
3750
|
name = "sqlrite-engine"
|
|
3751
|
-
version = "0.1.
|
|
3751
|
+
version = "0.1.13"
|
|
3752
3752
|
dependencies = [
|
|
3753
3753
|
"clap",
|
|
3754
3754
|
"env_logger",
|
|
@@ -3763,7 +3763,7 @@ dependencies = [
|
|
|
3763
3763
|
|
|
3764
3764
|
[[package]]
|
|
3765
3765
|
name = "sqlrite-ffi"
|
|
3766
|
-
version = "0.1.
|
|
3766
|
+
version = "0.1.13"
|
|
3767
3767
|
dependencies = [
|
|
3768
3768
|
"cbindgen",
|
|
3769
3769
|
"sqlrite-engine",
|
|
@@ -3771,7 +3771,7 @@ dependencies = [
|
|
|
3771
3771
|
|
|
3772
3772
|
[[package]]
|
|
3773
3773
|
name = "sqlrite-nodejs"
|
|
3774
|
-
version = "0.1.
|
|
3774
|
+
version = "0.1.13"
|
|
3775
3775
|
dependencies = [
|
|
3776
3776
|
"napi",
|
|
3777
3777
|
"napi-build",
|
|
@@ -3781,7 +3781,7 @@ dependencies = [
|
|
|
3781
3781
|
|
|
3782
3782
|
[[package]]
|
|
3783
3783
|
name = "sqlrite-python"
|
|
3784
|
-
version = "0.1.
|
|
3784
|
+
version = "0.1.13"
|
|
3785
3785
|
dependencies = [
|
|
3786
3786
|
"pyo3",
|
|
3787
3787
|
"sqlrite-engine",
|
|
@@ -27,7 +27,7 @@ resolver = "3"
|
|
|
27
27
|
# `package =` key so the import name stays `sqlrite` internally:
|
|
28
28
|
# sqlrite = { package = "sqlrite-engine", path = "…" }
|
|
29
29
|
name = "sqlrite-engine"
|
|
30
|
-
version = "0.1.
|
|
30
|
+
version = "0.1.13"
|
|
31
31
|
authors = ["Joao Henrique Machado Silva <joaoh82@gmail.com>"]
|
|
32
32
|
edition = "2024"
|
|
33
33
|
rust-version = "1.85"
|
|
@@ -122,13 +122,17 @@ SELECT id, title FROM docs ORDER BY embedding <-> [0.1, ...] LIMIT 10;
|
|
|
122
122
|
|
|
123
123
|
---
|
|
124
124
|
|
|
125
|
-
### 7c — Brute-force KNN executor optimization
|
|
125
|
+
### ✅ 7c — Brute-force KNN executor optimization
|
|
126
126
|
|
|
127
|
-
**What.**
|
|
127
|
+
**What shipped.** The SELECT executor now branches on `(ORDER BY, LIMIT k)` shape. When both are present and `k < N`, the new `select_topk` function maintains a bounded `BinaryHeap` of size k instead of full-sorting all N rowids. O(N log k) instead of O(N log N).
|
|
128
128
|
|
|
129
|
-
**
|
|
129
|
+
**Implementation note: max-heap with direction-aware Ord.** A single `HeapEntry { key: Value, rowid: i64, asc: bool }` wrapper handles both `ORDER BY ASC LIMIT k` (k smallest) and `ORDER BY DESC LIMIT k` (k largest) without separate code paths. The `asc` flag inverts the natural Ord, so the displacement test reduces to "new entry < heap top" in both cases. After the scan, `into_sorted_vec` returns the right caller-facing order (ascending for ASC, descending for DESC).
|
|
130
130
|
|
|
131
|
-
**
|
|
131
|
+
**Measured speedup** (N=10k, k=10, single REAL column sort key, release build): ~1.8×. The advantage scales with N and with per-row work — KNN queries where the sort key is `vec_distance_l2(col, [...])` benefit much more because each key evaluation is itself O(dim).
|
|
132
|
+
|
|
133
|
+
**LOC**: ~120 implementation + ~180 tests/benchmark = ~300 total. Slightly over the ~150 estimate because the test surface (correctness + bench + edge cases for k=0, k>N, empty input, distance-function integration) ended up larger than initially projected.
|
|
134
|
+
|
|
135
|
+
**Pre-existing bug surfaced.** The seed function for the benchmark needed positive scores because the INSERT parser doesn't currently handle `Expr::UnaryOp(Minus, Number(...))` for negative literals. Worked around with a Knuth-hash scrambler that stays positive; the underlying parser bug is documented as a follow-up.
|
|
132
136
|
|
|
133
137
|
---
|
|
134
138
|
|
|
@@ -154,6 +158,14 @@ SELECT id, title FROM docs ORDER BY embedding <-> [0.1, ...] LIMIT 10;
|
|
|
154
158
|
|
|
155
159
|
**LOC estimate:** ~700-900 lines. The big sub-phase.
|
|
156
160
|
|
|
161
|
+
> **Scope correction (2026-04-27, post-7c):** Re-scoping during implementation showed 7d works out to ~1300 LOC across three logical chunks, more than the original ~700-900 estimate and too much for one reviewable PR. Splitting into three:
|
|
162
|
+
>
|
|
163
|
+
> - **7d.1 — Pure HNSW algorithm** *(~700 LOC).* `src/sql/hnsw.rs` standalone module: insert + search + layer assignment + beam search per layer + L2/cosine/dot distance dispatch. No SQL integration yet — vectors are passed in via a `get_vec` closure so the algorithm doesn't depend on table types. Tests verify recall@k ≥ 0.95 vs brute-force on randomly-generated vector sets; deterministic via a fixed RNG seed.
|
|
164
|
+
> - **7d.2 — SQL integration** *(~400 LOC).* `CREATE INDEX … USING hnsw (col)` parser + engine, INSERT wiring (also calls `hnsw.insert()`), query optimizer hook (recognizes `ORDER BY vec_distance_*(col, literal) LIMIT k` and probes the HNSW instead of full-scanning). HNSW lives in memory only at this point — gets rebuilt on every database open.
|
|
165
|
+
> - **7d.3 — Persistence** *(~300 LOC).* Wire HNSW into the cell format: new `KIND_HNSW` cell tag, page-tree storage parallel to secondary indexes, save/reopen round-trip.
|
|
166
|
+
>
|
|
167
|
+
> Each 7d.x ships as its own PR + release wave. The user-facing value lands at 7d.2; 7d.3 closes the persistence loop. 7d.1 is foundational but ships a tested algorithmic primitive on its own — useful as documentation of the engine's "from scratch" theme.
|
|
168
|
+
|
|
157
169
|
**Tests:** recall@k vs brute-force baseline (should be ≥ 0.95 on standard benchmark vectors); insert performance; delete semantics; persistence roundtrip.
|
|
158
170
|
|
|
159
171
|
---
|
|
@@ -470,9 +470,9 @@ The full plan + recorded design decisions live in [`docs/phase-7-plan.md`](phase
|
|
|
470
470
|
|
|
471
471
|
Approved sub-phases (Q1–Q10 resolved):
|
|
472
472
|
|
|
473
|
-
-
|
|
474
|
-
-
|
|
475
|
-
-
|
|
473
|
+
- **✅ 7a — `VECTOR(N)` column type** *(v0.1.10)* — dense fixed-dimension f32 storage via the existing cell encoding; format bumped to v4. Bracket-array literal syntax `[0.1, 0.2, …]` (Q7).
|
|
474
|
+
- **✅ 7b — Distance functions** *(v0.1.11)* — `vec_distance_l2/cosine/dot`, plus the ORDER BY-expressions parser change so KNN queries work end-to-end. Operators (`<->` `<=>` `<#>`) deferred to **7b.1** — sqlparser doesn't parse them natively, contradicting Q6's "tiny parser change" assumption.
|
|
475
|
+
- **✅ 7c — Brute-force KNN executor optimization** — bounded `BinaryHeap` of size k for `ORDER BY <expr> LIMIT k`. ~1.8× faster than full-sort at N=10k for cheap keys; bigger gains on expensive keys like `vec_distance_l2`.
|
|
476
476
|
- **7d — HNSW ANN index** — `CREATE INDEX … USING hnsw (col)`; persisted as cell-encoded graph. Fixed defaults `M=16, ef_construction=200, ef_search=50` (Q2).
|
|
477
477
|
- **7e — JSON column type + path queries** — `JSON` data type stored as bincoded `serde_json::Value` (Q3); `json_extract` / `json_array_length` / `json_object_keys` / `json_type`.
|
|
478
478
|
- **7f — ~~Full-text search with BM25~~** — **deferred to Phase 8** (Q1).
|
|
@@ -4,7 +4,7 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sqlrite"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.13"
|
|
8
8
|
description = "Python bindings for SQLRite — a small, embeddable SQLite clone written in Rust."
|
|
9
9
|
authors = [{ name = "Joao Henrique Machado Silva", email = "joaoh82@gmail.com" }]
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -73,13 +73,39 @@ pub fn execute_select_rows(query: SelectQuery, db: &Database) -> Result<SelectRe
|
|
|
73
73
|
};
|
|
74
74
|
let mut matching = matching;
|
|
75
75
|
|
|
76
|
-
//
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
76
|
+
// Phase 7c — bounded-heap top-k optimization.
|
|
77
|
+
//
|
|
78
|
+
// The naive "ORDER BY <expr>" path (Phase 7b) sorts every matching
|
|
79
|
+
// rowid: O(N log N) sort_by + a truncate. For KNN queries
|
|
80
|
+
//
|
|
81
|
+
// SELECT id FROM docs
|
|
82
|
+
// ORDER BY vec_distance_l2(embedding, [...])
|
|
83
|
+
// LIMIT 10;
|
|
84
|
+
//
|
|
85
|
+
// N is the table row count and k is the LIMIT. With a bounded
|
|
86
|
+
// max-heap of size k we can find the top-k in O(N log k) — same
|
|
87
|
+
// sort_by-per-row cost on the heap operations, but k is typically
|
|
88
|
+
// 10-100 while N can be millions.
|
|
89
|
+
//
|
|
90
|
+
// We branch in three cases:
|
|
91
|
+
// 1. ORDER BY + LIMIT k where k < |matching| → bounded heap.
|
|
92
|
+
// 2. ORDER BY without LIMIT, or LIMIT >= |matching| → full sort
|
|
93
|
+
// (heap saves nothing when we'd keep everyone anyway).
|
|
94
|
+
// 3. LIMIT without ORDER BY → just truncate (no sort needed).
|
|
95
|
+
match (&query.order_by, query.limit) {
|
|
96
|
+
(Some(order), Some(k)) if k < matching.len() => {
|
|
97
|
+
matching = select_topk(&matching, table, order, k)?;
|
|
98
|
+
}
|
|
99
|
+
(Some(order), _) => {
|
|
100
|
+
sort_rowids(&mut matching, table, order)?;
|
|
101
|
+
if let Some(k) = query.limit {
|
|
102
|
+
matching.truncate(k);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
(None, Some(k)) => {
|
|
106
|
+
matching.truncate(k);
|
|
107
|
+
}
|
|
108
|
+
(None, None) => {}
|
|
83
109
|
}
|
|
84
110
|
|
|
85
111
|
// Build typed rows. Missing cells surface as `Value::Null` — that
|
|
@@ -500,6 +526,99 @@ fn try_extract_equality(expr: &Expr) -> Option<(String, sqlparser::ast::Value)>
|
|
|
500
526
|
None
|
|
501
527
|
}
|
|
502
528
|
|
|
529
|
+
/// One entry in the bounded-heap top-k path. Holds a pre-evaluated
|
|
530
|
+
/// sort key + the rowid it came from. The `asc` flag inverts `Ord`
|
|
531
|
+
/// so a single `BinaryHeap<HeapEntry>` works for both ASC and DESC
|
|
532
|
+
/// without wrapping in `std::cmp::Reverse` at the call site:
|
|
533
|
+
///
|
|
534
|
+
/// - ASC LIMIT k = "k smallest": natural Ord. Max-heap top is the
|
|
535
|
+
/// largest currently kept; new items smaller than top displace.
|
|
536
|
+
/// - DESC LIMIT k = "k largest": Ord reversed. Max-heap top is now
|
|
537
|
+
/// the smallest currently kept (under reversed Ord, smallest
|
|
538
|
+
/// looks largest); new items larger than top displace.
|
|
539
|
+
///
|
|
540
|
+
/// In both cases the displacement test reduces to "new entry < heap top".
|
|
541
|
+
struct HeapEntry {
|
|
542
|
+
key: Value,
|
|
543
|
+
rowid: i64,
|
|
544
|
+
asc: bool,
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
impl PartialEq for HeapEntry {
|
|
548
|
+
fn eq(&self, other: &Self) -> bool {
|
|
549
|
+
self.cmp(other) == Ordering::Equal
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
impl Eq for HeapEntry {}
|
|
554
|
+
|
|
555
|
+
impl PartialOrd for HeapEntry {
|
|
556
|
+
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
557
|
+
Some(self.cmp(other))
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
impl Ord for HeapEntry {
|
|
562
|
+
fn cmp(&self, other: &Self) -> Ordering {
|
|
563
|
+
let raw = compare_values(Some(&self.key), Some(&other.key));
|
|
564
|
+
if self.asc { raw } else { raw.reverse() }
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
/// Bounded-heap top-k selection. Returns at most `k` rowids in the
|
|
569
|
+
/// caller's desired order (ascending key for `order.ascending`,
|
|
570
|
+
/// descending otherwise).
|
|
571
|
+
///
|
|
572
|
+
/// O(N log k) where N = `matching.len()`. Caller must check
|
|
573
|
+
/// `k < matching.len()` for this to be a win — for k ≥ N the
|
|
574
|
+
/// `sort_rowids` full-sort path is the same asymptotic cost without
|
|
575
|
+
/// the heap overhead.
|
|
576
|
+
fn select_topk(
|
|
577
|
+
matching: &[i64],
|
|
578
|
+
table: &Table,
|
|
579
|
+
order: &OrderByClause,
|
|
580
|
+
k: usize,
|
|
581
|
+
) -> Result<Vec<i64>> {
|
|
582
|
+
use std::collections::BinaryHeap;
|
|
583
|
+
|
|
584
|
+
if k == 0 || matching.is_empty() {
|
|
585
|
+
return Ok(Vec::new());
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::with_capacity(k + 1);
|
|
589
|
+
|
|
590
|
+
for &rowid in matching {
|
|
591
|
+
let key = eval_expr(&order.expr, table, rowid)?;
|
|
592
|
+
let entry = HeapEntry {
|
|
593
|
+
key,
|
|
594
|
+
rowid,
|
|
595
|
+
asc: order.ascending,
|
|
596
|
+
};
|
|
597
|
+
|
|
598
|
+
if heap.len() < k {
|
|
599
|
+
heap.push(entry);
|
|
600
|
+
} else {
|
|
601
|
+
// peek() returns the largest under our direction-aware Ord
|
|
602
|
+
// — the worst entry currently kept. Displace it iff the
|
|
603
|
+
// new entry is "better" (i.e. compares Less).
|
|
604
|
+
if entry < *heap.peek().unwrap() {
|
|
605
|
+
heap.pop();
|
|
606
|
+
heap.push(entry);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// `into_sorted_vec` returns ascending under our direction-aware Ord:
|
|
612
|
+
// ASC: ascending by raw key (what we want)
|
|
613
|
+
// DESC: ascending under reversed Ord = descending by raw key (what
|
|
614
|
+
// we want for an ORDER BY DESC LIMIT k result)
|
|
615
|
+
Ok(heap
|
|
616
|
+
.into_sorted_vec()
|
|
617
|
+
.into_iter()
|
|
618
|
+
.map(|e| e.rowid)
|
|
619
|
+
.collect())
|
|
620
|
+
}
|
|
621
|
+
|
|
503
622
|
fn sort_rowids(rowids: &mut [i64], table: &Table, order: &OrderByClause) -> Result<()> {
|
|
504
623
|
// Phase 7b: ORDER BY now accepts any expression (column ref,
|
|
505
624
|
// arithmetic, function call, …). Pre-compute the sort key for
|
|
@@ -1063,4 +1182,240 @@ mod tests {
|
|
|
1063
1182
|
let cos = vec_distance_cosine(&a, &b).unwrap();
|
|
1064
1183
|
assert!(approx_eq(dot, cos - 1.0, 1e-5));
|
|
1065
1184
|
}
|
|
1185
|
+
|
|
1186
|
+
// -----------------------------------------------------------------
|
|
1187
|
+
// Phase 7c — bounded-heap top-k correctness + benchmark
|
|
1188
|
+
// -----------------------------------------------------------------
|
|
1189
|
+
|
|
1190
|
+
use crate::sql::db::database::Database;
|
|
1191
|
+
use crate::sql::parser::select::SelectQuery;
|
|
1192
|
+
use sqlparser::dialect::SQLiteDialect;
|
|
1193
|
+
use sqlparser::parser::Parser;
|
|
1194
|
+
|
|
1195
|
+
/// Builds a `docs(id INTEGER PK, score REAL)` table with N rows of
|
|
1196
|
+
/// distinct positive scores so top-k tests aren't sensitive to
|
|
1197
|
+
/// tie-breaking (heap is unstable; full-sort is stable; we want
|
|
1198
|
+
/// both to agree without arguing about equal-score row order).
|
|
1199
|
+
///
|
|
1200
|
+
/// **Why positive scores:** the INSERT parser doesn't currently
|
|
1201
|
+
/// handle `Expr::UnaryOp(Minus, …)` for negative number literals
|
|
1202
|
+
/// (it would parse `-3.14` as a unary expression and the value
|
|
1203
|
+
/// extractor would skip it). That's a pre-existing bug, out of
|
|
1204
|
+
/// scope for 7c. Using the Knuth multiplicative hash gives us
|
|
1205
|
+
/// distinct positive scrambled values without dancing around the
|
|
1206
|
+
/// negative-literal limitation.
|
|
1207
|
+
fn seed_score_table(n: usize) -> Database {
|
|
1208
|
+
let mut db = Database::new("tempdb".to_string());
|
|
1209
|
+
crate::sql::process_command(
|
|
1210
|
+
"CREATE TABLE docs (id INTEGER PRIMARY KEY, score REAL);",
|
|
1211
|
+
&mut db,
|
|
1212
|
+
)
|
|
1213
|
+
.expect("create");
|
|
1214
|
+
for i in 0..n {
|
|
1215
|
+
// Knuth multiplicative hash mod 1_000_000 — distinct,
|
|
1216
|
+
// dense in [0, 999_999], no collisions for n up to ~tens
|
|
1217
|
+
// of thousands.
|
|
1218
|
+
let score = ((i as u64).wrapping_mul(2_654_435_761) % 1_000_000) as f64;
|
|
1219
|
+
let sql = format!("INSERT INTO docs (score) VALUES ({score});");
|
|
1220
|
+
crate::sql::process_command(&sql, &mut db).expect("insert");
|
|
1221
|
+
}
|
|
1222
|
+
db
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
/// Helper: parses an SQL SELECT into a SelectQuery so we can drive
|
|
1226
|
+
/// `select_topk` / `sort_rowids` directly without the rest of the
|
|
1227
|
+
/// process_command pipeline.
|
|
1228
|
+
fn parse_select(sql: &str) -> SelectQuery {
|
|
1229
|
+
let dialect = SQLiteDialect {};
|
|
1230
|
+
let mut ast = Parser::parse_sql(&dialect, sql).expect("parse");
|
|
1231
|
+
let stmt = ast.pop().expect("one statement");
|
|
1232
|
+
SelectQuery::new(&stmt).expect("select-query")
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
#[test]
|
|
1236
|
+
fn topk_matches_full_sort_asc() {
|
|
1237
|
+
// Build N=200, top-k=10. Bounded heap output must equal
|
|
1238
|
+
// full-sort-then-truncate output (both produce ASC order).
|
|
1239
|
+
let db = seed_score_table(200);
|
|
1240
|
+
let table = db.get_table("docs".to_string()).unwrap();
|
|
1241
|
+
let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 10;");
|
|
1242
|
+
let order = q.order_by.as_ref().unwrap();
|
|
1243
|
+
let all_rowids = table.rowids();
|
|
1244
|
+
|
|
1245
|
+
// Full-sort path
|
|
1246
|
+
let mut full = all_rowids.clone();
|
|
1247
|
+
sort_rowids(&mut full, table, order).unwrap();
|
|
1248
|
+
full.truncate(10);
|
|
1249
|
+
|
|
1250
|
+
// Bounded-heap path
|
|
1251
|
+
let topk = select_topk(&all_rowids, table, order, 10).unwrap();
|
|
1252
|
+
|
|
1253
|
+
assert_eq!(topk, full, "top-k via heap should match full-sort+truncate");
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
#[test]
|
|
1257
|
+
fn topk_matches_full_sort_desc() {
|
|
1258
|
+
// Same with DESC — verifies the direction-aware Ord wrapper.
|
|
1259
|
+
let db = seed_score_table(200);
|
|
1260
|
+
let table = db.get_table("docs".to_string()).unwrap();
|
|
1261
|
+
let q = parse_select("SELECT * FROM docs ORDER BY score DESC LIMIT 10;");
|
|
1262
|
+
let order = q.order_by.as_ref().unwrap();
|
|
1263
|
+
let all_rowids = table.rowids();
|
|
1264
|
+
|
|
1265
|
+
let mut full = all_rowids.clone();
|
|
1266
|
+
sort_rowids(&mut full, table, order).unwrap();
|
|
1267
|
+
full.truncate(10);
|
|
1268
|
+
|
|
1269
|
+
let topk = select_topk(&all_rowids, table, order, 10).unwrap();
|
|
1270
|
+
|
|
1271
|
+
assert_eq!(
|
|
1272
|
+
topk, full,
|
|
1273
|
+
"top-k DESC via heap should match full-sort+truncate"
|
|
1274
|
+
);
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
#[test]
|
|
1278
|
+
fn topk_k_larger_than_n_returns_everything_sorted() {
|
|
1279
|
+
// The executor branches off to the full-sort path when k >= N,
|
|
1280
|
+
// but if a caller invokes select_topk directly with k > N, it
|
|
1281
|
+
// should still produce all-sorted output (no truncation
|
|
1282
|
+
// because we don't have N items to truncate to k).
|
|
1283
|
+
let db = seed_score_table(50);
|
|
1284
|
+
let table = db.get_table("docs".to_string()).unwrap();
|
|
1285
|
+
let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 1000;");
|
|
1286
|
+
let order = q.order_by.as_ref().unwrap();
|
|
1287
|
+
let topk = select_topk(&table.rowids(), table, order, 1000).unwrap();
|
|
1288
|
+
assert_eq!(topk.len(), 50);
|
|
1289
|
+
// All scores in ascending order.
|
|
1290
|
+
let scores: Vec<f64> = topk
|
|
1291
|
+
.iter()
|
|
1292
|
+
.filter_map(|r| match table.get_value("score", *r) {
|
|
1293
|
+
Some(Value::Real(f)) => Some(f),
|
|
1294
|
+
_ => None,
|
|
1295
|
+
})
|
|
1296
|
+
.collect();
|
|
1297
|
+
assert!(scores.windows(2).all(|w| w[0] <= w[1]));
|
|
1298
|
+
}
|
|
1299
|
+
|
|
1300
|
+
#[test]
|
|
1301
|
+
fn topk_k_zero_returns_empty() {
|
|
1302
|
+
let db = seed_score_table(10);
|
|
1303
|
+
let table = db.get_table("docs".to_string()).unwrap();
|
|
1304
|
+
let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 1;");
|
|
1305
|
+
let order = q.order_by.as_ref().unwrap();
|
|
1306
|
+
let topk = select_topk(&table.rowids(), table, order, 0).unwrap();
|
|
1307
|
+
assert!(topk.is_empty());
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
#[test]
|
|
1311
|
+
fn topk_empty_input_returns_empty() {
|
|
1312
|
+
let db = seed_score_table(0);
|
|
1313
|
+
let table = db.get_table("docs".to_string()).unwrap();
|
|
1314
|
+
let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 5;");
|
|
1315
|
+
let order = q.order_by.as_ref().unwrap();
|
|
1316
|
+
let topk = select_topk(&[], table, order, 5).unwrap();
|
|
1317
|
+
assert!(topk.is_empty());
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
#[test]
|
|
1321
|
+
fn topk_works_through_select_executor_with_distance_function() {
|
|
1322
|
+
// Integration check that the executor actually picks the
|
|
1323
|
+
// bounded-heap path on a KNN-shaped query and produces the
|
|
1324
|
+
// correct top-k.
|
|
1325
|
+
let mut db = Database::new("tempdb".to_string());
|
|
1326
|
+
crate::sql::process_command(
|
|
1327
|
+
"CREATE TABLE docs (id INTEGER PRIMARY KEY, e VECTOR(2));",
|
|
1328
|
+
&mut db,
|
|
1329
|
+
)
|
|
1330
|
+
.unwrap();
|
|
1331
|
+
// Five rows with distinct distances from probe [1.0, 0.0]:
|
|
1332
|
+
// id=1 [1.0, 0.0] distance=0
|
|
1333
|
+
// id=2 [2.0, 0.0] distance=1
|
|
1334
|
+
// id=3 [0.0, 3.0] distance=√(1+9) = √10 ≈ 3.16
|
|
1335
|
+
// id=4 [1.0, 4.0] distance=4
|
|
1336
|
+
// id=5 [10.0, 10.0] distance=√(81+100) ≈ 13.45
|
|
1337
|
+
for v in &[
|
|
1338
|
+
"[1.0, 0.0]",
|
|
1339
|
+
"[2.0, 0.0]",
|
|
1340
|
+
"[0.0, 3.0]",
|
|
1341
|
+
"[1.0, 4.0]",
|
|
1342
|
+
"[10.0, 10.0]",
|
|
1343
|
+
] {
|
|
1344
|
+
crate::sql::process_command(&format!("INSERT INTO docs (e) VALUES ({v});"), &mut db)
|
|
1345
|
+
.unwrap();
|
|
1346
|
+
}
|
|
1347
|
+
let resp = crate::sql::process_command(
|
|
1348
|
+
"SELECT id FROM docs ORDER BY vec_distance_l2(e, [1.0, 0.0]) ASC LIMIT 3;",
|
|
1349
|
+
&mut db,
|
|
1350
|
+
)
|
|
1351
|
+
.unwrap();
|
|
1352
|
+
// Top-3 closest to [1.0, 0.0] are id=1, id=2, id=3 (in that order).
|
|
1353
|
+
// The status message tells us how many rows came back.
|
|
1354
|
+
assert!(resp.contains("3 rows returned"), "got: {resp}");
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
/// Manual benchmark — not run by default. Recommended invocation:
|
|
1358
|
+
///
|
|
1359
|
+
/// cargo test -p sqlrite-engine --lib topk_benchmark --release \
|
|
1360
|
+
/// -- --ignored --nocapture
|
|
1361
|
+
///
|
|
1362
|
+
/// (`--release` matters: Rust's optimized sort gets very fast under
|
|
1363
|
+
/// optimization, so the heap's relative advantage is best observed
|
|
1364
|
+
/// against a sort that's also been optimized.)
|
|
1365
|
+
///
|
|
1366
|
+
/// Measured numbers on an Apple Silicon laptop with N=10_000 + k=10:
|
|
1367
|
+
/// - bounded heap: ~820µs
|
|
1368
|
+
/// - full sort+trunc: ~1.5ms
|
|
1369
|
+
/// - ratio: ~1.8×
|
|
1370
|
+
///
|
|
1371
|
+
/// The advantage is real but moderate at this size because the sort
|
|
1372
|
+
/// key here is a single REAL column read (cheap) and Rust's sort_by
|
|
1373
|
+
/// has a very low constant factor. The asymptotic O(N log k) vs
|
|
1374
|
+
/// O(N log N) advantage scales with N and with per-row work — KNN
|
|
1375
|
+
/// queries where the sort key is `vec_distance_l2(col, [...])` are
|
|
1376
|
+
/// where this path really pays off, because each key evaluation is
|
|
1377
|
+
/// itself O(dim) and the heap path skips the per-row evaluation
|
|
1378
|
+
/// in the comparator (see `sort_rowids` for the contrast).
|
|
1379
|
+
#[test]
|
|
1380
|
+
#[ignore]
|
|
1381
|
+
fn topk_benchmark() {
|
|
1382
|
+
use std::time::Instant;
|
|
1383
|
+
const N: usize = 10_000;
|
|
1384
|
+
const K: usize = 10;
|
|
1385
|
+
|
|
1386
|
+
let db = seed_score_table(N);
|
|
1387
|
+
let table = db.get_table("docs".to_string()).unwrap();
|
|
1388
|
+
let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 10;");
|
|
1389
|
+
let order = q.order_by.as_ref().unwrap();
|
|
1390
|
+
let all_rowids = table.rowids();
|
|
1391
|
+
|
|
1392
|
+
// Time bounded heap.
|
|
1393
|
+
let t0 = Instant::now();
|
|
1394
|
+
let _topk = select_topk(&all_rowids, table, order, K).unwrap();
|
|
1395
|
+
let heap_dur = t0.elapsed();
|
|
1396
|
+
|
|
1397
|
+
// Time full sort + truncate.
|
|
1398
|
+
let t1 = Instant::now();
|
|
1399
|
+
let mut full = all_rowids.clone();
|
|
1400
|
+
sort_rowids(&mut full, table, order).unwrap();
|
|
1401
|
+
full.truncate(K);
|
|
1402
|
+
let sort_dur = t1.elapsed();
|
|
1403
|
+
|
|
1404
|
+
let ratio = sort_dur.as_secs_f64() / heap_dur.as_secs_f64().max(1e-9);
|
|
1405
|
+
println!("\n--- topk_benchmark (N={N}, k={K}) ---");
|
|
1406
|
+
println!(" bounded heap: {heap_dur:?}");
|
|
1407
|
+
println!(" full sort+trunc: {sort_dur:?}");
|
|
1408
|
+
println!(" speedup ratio: {ratio:.2}×");
|
|
1409
|
+
|
|
1410
|
+
// Soft assertion. Floor is 1.4× because the cheap-key
|
|
1411
|
+
// benchmark hovers around 1.8× empirically; setting this too
|
|
1412
|
+
// close to the measured value risks flaky CI on slower
|
|
1413
|
+
// runners. Floor of 1.4× still catches an actual regression
|
|
1414
|
+
// (e.g., if select_topk became O(N²) or stopped using the
|
|
1415
|
+
// heap entirely).
|
|
1416
|
+
assert!(
|
|
1417
|
+
ratio > 1.4,
|
|
1418
|
+
"bounded heap should be substantially faster than full sort, but ratio = {ratio:.2}"
|
|
1419
|
+
);
|
|
1420
|
+
}
|
|
1066
1421
|
}
|