sqlrite 0.1.11__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. {sqlrite-0.1.11 → sqlrite-0.1.12}/Cargo.lock +5 -5
  2. {sqlrite-0.1.11 → sqlrite-0.1.12}/Cargo.toml +1 -1
  3. {sqlrite-0.1.11 → sqlrite-0.1.12}/PKG-INFO +1 -1
  4. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/package.json +1 -1
  5. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/phase-7-plan.md +8 -4
  6. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/roadmap.md +3 -3
  7. {sqlrite-0.1.11 → sqlrite-0.1.12}/pyproject.toml +1 -1
  8. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/python/Cargo.toml +1 -1
  9. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/executor.rs +362 -7
  10. {sqlrite-0.1.11 → sqlrite-0.1.12}/.github/workflows/ci.yml +0 -0
  11. {sqlrite-0.1.11 → sqlrite-0.1.12}/.github/workflows/release-pr.yml +0 -0
  12. {sqlrite-0.1.11 → sqlrite-0.1.12}/.github/workflows/release.yml +0 -0
  13. {sqlrite-0.1.11 → sqlrite-0.1.12}/.github/workflows/rust.yml +0 -0
  14. {sqlrite-0.1.11 → sqlrite-0.1.12}/.gitignore +0 -0
  15. {sqlrite-0.1.11 → sqlrite-0.1.12}/CODE_OF_CONDUCT.md +0 -0
  16. {sqlrite-0.1.11 → sqlrite-0.1.12}/LICENSE +0 -0
  17. {sqlrite-0.1.11 → sqlrite-0.1.12}/MAINTAINERS +0 -0
  18. {sqlrite-0.1.11 → sqlrite-0.1.12}/Makefile +0 -0
  19. {sqlrite-0.1.11 → sqlrite-0.1.12}/README.md +0 -0
  20. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/index.html +0 -0
  21. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/package-lock.json +0 -0
  22. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/src/App.svelte +0 -0
  23. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/src/app.css +0 -0
  24. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/src/main.ts +0 -0
  25. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/src/vite-env.d.ts +0 -0
  26. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/svelte.config.js +0 -0
  27. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/tsconfig.json +0 -0
  28. {sqlrite-0.1.11 → sqlrite-0.1.12}/desktop/vite.config.ts +0 -0
  29. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/_index.md +0 -0
  30. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/architecture.md +0 -0
  31. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/design-decisions.md +0 -0
  32. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/desktop.md +0 -0
  33. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/embedding.md +0 -0
  34. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/file-format.md +0 -0
  35. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/getting-started.md +0 -0
  36. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/pager.md +0 -0
  37. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/release-plan.md +0 -0
  38. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/release-secrets.md +0 -0
  39. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/smoke-test.md +0 -0
  40. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/sql-engine.md +0 -0
  41. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/storage-model.md +0 -0
  42. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/supported-sql.md +0 -0
  43. {sqlrite-0.1.11 → sqlrite-0.1.12}/docs/usage.md +0 -0
  44. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/README.md +0 -0
  45. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/c/Makefile +0 -0
  46. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/c/hello.c +0 -0
  47. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/go/go.mod +0 -0
  48. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/go/hello.go +0 -0
  49. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/nodejs/hello.mjs +0 -0
  50. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/python/hello.py +0 -0
  51. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/rust/quickstart.rs +0 -0
  52. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/wasm/Makefile +0 -0
  53. {sqlrite-0.1.11 → sqlrite-0.1.12}/examples/wasm/index.html +0 -0
  54. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/SQLRite - Desktop.png +0 -0
  55. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/SQLRite Data Structures.png +0 -0
  56. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/SQLRite Simple SQL Execution High Level Diagram.png +0 -0
  57. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/SQLRite Simple SQL INSERT Execution High Level Diagram (Insert Row).png +0 -0
  58. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/SQLRite Simple SQL INSERT Execution High Level Diagram.png +0 -0
  59. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/SQLRite_logo.png +0 -0
  60. {sqlrite-0.1.11 → sqlrite-0.1.12}/images/architecture.png +0 -0
  61. {sqlrite-0.1.11 → sqlrite-0.1.12}/rust-toolchain.toml +0 -0
  62. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/AST.delete.example +0 -0
  63. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/AST.insert.exemple +0 -0
  64. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/AST.select.example +0 -0
  65. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/AST.update.example +0 -0
  66. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/CREATE TABLE sqlrite_schema.sql +0 -0
  67. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/CREATE_TABLE with duplicate.sql +0 -0
  68. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/CREATE_TABLE.sql +0 -0
  69. {sqlrite-0.1.11 → sqlrite-0.1.12}/samples/INSERT.sql +0 -0
  70. {sqlrite-0.1.11 → sqlrite-0.1.12}/scripts/bump-version.sh +0 -0
  71. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/README.md +0 -0
  72. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/conn.go +0 -0
  73. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/go.mod +0 -0
  74. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/rows.go +0 -0
  75. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/sqlrite.go +0 -0
  76. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/sqlrite_test.go +0 -0
  77. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/go/stmt.go +0 -0
  78. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/python/README.md +0 -0
  79. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/python/src/lib.rs +0 -0
  80. {sqlrite-0.1.11 → sqlrite-0.1.12}/sdk/python/tests/test_sqlrite.py +0 -0
  81. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/connection.rs +0 -0
  82. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/error.rs +0 -0
  83. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/lib.rs +0 -0
  84. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/main.rs +0 -0
  85. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/meta_command/mod.rs +0 -0
  86. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/repl/mod.rs +0 -0
  87. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/db/database.rs +0 -0
  88. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/db/mod.rs +0 -0
  89. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/db/secondary_index.rs +0 -0
  90. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/db/table.rs +0 -0
  91. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/mod.rs +0 -0
  92. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/cell.rs +0 -0
  93. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/file.rs +0 -0
  94. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/header.rs +0 -0
  95. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/index_cell.rs +0 -0
  96. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/interior_page.rs +0 -0
  97. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/mod.rs +0 -0
  98. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/overflow.rs +0 -0
  99. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/page.rs +0 -0
  100. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/pager.rs +0 -0
  101. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/table_page.rs +0 -0
  102. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/varint.rs +0 -0
  103. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/pager/wal.rs +0 -0
  104. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/parser/create.rs +0 -0
  105. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/parser/insert.rs +0 -0
  106. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/parser/mod.rs +0 -0
  107. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/parser/select.rs +0 -0
  108. {sqlrite-0.1.11 → sqlrite-0.1.12}/src/sql/tokenizer.rs +0 -0
@@ -3736,7 +3736,7 @@ dependencies = [
3736
3736
 
3737
3737
  [[package]]
3738
3738
  name = "sqlrite-desktop"
3739
- version = "0.1.11"
3739
+ version = "0.1.12"
3740
3740
  dependencies = [
3741
3741
  "serde",
3742
3742
  "serde_json",
@@ -3748,7 +3748,7 @@ dependencies = [
3748
3748
 
3749
3749
  [[package]]
3750
3750
  name = "sqlrite-engine"
3751
- version = "0.1.11"
3751
+ version = "0.1.12"
3752
3752
  dependencies = [
3753
3753
  "clap",
3754
3754
  "env_logger",
@@ -3763,7 +3763,7 @@ dependencies = [
3763
3763
 
3764
3764
  [[package]]
3765
3765
  name = "sqlrite-ffi"
3766
- version = "0.1.11"
3766
+ version = "0.1.12"
3767
3767
  dependencies = [
3768
3768
  "cbindgen",
3769
3769
  "sqlrite-engine",
@@ -3771,7 +3771,7 @@ dependencies = [
3771
3771
 
3772
3772
  [[package]]
3773
3773
  name = "sqlrite-nodejs"
3774
- version = "0.1.11"
3774
+ version = "0.1.12"
3775
3775
  dependencies = [
3776
3776
  "napi",
3777
3777
  "napi-build",
@@ -3781,7 +3781,7 @@ dependencies = [
3781
3781
 
3782
3782
  [[package]]
3783
3783
  name = "sqlrite-python"
3784
- version = "0.1.11"
3784
+ version = "0.1.12"
3785
3785
  dependencies = [
3786
3786
  "pyo3",
3787
3787
  "sqlrite-engine",
@@ -27,7 +27,7 @@ resolver = "3"
27
27
  # `package =` key so the import name stays `sqlrite` internally:
28
28
  # sqlrite = { package = "sqlrite-engine", path = "…" }
29
29
  name = "sqlrite-engine"
30
- version = "0.1.11"
30
+ version = "0.1.12"
31
31
  authors = ["Joao Henrique Machado Silva <joaoh82@gmail.com>"]
32
32
  edition = "2024"
33
33
  rust-version = "1.85"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sqlrite
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Classifier: Development Status :: 3 - Alpha
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "sqlrite-desktop-frontend",
3
3
  "private": true,
4
- "version": "0.1.11",
4
+ "version": "0.1.12",
5
5
  "type": "module",
6
6
  "scripts": {
7
7
  "dev": "vite",
@@ -122,13 +122,17 @@ SELECT id, title FROM docs ORDER BY embedding <-> [0.1, ...] LIMIT 10;
122
122
 
123
123
  ---
124
124
 
125
- ### 7c — Brute-force KNN executor optimization
125
+ ### 7c — Brute-force KNN executor optimization
126
126
 
127
- **What.** Recognize the pattern `ORDER BY <distance-expr> LIMIT k` and execute it with a bounded min-heap (size k) instead of a full sort. O(N log k) instead of O(N log N).
127
+ **What shipped.** The SELECT executor now branches on `(ORDER BY, LIMIT k)` shape. When both are present and `k < N`, the new `select_topk` function maintains a bounded `BinaryHeap` of size k instead of full-sorting all N rowids. O(N log k) instead of O(N log N).
128
128
 
129
- **Why a separate sub-phase.** 7b makes it work; 7c makes it fast enough to be useful on millions of rows. Worth shipping as its own commit so the perf delta is visible in benchmarks.
129
+ **Implementation note: max-heap with direction-aware Ord.** A single `HeapEntry { key: Value, rowid: i64, asc: bool }` wrapper handles both `ORDER BY ASC LIMIT k` (k smallest) and `ORDER BY DESC LIMIT k` (k largest) without separate code paths. The `asc` flag inverts the natural Ord, so the displacement test reduces to "new entry < heap top" in both cases. After the scan, `into_sorted_vec` returns the right caller-facing order (ascending for ASC, descending for DESC).
130
130
 
131
- **LOC estimate:** ~150 lines including a tiny benchmark to prove the speedup.
131
+ **Measured speedup** (N=10k, k=10, single REAL column sort key, release build): ~1.8×. The advantage scales with N and with per-row work — KNN queries where the sort key is `vec_distance_l2(col, [...])` benefit much more because each key evaluation is itself O(dim).
132
+
133
+ **LOC**: ~120 implementation + ~180 tests/benchmark = ~300 total. Slightly over the ~150 estimate because the test surface (correctness + bench + edge cases for k=0, k>N, empty input, distance-function integration) ended up larger than initially projected.
134
+
135
+ **Pre-existing bug surfaced.** The seed function for the benchmark needed positive scores because the INSERT parser doesn't currently handle `Expr::UnaryOp(Minus, Number(...))` for negative literals. Worked around with a Knuth-hash scrambler that stays positive; the underlying parser bug is documented as a follow-up.
132
136
 
133
137
  ---
134
138
 
@@ -470,9 +470,9 @@ The full plan + recorded design decisions live in [`docs/phase-7-plan.md`](phase
470
470
 
471
471
  Approved sub-phases (Q1–Q10 resolved):
472
472
 
473
- - **7a — `VECTOR(N)` column type** — dense fixed-dimension f32 storage via the existing cell encoding; bump file format to v4. Bracket-array literal syntax `[0.1, 0.2, …]` (Q7).
474
- - **7b — Distance functions + KNN operators** — `vec_distance_l2/cosine/dot` plus pgvector-style `<->` `<=>` `<#>` operators (Q6).
475
- - **7c — Brute-force KNN executor optimization** — recognize `ORDER BY <distance> LIMIT k`, use bounded min-heap.
473
+ - **✅ 7a — `VECTOR(N)` column type** *(v0.1.10)* — dense fixed-dimension f32 storage via the existing cell encoding; format bumped to v4. Bracket-array literal syntax `[0.1, 0.2, …]` (Q7).
474
+ - **✅ 7b — Distance functions** *(v0.1.11)* — `vec_distance_l2/cosine/dot`, plus the ORDER BY-expressions parser change so KNN queries work end-to-end. Operators (`<->` `<=>` `<#>`) deferred to **7b.1** — sqlparser doesn't parse them natively, contradicting Q6's "tiny parser change" assumption.
475
+ - **✅ 7c — Brute-force KNN executor optimization** — bounded `BinaryHeap` of size k for `ORDER BY <expr> LIMIT k`. ~1.8× faster than full-sort at N=10k for cheap keys; bigger gains on expensive keys like `vec_distance_l2`.
476
476
  - **7d — HNSW ANN index** — `CREATE INDEX … USING hnsw (col)`; persisted as cell-encoded graph. Fixed defaults `M=16, ef_construction=200, ef_search=50` (Q2).
477
477
  - **7e — JSON column type + path queries** — `JSON` data type stored as bincoded `serde_json::Value` (Q3); `json_extract` / `json_array_length` / `json_object_keys` / `json_type`.
478
478
  - **7f — ~~Full-text search with BM25~~** — **deferred to Phase 8** (Q1).
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "sqlrite"
7
- version = "0.1.11"
7
+ version = "0.1.12"
8
8
  description = "Python bindings for SQLRite — a small, embeddable SQLite clone written in Rust."
9
9
  authors = [{ name = "Joao Henrique Machado Silva", email = "joaoh82@gmail.com" }]
10
10
  license = { text = "MIT" }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "sqlrite-python"
3
- version = "0.1.11"
3
+ version = "0.1.12"
4
4
  authors = ["Joao Henrique Machado Silva <joaoh82@gmail.com>"]
5
5
  edition = "2024"
6
6
  rust-version = "1.85"
@@ -73,13 +73,39 @@ pub fn execute_select_rows(query: SelectQuery, db: &Database) -> Result<SelectRe
73
73
  };
74
74
  let mut matching = matching;
75
75
 
76
- // Sort before applying LIMIT, matching SQL semantics.
77
- if let Some(order) = &query.order_by {
78
- sort_rowids(&mut matching, table, order)?;
79
- }
80
-
81
- if let Some(n) = query.limit {
82
- matching.truncate(n);
76
+ // Phase 7c bounded-heap top-k optimization.
77
+ //
78
+ // The naive "ORDER BY <expr>" path (Phase 7b) sorts every matching
79
+ // rowid: O(N log N) sort_by + a truncate. For KNN queries
80
+ //
81
+ // SELECT id FROM docs
82
+ // ORDER BY vec_distance_l2(embedding, [...])
83
+ // LIMIT 10;
84
+ //
85
+ // N is the table row count and k is the LIMIT. With a bounded
86
+ // max-heap of size k we can find the top-k in O(N log k) — same
87
+ // sort_by-per-row cost on the heap operations, but k is typically
88
+ // 10-100 while N can be millions.
89
+ //
90
+ // We branch in three cases:
91
+ // 1. ORDER BY + LIMIT k where k < |matching| → bounded heap.
92
+ // 2. ORDER BY without LIMIT, or LIMIT >= |matching| → full sort
93
+ // (heap saves nothing when we'd keep everyone anyway).
94
+ // 3. LIMIT without ORDER BY → just truncate (no sort needed).
95
+ match (&query.order_by, query.limit) {
96
+ (Some(order), Some(k)) if k < matching.len() => {
97
+ matching = select_topk(&matching, table, order, k)?;
98
+ }
99
+ (Some(order), _) => {
100
+ sort_rowids(&mut matching, table, order)?;
101
+ if let Some(k) = query.limit {
102
+ matching.truncate(k);
103
+ }
104
+ }
105
+ (None, Some(k)) => {
106
+ matching.truncate(k);
107
+ }
108
+ (None, None) => {}
83
109
  }
84
110
 
85
111
  // Build typed rows. Missing cells surface as `Value::Null` — that
@@ -500,6 +526,99 @@ fn try_extract_equality(expr: &Expr) -> Option<(String, sqlparser::ast::Value)>
500
526
  None
501
527
  }
502
528
 
529
+ /// One entry in the bounded-heap top-k path. Holds a pre-evaluated
530
+ /// sort key + the rowid it came from. The `asc` flag inverts `Ord`
531
+ /// so a single `BinaryHeap<HeapEntry>` works for both ASC and DESC
532
+ /// without wrapping in `std::cmp::Reverse` at the call site:
533
+ ///
534
+ /// - ASC LIMIT k = "k smallest": natural Ord. Max-heap top is the
535
+ /// largest currently kept; new items smaller than top displace.
536
+ /// - DESC LIMIT k = "k largest": Ord reversed. Max-heap top is now
537
+ /// the smallest currently kept (under reversed Ord, smallest
538
+ /// looks largest); new items larger than top displace.
539
+ ///
540
+ /// In both cases the displacement test reduces to "new entry < heap top".
541
+ struct HeapEntry {
542
+ key: Value,
543
+ rowid: i64,
544
+ asc: bool,
545
+ }
546
+
547
+ impl PartialEq for HeapEntry {
548
+ fn eq(&self, other: &Self) -> bool {
549
+ self.cmp(other) == Ordering::Equal
550
+ }
551
+ }
552
+
553
+ impl Eq for HeapEntry {}
554
+
555
+ impl PartialOrd for HeapEntry {
556
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
557
+ Some(self.cmp(other))
558
+ }
559
+ }
560
+
561
+ impl Ord for HeapEntry {
562
+ fn cmp(&self, other: &Self) -> Ordering {
563
+ let raw = compare_values(Some(&self.key), Some(&other.key));
564
+ if self.asc { raw } else { raw.reverse() }
565
+ }
566
+ }
567
+
568
+ /// Bounded-heap top-k selection. Returns at most `k` rowids in the
569
+ /// caller's desired order (ascending key for `order.ascending`,
570
+ /// descending otherwise).
571
+ ///
572
+ /// O(N log k) where N = `matching.len()`. Caller must check
573
+ /// `k < matching.len()` for this to be a win — for k ≥ N the
574
+ /// `sort_rowids` full-sort path is the same asymptotic cost without
575
+ /// the heap overhead.
576
+ fn select_topk(
577
+ matching: &[i64],
578
+ table: &Table,
579
+ order: &OrderByClause,
580
+ k: usize,
581
+ ) -> Result<Vec<i64>> {
582
+ use std::collections::BinaryHeap;
583
+
584
+ if k == 0 || matching.is_empty() {
585
+ return Ok(Vec::new());
586
+ }
587
+
588
+ let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::with_capacity(k + 1);
589
+
590
+ for &rowid in matching {
591
+ let key = eval_expr(&order.expr, table, rowid)?;
592
+ let entry = HeapEntry {
593
+ key,
594
+ rowid,
595
+ asc: order.ascending,
596
+ };
597
+
598
+ if heap.len() < k {
599
+ heap.push(entry);
600
+ } else {
601
+ // peek() returns the largest under our direction-aware Ord
602
+ // — the worst entry currently kept. Displace it iff the
603
+ // new entry is "better" (i.e. compares Less).
604
+ if entry < *heap.peek().unwrap() {
605
+ heap.pop();
606
+ heap.push(entry);
607
+ }
608
+ }
609
+ }
610
+
611
+ // `into_sorted_vec` returns ascending under our direction-aware Ord:
612
+ // ASC: ascending by raw key (what we want)
613
+ // DESC: ascending under reversed Ord = descending by raw key (what
614
+ // we want for an ORDER BY DESC LIMIT k result)
615
+ Ok(heap
616
+ .into_sorted_vec()
617
+ .into_iter()
618
+ .map(|e| e.rowid)
619
+ .collect())
620
+ }
621
+
503
622
  fn sort_rowids(rowids: &mut [i64], table: &Table, order: &OrderByClause) -> Result<()> {
504
623
  // Phase 7b: ORDER BY now accepts any expression (column ref,
505
624
  // arithmetic, function call, …). Pre-compute the sort key for
@@ -1063,4 +1182,240 @@ mod tests {
1063
1182
  let cos = vec_distance_cosine(&a, &b).unwrap();
1064
1183
  assert!(approx_eq(dot, cos - 1.0, 1e-5));
1065
1184
  }
1185
+
1186
+ // -----------------------------------------------------------------
1187
+ // Phase 7c — bounded-heap top-k correctness + benchmark
1188
+ // -----------------------------------------------------------------
1189
+
1190
+ use crate::sql::db::database::Database;
1191
+ use crate::sql::parser::select::SelectQuery;
1192
+ use sqlparser::dialect::SQLiteDialect;
1193
+ use sqlparser::parser::Parser;
1194
+
1195
+ /// Builds a `docs(id INTEGER PK, score REAL)` table with N rows of
1196
+ /// distinct positive scores so top-k tests aren't sensitive to
1197
+ /// tie-breaking (heap is unstable; full-sort is stable; we want
1198
+ /// both to agree without arguing about equal-score row order).
1199
+ ///
1200
+ /// **Why positive scores:** the INSERT parser doesn't currently
1201
+ /// handle `Expr::UnaryOp(Minus, …)` for negative number literals
1202
+ /// (it would parse `-3.14` as a unary expression and the value
1203
+ /// extractor would skip it). That's a pre-existing bug, out of
1204
+ /// scope for 7c. Using the Knuth multiplicative hash gives us
1205
+ /// distinct positive scrambled values without dancing around the
1206
+ /// negative-literal limitation.
1207
+ fn seed_score_table(n: usize) -> Database {
1208
+ let mut db = Database::new("tempdb".to_string());
1209
+ crate::sql::process_command(
1210
+ "CREATE TABLE docs (id INTEGER PRIMARY KEY, score REAL);",
1211
+ &mut db,
1212
+ )
1213
+ .expect("create");
1214
+ for i in 0..n {
1215
+ // Knuth multiplicative hash mod 1_000_000 — distinct,
1216
+ // dense in [0, 999_999], no collisions for n up to ~tens
1217
+ // of thousands.
1218
+ let score = ((i as u64).wrapping_mul(2_654_435_761) % 1_000_000) as f64;
1219
+ let sql = format!("INSERT INTO docs (score) VALUES ({score});");
1220
+ crate::sql::process_command(&sql, &mut db).expect("insert");
1221
+ }
1222
+ db
1223
+ }
1224
+
1225
+ /// Helper: parses an SQL SELECT into a SelectQuery so we can drive
1226
+ /// `select_topk` / `sort_rowids` directly without the rest of the
1227
+ /// process_command pipeline.
1228
+ fn parse_select(sql: &str) -> SelectQuery {
1229
+ let dialect = SQLiteDialect {};
1230
+ let mut ast = Parser::parse_sql(&dialect, sql).expect("parse");
1231
+ let stmt = ast.pop().expect("one statement");
1232
+ SelectQuery::new(&stmt).expect("select-query")
1233
+ }
1234
+
1235
+ #[test]
1236
+ fn topk_matches_full_sort_asc() {
1237
+ // Build N=200, top-k=10. Bounded heap output must equal
1238
+ // full-sort-then-truncate output (both produce ASC order).
1239
+ let db = seed_score_table(200);
1240
+ let table = db.get_table("docs".to_string()).unwrap();
1241
+ let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 10;");
1242
+ let order = q.order_by.as_ref().unwrap();
1243
+ let all_rowids = table.rowids();
1244
+
1245
+ // Full-sort path
1246
+ let mut full = all_rowids.clone();
1247
+ sort_rowids(&mut full, table, order).unwrap();
1248
+ full.truncate(10);
1249
+
1250
+ // Bounded-heap path
1251
+ let topk = select_topk(&all_rowids, table, order, 10).unwrap();
1252
+
1253
+ assert_eq!(topk, full, "top-k via heap should match full-sort+truncate");
1254
+ }
1255
+
1256
+ #[test]
1257
+ fn topk_matches_full_sort_desc() {
1258
+ // Same with DESC — verifies the direction-aware Ord wrapper.
1259
+ let db = seed_score_table(200);
1260
+ let table = db.get_table("docs".to_string()).unwrap();
1261
+ let q = parse_select("SELECT * FROM docs ORDER BY score DESC LIMIT 10;");
1262
+ let order = q.order_by.as_ref().unwrap();
1263
+ let all_rowids = table.rowids();
1264
+
1265
+ let mut full = all_rowids.clone();
1266
+ sort_rowids(&mut full, table, order).unwrap();
1267
+ full.truncate(10);
1268
+
1269
+ let topk = select_topk(&all_rowids, table, order, 10).unwrap();
1270
+
1271
+ assert_eq!(
1272
+ topk, full,
1273
+ "top-k DESC via heap should match full-sort+truncate"
1274
+ );
1275
+ }
1276
+
1277
+ #[test]
1278
+ fn topk_k_larger_than_n_returns_everything_sorted() {
1279
+ // The executor branches off to the full-sort path when k >= N,
1280
+ // but if a caller invokes select_topk directly with k > N, it
1281
+ // should still produce all-sorted output (no truncation
1282
+ // because we don't have N items to truncate to k).
1283
+ let db = seed_score_table(50);
1284
+ let table = db.get_table("docs".to_string()).unwrap();
1285
+ let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 1000;");
1286
+ let order = q.order_by.as_ref().unwrap();
1287
+ let topk = select_topk(&table.rowids(), table, order, 1000).unwrap();
1288
+ assert_eq!(topk.len(), 50);
1289
+ // All scores in ascending order.
1290
+ let scores: Vec<f64> = topk
1291
+ .iter()
1292
+ .filter_map(|r| match table.get_value("score", *r) {
1293
+ Some(Value::Real(f)) => Some(f),
1294
+ _ => None,
1295
+ })
1296
+ .collect();
1297
+ assert!(scores.windows(2).all(|w| w[0] <= w[1]));
1298
+ }
1299
+
1300
+ #[test]
1301
+ fn topk_k_zero_returns_empty() {
1302
+ let db = seed_score_table(10);
1303
+ let table = db.get_table("docs".to_string()).unwrap();
1304
+ let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 1;");
1305
+ let order = q.order_by.as_ref().unwrap();
1306
+ let topk = select_topk(&table.rowids(), table, order, 0).unwrap();
1307
+ assert!(topk.is_empty());
1308
+ }
1309
+
1310
+ #[test]
1311
+ fn topk_empty_input_returns_empty() {
1312
+ let db = seed_score_table(0);
1313
+ let table = db.get_table("docs".to_string()).unwrap();
1314
+ let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 5;");
1315
+ let order = q.order_by.as_ref().unwrap();
1316
+ let topk = select_topk(&[], table, order, 5).unwrap();
1317
+ assert!(topk.is_empty());
1318
+ }
1319
+
1320
+ #[test]
1321
+ fn topk_works_through_select_executor_with_distance_function() {
1322
+ // Integration check that the executor actually picks the
1323
+ // bounded-heap path on a KNN-shaped query and produces the
1324
+ // correct top-k.
1325
+ let mut db = Database::new("tempdb".to_string());
1326
+ crate::sql::process_command(
1327
+ "CREATE TABLE docs (id INTEGER PRIMARY KEY, e VECTOR(2));",
1328
+ &mut db,
1329
+ )
1330
+ .unwrap();
1331
+ // Five rows with distinct distances from probe [1.0, 0.0]:
1332
+ // id=1 [1.0, 0.0] distance=0
1333
+ // id=2 [2.0, 0.0] distance=1
1334
+ // id=3 [0.0, 3.0] distance=√(1+9) = √10 ≈ 3.16
1335
+ // id=4 [1.0, 4.0] distance=4
1336
+ // id=5 [10.0, 10.0] distance=√(81+100) ≈ 13.45
1337
+ for v in &[
1338
+ "[1.0, 0.0]",
1339
+ "[2.0, 0.0]",
1340
+ "[0.0, 3.0]",
1341
+ "[1.0, 4.0]",
1342
+ "[10.0, 10.0]",
1343
+ ] {
1344
+ crate::sql::process_command(&format!("INSERT INTO docs (e) VALUES ({v});"), &mut db)
1345
+ .unwrap();
1346
+ }
1347
+ let resp = crate::sql::process_command(
1348
+ "SELECT id FROM docs ORDER BY vec_distance_l2(e, [1.0, 0.0]) ASC LIMIT 3;",
1349
+ &mut db,
1350
+ )
1351
+ .unwrap();
1352
+ // Top-3 closest to [1.0, 0.0] are id=1, id=2, id=3 (in that order).
1353
+ // The status message tells us how many rows came back.
1354
+ assert!(resp.contains("3 rows returned"), "got: {resp}");
1355
+ }
1356
+
1357
+ /// Manual benchmark — not run by default. Recommended invocation:
1358
+ ///
1359
+ /// cargo test -p sqlrite-engine --lib topk_benchmark --release \
1360
+ /// -- --ignored --nocapture
1361
+ ///
1362
+ /// (`--release` matters: Rust's optimized sort gets very fast under
1363
+ /// optimization, so the heap's relative advantage is best observed
1364
+ /// against a sort that's also been optimized.)
1365
+ ///
1366
+ /// Measured numbers on an Apple Silicon laptop with N=10_000 + k=10:
1367
+ /// - bounded heap: ~820µs
1368
+ /// - full sort+trunc: ~1.5ms
1369
+ /// - ratio: ~1.8×
1370
+ ///
1371
+ /// The advantage is real but moderate at this size because the sort
1372
+ /// key here is a single REAL column read (cheap) and Rust's sort_by
1373
+ /// has a very low constant factor. The asymptotic O(N log k) vs
1374
+ /// O(N log N) advantage scales with N and with per-row work — KNN
1375
+ /// queries where the sort key is `vec_distance_l2(col, [...])` are
1376
+ /// where this path really pays off, because each key evaluation is
1377
+ /// itself O(dim) and the heap path skips the per-row evaluation
1378
+ /// in the comparator (see `sort_rowids` for the contrast).
1379
+ #[test]
1380
+ #[ignore]
1381
+ fn topk_benchmark() {
1382
+ use std::time::Instant;
1383
+ const N: usize = 10_000;
1384
+ const K: usize = 10;
1385
+
1386
+ let db = seed_score_table(N);
1387
+ let table = db.get_table("docs".to_string()).unwrap();
1388
+ let q = parse_select("SELECT * FROM docs ORDER BY score ASC LIMIT 10;");
1389
+ let order = q.order_by.as_ref().unwrap();
1390
+ let all_rowids = table.rowids();
1391
+
1392
+ // Time bounded heap.
1393
+ let t0 = Instant::now();
1394
+ let _topk = select_topk(&all_rowids, table, order, K).unwrap();
1395
+ let heap_dur = t0.elapsed();
1396
+
1397
+ // Time full sort + truncate.
1398
+ let t1 = Instant::now();
1399
+ let mut full = all_rowids.clone();
1400
+ sort_rowids(&mut full, table, order).unwrap();
1401
+ full.truncate(K);
1402
+ let sort_dur = t1.elapsed();
1403
+
1404
+ let ratio = sort_dur.as_secs_f64() / heap_dur.as_secs_f64().max(1e-9);
1405
+ println!("\n--- topk_benchmark (N={N}, k={K}) ---");
1406
+ println!(" bounded heap: {heap_dur:?}");
1407
+ println!(" full sort+trunc: {sort_dur:?}");
1408
+ println!(" speedup ratio: {ratio:.2}×");
1409
+
1410
+ // Soft assertion. Floor is 1.4× because the cheap-key
1411
+ // benchmark hovers around 1.8× empirically; setting this too
1412
+ // close to the measured value risks flaky CI on slower
1413
+ // runners. Floor of 1.4× still catches an actual regression
1414
+ // (e.g., if select_topk became O(N²) or stopped using the
1415
+ // heap entirely).
1416
+ assert!(
1417
+ ratio > 1.4,
1418
+ "bounded heap should be substantially faster than full sort, but ratio = {ratio:.2}"
1419
+ );
1420
+ }
1066
1421
  }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes