sqlite-muninn 0.1.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ """sqlite-muninn: HNSW vector search + graph traversal + Node2Vec for SQLite.
2
+
3
+ Zero-dependency C11 SQLite extension. Three subsystems in one .load:
4
+ HNSW approximate nearest neighbor search, graph traversal TVFs, and Node2Vec.
5
+ """
6
+
7
+ import importlib.metadata
8
+ import pathlib
9
+ import sqlite3
10
+
11
+ _PKG_DIR = pathlib.Path(__file__).parent
12
+
13
+ __version__ = importlib.metadata.version("sqlite-muninn")
14
+
15
+
16
+ def loadable_path() -> str:
17
+ """Return path to the muninn loadable extension (without file extension).
18
+
19
+ SQLite's load_extension() automatically appends .so/.dylib/.dll.
20
+ Searches in package directory first (wheel install), then repo root (dev / git install).
21
+ """
22
+ # Wheel install: binary is inside the package directory
23
+ pkg_path = _PKG_DIR / "muninn"
24
+ if any(_PKG_DIR.glob("muninn.*")):
25
+ return str(pkg_path)
26
+
27
+ # Development / git install: binary is in the build directory
28
+ build_dir = _PKG_DIR.parent / "build"
29
+ if any(build_dir.glob("muninn.*")):
30
+ return str(build_dir / "muninn")
31
+
32
+ raise FileNotFoundError("muninn extension not found. Build it with: make all")
33
+
34
+
35
+ def load(conn: sqlite3.Connection) -> None:
36
+ """Load muninn into the given SQLite connection.
37
+
38
+ The connection must have load_extension enabled:
39
+ conn.enable_load_extension(True)
40
+ sqlite_muninn.load(conn)
41
+ conn.enable_load_extension(False)
42
+ """
43
+ conn.load_extension(loadable_path())
Binary file
@@ -0,0 +1,406 @@
1
+ Metadata-Version: 2.4
2
+ Name: sqlite-muninn
3
+ Version: 0.1.0
4
+ Summary: HNSW vector search + graph traversal + Node2Vec for SQLite
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/user/sqlite-muninn
7
+ Project-URL: Repository, https://github.com/user/sqlite-muninn
8
+ Keywords: sqlite,vector,hnsw,graph,node2vec,search
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: C
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Database
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: <3.14,>=3.12
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Dynamic: license-file
19
+
20
+ # sqlite-muninn
21
+
22
+ <div align="center">
23
+ <img src="https://joshpeak.net/sqlite-muninn/assets/muninn_logo_transparent.png" alt="Muninn Raven Logo" width=480px/>
24
+ <p><i>Odin's mythic <a href="https://en.wikipedia.org/wiki/Huginn_and_Muninn">raven of Memory</a>.</i></p>
25
+ </div>
26
+
27
+
28
+
29
+ A zero-dependency C extension for SQLite to add an advanced collection of knowledge graph primitives like Vector Similarity Search, HNSW Indexes, Graph database, Community Detection, Node2Vec capabilities.
30
+
31
+ **[Documentation](https://neozenith.github.io/sqlite-muninn/)** | **[GitHub](https://github.com/neozenith/sqlite-muninn)**
32
+
33
+
34
+ ## Features
35
+
36
+ - **HNSW Vector Index** &mdash; O(log N) approximate nearest neighbor search with incremental insert/delete
37
+ - **Graph Traversal** &mdash; BFS, DFS, shortest path, connected components, PageRank on any edge table
38
+ - **Centrality Measures** &mdash; Degree, betweenness (Brandes), and closeness centrality with weighted/temporal support
39
+ - **Community Detection** &mdash; Leiden algorithm for discovering graph communities with modularity scoring
40
+ - **Node2Vec** &mdash; Learn structural node embeddings from graph topology, store in HNSW for similarity search
41
+ - **Zero dependencies** &mdash; Pure C11, compiles to a single `.dylib`/`.so`/`.dll`
42
+ - **SIMD accelerated** &mdash; ARM NEON and x86 SSE distance functions
43
+
44
+ ## Build
45
+
46
+ Requires SQLite development headers and a C11 compiler.
47
+
48
+ ```bash
49
+ # macOS (Homebrew SQLite recommended)
50
+ brew install sqlite
51
+ make all
52
+
53
+ # Linux
54
+ sudo apt-get install libsqlite3-dev
55
+ make all
56
+
57
+ # Run tests
58
+ make test # C unit tests
59
+ make test-python # Python integration tests
60
+ make test-all # Both
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ```sql
66
+ .load ./muninn
67
+
68
+ -- Create an HNSW vector index
69
+ CREATE VIRTUAL TABLE my_vectors USING hnsw_index(
70
+ dimensions=384, metric='cosine', m=16, ef_construction=200
71
+ );
72
+
73
+ -- Insert vectors
74
+ INSERT INTO my_vectors (rowid, vector) VALUES (1, ?); -- 384-dim float32 blob
75
+
76
+ -- KNN search
77
+ SELECT rowid, distance FROM my_vectors
78
+ WHERE vector MATCH ?query AND k = 10 AND ef_search = 64;
79
+
80
+ -- Graph traversal on any edge table
81
+ SELECT node, depth FROM graph_bfs
82
+ WHERE edge_table = 'friendships' AND src_col = 'user_a'
83
+ AND dst_col = 'user_b' AND start_node = 'alice' AND max_depth = 3
84
+ AND direction = 'both';
85
+
86
+ -- Connected components
87
+ SELECT node, component_id, component_size FROM graph_components
88
+ WHERE edge_table = 'friendships' AND src_col = 'user_a' AND dst_col = 'user_b';
89
+
90
+ -- PageRank
91
+ SELECT node, rank FROM graph_pagerank
92
+ WHERE edge_table = 'citations' AND src_col = 'citing' AND dst_col = 'cited'
93
+ AND damping = 0.85 AND iterations = 20;
94
+
95
+ -- Betweenness centrality (find bridge nodes)
96
+ SELECT node, centrality FROM graph_betweenness
97
+ WHERE edge_table = 'friendships' AND src_col = 'user_a' AND dst_col = 'user_b'
98
+ AND direction = 'both'
99
+ ORDER BY centrality DESC LIMIT 10;
100
+
101
+ -- Community detection (Leiden algorithm)
102
+ SELECT node, community_id, modularity FROM graph_leiden
103
+ WHERE edge_table = 'friendships' AND src_col = 'user_a' AND dst_col = 'user_b';
104
+
105
+ -- Learn structural embeddings from graph topology
106
+ SELECT node2vec_train(
107
+ 'friendships', 'user_a', 'user_b', 'my_vectors',
108
+ 64, 1.0, 1.0, 10, 80, 5, 5, 0.025, 5
109
+ );
110
+ ```
111
+
112
+ ## Examples
113
+
114
+ Self-contained examples in the [`examples/`](examples/) directory:
115
+
116
+ | Example | Demonstrates |
117
+ |---------|-------------|
118
+ | [Semantic Search](examples/semantic_search/) | HNSW index, KNN queries, point lookup, delete |
119
+ | [Movie Recommendations](examples/movie_recommendations/) | Vector similarity for content-based recommendations |
120
+ | [Social Network](examples/social_network/) | Graph TVFs on a social graph (BFS, components, PageRank) |
121
+ | [Research Papers](examples/research_papers/) | Citation graph analysis with Node2Vec embeddings |
122
+ | [Transit Routes](examples/transit_routes/) | Shortest path and graph traversal on route networks |
123
+
124
+ ```bash
125
+ make all
126
+ python examples/semantic_search/example.py
127
+ ```
128
+
129
+ ## API Reference
130
+
131
+ ### HNSW Virtual Table (`hnsw_index`)
132
+
133
+ ```sql
134
+ CREATE VIRTUAL TABLE name USING hnsw_index(
135
+ dimensions=N, -- vector dimensionality (required)
136
+ metric='l2', -- 'l2' | 'cosine' | 'inner_product'
137
+ m=16, -- max connections per node per layer
138
+ ef_construction=200 -- beam width during index construction
139
+ );
140
+ ```
141
+
142
+ **Columns:**
143
+
144
+ | Column | Type | Hidden | Description |
145
+ |--------|------|--------|-------------|
146
+ | `rowid` | INTEGER | Yes | User-assigned ID for joining with application tables |
147
+ | `vector` | BLOB | No | `float32[dim]` &mdash; input for INSERT, MATCH constraint for search |
148
+ | `distance` | REAL | No | Computed distance (output only, during search) |
149
+ | `k` | INTEGER | Yes | Top-k parameter (search constraint) |
150
+ | `ef_search` | INTEGER | Yes | Search beam width (search constraint) |
151
+
152
+ **Operations:**
153
+
154
+ ```sql
155
+ -- Insert
156
+ INSERT INTO t (rowid, vector) VALUES (42, ?blob);
157
+
158
+ -- KNN search
159
+ SELECT rowid, distance FROM t WHERE vector MATCH ?query AND k = 10;
160
+
161
+ -- Point lookup
162
+ SELECT vector FROM t WHERE rowid = 42;
163
+
164
+ -- Delete (with automatic neighbor reconnection)
165
+ DELETE FROM t WHERE rowid = 42;
166
+
167
+ -- Drop (removes index and all shadow tables)
168
+ DROP TABLE t;
169
+ ```
170
+
171
+ **Shadow tables** (auto-managed):
172
+ - `{name}_config` &mdash; HNSW parameters
173
+ - `{name}_nodes` &mdash; stored vectors and level assignments
174
+ - `{name}_edges` &mdash; the proximity graph (usable by graph TVFs)
175
+
176
+ ### Graph Table-Valued Functions
177
+
178
+ All graph TVFs work on **any** existing SQLite table with source/target columns. Table and column names are validated against SQL injection.
179
+
180
+ #### `graph_bfs` / `graph_dfs`
181
+
182
+ Breadth-first or depth-first traversal from a start node.
183
+
184
+ ```sql
185
+ SELECT node, depth, parent FROM graph_bfs
186
+ WHERE edge_table = 'edges'
187
+ AND src_col = 'src'
188
+ AND dst_col = 'dst'
189
+ AND start_node = 'node-42'
190
+ AND max_depth = 5
191
+ AND direction = 'forward'; -- 'forward' | 'reverse' | 'both'
192
+ ```
193
+
194
+ | Output Column | Type | Description |
195
+ |---------------|------|-------------|
196
+ | `node` | TEXT | Node identifier |
197
+ | `depth` | INTEGER | Hop distance from start |
198
+ | `parent` | TEXT | Parent node in traversal tree (NULL for start) |
199
+
200
+ #### `graph_shortest_path`
201
+
202
+ Unweighted (BFS) or weighted (Dijkstra) shortest path.
203
+
204
+ ```sql
205
+ -- Unweighted
206
+ SELECT node, distance, path_order FROM graph_shortest_path
207
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst'
208
+ AND start_node = 'A' AND end_node = 'Z' AND weight_col IS NULL;
209
+
210
+ -- Weighted (Dijkstra)
211
+ SELECT node, distance, path_order FROM graph_shortest_path
212
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst'
213
+ AND start_node = 'A' AND end_node = 'Z' AND weight_col = 'weight';
214
+ ```
215
+
216
+ | Output Column | Type | Description |
217
+ |---------------|------|-------------|
218
+ | `node` | TEXT | Node on the path |
219
+ | `distance` | REAL | Cumulative distance from start |
220
+ | `path_order` | INTEGER | Position in path (0-indexed) |
221
+
222
+ #### `graph_components`
223
+
224
+ Connected components via Union-Find with path compression.
225
+
226
+ ```sql
227
+ SELECT node, component_id, component_size FROM graph_components
228
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst';
229
+ ```
230
+
231
+ | Output Column | Type | Description |
232
+ |---------------|------|-------------|
233
+ | `node` | TEXT | Node identifier |
234
+ | `component_id` | INTEGER | Component index (0-based) |
235
+ | `component_size` | INTEGER | Number of nodes in this component |
236
+
237
+ #### `graph_pagerank`
238
+
239
+ Iterative power method PageRank with configurable damping and iterations.
240
+
241
+ ```sql
242
+ SELECT node, rank FROM graph_pagerank
243
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst'
244
+ AND damping = 0.85 -- optional, default 0.85
245
+ AND iterations = 20; -- optional, default 20
246
+ ```
247
+
248
+ | Output Column | Type | Description |
249
+ |---------------|------|-------------|
250
+ | `node` | TEXT | Node identifier |
251
+ | `rank` | REAL | PageRank score (sums to ~1.0) |
252
+
253
+ #### `graph_degree`
254
+
255
+ Degree centrality for all nodes.
256
+
257
+ ```sql
258
+ SELECT node, in_degree, out_degree, degree, centrality FROM graph_degree
259
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst';
260
+ ```
261
+
262
+ | Output Column | Type | Description |
263
+ |---------------|------|-------------|
264
+ | `node` | TEXT | Node identifier |
265
+ | `in_degree` | REAL | Count (or weighted sum) of incoming edges |
266
+ | `out_degree` | REAL | Count (or weighted sum) of outgoing edges |
267
+ | `degree` | REAL | Total degree (in + out) |
268
+ | `centrality` | REAL | Normalized degree centrality |
269
+
270
+ Optional constraints: `weight_col`, `direction`, `normalized`, `timestamp_col`, `time_start`, `time_end`.
271
+
272
+ #### `graph_betweenness`
273
+
274
+ Betweenness centrality via Brandes' O(VE) algorithm.
275
+
276
+ ```sql
277
+ SELECT node, centrality FROM graph_betweenness
278
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst'
279
+ AND direction = 'both';
280
+ ```
281
+
282
+ | Output Column | Type | Description |
283
+ |---------------|------|-------------|
284
+ | `node` | TEXT | Node identifier |
285
+ | `centrality` | REAL | Betweenness centrality score |
286
+
287
+ Optional constraints: `weight_col`, `direction`, `normalized`, `timestamp_col`, `time_start`, `time_end`.
288
+
289
+ #### `graph_closeness`
290
+
291
+ Closeness centrality with Wasserman-Faust normalization for disconnected graphs.
292
+
293
+ ```sql
294
+ SELECT node, centrality FROM graph_closeness
295
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst'
296
+ AND direction = 'both';
297
+ ```
298
+
299
+ | Output Column | Type | Description |
300
+ |---------------|------|-------------|
301
+ | `node` | TEXT | Node identifier |
302
+ | `centrality` | REAL | Closeness centrality score |
303
+
304
+ Optional constraints: `weight_col`, `direction`, `timestamp_col`, `time_start`, `time_end`.
305
+
306
+ #### `graph_leiden`
307
+
308
+ Community detection via the Leiden algorithm (Traag et al., 2019).
309
+
310
+ ```sql
311
+ SELECT node, community_id, modularity FROM graph_leiden
312
+ WHERE edge_table = 'edges' AND src_col = 'src' AND dst_col = 'dst';
313
+ ```
314
+
315
+ | Output Column | Type | Description |
316
+ |---------------|------|-------------|
317
+ | `node` | TEXT | Node identifier |
318
+ | `community_id` | INTEGER | Community assignment (0-based) |
319
+ | `modularity` | REAL | Global modularity score of the partition |
320
+
321
+ Optional constraints: `weight_col`, `resolution` (default 1.0), `timestamp_col`, `time_start`, `time_end`.
322
+
323
+ ### `node2vec_train()`
324
+
325
+ Learn vector embeddings from graph structure using biased random walks (Node2Vec) and Skip-gram with Negative Sampling (SGNS).
326
+
327
+ ```sql
328
+ SELECT node2vec_train(
329
+ edge_table, -- name of edge table
330
+ src_col, -- source column name
331
+ dst_col, -- destination column name
332
+ output_table, -- HNSW table to store embeddings (must exist)
333
+ dimensions, -- embedding size (must match HNSW table)
334
+ p, -- return parameter (1.0 = uniform/DeepWalk)
335
+ q, -- in-out parameter (1.0 = uniform/DeepWalk)
336
+ num_walks, -- walks per node
337
+ walk_length, -- max steps per walk
338
+ window_size, -- SGNS context window
339
+ negative_samples, -- negative samples per positive
340
+ learning_rate, -- initial learning rate (decays linearly)
341
+ epochs -- training epochs
342
+ );
343
+ -- Returns: number of nodes embedded
344
+ ```
345
+
346
+ **p, q parameter guide:**
347
+
348
+ | Setting | Walk Behavior | Best For |
349
+ |---------|--------------|----------|
350
+ | p=1, q=1 | Uniform (DeepWalk) | General structural similarity |
351
+ | Low p (0.25) | BFS-like, stays local | Community/cluster detection |
352
+ | Low q (0.5) | DFS-like, explores far | Structural role similarity |
353
+
354
+ ## Benchmarks
355
+
356
+ The project includes a comprehensive benchmark suite comparing muninn against other SQLite extensions across real-world workloads.
357
+
358
+ **Vector search** benchmarks compare against [sqlite-vector](https://github.com/nicepkg/sqlite-vector), [sqlite-vec](https://github.com/asg017/sqlite-vec), and [vectorlite](https://github.com/nicepkg/vectorlite) using 3 embedding models (MiniLM, MPNet, BGE-Large) and 2 text datasets (AG News, Wealth of Nations) at scales up to 250K vectors.
359
+
360
+ **Graph traversal** benchmarks compare muninn TVFs against recursive CTEs and [GraphQLite](https://github.com/nicepkg/graphqlite) on synthetic graphs (Erdos-Renyi, Barabasi-Albert) at scales up to 100K nodes.
361
+
362
+ Results include interactive Plotly charts for insert throughput, search latency, recall, database size, and tipping-point analysis. See the [full benchmark results](https://neozenith.github.io/sqlite-muninn/benchmarks/) on the documentation site.
363
+
364
+ ```bash
365
+ make -C benchmarks help # List all benchmark targets
366
+ make -C benchmarks analyze # Generate charts and reports from existing results
367
+ ```
368
+
369
+ ## Project Structure
370
+
371
+ ```
372
+ src/ C11 source (extension entry point, HNSW, graph TVFs, Node2Vec)
373
+ test/ C unit tests (custom minimal framework)
374
+ pytests/ Python integration tests (pytest)
375
+ examples/ Self-contained usage examples
376
+ benchmarks/
377
+ scripts/ Benchmark runners and analysis scripts
378
+ charts/ Plotly JSON chart specs (committed for docs site)
379
+ results/ JSONL benchmark data (generated, not committed)
380
+ docs/ MkDocs documentation source
381
+ ```
382
+
383
+ ## Documentation
384
+
385
+ Full documentation is published at **[neozenith.github.io/sqlite-muninn](https://neozenith.github.io/sqlite-muninn/)** via MkDocs Material with interactive Plotly charts.
386
+
387
+ ```bash
388
+ make docs-serve # Local dev server with live reload
389
+ make docs-build # Build static site
390
+ ```
391
+
392
+ ## Research References
393
+
394
+ | Feature | Paper |
395
+ |---------|-------|
396
+ | HNSW | Malkov & Yashunin, TPAMI 2020 |
397
+ | MN-RU insert repair | arXiv:2407.07871, 2024 |
398
+ | Patience early termination | SISAP 2025 |
399
+ | Betweenness centrality | Brandes, J. Math. Sociol. 2001 |
400
+ | Leiden community detection | Traag, Waltman & van Eck, Sci. Rep. 2019 |
401
+ | Node2Vec | Grover & Leskovec, KDD 2016 |
402
+ | SGNS | Mikolov et al., 2013 |
403
+
404
+ ## License
405
+
406
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,7 @@
1
+ sqlite_muninn/__init__.py,sha256=ZwYXrogu4GCeWaRzL5YzQ-kChpgyj14b8Zg5RCE-p-o,1465
2
+ sqlite_muninn/muninn.dll,sha256=GI1pcQvQEGw73qUe15fFlzjQm6oBDs60sHNiiGJxm2o,208384
3
+ sqlite_muninn-0.1.0.dist-info/licenses/LICENSE,sha256=QyyYwcmZwbq4FumXkfaKrlEDCv-bgsCNGVWuCkIlZCc,1107
4
+ sqlite_muninn-0.1.0.dist-info/METADATA,sha256=sK9jGD14-qBljsoMzZA5inZiyREEpBtvhTv_v5wvhxA,14827
5
+ sqlite_muninn-0.1.0.dist-info/WHEEL,sha256=I7N5ZCUvoGccb6ODwz-ZRxcDg9ip88rBxOm2U6c41XQ,97
6
+ sqlite_muninn-0.1.0.dist-info/top_level.txt,sha256=bHCxOUoDiepH3tv0QWQA25FwkMccry8BHRtOb2Ncae0,14
7
+ sqlite_muninn-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-win_amd64
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 sqlite-vec-graph contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ sqlite_muninn