sqlitesearch 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlitesearch-0.0.1/.github/workflows/tests.yml +38 -0
- sqlitesearch-0.0.1/.gitignore +207 -0
- sqlitesearch-0.0.1/CLAUDE.md +68 -0
- sqlitesearch-0.0.1/Makefile +25 -0
- sqlitesearch-0.0.1/PKG-INFO +200 -0
- sqlitesearch-0.0.1/README.md +181 -0
- sqlitesearch-0.0.1/notebooks/faq_search.ipynb +250 -0
- sqlitesearch-0.0.1/notebooks/faq_search.py +105 -0
- sqlitesearch-0.0.1/plan.md +103 -0
- sqlitesearch-0.0.1/pyproject.toml +56 -0
- sqlitesearch-0.0.1/sqlitesearch/__init__.py +16 -0
- sqlitesearch-0.0.1/sqlitesearch/__version__.py +1 -0
- sqlitesearch-0.0.1/sqlitesearch/text/__init__.py +9 -0
- sqlitesearch-0.0.1/sqlitesearch/text/fts.py +356 -0
- sqlitesearch-0.0.1/sqlitesearch/vector/__init__.py +10 -0
- sqlitesearch-0.0.1/sqlitesearch/vector/lsh.py +503 -0
- sqlitesearch-0.0.1/tests/__init__.py +1 -0
- sqlitesearch-0.0.1/tests/test_integration.py +301 -0
- sqlitesearch-0.0.1/tests/test_performance.py +422 -0
- sqlitesearch-0.0.1/tests/test_text_search.py +518 -0
- sqlitesearch-0.0.1/tests/test_vector_search.py +502 -0
- sqlitesearch-0.0.1/uv.lock +1705 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ['3.10', '3.11', '3.12', '3.13']
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Install uv
|
|
26
|
+
run: curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: uv sync --dev
|
|
30
|
+
|
|
31
|
+
- name: Run tests with coverage
|
|
32
|
+
run: uv run pytest --cov=sqlitesearch --cov-report=xml --cov-report=term-missing
|
|
33
|
+
|
|
34
|
+
- name: Upload coverage to Codecov
|
|
35
|
+
uses: codecov/codecov-action@v4
|
|
36
|
+
with:
|
|
37
|
+
file: ./coverage.xml
|
|
38
|
+
fail_ci_if_error: false
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
## CRITICAL: File Editing on Windows
|
|
2
|
+
|
|
3
|
+
### ⚠️ MANDATORY: Always Use Backslashes on Windows for File Paths
|
|
4
|
+
|
|
5
|
+
When using Edit or MultiEdit tools on Windows, you MUST use backslashes (`\`) in file paths, NOT forward slashes (`/`).
|
|
6
|
+
|
|
7
|
+
❌ WRONG - Will cause errors:
|
|
8
|
+
```
|
|
9
|
+
Edit(file_path: "D:/repos/project/file.tsx", ...)
|
|
10
|
+
MultiEdit(file_path: "D:/repos/project/file.tsx", ...)
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
✅ CORRECT - Always works:
|
|
14
|
+
```
|
|
15
|
+
Edit(file_path: "D:\repos\project\file.tsx", ...)
|
|
16
|
+
MultiEdit(file_path: "D:\repos\project\file.tsx", ...)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### ⚠️ "File has been unexpectedly modified" Error
|
|
20
|
+
|
|
21
|
+
If you get this error: **"File has been unexpectedly modified. Read it again before attempting to write it"**
|
|
22
|
+
|
|
23
|
+
**Root cause:** The file was modified after you last read it (by linter, formatter, git, or external process).
|
|
24
|
+
|
|
25
|
+
**Solution: Re-read the file immediately before editing:**
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# 1. Read the file again
|
|
29
|
+
Read(file_path: "path\to\file.txt")
|
|
30
|
+
|
|
31
|
+
# 2. Then immediately edit
|
|
32
|
+
Edit(file_path: "path\to\file.txt", old_string="...", new_string="...")
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Tool requirements:**
|
|
36
|
+
|
|
37
|
+
- Edit - Must `Read` immediately before - `old_string` must match current content
|
|
38
|
+
- Write - Must `Read` once per conversation before first write
|
|
39
|
+
|
|
40
|
+
**Common triggers:**
|
|
41
|
+
- Linters/formatters running on save
|
|
42
|
+
- Git operations (checkout, merge, rebase)
|
|
43
|
+
- File watchers or build processes
|
|
44
|
+
|
|
45
|
+
**Tip:** If this happens repeatedly, consider disabling auto-formatting for files you're actively editing with Claude Code.
|
|
46
|
+
|
|
47
|
+
### ⚠️ Use UV for Python Package Management
|
|
48
|
+
|
|
49
|
+
When installing Python packages, use `uv` instead of `pip`. See `/uv` for details.
|
|
50
|
+
|
|
51
|
+
❌ WRONG:
|
|
52
|
+
```bash
|
|
53
|
+
pip install djangorestframework
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
✅ CORRECT:
|
|
57
|
+
```bash
|
|
58
|
+
cd backend-django
|
|
59
|
+
uv add djangorestframework
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Run Django commands:
|
|
63
|
+
```bash
|
|
64
|
+
cd backend-django
|
|
65
|
+
uv run python manage.py makemigrations
|
|
66
|
+
uv run python manage.py migrate
|
|
67
|
+
uv run python manage.py test
|
|
68
|
+
```
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
.PHONY: test setup shell coverage publish-build publish-test publish publish-clean
|
|
2
|
+
|
|
3
|
+
test:
|
|
4
|
+
uv run pytest
|
|
5
|
+
|
|
6
|
+
coverage:
|
|
7
|
+
uv run pytest --cov=sqlitesearch --cov-report=term-missing --cov-report=html
|
|
8
|
+
|
|
9
|
+
setup:
|
|
10
|
+
uv sync --dev
|
|
11
|
+
|
|
12
|
+
shell:
|
|
13
|
+
uv shell
|
|
14
|
+
|
|
15
|
+
publish-build:
|
|
16
|
+
uv run hatch build
|
|
17
|
+
|
|
18
|
+
publish-test:
|
|
19
|
+
uv run hatch publish --repo test
|
|
20
|
+
|
|
21
|
+
publish:
|
|
22
|
+
uv run hatch publish
|
|
23
|
+
|
|
24
|
+
publish-clean:
|
|
25
|
+
rm -r dist/
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sqlitesearch
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A tiny, SQLite-backed search library for small, local projects
|
|
5
|
+
Author: sqlitesearch contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: fts,hybrid-search,lsh,search,sqlite,vector-search
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Requires-Dist: numpy>=1.24.0
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# sqlitesearch
|
|
21
|
+
|
|
22
|
+
A tiny, SQLite-backed search library for small, local projects. sqlitesearch is a persistent sibling of [minsearch](https://github.com/alexeygrigorev/minsearch) - same API, but stores data on disk.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
sqlitesearch provides
|
|
26
|
+
|
|
27
|
+
- text search using SQLite FTS5 and persistent
|
|
28
|
+
- vector search using LSH (random projections) with exact reranking
|
|
29
|
+
- hybrid search (Todo explain)
|
|
30
|
+
|
|
31
|
+
It stores the index in a single SQLite file, making it perfect for applications
|
|
32
|
+
that need search functionality without running a separate search server.
|
|
33
|
+
|
|
34
|
+
## When to use
|
|
35
|
+
|
|
36
|
+
- minsearch - in-memory, for experiments and notebooks
|
|
37
|
+
- sqlitesearch - persistent (SQLite file), for pet projects and prototypes
|
|
38
|
+
- Postgres/Elasticsearch/Qdrant/etc - production workloads with high traffic
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv add sqlitesearch
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Text Search
|
|
48
|
+
|
|
49
|
+
Text search uses SQLite's FTS5 (Full-Text Search) extension with BM25 ranking.
|
|
50
|
+
|
|
51
|
+
### Basic usage
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from sqlitesearch import TextSearchIndex
|
|
55
|
+
|
|
56
|
+
# Create an index
|
|
57
|
+
index = TextSearchIndex(
|
|
58
|
+
text_fields=["title", "description"],
|
|
59
|
+
keyword_fields=["category"],
|
|
60
|
+
db_path="search.db"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Index some documents
|
|
64
|
+
documents = [
|
|
65
|
+
{"id": 1, "title": "Python Tutorial", "description": "Learn Python basics", "category": "tutorial"},
|
|
66
|
+
{"id": 2, "title": "Java Guide", "description": "Java programming guide", "category": "guide"},
|
|
67
|
+
]
|
|
68
|
+
index.fit(documents)
|
|
69
|
+
|
|
70
|
+
# Search
|
|
71
|
+
results = index.search("python programming")
|
|
72
|
+
for result in results:
|
|
73
|
+
print(result["title"], result["score"])
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Filtering
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
# Filter by keyword fields
|
|
80
|
+
results = index.search(
|
|
81
|
+
"python",
|
|
82
|
+
filter_dict={"category": "tutorial"}
|
|
83
|
+
)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Field boosting
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
# Boost title matches higher than description
|
|
90
|
+
results = index.search(
|
|
91
|
+
"python",
|
|
92
|
+
boost_dict={"title": 2.0, "description": 1.0}
|
|
93
|
+
)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Stemming
|
|
97
|
+
|
|
98
|
+
Enable Porter stemming for better matching of word variants:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# With stemming, "running" matches "run", "courses" matches "course", etc.
|
|
102
|
+
index = TextSearchIndex(
|
|
103
|
+
text_fields=["title", "description"],
|
|
104
|
+
stemming=True,
|
|
105
|
+
db_path="search.db"
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
By default, stemming is disabled to match minsearch behavior.
|
|
110
|
+
|
|
111
|
+
### Adding documents
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
# Add documents one by one
|
|
115
|
+
index.add({
|
|
116
|
+
"id": 3,
|
|
117
|
+
"title": "Advanced Python",
|
|
118
|
+
"description": "Deep dive into Python",
|
|
119
|
+
"category": "tutorial"
|
|
120
|
+
})
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Custom ID field
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
index = TextSearchIndex(
|
|
127
|
+
text_fields=["title", "description"],
|
|
128
|
+
id_field="doc_id",
|
|
129
|
+
db_path="search.db"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
results = index.search("python", output_ids=True)
|
|
133
|
+
# Results will include 'id' field with the doc_id value
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Vector Search
|
|
137
|
+
|
|
138
|
+
Vector search uses Locality-Sensitive Hashing (LSH) with random projections
|
|
139
|
+
for fast approximate nearest neighbor search, followed by exact cosine
|
|
140
|
+
similarity reranking.
|
|
141
|
+
|
|
142
|
+
### Basic usage
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
import numpy as np
|
|
146
|
+
from sqlitesearch import VectorSearchIndex
|
|
147
|
+
|
|
148
|
+
# Create an index
|
|
149
|
+
index = VectorSearchIndex(
|
|
150
|
+
keyword_fields=["category"],
|
|
151
|
+
n_tables=8, # Number of hash tables (more = better recall)
|
|
152
|
+
hash_size=16, # Bits per hash (more = better precision)
|
|
153
|
+
db_path="vectors.db"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Index vectors with documents
|
|
157
|
+
vectors = np.random.rand(100, 384) # 100 documents, 384 dimensions
|
|
158
|
+
documents = [{"category": "test"} for _ in range(100)]
|
|
159
|
+
index.fit(vectors, documents)
|
|
160
|
+
|
|
161
|
+
# Search
|
|
162
|
+
query = np.random.rand(384)
|
|
163
|
+
results = index.search(query)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Filtering
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
results = index.search(
|
|
170
|
+
query,
|
|
171
|
+
filter_dict={"category": "test"}
|
|
172
|
+
)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Persistence
|
|
176
|
+
|
|
177
|
+
Both index types automatically persist to disk. You can reopen an existing index:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
# Open existing index
|
|
181
|
+
index = TextSearchIndex(
|
|
182
|
+
text_fields=["title", "description"],
|
|
183
|
+
db_path="search.db"
|
|
184
|
+
)
|
|
185
|
+
# Ready to search immediately
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Clearing the index
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
index.clear() # Remove all documents
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## API Compatibility
|
|
195
|
+
|
|
196
|
+
The API is designed to match minsearch for easy migration:
|
|
197
|
+
|
|
198
|
+
- `fit(docs)` - Index documents (only if index is empty)
|
|
199
|
+
- `add(doc)` - Add a single document
|
|
200
|
+
- `search(query, filter_dict=None, boost_dict=None, num_results=10)` - Search
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# sqlitesearch
|
|
2
|
+
|
|
3
|
+
A tiny, SQLite-backed search library for small, local projects. sqlitesearch is a persistent sibling of [minsearch](https://github.com/alexeygrigorev/minsearch) - same API, but stores data on disk.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
sqlitesearch provides
|
|
7
|
+
|
|
8
|
+
- text search using SQLite FTS5 and persistent
|
|
9
|
+
- vector search using LSH (random projections) with exact reranking
|
|
10
|
+
- hybrid search (Todo explain)
|
|
11
|
+
|
|
12
|
+
It stores the index in a single SQLite file, making it perfect for applications
|
|
13
|
+
that need search functionality without running a separate search server.
|
|
14
|
+
|
|
15
|
+
## When to use
|
|
16
|
+
|
|
17
|
+
- minsearch - in-memory, for experiments and notebooks
|
|
18
|
+
- sqlitesearch - persistent (SQLite file), for pet projects and prototypes
|
|
19
|
+
- Postgres/Elasticsearch/Qdrant/etc - production workloads with high traffic
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv add sqlitesearch
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Text Search
|
|
29
|
+
|
|
30
|
+
Text search uses SQLite's FTS5 (Full-Text Search) extension with BM25 ranking.
|
|
31
|
+
|
|
32
|
+
### Basic usage
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from sqlitesearch import TextSearchIndex
|
|
36
|
+
|
|
37
|
+
# Create an index
|
|
38
|
+
index = TextSearchIndex(
|
|
39
|
+
text_fields=["title", "description"],
|
|
40
|
+
keyword_fields=["category"],
|
|
41
|
+
db_path="search.db"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Index some documents
|
|
45
|
+
documents = [
|
|
46
|
+
{"id": 1, "title": "Python Tutorial", "description": "Learn Python basics", "category": "tutorial"},
|
|
47
|
+
{"id": 2, "title": "Java Guide", "description": "Java programming guide", "category": "guide"},
|
|
48
|
+
]
|
|
49
|
+
index.fit(documents)
|
|
50
|
+
|
|
51
|
+
# Search
|
|
52
|
+
results = index.search("python programming")
|
|
53
|
+
for result in results:
|
|
54
|
+
print(result["title"], result["score"])
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Filtering
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
# Filter by keyword fields
|
|
61
|
+
results = index.search(
|
|
62
|
+
"python",
|
|
63
|
+
filter_dict={"category": "tutorial"}
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Field boosting
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# Boost title matches higher than description
|
|
71
|
+
results = index.search(
|
|
72
|
+
"python",
|
|
73
|
+
boost_dict={"title": 2.0, "description": 1.0}
|
|
74
|
+
)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Stemming
|
|
78
|
+
|
|
79
|
+
Enable Porter stemming for better matching of word variants:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
# With stemming, "running" matches "run", "courses" matches "course", etc.
|
|
83
|
+
index = TextSearchIndex(
|
|
84
|
+
text_fields=["title", "description"],
|
|
85
|
+
stemming=True,
|
|
86
|
+
db_path="search.db"
|
|
87
|
+
)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
By default, stemming is disabled to match minsearch behavior.
|
|
91
|
+
|
|
92
|
+
### Adding documents
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# Add documents one by one
|
|
96
|
+
index.add({
|
|
97
|
+
"id": 3,
|
|
98
|
+
"title": "Advanced Python",
|
|
99
|
+
"description": "Deep dive into Python",
|
|
100
|
+
"category": "tutorial"
|
|
101
|
+
})
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Custom ID field
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
index = TextSearchIndex(
|
|
108
|
+
text_fields=["title", "description"],
|
|
109
|
+
id_field="doc_id",
|
|
110
|
+
db_path="search.db"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
results = index.search("python", output_ids=True)
|
|
114
|
+
# Results will include 'id' field with the doc_id value
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Vector Search
|
|
118
|
+
|
|
119
|
+
Vector search uses Locality-Sensitive Hashing (LSH) with random projections
|
|
120
|
+
for fast approximate nearest neighbor search, followed by exact cosine
|
|
121
|
+
similarity reranking.
|
|
122
|
+
|
|
123
|
+
### Basic usage
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
import numpy as np
|
|
127
|
+
from sqlitesearch import VectorSearchIndex
|
|
128
|
+
|
|
129
|
+
# Create an index
|
|
130
|
+
index = VectorSearchIndex(
|
|
131
|
+
keyword_fields=["category"],
|
|
132
|
+
n_tables=8, # Number of hash tables (more = better recall)
|
|
133
|
+
hash_size=16, # Bits per hash (more = better precision)
|
|
134
|
+
db_path="vectors.db"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Index vectors with documents
|
|
138
|
+
vectors = np.random.rand(100, 384) # 100 documents, 384 dimensions
|
|
139
|
+
documents = [{"category": "test"} for _ in range(100)]
|
|
140
|
+
index.fit(vectors, documents)
|
|
141
|
+
|
|
142
|
+
# Search
|
|
143
|
+
query = np.random.rand(384)
|
|
144
|
+
results = index.search(query)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Filtering
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
results = index.search(
|
|
151
|
+
query,
|
|
152
|
+
filter_dict={"category": "test"}
|
|
153
|
+
)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Persistence
|
|
157
|
+
|
|
158
|
+
Both index types automatically persist to disk. You can reopen an existing index:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
# Open existing index
|
|
162
|
+
index = TextSearchIndex(
|
|
163
|
+
text_fields=["title", "description"],
|
|
164
|
+
db_path="search.db"
|
|
165
|
+
)
|
|
166
|
+
# Ready to search immediately
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Clearing the index
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
index.clear() # Remove all documents
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## API Compatibility
|
|
176
|
+
|
|
177
|
+
The API is designed to match minsearch for easy migration:
|
|
178
|
+
|
|
179
|
+
- `fit(docs)` - Index documents (only if index is empty)
|
|
180
|
+
- `add(doc)` - Add a single document
|
|
181
|
+
- `search(query, filter_dict=None, boost_dict=None, num_results=10)` - Search
|