sqlitesearch 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ test:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ['3.10', '3.11', '3.12', '3.13']
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+
25
+ - name: Install uv
26
+ run: curl -LsSf https://astral.sh/uv/install.sh | sh
27
+
28
+ - name: Install dependencies
29
+ run: uv sync --dev
30
+
31
+ - name: Run tests with coverage
32
+ run: uv run pytest --cov=sqlitesearch --cov-report=xml --cov-report=term-missing
33
+
34
+ - name: Upload coverage to Codecov
35
+ uses: codecov/codecov-action@v4
36
+ with:
37
+ file: ./coverage.xml
38
+ fail_ci_if_error: false
@@ -0,0 +1,207 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
@@ -0,0 +1,68 @@
1
+ ## CRITICAL: File Editing on Windows
2
+
3
+ ### ⚠️ MANDATORY: Always Use Backslashes on Windows for File Paths
4
+
5
+ When using Edit or MultiEdit tools on Windows, you MUST use backslashes (`\`) in file paths, NOT forward slashes (`/`).
6
+
7
+ ❌ WRONG - Will cause errors:
8
+ ```
9
+ Edit(file_path: "D:/repos/project/file.tsx", ...)
10
+ MultiEdit(file_path: "D:/repos/project/file.tsx", ...)
11
+ ```
12
+
13
+ ✅ CORRECT - Always works:
14
+ ```
15
+ Edit(file_path: "D:\repos\project\file.tsx", ...)
16
+ MultiEdit(file_path: "D:\repos\project\file.tsx", ...)
17
+ ```
18
+
19
+ ### ⚠️ "File has been unexpectedly modified" Error
20
+
21
+ If you get this error: **"File has been unexpectedly modified. Read it again before attempting to write it"**
22
+
23
+ **Root cause:** The file was modified after you last read it (by linter, formatter, git, or external process).
24
+
25
+ **Solution: Re-read the file immediately before editing:**
26
+
27
+ ```bash
28
+ # 1. Read the file again
29
+ Read(file_path: "path\to\file.txt")
30
+
31
+ # 2. Then immediately edit
32
+ Edit(file_path: "path\to\file.txt", old_string="...", new_string="...")
33
+ ```
34
+
35
+ **Tool requirements:**
36
+
37
+ - Edit - Must `Read` immediately before - `old_string` must match current content
38
+ - Write - Must `Read` once per conversation before first write
39
+
40
+ **Common triggers:**
41
+ - Linters/formatters running on save
42
+ - Git operations (checkout, merge, rebase)
43
+ - File watchers or build processes
44
+
45
+ **Tip:** If this happens repeatedly, consider disabling auto-formatting for files you're actively editing with Claude Code.
46
+
47
+ ### ⚠️ Use UV for Python Package Management
48
+
49
+ When installing Python packages, use `uv` instead of `pip`. See `/uv` for details.
50
+
51
+ ❌ WRONG:
52
+ ```bash
53
+ pip install djangorestframework
54
+ ```
55
+
56
+ ✅ CORRECT:
57
+ ```bash
58
+ cd backend-django
59
+ uv add djangorestframework
60
+ ```
61
+
62
+ Run Django commands:
63
+ ```bash
64
+ cd backend-django
65
+ uv run python manage.py makemigrations
66
+ uv run python manage.py migrate
67
+ uv run python manage.py test
68
+ ```
@@ -0,0 +1,25 @@
1
+ .PHONY: test setup shell coverage publish-build publish-test publish publish-clean
2
+
3
+ test:
4
+ uv run pytest
5
+
6
+ coverage:
7
+ uv run pytest --cov=sqlitesearch --cov-report=term-missing --cov-report=html
8
+
9
+ setup:
10
+ uv sync --dev
11
+
12
+ shell:
13
+ uv shell
14
+
15
+ publish-build:
16
+ uv run hatch build
17
+
18
+ publish-test:
19
+ uv run hatch publish --repo test
20
+
21
+ publish:
22
+ uv run hatch publish
23
+
24
+ publish-clean:
25
+ rm -r dist/
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: sqlitesearch
3
+ Version: 0.0.1
4
+ Summary: A tiny, SQLite-backed search library for small, local projects
5
+ Author: sqlitesearch contributors
6
+ License: MIT
7
+ Keywords: fts,hybrid-search,lsh,search,sqlite,vector-search
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.10
17
+ Requires-Dist: numpy>=1.24.0
18
+ Description-Content-Type: text/markdown
19
+
20
+ # sqlitesearch
21
+
22
+ A tiny, SQLite-backed search library for small, local projects. sqlitesearch is a persistent sibling of [minsearch](https://github.com/alexeygrigorev/minsearch) - same API, but stores data on disk.
23
+
24
+
25
+ sqlitesearch provides
26
+
27
+ - text search using SQLite FTS5 and persistent
28
+ - vector search using LSH (random projections) with exact reranking
29
+ - hybrid search (Todo explain)
30
+
31
+ It stores the index in a single SQLite file, making it perfect for applications
32
+ that need search functionality without running a separate search server.
33
+
34
+ ## When to use
35
+
36
+ - minsearch - in-memory, for experiments and notebooks
37
+ - sqlitesearch - persistent (SQLite file), for pet projects and prototypes
38
+ - Postgres/Elasticsearch/Qdrant/etc - production workloads with high traffic
39
+
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ uv add sqlitesearch
45
+ ```
46
+
47
+ ## Text Search
48
+
49
+ Text search uses SQLite's FTS5 (Full-Text Search) extension with BM25 ranking.
50
+
51
+ ### Basic usage
52
+
53
+ ```python
54
+ from sqlitesearch import TextSearchIndex
55
+
56
+ # Create an index
57
+ index = TextSearchIndex(
58
+ text_fields=["title", "description"],
59
+ keyword_fields=["category"],
60
+ db_path="search.db"
61
+ )
62
+
63
+ # Index some documents
64
+ documents = [
65
+ {"id": 1, "title": "Python Tutorial", "description": "Learn Python basics", "category": "tutorial"},
66
+ {"id": 2, "title": "Java Guide", "description": "Java programming guide", "category": "guide"},
67
+ ]
68
+ index.fit(documents)
69
+
70
+ # Search
71
+ results = index.search("python programming")
72
+ for result in results:
73
+ print(result["title"], result["score"])
74
+ ```
75
+
76
+ ### Filtering
77
+
78
+ ```python
79
+ # Filter by keyword fields
80
+ results = index.search(
81
+ "python",
82
+ filter_dict={"category": "tutorial"}
83
+ )
84
+ ```
85
+
86
+ ### Field boosting
87
+
88
+ ```python
89
+ # Boost title matches higher than description
90
+ results = index.search(
91
+ "python",
92
+ boost_dict={"title": 2.0, "description": 1.0}
93
+ )
94
+ ```
95
+
96
+ ### Stemming
97
+
98
+ Enable Porter stemming for better matching of word variants:
99
+
100
+ ```python
101
+ # With stemming, "running" matches "run", "courses" matches "course", etc.
102
+ index = TextSearchIndex(
103
+ text_fields=["title", "description"],
104
+ stemming=True,
105
+ db_path="search.db"
106
+ )
107
+ ```
108
+
109
+ By default, stemming is disabled to match minsearch behavior.
110
+
111
+ ### Adding documents
112
+
113
+ ```python
114
+ # Add documents one by one
115
+ index.add({
116
+ "id": 3,
117
+ "title": "Advanced Python",
118
+ "description": "Deep dive into Python",
119
+ "category": "tutorial"
120
+ })
121
+ ```
122
+
123
+ ### Custom ID field
124
+
125
+ ```python
126
+ index = TextSearchIndex(
127
+ text_fields=["title", "description"],
128
+ id_field="doc_id",
129
+ db_path="search.db"
130
+ )
131
+
132
+ results = index.search("python", output_ids=True)
133
+ # Results will include 'id' field with the doc_id value
134
+ ```
135
+
136
+ ## Vector Search
137
+
138
+ Vector search uses Locality-Sensitive Hashing (LSH) with random projections
139
+ for fast approximate nearest neighbor search, followed by exact cosine
140
+ similarity reranking.
141
+
142
+ ### Basic usage
143
+
144
+ ```python
145
+ import numpy as np
146
+ from sqlitesearch import VectorSearchIndex
147
+
148
+ # Create an index
149
+ index = VectorSearchIndex(
150
+ keyword_fields=["category"],
151
+ n_tables=8, # Number of hash tables (more = better recall)
152
+ hash_size=16, # Bits per hash (more = better precision)
153
+ db_path="vectors.db"
154
+ )
155
+
156
+ # Index vectors with documents
157
+ vectors = np.random.rand(100, 384) # 100 documents, 384 dimensions
158
+ documents = [{"category": "test"} for _ in range(100)]
159
+ index.fit(vectors, documents)
160
+
161
+ # Search
162
+ query = np.random.rand(384)
163
+ results = index.search(query)
164
+ ```
165
+
166
+ ### Filtering
167
+
168
+ ```python
169
+ results = index.search(
170
+ query,
171
+ filter_dict={"category": "test"}
172
+ )
173
+ ```
174
+
175
+ ## Persistence
176
+
177
+ Both index types automatically persist to disk. You can reopen an existing index:
178
+
179
+ ```python
180
+ # Open existing index
181
+ index = TextSearchIndex(
182
+ text_fields=["title", "description"],
183
+ db_path="search.db"
184
+ )
185
+ # Ready to search immediately
186
+ ```
187
+
188
+ ## Clearing the index
189
+
190
+ ```python
191
+ index.clear() # Remove all documents
192
+ ```
193
+
194
+ ## API Compatibility
195
+
196
+ The API is designed to match minsearch for easy migration:
197
+
198
+ - `fit(docs)` - Index documents (only if index is empty)
199
+ - `add(doc)` - Add a single document
200
+ - `search(query, filter_dict=None, boost_dict=None, num_results=10)` - Search
@@ -0,0 +1,181 @@
1
+ # sqlitesearch
2
+
3
+ A tiny, SQLite-backed search library for small, local projects. sqlitesearch is a persistent sibling of [minsearch](https://github.com/alexeygrigorev/minsearch) - same API, but stores data on disk.
4
+
5
+
6
+ sqlitesearch provides
7
+
8
+ - text search using SQLite FTS5 and persistent
9
+ - vector search using LSH (random projections) with exact reranking
10
+ - hybrid search (Todo explain)
11
+
12
+ It stores the index in a single SQLite file, making it perfect for applications
13
+ that need search functionality without running a separate search server.
14
+
15
+ ## When to use
16
+
17
+ - minsearch - in-memory, for experiments and notebooks
18
+ - sqlitesearch - persistent (SQLite file), for pet projects and prototypes
19
+ - Postgres/Elasticsearch/Qdrant/etc - production workloads with high traffic
20
+
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ uv add sqlitesearch
26
+ ```
27
+
28
+ ## Text Search
29
+
30
+ Text search uses SQLite's FTS5 (Full-Text Search) extension with BM25 ranking.
31
+
32
+ ### Basic usage
33
+
34
+ ```python
35
+ from sqlitesearch import TextSearchIndex
36
+
37
+ # Create an index
38
+ index = TextSearchIndex(
39
+ text_fields=["title", "description"],
40
+ keyword_fields=["category"],
41
+ db_path="search.db"
42
+ )
43
+
44
+ # Index some documents
45
+ documents = [
46
+ {"id": 1, "title": "Python Tutorial", "description": "Learn Python basics", "category": "tutorial"},
47
+ {"id": 2, "title": "Java Guide", "description": "Java programming guide", "category": "guide"},
48
+ ]
49
+ index.fit(documents)
50
+
51
+ # Search
52
+ results = index.search("python programming")
53
+ for result in results:
54
+ print(result["title"], result["score"])
55
+ ```
56
+
57
+ ### Filtering
58
+
59
+ ```python
60
+ # Filter by keyword fields
61
+ results = index.search(
62
+ "python",
63
+ filter_dict={"category": "tutorial"}
64
+ )
65
+ ```
66
+
67
+ ### Field boosting
68
+
69
+ ```python
70
+ # Boost title matches higher than description
71
+ results = index.search(
72
+ "python",
73
+ boost_dict={"title": 2.0, "description": 1.0}
74
+ )
75
+ ```
76
+
77
+ ### Stemming
78
+
79
+ Enable Porter stemming for better matching of word variants:
80
+
81
+ ```python
82
+ # With stemming, "running" matches "run", "courses" matches "course", etc.
83
+ index = TextSearchIndex(
84
+ text_fields=["title", "description"],
85
+ stemming=True,
86
+ db_path="search.db"
87
+ )
88
+ ```
89
+
90
+ By default, stemming is disabled to match minsearch behavior.
91
+
92
+ ### Adding documents
93
+
94
+ ```python
95
+ # Add documents one by one
96
+ index.add({
97
+ "id": 3,
98
+ "title": "Advanced Python",
99
+ "description": "Deep dive into Python",
100
+ "category": "tutorial"
101
+ })
102
+ ```
103
+
104
+ ### Custom ID field
105
+
106
+ ```python
107
+ index = TextSearchIndex(
108
+ text_fields=["title", "description"],
109
+ id_field="doc_id",
110
+ db_path="search.db"
111
+ )
112
+
113
+ results = index.search("python", output_ids=True)
114
+ # Results will include 'id' field with the doc_id value
115
+ ```
116
+
117
+ ## Vector Search
118
+
119
+ Vector search uses Locality-Sensitive Hashing (LSH) with random projections
120
+ for fast approximate nearest neighbor search, followed by exact cosine
121
+ similarity reranking.
122
+
123
+ ### Basic usage
124
+
125
+ ```python
126
+ import numpy as np
127
+ from sqlitesearch import VectorSearchIndex
128
+
129
+ # Create an index
130
+ index = VectorSearchIndex(
131
+ keyword_fields=["category"],
132
+ n_tables=8, # Number of hash tables (more = better recall)
133
+ hash_size=16, # Bits per hash (more = better precision)
134
+ db_path="vectors.db"
135
+ )
136
+
137
+ # Index vectors with documents
138
+ vectors = np.random.rand(100, 384) # 100 documents, 384 dimensions
139
+ documents = [{"category": "test"} for _ in range(100)]
140
+ index.fit(vectors, documents)
141
+
142
+ # Search
143
+ query = np.random.rand(384)
144
+ results = index.search(query)
145
+ ```
146
+
147
+ ### Filtering
148
+
149
+ ```python
150
+ results = index.search(
151
+ query,
152
+ filter_dict={"category": "test"}
153
+ )
154
+ ```
155
+
156
+ ## Persistence
157
+
158
+ Both index types automatically persist to disk. You can reopen an existing index:
159
+
160
+ ```python
161
+ # Open existing index
162
+ index = TextSearchIndex(
163
+ text_fields=["title", "description"],
164
+ db_path="search.db"
165
+ )
166
+ # Ready to search immediately
167
+ ```
168
+
169
+ ## Clearing the index
170
+
171
+ ```python
172
+ index.clear() # Remove all documents
173
+ ```
174
+
175
+ ## API Compatibility
176
+
177
+ The API is designed to match minsearch for easy migration:
178
+
179
+ - `fit(docs)` - Index documents (only if index is empty)
180
+ - `add(doc)` - Add a single document
181
+ - `search(query, filter_dict=None, boost_dict=None, num_results=10)` - Search