zerosearch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zerosearch-0.1.0/.github/workflows/publish.yml +40 -0
- zerosearch-0.1.0/.gitignore +15 -0
- zerosearch-0.1.0/.python-version +1 -0
- zerosearch-0.1.0/Makefile +26 -0
- zerosearch-0.1.0/PKG-INFO +86 -0
- zerosearch-0.1.0/README.md +71 -0
- zerosearch-0.1.0/pyproject.toml +50 -0
- zerosearch-0.1.0/tests/__init__.py +0 -0
- zerosearch-0.1.0/tests/test_index.py +101 -0
- zerosearch-0.1.0/uv.lock +1003 -0
- zerosearch-0.1.0/zerosearch/__init__.py +6 -0
- zerosearch-0.1.0/zerosearch/__version__.py +1 -0
- zerosearch-0.1.0/zerosearch/index.py +196 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v5
|
|
14
|
+
- uses: astral-sh/setup-uv@v6
|
|
15
|
+
- run: uv build
|
|
16
|
+
- name: Verify version matches tag
|
|
17
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
18
|
+
run: |
|
|
19
|
+
TAG="${GITHUB_REF#refs/tags/v}"
|
|
20
|
+
if ! ls dist/ | grep -qE -- "-${TAG}(-|\.)"; then
|
|
21
|
+
echo "::error::dist/ contents do not match tag v${TAG}"
|
|
22
|
+
ls dist/
|
|
23
|
+
exit 1
|
|
24
|
+
fi
|
|
25
|
+
- uses: actions/upload-artifact@v5
|
|
26
|
+
with:
|
|
27
|
+
name: dist
|
|
28
|
+
path: dist/*
|
|
29
|
+
|
|
30
|
+
publish-pypi:
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/download-artifact@v5
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist
|
|
38
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
39
|
+
with:
|
|
40
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
.PHONY: test setup shell coverage publish-build publish-clean release
|
|
2
|
+
|
|
3
|
+
test:
|
|
4
|
+
uv run pytest
|
|
5
|
+
|
|
6
|
+
setup:
|
|
7
|
+
uv sync --dev
|
|
8
|
+
|
|
9
|
+
shell:
|
|
10
|
+
uv shell
|
|
11
|
+
|
|
12
|
+
coverage:
|
|
13
|
+
uv run pytest --cov=zerosearch --cov-report=term-missing
|
|
14
|
+
|
|
15
|
+
publish-build:
|
|
16
|
+
uv run hatch build
|
|
17
|
+
|
|
18
|
+
publish-clean:
|
|
19
|
+
rm -r dist/
|
|
20
|
+
|
|
21
|
+
# Release: tag the current version and push to trigger CI publish.
|
|
22
|
+
release:
|
|
23
|
+
@VERSION=$$(grep -E "^__version__" zerosearch/__version__.py | sed -E "s/.*['\"]([^'\"]+)['\"].*/\1/"); \
|
|
24
|
+
echo "Releasing v$$VERSION"; \
|
|
25
|
+
git tag "v$$VERSION"; \
|
|
26
|
+
git push origin "v$$VERSION"
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zerosearch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A tiny, zero-dependency BM25-lite in-memory text search index.
|
|
5
|
+
Project-URL: Homepage, https://github.com/alexeygrigorev/zerosearch
|
|
6
|
+
Project-URL: Repository, https://github.com/alexeygrigorev/zerosearch
|
|
7
|
+
Author-email: Alexey Grigorev <alexey.s.grigoriev@gmail.com>
|
|
8
|
+
License: WTFPL
|
|
9
|
+
Keywords: bm25,information-retrieval,minsearch,search,tf-idf,zero-dependency
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# zerosearch
|
|
17
|
+
|
|
18
|
+
A tiny, **zero-dependency** BM25-lite in-memory text search index — standard
|
|
19
|
+
library only, a single small module, and good enough to power retrieval for a
|
|
20
|
+
RAG pipeline. Designed to run anywhere Python runs, including constrained
|
|
21
|
+
environments like Cloudflare Python Workers (Pyodide) where pulling in
|
|
22
|
+
`scikit-learn`/`numpy` is not an option.
|
|
23
|
+
|
|
24
|
+
It is a spiritual cousin of [`minsearch`](https://github.com/alexeygrigorev/minsearch),
|
|
25
|
+
with the same `Index(text_fields, keyword_fields).fit(docs).search(query)` shape,
|
|
26
|
+
but reimplemented from scratch with no third-party dependencies.
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install zerosearch
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from zerosearch import Index
|
|
38
|
+
|
|
39
|
+
docs = [
|
|
40
|
+
{"id": "1", "title": "Docker compose basics", "text": "how to start services", "course": "de"},
|
|
41
|
+
{"id": "2", "title": "Kafka consumers", "text": "consumer groups explained", "course": "de"},
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
index = Index(
|
|
45
|
+
text_fields=["title", "text"],
|
|
46
|
+
keyword_fields=["id", "course"],
|
|
47
|
+
).fit(docs)
|
|
48
|
+
|
|
49
|
+
results = index.search(
|
|
50
|
+
"how do I start docker compose",
|
|
51
|
+
filter_dict={"course": "de"}, # exact-match keyword filter
|
|
52
|
+
boost_dict={"title": 3.0, "text": 1.0}, # per-field boosts
|
|
53
|
+
num_results=5,
|
|
54
|
+
)
|
|
55
|
+
for r in results:
|
|
56
|
+
print(r["score"], r["title"])
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Each result is a shallow copy of the original document dict with an added
|
|
60
|
+
`"score"` key.
|
|
61
|
+
|
|
62
|
+
## How it works
|
|
63
|
+
|
|
64
|
+
* **Tokenizer** — lowercased word/number tokens; keeps `+ . # _ -` *inside* a
|
|
65
|
+
token so `c++`, `node.js`, `f-string` survive (a token must start with a
|
|
66
|
+
letter/digit). Drops 1-character tokens and a small English stop-word list
|
|
67
|
+
(both overridable).
|
|
68
|
+
* **Inverted index** — built once in `fit()`. A query only scores documents that
|
|
69
|
+
actually contain a query term, so search is fast even on large corpora.
|
|
70
|
+
* **Ranking** — BM25-lite: each query term contributes
|
|
71
|
+
`boost * idf * (term_frequency / sqrt(field_length))` per field. IDF and
|
|
72
|
+
document frequencies are computed over the filtered candidate set.
|
|
73
|
+
|
|
74
|
+
## Customizing
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
Index(
|
|
78
|
+
text_fields=["title", "text"],
|
|
79
|
+
stop_words={"the", "a", "an"}, # replace the default stop words
|
|
80
|
+
tokenizer=lambda s: s.lower().split(), # or plug in your own tokenizer
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
WTFPL.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# zerosearch
|
|
2
|
+
|
|
3
|
+
A tiny, **zero-dependency** BM25-lite in-memory text search index — standard
|
|
4
|
+
library only, a single small module, and good enough to power retrieval for a
|
|
5
|
+
RAG pipeline. Designed to run anywhere Python runs, including constrained
|
|
6
|
+
environments like Cloudflare Python Workers (Pyodide) where pulling in
|
|
7
|
+
`scikit-learn`/`numpy` is not an option.
|
|
8
|
+
|
|
9
|
+
It is a spiritual cousin of [`minsearch`](https://github.com/alexeygrigorev/minsearch),
|
|
10
|
+
with the same `Index(text_fields, keyword_fields).fit(docs).search(query)` shape,
|
|
11
|
+
but reimplemented from scratch with no third-party dependencies.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install zerosearch
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from zerosearch import Index
|
|
23
|
+
|
|
24
|
+
docs = [
|
|
25
|
+
{"id": "1", "title": "Docker compose basics", "text": "how to start services", "course": "de"},
|
|
26
|
+
{"id": "2", "title": "Kafka consumers", "text": "consumer groups explained", "course": "de"},
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
index = Index(
|
|
30
|
+
text_fields=["title", "text"],
|
|
31
|
+
keyword_fields=["id", "course"],
|
|
32
|
+
).fit(docs)
|
|
33
|
+
|
|
34
|
+
results = index.search(
|
|
35
|
+
"how do I start docker compose",
|
|
36
|
+
filter_dict={"course": "de"}, # exact-match keyword filter
|
|
37
|
+
boost_dict={"title": 3.0, "text": 1.0}, # per-field boosts
|
|
38
|
+
num_results=5,
|
|
39
|
+
)
|
|
40
|
+
for r in results:
|
|
41
|
+
print(r["score"], r["title"])
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Each result is a shallow copy of the original document dict with an added
|
|
45
|
+
`"score"` key.
|
|
46
|
+
|
|
47
|
+
## How it works
|
|
48
|
+
|
|
49
|
+
* **Tokenizer** — lowercased word/number tokens; keeps `+ . # _ -` *inside* a
|
|
50
|
+
token so `c++`, `node.js`, `f-string` survive (a token must start with a
|
|
51
|
+
letter/digit). Drops 1-character tokens and a small English stop-word list
|
|
52
|
+
(both overridable).
|
|
53
|
+
* **Inverted index** — built once in `fit()`. A query only scores documents that
|
|
54
|
+
actually contain a query term, so search is fast even on large corpora.
|
|
55
|
+
* **Ranking** — BM25-lite: each query term contributes
|
|
56
|
+
`boost * idf * (term_frequency / sqrt(field_length))` per field. IDF and
|
|
57
|
+
document frequencies are computed over the filtered candidate set.
|
|
58
|
+
|
|
59
|
+
## Customizing
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
Index(
|
|
63
|
+
text_fields=["title", "text"],
|
|
64
|
+
stop_words={"the", "a", "an"}, # replace the default stop words
|
|
65
|
+
tokenizer=lambda s: s.lower().split(), # or plug in your own tokenizer
|
|
66
|
+
)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
WTFPL.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "zerosearch"
|
|
7
|
+
description = "A tiny, zero-dependency BM25-lite in-memory text search index."
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = {text = "WTFPL"}
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dynamic = ["version"]
|
|
12
|
+
keywords = ["search", "bm25", "tf-idf", "information-retrieval", "minsearch", "zero-dependency"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Topic :: Text Processing :: Indexing",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dependencies = [
|
|
20
|
+
# Intentionally empty: standard library only.
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
authors = [
|
|
24
|
+
{name = "Alexey Grigorev", email = "alexey.s.grigoriev@gmail.com"}
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/alexeygrigorev/zerosearch"
|
|
29
|
+
Repository = "https://github.com/alexeygrigorev/zerosearch"
|
|
30
|
+
|
|
31
|
+
[dependency-groups]
|
|
32
|
+
dev = [
|
|
33
|
+
"hatch",
|
|
34
|
+
"pytest",
|
|
35
|
+
"pytest-cov",
|
|
36
|
+
"ruff",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.hatch.build.targets.wheel]
|
|
40
|
+
packages = ["zerosearch"]
|
|
41
|
+
|
|
42
|
+
[tool.hatch.version]
|
|
43
|
+
path = "zerosearch/__version__.py"
|
|
44
|
+
|
|
45
|
+
[tool.pytest.ini_options]
|
|
46
|
+
testpaths = ["tests"]
|
|
47
|
+
|
|
48
|
+
[tool.ruff]
|
|
49
|
+
line-length = 100
|
|
50
|
+
target-version = "py310"
|
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from zerosearch import DEFAULT_STOP_WORDS, Index, tokenize
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DOCS = [
|
|
9
|
+
{"id": "1", "title": "Docker compose basics", "text": "how to start services with docker", "course": "de"},
|
|
10
|
+
{"id": "2", "title": "Kafka consumers", "text": "consumer groups explained in kafka", "course": "de"},
|
|
11
|
+
{"id": "3", "title": "Docker networking", "text": "containers talk over a docker network", "course": "mlops"},
|
|
12
|
+
{"id": "4", "title": "Pandas joins", "text": "merge and join dataframes", "course": "ml"},
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def make_index():
|
|
17
|
+
return Index(text_fields=["title", "text"], keyword_fields=["id", "course"]).fit(DOCS)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_tokenize_keeps_technical_tokens():
|
|
21
|
+
# Punctuation is kept *inside* a token (a token must start with [a-z0-9]),
|
|
22
|
+
# so a leading dot in ".env" is dropped.
|
|
23
|
+
assert tokenize("Node.js and C++ with f-strings") == ["node.js", "c++", "f-strings"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_tokenize_drops_stopwords_and_single_chars():
|
|
27
|
+
assert "the" not in tokenize("the a docker")
|
|
28
|
+
assert tokenize("a I") == [] # all stop words / single char
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_empty_query_returns_nothing():
|
|
32
|
+
assert make_index().search("") == []
|
|
33
|
+
assert make_index().search(" ") == []
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_basic_ranking_finds_relevant_doc():
|
|
37
|
+
results = make_index().search("docker compose", num_results=5)
|
|
38
|
+
assert results
|
|
39
|
+
assert results[0]["id"] == "1"
|
|
40
|
+
assert all("score" in r for r in results)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_results_are_sorted_by_score_desc():
|
|
44
|
+
results = make_index().search("docker", num_results=5)
|
|
45
|
+
scores = [r["score"] for r in results]
|
|
46
|
+
assert scores == sorted(scores, reverse=True)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_keyword_filter_restricts_candidates():
|
|
50
|
+
results = make_index().search("docker", filter_dict={"course": "mlops"}, num_results=5)
|
|
51
|
+
assert [r["id"] for r in results] == ["3"]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_filter_with_no_matches_returns_empty():
|
|
55
|
+
assert make_index().search("docker", filter_dict={"course": "nonexistent"}) == []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_boost_changes_ranking():
|
|
59
|
+
index = make_index()
|
|
60
|
+
# "kafka" appears in both title and text of doc 2; boosting title should not
|
|
61
|
+
# crash and should keep doc 2 on top.
|
|
62
|
+
results = index.search("kafka", boost_dict={"title": 5.0, "text": 1.0})
|
|
63
|
+
assert results[0]["id"] == "2"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_num_results_caps_output():
|
|
67
|
+
assert len(make_index().search("docker", num_results=1)) == 1
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_search_does_not_mutate_source_docs():
|
|
71
|
+
index = make_index()
|
|
72
|
+
index.search("docker")
|
|
73
|
+
assert all("score" not in doc for doc in DOCS)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_unknown_term_returns_empty():
|
|
77
|
+
assert make_index().search("zzzznonexistentterm") == []
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_custom_stopwords():
|
|
81
|
+
index = Index(text_fields=["text"], stop_words={"docker"}).fit(DOCS)
|
|
82
|
+
# "docker" is now a stop word, so a docker-only query finds nothing.
|
|
83
|
+
assert index.search("docker") == []
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_custom_tokenizer():
|
|
87
|
+
index = Index(text_fields=["title"], tokenizer=lambda s: s.lower().split()).fit(DOCS)
|
|
88
|
+
assert index.search("kafka")[0]["id"] == "2"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_idf_is_positive_and_finite():
|
|
92
|
+
index = make_index()
|
|
93
|
+
results = index.search("docker")
|
|
94
|
+
for r in results:
|
|
95
|
+
assert math.isfinite(r["score"]) and r["score"] > 0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_default_stopwords_frozen():
|
|
99
|
+
assert "the" in DEFAULT_STOP_WORDS
|
|
100
|
+
with pytest.raises(AttributeError):
|
|
101
|
+
DEFAULT_STOP_WORDS.add("x") # frozenset has no add
|