zerosearch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ build:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v5
14
+ - uses: astral-sh/setup-uv@v6
15
+ - run: uv build
16
+ - name: Verify version matches tag
17
+ if: startsWith(github.ref, 'refs/tags/v')
18
+ run: |
19
+ TAG="${GITHUB_REF#refs/tags/v}"
20
+ if ! ls dist/ | grep -qE -- "-${TAG}(-|\.)"; then
21
+ echo "::error::dist/ contents do not match tag v${TAG}"
22
+ ls dist/
23
+ exit 1
24
+ fi
25
+ - uses: actions/upload-artifact@v5
26
+ with:
27
+ name: dist
28
+ path: dist/*
29
+
30
+ publish-pypi:
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ steps:
34
+ - uses: actions/download-artifact@v5
35
+ with:
36
+ name: dist
37
+ path: dist
38
+ - uses: pypa/gh-action-pypi-publish@release/v1
39
+ with:
40
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,15 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+
9
+ # venv / tooling
10
+ .venv/
11
+ .python-version-local
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .coverage
15
+ htmlcov/
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,26 @@
1
+ .PHONY: test setup shell coverage publish-build publish-clean release
2
+
3
+ test:
4
+ uv run pytest
5
+
6
+ setup:
7
+ uv sync --dev
8
+
9
+ shell:
10
+ uv shell
11
+
12
+ coverage:
13
+ uv run pytest --cov=zerosearch --cov-report=term-missing
14
+
15
+ publish-build:
16
+ uv run hatch build
17
+
18
+ publish-clean:
19
+ rm -r dist/
20
+
21
+ # Release: tag the current version and push to trigger CI publish.
22
+ release:
23
+ @VERSION=$$(grep -E "^__version__" zerosearch/__version__.py | sed -E "s/.*['\"]([^'\"]+)['\"].*/\1/"); \
24
+ echo "Releasing v$$VERSION"; \
25
+ git tag "v$$VERSION"; \
26
+ git push origin "v$$VERSION"
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: zerosearch
3
+ Version: 0.1.0
4
+ Summary: A tiny, zero-dependency BM25-lite in-memory text search index.
5
+ Project-URL: Homepage, https://github.com/alexeygrigorev/zerosearch
6
+ Project-URL: Repository, https://github.com/alexeygrigorev/zerosearch
7
+ Author-email: Alexey Grigorev <alexey.s.grigoriev@gmail.com>
8
+ License: WTFPL
9
+ Keywords: bm25,information-retrieval,minsearch,search,tf-idf,zero-dependency
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Text Processing :: Indexing
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+
16
+ # zerosearch
17
+
18
+ A tiny, **zero-dependency** BM25-lite in-memory text search index — standard
19
+ library only, a single small module, and good enough to power retrieval for a
20
+ RAG pipeline. Designed to run anywhere Python runs, including constrained
21
+ environments like Cloudflare Python Workers (Pyodide) where pulling in
22
+ `scikit-learn`/`numpy` is not an option.
23
+
24
+ It is a spiritual cousin of [`minsearch`](https://github.com/alexeygrigorev/minsearch),
25
+ with the same `Index(text_fields, keyword_fields).fit(docs).search(query)` shape,
26
+ but reimplemented from scratch with no third-party dependencies.
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install zerosearch
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ```python
37
+ from zerosearch import Index
38
+
39
+ docs = [
40
+ {"id": "1", "title": "Docker compose basics", "text": "how to start services", "course": "de"},
41
+ {"id": "2", "title": "Kafka consumers", "text": "consumer groups explained", "course": "de"},
42
+ ]
43
+
44
+ index = Index(
45
+ text_fields=["title", "text"],
46
+ keyword_fields=["id", "course"],
47
+ ).fit(docs)
48
+
49
+ results = index.search(
50
+ "how do I start docker compose",
51
+ filter_dict={"course": "de"}, # exact-match keyword filter
52
+ boost_dict={"title": 3.0, "text": 1.0}, # per-field boosts
53
+ num_results=5,
54
+ )
55
+ for r in results:
56
+ print(r["score"], r["title"])
57
+ ```
58
+
59
+ Each result is a shallow copy of the original document dict with an added
60
+ `"score"` key.
61
+
62
+ ## How it works
63
+
64
+ * **Tokenizer** — lowercased word/number tokens; keeps `+ . # _ -` *inside* a
65
+ token so `c++`, `node.js`, `f-string` survive (a token must start with a
66
+ letter/digit). Drops 1-character tokens and a small English stop-word list
67
+ (both overridable).
68
+ * **Inverted index** — built once in `fit()`. A query only scores documents that
69
+ actually contain a query term, so search is fast even on large corpora.
70
+ * **Ranking** — BM25-lite: each query term contributes
71
+ `boost * idf * (term_frequency / sqrt(field_length))` per field. IDF and
72
+ document frequencies are computed over the filtered candidate set.
73
+
74
+ ## Customizing
75
+
76
+ ```python
77
+ Index(
78
+ text_fields=["title", "text"],
79
+ stop_words={"the", "a", "an"}, # replace the default stop words
80
+ tokenizer=lambda s: s.lower().split(), # or plug in your own tokenizer
81
+ )
82
+ ```
83
+
84
+ ## License
85
+
86
+ WTFPL.
@@ -0,0 +1,71 @@
1
+ # zerosearch
2
+
3
+ A tiny, **zero-dependency** BM25-lite in-memory text search index — standard
4
+ library only, a single small module, and good enough to power retrieval for a
5
+ RAG pipeline. Designed to run anywhere Python runs, including constrained
6
+ environments like Cloudflare Python Workers (Pyodide) where pulling in
7
+ `scikit-learn`/`numpy` is not an option.
8
+
9
+ It is a spiritual cousin of [`minsearch`](https://github.com/alexeygrigorev/minsearch),
10
+ with the same `Index(text_fields, keyword_fields).fit(docs).search(query)` shape,
11
+ but reimplemented from scratch with no third-party dependencies.
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install zerosearch
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ ```python
22
+ from zerosearch import Index
23
+
24
+ docs = [
25
+ {"id": "1", "title": "Docker compose basics", "text": "how to start services", "course": "de"},
26
+ {"id": "2", "title": "Kafka consumers", "text": "consumer groups explained", "course": "de"},
27
+ ]
28
+
29
+ index = Index(
30
+ text_fields=["title", "text"],
31
+ keyword_fields=["id", "course"],
32
+ ).fit(docs)
33
+
34
+ results = index.search(
35
+ "how do I start docker compose",
36
+ filter_dict={"course": "de"}, # exact-match keyword filter
37
+ boost_dict={"title": 3.0, "text": 1.0}, # per-field boosts
38
+ num_results=5,
39
+ )
40
+ for r in results:
41
+ print(r["score"], r["title"])
42
+ ```
43
+
44
+ Each result is a shallow copy of the original document dict with an added
45
+ `"score"` key.
46
+
47
+ ## How it works
48
+
49
+ * **Tokenizer** — lowercased word/number tokens; keeps `+ . # _ -` *inside* a
50
+ token so `c++`, `node.js`, `f-string` survive (a token must start with a
51
+ letter/digit). Drops 1-character tokens and a small English stop-word list
52
+ (both overridable).
53
+ * **Inverted index** — built once in `fit()`. A query only scores documents that
54
+ actually contain a query term, so search is fast even on large corpora.
55
+ * **Ranking** — BM25-lite: each query term contributes
56
+ `boost * idf * (term_frequency / sqrt(field_length))` per field. IDF and
57
+ document frequencies are computed over the filtered candidate set.
58
+
59
+ ## Customizing
60
+
61
+ ```python
62
+ Index(
63
+ text_fields=["title", "text"],
64
+ stop_words={"the", "a", "an"}, # replace the default stop words
65
+ tokenizer=lambda s: s.lower().split(), # or plug in your own tokenizer
66
+ )
67
+ ```
68
+
69
+ ## License
70
+
71
+ WTFPL.
@@ -0,0 +1,50 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "zerosearch"
7
+ description = "A tiny, zero-dependency BM25-lite in-memory text search index."
8
+ readme = "README.md"
9
+ license = {text = "WTFPL"}
10
+ requires-python = ">=3.10"
11
+ dynamic = ["version"]
12
+ keywords = ["search", "bm25", "tf-idf", "information-retrieval", "minsearch", "zero-dependency"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Topic :: Text Processing :: Indexing",
16
+ "Intended Audience :: Developers",
17
+ ]
18
+
19
+ dependencies = [
20
+ # Intentionally empty: standard library only.
21
+ ]
22
+
23
+ authors = [
24
+ {name = "Alexey Grigorev", email = "alexey.s.grigoriev@gmail.com"}
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/alexeygrigorev/zerosearch"
29
+ Repository = "https://github.com/alexeygrigorev/zerosearch"
30
+
31
+ [dependency-groups]
32
+ dev = [
33
+ "hatch",
34
+ "pytest",
35
+ "pytest-cov",
36
+ "ruff",
37
+ ]
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["zerosearch"]
41
+
42
+ [tool.hatch.version]
43
+ path = "zerosearch/__version__.py"
44
+
45
+ [tool.pytest.ini_options]
46
+ testpaths = ["tests"]
47
+
48
+ [tool.ruff]
49
+ line-length = 100
50
+ target-version = "py310"
File without changes
@@ -0,0 +1,101 @@
1
+ import math
2
+
3
+ import pytest
4
+
5
+ from zerosearch import DEFAULT_STOP_WORDS, Index, tokenize
6
+
7
+
8
+ DOCS = [
9
+ {"id": "1", "title": "Docker compose basics", "text": "how to start services with docker", "course": "de"},
10
+ {"id": "2", "title": "Kafka consumers", "text": "consumer groups explained in kafka", "course": "de"},
11
+ {"id": "3", "title": "Docker networking", "text": "containers talk over a docker network", "course": "mlops"},
12
+ {"id": "4", "title": "Pandas joins", "text": "merge and join dataframes", "course": "ml"},
13
+ ]
14
+
15
+
16
+ def make_index():
17
+ return Index(text_fields=["title", "text"], keyword_fields=["id", "course"]).fit(DOCS)
18
+
19
+
20
+ def test_tokenize_keeps_technical_tokens():
21
+ # Punctuation is kept *inside* a token (a token must start with [a-z0-9]),
22
+ # so a leading dot in ".env" is dropped.
23
+ assert tokenize("Node.js and C++ with f-strings") == ["node.js", "c++", "f-strings"]
24
+
25
+
26
+ def test_tokenize_drops_stopwords_and_single_chars():
27
+ assert "the" not in tokenize("the a docker")
28
+ assert tokenize("a I") == [] # all stop words / single char
29
+
30
+
31
+ def test_empty_query_returns_nothing():
32
+ assert make_index().search("") == []
33
+ assert make_index().search(" ") == []
34
+
35
+
36
+ def test_basic_ranking_finds_relevant_doc():
37
+ results = make_index().search("docker compose", num_results=5)
38
+ assert results
39
+ assert results[0]["id"] == "1"
40
+ assert all("score" in r for r in results)
41
+
42
+
43
+ def test_results_are_sorted_by_score_desc():
44
+ results = make_index().search("docker", num_results=5)
45
+ scores = [r["score"] for r in results]
46
+ assert scores == sorted(scores, reverse=True)
47
+
48
+
49
+ def test_keyword_filter_restricts_candidates():
50
+ results = make_index().search("docker", filter_dict={"course": "mlops"}, num_results=5)
51
+ assert [r["id"] for r in results] == ["3"]
52
+
53
+
54
+ def test_filter_with_no_matches_returns_empty():
55
+ assert make_index().search("docker", filter_dict={"course": "nonexistent"}) == []
56
+
57
+
58
+ def test_boost_changes_ranking():
59
+ index = make_index()
60
+ # "kafka" appears in both title and text of doc 2; boosting title should not
61
+ # crash and should keep doc 2 on top.
62
+ results = index.search("kafka", boost_dict={"title": 5.0, "text": 1.0})
63
+ assert results[0]["id"] == "2"
64
+
65
+
66
+ def test_num_results_caps_output():
67
+ assert len(make_index().search("docker", num_results=1)) == 1
68
+
69
+
70
+ def test_search_does_not_mutate_source_docs():
71
+ index = make_index()
72
+ index.search("docker")
73
+ assert all("score" not in doc for doc in DOCS)
74
+
75
+
76
+ def test_unknown_term_returns_empty():
77
+ assert make_index().search("zzzznonexistentterm") == []
78
+
79
+
80
+ def test_custom_stopwords():
81
+ index = Index(text_fields=["text"], stop_words={"docker"}).fit(DOCS)
82
+ # "docker" is now a stop word, so a docker-only query finds nothing.
83
+ assert index.search("docker") == []
84
+
85
+
86
+ def test_custom_tokenizer():
87
+ index = Index(text_fields=["title"], tokenizer=lambda s: s.lower().split()).fit(DOCS)
88
+ assert index.search("kafka")[0]["id"] == "2"
89
+
90
+
91
+ def test_idf_is_positive_and_finite():
92
+ index = make_index()
93
+ results = index.search("docker")
94
+ for r in results:
95
+ assert math.isfinite(r["score"]) and r["score"] > 0
96
+
97
+
98
+ def test_default_stopwords_frozen():
99
+ assert "the" in DEFAULT_STOP_WORDS
100
+ with pytest.raises(AttributeError):
101
+ DEFAULT_STOP_WORDS.add("x") # frozenset has no add