zerosearch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zerosearch/__init__.py +6 -0
- zerosearch/__version__.py +1 -0
- zerosearch/index.py +196 -0
- zerosearch-0.1.0.dist-info/METADATA +86 -0
- zerosearch-0.1.0.dist-info/RECORD +6 -0
- zerosearch-0.1.0.dist-info/WHEEL +4 -0
zerosearch/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""zerosearch: a tiny, zero-dependency BM25-lite in-memory search index."""
|
|
2
|
+
|
|
3
|
+
from zerosearch.__version__ import __version__
|
|
4
|
+
from zerosearch.index import DEFAULT_STOP_WORDS, TOKEN_RE, Index, tokenize
|
|
5
|
+
|
|
6
|
+
__all__ = ["Index", "tokenize", "DEFAULT_STOP_WORDS", "TOKEN_RE", "__version__"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
zerosearch/index.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""A tiny, zero-dependency BM25-lite search index.
|
|
2
|
+
|
|
3
|
+
The whole engine is standard-library only. Documents are plain dicts. Text
|
|
4
|
+
fields are tokenized once when the index is built and kept as an inverted index,
|
|
5
|
+
so a query only scores the documents that actually contain a query term.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
from collections import Counter
|
|
13
|
+
from typing import Any, Callable, Iterable
|
|
14
|
+
|
|
15
|
+
__all__ = ["Index", "tokenize", "DEFAULT_STOP_WORDS", "TOKEN_RE"]
|
|
16
|
+
|
|
17
|
+
TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_+.#-]*", re.IGNORECASE)
|
|
18
|
+
|
|
19
|
+
DEFAULT_STOP_WORDS: frozenset[str] = frozenset(
|
|
20
|
+
{
|
|
21
|
+
"a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from",
|
|
22
|
+
"how", "i", "in", "is", "it", "of", "on", "or", "the", "to", "with",
|
|
23
|
+
}
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
Tokenizer = Callable[[str], list]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def tokenize(text: str, stop_words: Iterable[str] = DEFAULT_STOP_WORDS) -> list[str]:
|
|
30
|
+
"""Lowercase word/number tokens, dropping 1-char tokens and stop words.
|
|
31
|
+
|
|
32
|
+
The token pattern keeps ``+ . # _ -`` inside a token so technical terms such
|
|
33
|
+
as ``c++``, ``node.js`` and ``f-string`` survive intact (a token must start
|
|
34
|
+
with a letter or digit, so a leading ``.`` in ``.env`` is dropped).
|
|
35
|
+
"""
|
|
36
|
+
stops = stop_words if isinstance(stop_words, (set, frozenset)) else set(stop_words)
|
|
37
|
+
tokens = (match.group(0).lower() for match in TOKEN_RE.finditer(text))
|
|
38
|
+
return [token for token in tokens if len(token) > 1 and token not in stops]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Index:
|
|
42
|
+
"""In-memory search over a fixed list of documents.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
text_fields:
|
|
47
|
+
Document fields that are tokenized and ranked.
|
|
48
|
+
keyword_fields:
|
|
49
|
+
Document fields used for exact-match filtering (not ranked).
|
|
50
|
+
stop_words:
|
|
51
|
+
Tokens to ignore. Defaults to :data:`DEFAULT_STOP_WORDS`.
|
|
52
|
+
tokenizer:
|
|
53
|
+
Optional ``str -> list[str]`` override. Defaults to :func:`tokenize`.
|
|
54
|
+
|
|
55
|
+
Ranking is BM25-lite: each query term contributes
|
|
56
|
+
``boost * idf * (term_frequency / sqrt(field_length))`` per field, where the
|
|
57
|
+
IDF and document frequencies are computed over the filtered candidate set.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
text_fields: list[str],
|
|
63
|
+
keyword_fields: list[str] | None = None,
|
|
64
|
+
*,
|
|
65
|
+
stop_words: Iterable[str] = DEFAULT_STOP_WORDS,
|
|
66
|
+
tokenizer: Tokenizer | None = None,
|
|
67
|
+
) -> None:
|
|
68
|
+
self.text_fields = list(text_fields)
|
|
69
|
+
self.keyword_fields = list(keyword_fields or [])
|
|
70
|
+
self._stop_words = frozenset(stop_words)
|
|
71
|
+
self._tokenize: Tokenizer = tokenizer or (lambda text: tokenize(text, self._stop_words))
|
|
72
|
+
self.docs: list[dict[str, Any]] = []
|
|
73
|
+
self._field_counts: list[dict[str, Counter]] = []
|
|
74
|
+
self._field_lengths: list[dict[str, int]] = []
|
|
75
|
+
self._postings: dict[str, set[int]] = {}
|
|
76
|
+
self._keyword_index: dict[str, dict[str, set[int]]] = {}
|
|
77
|
+
|
|
78
|
+
def fit(self, docs: list[dict[str, Any]]) -> "Index":
|
|
79
|
+
"""Build the inverted index from ``docs``. Returns ``self``."""
|
|
80
|
+
self.docs = list(docs)
|
|
81
|
+
self._field_counts = []
|
|
82
|
+
self._field_lengths = []
|
|
83
|
+
self._postings = {}
|
|
84
|
+
self._keyword_index = {field: {} for field in self.keyword_fields}
|
|
85
|
+
|
|
86
|
+
for doc_id, doc in enumerate(self.docs):
|
|
87
|
+
counts: dict[str, Counter] = {}
|
|
88
|
+
lengths: dict[str, int] = {}
|
|
89
|
+
doc_terms: set[str] = set()
|
|
90
|
+
for field in self.text_fields:
|
|
91
|
+
field_counts = Counter(self._tokenize(str(doc.get(field, ""))))
|
|
92
|
+
counts[field] = field_counts
|
|
93
|
+
lengths[field] = sum(field_counts.values())
|
|
94
|
+
doc_terms.update(field_counts)
|
|
95
|
+
self._field_counts.append(counts)
|
|
96
|
+
self._field_lengths.append(lengths)
|
|
97
|
+
for term in doc_terms:
|
|
98
|
+
self._postings.setdefault(term, set()).add(doc_id)
|
|
99
|
+
|
|
100
|
+
for field in self.keyword_fields:
|
|
101
|
+
value = str(doc.get(field, ""))
|
|
102
|
+
self._keyword_index[field].setdefault(value, set()).add(doc_id)
|
|
103
|
+
|
|
104
|
+
return self
|
|
105
|
+
|
|
106
|
+
def search(
|
|
107
|
+
self,
|
|
108
|
+
query: str,
|
|
109
|
+
filter_dict: dict[str, str] | None = None,
|
|
110
|
+
boost_dict: dict[str, float] | None = None,
|
|
111
|
+
num_results: int = 10,
|
|
112
|
+
) -> list[dict[str, Any]]:
|
|
113
|
+
"""Return up to ``num_results`` docs (copies, with a ``"score"`` key)."""
|
|
114
|
+
query_terms = self._tokenize(query)
|
|
115
|
+
if not query_terms:
|
|
116
|
+
return []
|
|
117
|
+
|
|
118
|
+
filter_dict = filter_dict or {}
|
|
119
|
+
boost_dict = boost_dict or {}
|
|
120
|
+
|
|
121
|
+
candidates = self._candidate_ids(filter_dict)
|
|
122
|
+
if candidates is not None and not candidates:
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
document_count = len(self.docs) if candidates is None else len(candidates)
|
|
126
|
+
term_postings: dict[str, set[int]] = {}
|
|
127
|
+
document_frequencies: dict[str, int] = {}
|
|
128
|
+
docs_to_score: set[int] = set()
|
|
129
|
+
for term in set(query_terms):
|
|
130
|
+
postings = self._postings.get(term)
|
|
131
|
+
if not postings:
|
|
132
|
+
continue
|
|
133
|
+
matched = postings if candidates is None else (postings & candidates)
|
|
134
|
+
if not matched:
|
|
135
|
+
continue
|
|
136
|
+
term_postings[term] = matched
|
|
137
|
+
document_frequencies[term] = len(matched)
|
|
138
|
+
docs_to_score |= matched
|
|
139
|
+
|
|
140
|
+
if not docs_to_score:
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
idf = {
|
|
144
|
+
term: math.log(1 + (document_count - df + 0.5) / (df + 0.5))
|
|
145
|
+
for term, df in document_frequencies.items()
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
scored = []
|
|
149
|
+
for doc_id in sorted(docs_to_score):
|
|
150
|
+
score = self._score(doc_id, query_terms, term_postings, idf, boost_dict)
|
|
151
|
+
if score > 0:
|
|
152
|
+
record = dict(self.docs[doc_id])
|
|
153
|
+
record["score"] = score
|
|
154
|
+
scored.append(record)
|
|
155
|
+
|
|
156
|
+
scored.sort(key=lambda record: float(record["score"]), reverse=True)
|
|
157
|
+
return scored[:num_results]
|
|
158
|
+
|
|
159
|
+
def _candidate_ids(self, filter_dict: dict[str, str]) -> set[int] | None:
|
|
160
|
+
"""Intersect keyword indexes for each filter. ``None`` means "all docs"."""
|
|
161
|
+
if not filter_dict:
|
|
162
|
+
return None
|
|
163
|
+
candidates: set[int] | None = None
|
|
164
|
+
for field, value in filter_dict.items():
|
|
165
|
+
matched = self._keyword_index.get(field, {}).get(str(value), set())
|
|
166
|
+
candidates = set(matched) if candidates is None else (candidates & matched)
|
|
167
|
+
if not candidates:
|
|
168
|
+
return set()
|
|
169
|
+
return candidates
|
|
170
|
+
|
|
171
|
+
def _score(
|
|
172
|
+
self,
|
|
173
|
+
doc_id: int,
|
|
174
|
+
query_terms: list[str],
|
|
175
|
+
term_postings: dict[str, set[int]],
|
|
176
|
+
idf: dict[str, float],
|
|
177
|
+
boost_dict: dict[str, float],
|
|
178
|
+
) -> float:
|
|
179
|
+
counts = self._field_counts[doc_id]
|
|
180
|
+
lengths = self._field_lengths[doc_id]
|
|
181
|
+
score = 0.0
|
|
182
|
+
for field in self.text_fields:
|
|
183
|
+
field_length = lengths.get(field, 0)
|
|
184
|
+
if not field_length:
|
|
185
|
+
continue
|
|
186
|
+
field_counts = counts[field]
|
|
187
|
+
boost = float(boost_dict.get(field, 1.0))
|
|
188
|
+
norm = math.sqrt(field_length)
|
|
189
|
+
for term in query_terms:
|
|
190
|
+
if doc_id not in term_postings.get(term, ()):
|
|
191
|
+
continue
|
|
192
|
+
term_frequency = field_counts.get(term, 0)
|
|
193
|
+
if term_frequency == 0:
|
|
194
|
+
continue
|
|
195
|
+
score += boost * idf[term] * (term_frequency / norm)
|
|
196
|
+
return score
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zerosearch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A tiny, zero-dependency BM25-lite in-memory text search index.
|
|
5
|
+
Project-URL: Homepage, https://github.com/alexeygrigorev/zerosearch
|
|
6
|
+
Project-URL: Repository, https://github.com/alexeygrigorev/zerosearch
|
|
7
|
+
Author-email: Alexey Grigorev <alexey.s.grigoriev@gmail.com>
|
|
8
|
+
License: WTFPL
|
|
9
|
+
Keywords: bm25,information-retrieval,minsearch,search,tf-idf,zero-dependency
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# zerosearch
|
|
17
|
+
|
|
18
|
+
A tiny, **zero-dependency** BM25-lite in-memory text search index — standard
|
|
19
|
+
library only, a single small module, and good enough to power retrieval for a
|
|
20
|
+
RAG pipeline. Designed to run anywhere Python runs, including constrained
|
|
21
|
+
environments like Cloudflare Python Workers (Pyodide) where pulling in
|
|
22
|
+
`scikit-learn`/`numpy` is not an option.
|
|
23
|
+
|
|
24
|
+
It is a spiritual cousin of [`minsearch`](https://github.com/alexeygrigorev/minsearch),
|
|
25
|
+
with the same `Index(text_fields, keyword_fields).fit(docs).search(query)` shape,
|
|
26
|
+
but reimplemented from scratch with no third-party dependencies.
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install zerosearch
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from zerosearch import Index
|
|
38
|
+
|
|
39
|
+
docs = [
|
|
40
|
+
{"id": "1", "title": "Docker compose basics", "text": "how to start services", "course": "de"},
|
|
41
|
+
{"id": "2", "title": "Kafka consumers", "text": "consumer groups explained", "course": "de"},
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
index = Index(
|
|
45
|
+
text_fields=["title", "text"],
|
|
46
|
+
keyword_fields=["id", "course"],
|
|
47
|
+
).fit(docs)
|
|
48
|
+
|
|
49
|
+
results = index.search(
|
|
50
|
+
"how do I start docker compose",
|
|
51
|
+
filter_dict={"course": "de"}, # exact-match keyword filter
|
|
52
|
+
boost_dict={"title": 3.0, "text": 1.0}, # per-field boosts
|
|
53
|
+
num_results=5,
|
|
54
|
+
)
|
|
55
|
+
for r in results:
|
|
56
|
+
print(r["score"], r["title"])
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Each result is a shallow copy of the original document dict with an added
|
|
60
|
+
`"score"` key.
|
|
61
|
+
|
|
62
|
+
## How it works
|
|
63
|
+
|
|
64
|
+
* **Tokenizer** — lowercased word/number tokens; keeps `+ . # _ -` *inside* a
|
|
65
|
+
token so `c++`, `node.js`, `f-string` survive (a token must start with a
|
|
66
|
+
letter/digit). Drops 1-character tokens and a small English stop-word list
|
|
67
|
+
(both overridable).
|
|
68
|
+
* **Inverted index** — built once in `fit()`. A query only scores documents that
|
|
69
|
+
actually contain a query term, so search is fast even on large corpora.
|
|
70
|
+
* **Ranking** — BM25-lite: each query term contributes
|
|
71
|
+
`boost * idf * (term_frequency / sqrt(field_length))` per field. IDF and
|
|
72
|
+
document frequencies are computed over the filtered candidate set.
|
|
73
|
+
|
|
74
|
+
## Customizing
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
Index(
|
|
78
|
+
text_fields=["title", "text"],
|
|
79
|
+
stop_words={"the", "a", "an"}, # replace the default stop words
|
|
80
|
+
tokenizer=lambda s: s.lower().split(), # or plug in your own tokenizer
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
WTFPL.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
zerosearch/__init__.py,sha256=_4FimoYYVpwKajCwwCwnEF5hTM2__Em0jrhx_vZDlPg,281
|
|
2
|
+
zerosearch/__version__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
3
|
+
zerosearch/index.py,sha256=EfxszEsyg3FbPGX2FXy17CzAAsgmcMgjqCNnF3LUcLY,7421
|
|
4
|
+
zerosearch-0.1.0.dist-info/METADATA,sha256=efhn5CNOTCjVZ_jYE6ncs8z_Nrq8aQqKGkSFdHVCqhM,2862
|
|
5
|
+
zerosearch-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
6
|
+
zerosearch-0.1.0.dist-info/RECORD,,
|