zerosearch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zerosearch/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """zerosearch: a tiny, zero-dependency BM25-lite in-memory search index."""
2
+
3
+ from zerosearch.__version__ import __version__
4
+ from zerosearch.index import DEFAULT_STOP_WORDS, TOKEN_RE, Index, tokenize
5
+
6
+ __all__ = ["Index", "tokenize", "DEFAULT_STOP_WORDS", "TOKEN_RE", "__version__"]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
zerosearch/index.py ADDED
@@ -0,0 +1,196 @@
1
+ """A tiny, zero-dependency BM25-lite search index.
2
+
3
+ The whole engine is standard-library only. Documents are plain dicts. Text
4
+ fields are tokenized once when the index is built and kept as an inverted index,
5
+ so a query only scores the documents that actually contain a query term.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ import re
12
+ from collections import Counter
13
+ from typing import Any, Callable, Iterable
14
+
15
+ __all__ = ["Index", "tokenize", "DEFAULT_STOP_WORDS", "TOKEN_RE"]
16
+
17
+ TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_+.#-]*", re.IGNORECASE)
18
+
19
+ DEFAULT_STOP_WORDS: frozenset[str] = frozenset(
20
+ {
21
+ "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from",
22
+ "how", "i", "in", "is", "it", "of", "on", "or", "the", "to", "with",
23
+ }
24
+ )
25
+
26
+ Tokenizer = Callable[[str], list]
27
+
28
+
29
+ def tokenize(text: str, stop_words: Iterable[str] = DEFAULT_STOP_WORDS) -> list[str]:
30
+ """Lowercase word/number tokens, dropping 1-char tokens and stop words.
31
+
32
+ The token pattern keeps ``+ . # _ -`` inside a token so technical terms such
33
+ as ``c++``, ``node.js`` and ``f-string`` survive intact (a token must start
34
+ with a letter or digit, so a leading ``.`` in ``.env`` is dropped).
35
+ """
36
+ stops = stop_words if isinstance(stop_words, (set, frozenset)) else set(stop_words)
37
+ tokens = (match.group(0).lower() for match in TOKEN_RE.finditer(text))
38
+ return [token for token in tokens if len(token) > 1 and token not in stops]
39
+
40
+
41
+ class Index:
42
+ """In-memory search over a fixed list of documents.
43
+
44
+ Parameters
45
+ ----------
46
+ text_fields:
47
+ Document fields that are tokenized and ranked.
48
+ keyword_fields:
49
+ Document fields used for exact-match filtering (not ranked).
50
+ stop_words:
51
+ Tokens to ignore. Defaults to :data:`DEFAULT_STOP_WORDS`.
52
+ tokenizer:
53
+ Optional ``str -> list[str]`` override. Defaults to :func:`tokenize`.
54
+
55
+ Ranking is BM25-lite: each query term contributes
56
+ ``boost * idf * (term_frequency / sqrt(field_length))`` per field, where the
57
+ IDF and document frequencies are computed over the filtered candidate set.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ text_fields: list[str],
63
+ keyword_fields: list[str] | None = None,
64
+ *,
65
+ stop_words: Iterable[str] = DEFAULT_STOP_WORDS,
66
+ tokenizer: Tokenizer | None = None,
67
+ ) -> None:
68
+ self.text_fields = list(text_fields)
69
+ self.keyword_fields = list(keyword_fields or [])
70
+ self._stop_words = frozenset(stop_words)
71
+ self._tokenize: Tokenizer = tokenizer or (lambda text: tokenize(text, self._stop_words))
72
+ self.docs: list[dict[str, Any]] = []
73
+ self._field_counts: list[dict[str, Counter]] = []
74
+ self._field_lengths: list[dict[str, int]] = []
75
+ self._postings: dict[str, set[int]] = {}
76
+ self._keyword_index: dict[str, dict[str, set[int]]] = {}
77
+
78
+ def fit(self, docs: list[dict[str, Any]]) -> "Index":
79
+ """Build the inverted index from ``docs``. Returns ``self``."""
80
+ self.docs = list(docs)
81
+ self._field_counts = []
82
+ self._field_lengths = []
83
+ self._postings = {}
84
+ self._keyword_index = {field: {} for field in self.keyword_fields}
85
+
86
+ for doc_id, doc in enumerate(self.docs):
87
+ counts: dict[str, Counter] = {}
88
+ lengths: dict[str, int] = {}
89
+ doc_terms: set[str] = set()
90
+ for field in self.text_fields:
91
+ field_counts = Counter(self._tokenize(str(doc.get(field, ""))))
92
+ counts[field] = field_counts
93
+ lengths[field] = sum(field_counts.values())
94
+ doc_terms.update(field_counts)
95
+ self._field_counts.append(counts)
96
+ self._field_lengths.append(lengths)
97
+ for term in doc_terms:
98
+ self._postings.setdefault(term, set()).add(doc_id)
99
+
100
+ for field in self.keyword_fields:
101
+ value = str(doc.get(field, ""))
102
+ self._keyword_index[field].setdefault(value, set()).add(doc_id)
103
+
104
+ return self
105
+
106
+ def search(
107
+ self,
108
+ query: str,
109
+ filter_dict: dict[str, str] | None = None,
110
+ boost_dict: dict[str, float] | None = None,
111
+ num_results: int = 10,
112
+ ) -> list[dict[str, Any]]:
113
+ """Return up to ``num_results`` docs (copies, with a ``"score"`` key)."""
114
+ query_terms = self._tokenize(query)
115
+ if not query_terms:
116
+ return []
117
+
118
+ filter_dict = filter_dict or {}
119
+ boost_dict = boost_dict or {}
120
+
121
+ candidates = self._candidate_ids(filter_dict)
122
+ if candidates is not None and not candidates:
123
+ return []
124
+
125
+ document_count = len(self.docs) if candidates is None else len(candidates)
126
+ term_postings: dict[str, set[int]] = {}
127
+ document_frequencies: dict[str, int] = {}
128
+ docs_to_score: set[int] = set()
129
+ for term in set(query_terms):
130
+ postings = self._postings.get(term)
131
+ if not postings:
132
+ continue
133
+ matched = postings if candidates is None else (postings & candidates)
134
+ if not matched:
135
+ continue
136
+ term_postings[term] = matched
137
+ document_frequencies[term] = len(matched)
138
+ docs_to_score |= matched
139
+
140
+ if not docs_to_score:
141
+ return []
142
+
143
+ idf = {
144
+ term: math.log(1 + (document_count - df + 0.5) / (df + 0.5))
145
+ for term, df in document_frequencies.items()
146
+ }
147
+
148
+ scored = []
149
+ for doc_id in sorted(docs_to_score):
150
+ score = self._score(doc_id, query_terms, term_postings, idf, boost_dict)
151
+ if score > 0:
152
+ record = dict(self.docs[doc_id])
153
+ record["score"] = score
154
+ scored.append(record)
155
+
156
+ scored.sort(key=lambda record: float(record["score"]), reverse=True)
157
+ return scored[:num_results]
158
+
159
+ def _candidate_ids(self, filter_dict: dict[str, str]) -> set[int] | None:
160
+ """Intersect keyword indexes for each filter. ``None`` means "all docs"."""
161
+ if not filter_dict:
162
+ return None
163
+ candidates: set[int] | None = None
164
+ for field, value in filter_dict.items():
165
+ matched = self._keyword_index.get(field, {}).get(str(value), set())
166
+ candidates = set(matched) if candidates is None else (candidates & matched)
167
+ if not candidates:
168
+ return set()
169
+ return candidates
170
+
171
+ def _score(
172
+ self,
173
+ doc_id: int,
174
+ query_terms: list[str],
175
+ term_postings: dict[str, set[int]],
176
+ idf: dict[str, float],
177
+ boost_dict: dict[str, float],
178
+ ) -> float:
179
+ counts = self._field_counts[doc_id]
180
+ lengths = self._field_lengths[doc_id]
181
+ score = 0.0
182
+ for field in self.text_fields:
183
+ field_length = lengths.get(field, 0)
184
+ if not field_length:
185
+ continue
186
+ field_counts = counts[field]
187
+ boost = float(boost_dict.get(field, 1.0))
188
+ norm = math.sqrt(field_length)
189
+ for term in query_terms:
190
+ if doc_id not in term_postings.get(term, ()):
191
+ continue
192
+ term_frequency = field_counts.get(term, 0)
193
+ if term_frequency == 0:
194
+ continue
195
+ score += boost * idf[term] * (term_frequency / norm)
196
+ return score
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: zerosearch
3
+ Version: 0.1.0
4
+ Summary: A tiny, zero-dependency BM25-lite in-memory text search index.
5
+ Project-URL: Homepage, https://github.com/alexeygrigorev/zerosearch
6
+ Project-URL: Repository, https://github.com/alexeygrigorev/zerosearch
7
+ Author-email: Alexey Grigorev <alexey.s.grigoriev@gmail.com>
8
+ License: WTFPL
9
+ Keywords: bm25,information-retrieval,minsearch,search,tf-idf,zero-dependency
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Text Processing :: Indexing
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+
16
+ # zerosearch
17
+
18
+ A tiny, **zero-dependency** BM25-lite in-memory text search index — standard
19
+ library only, a single small module, and good enough to power retrieval for a
20
+ RAG pipeline. Designed to run anywhere Python runs, including constrained
21
+ environments like Cloudflare Python Workers (Pyodide) where pulling in
22
+ `scikit-learn`/`numpy` is not an option.
23
+
24
+ It is a spiritual cousin of [`minsearch`](https://github.com/alexeygrigorev/minsearch),
25
+ with the same `Index(text_fields, keyword_fields).fit(docs).search(query)` shape,
26
+ but reimplemented from scratch with no third-party dependencies.
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install zerosearch
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ```python
37
+ from zerosearch import Index
38
+
39
+ docs = [
40
+ {"id": "1", "title": "Docker compose basics", "text": "how to start services", "course": "de"},
41
+ {"id": "2", "title": "Kafka consumers", "text": "consumer groups explained", "course": "de"},
42
+ ]
43
+
44
+ index = Index(
45
+ text_fields=["title", "text"],
46
+ keyword_fields=["id", "course"],
47
+ ).fit(docs)
48
+
49
+ results = index.search(
50
+ "how do I start docker compose",
51
+ filter_dict={"course": "de"}, # exact-match keyword filter
52
+ boost_dict={"title": 3.0, "text": 1.0}, # per-field boosts
53
+ num_results=5,
54
+ )
55
+ for r in results:
56
+ print(r["score"], r["title"])
57
+ ```
58
+
59
+ Each result is a shallow copy of the original document dict with an added
60
+ `"score"` key.
61
+
62
+ ## How it works
63
+
64
+ * **Tokenizer** — lowercased word/number tokens; keeps `+ . # _ -` *inside* a
65
+ token so `c++`, `node.js`, `f-string` survive (a token must start with a
66
+ letter/digit). Drops 1-character tokens and a small English stop-word list
67
+ (both overridable).
68
+ * **Inverted index** — built once in `fit()`. A query only scores documents that
69
+ actually contain a query term, so search is fast even on large corpora.
70
+ * **Ranking** — BM25-lite: each query term contributes
71
+ `boost * idf * (term_frequency / sqrt(field_length))` per field. IDF and
72
+ document frequencies are computed over the filtered candidate set.
73
+
74
+ ## Customizing
75
+
76
+ ```python
77
+ Index(
78
+ text_fields=["title", "text"],
79
+ stop_words={"the", "a", "an"}, # replace the default stop words
80
+ tokenizer=lambda s: s.lower().split(), # or plug in your own tokenizer
81
+ )
82
+ ```
83
+
84
+ ## License
85
+
86
+ WTFPL.
@@ -0,0 +1,6 @@
1
+ zerosearch/__init__.py,sha256=_4FimoYYVpwKajCwwCwnEF5hTM2__Em0jrhx_vZDlPg,281
2
+ zerosearch/__version__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
3
+ zerosearch/index.py,sha256=EfxszEsyg3FbPGX2FXy17CzAAsgmcMgjqCNnF3LUcLY,7421
4
+ zerosearch-0.1.0.dist-info/METADATA,sha256=efhn5CNOTCjVZ_jYE6ncs8z_Nrq8aQqKGkSFdHVCqhM,2862
5
+ zerosearch-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
6
+ zerosearch-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any