biblicus 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +28 -0
- biblicus/__main__.py +8 -0
- biblicus/backends/__init__.py +44 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +292 -0
- biblicus/backends/sqlite_full_text_search.py +427 -0
- biblicus/cli.py +468 -0
- biblicus/constants.py +10 -0
- biblicus/corpus.py +952 -0
- biblicus/evaluation.py +261 -0
- biblicus/frontmatter.py +92 -0
- biblicus/models.py +307 -0
- biblicus/retrieval.py +137 -0
- biblicus/sources.py +132 -0
- biblicus/time.py +18 -0
- biblicus/uris.py +64 -0
- biblicus-0.1.1.dist-info/METADATA +174 -0
- biblicus-0.1.1.dist-info/RECORD +22 -0
- biblicus-0.1.1.dist-info/WHEEL +5 -0
- biblicus-0.1.1.dist-info/entry_points.txt +2 -0
- biblicus-0.1.1.dist-info/licenses/LICENSE +21 -0
- biblicus-0.1.1.dist-info/top_level.txt +1 -0
biblicus/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Biblicus public package interface.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .models import (
|
|
6
|
+
CorpusConfig,
|
|
7
|
+
Evidence,
|
|
8
|
+
IngestResult,
|
|
9
|
+
QueryBudget,
|
|
10
|
+
RecipeManifest,
|
|
11
|
+
RetrievalResult,
|
|
12
|
+
RetrievalRun,
|
|
13
|
+
)
|
|
14
|
+
from .corpus import Corpus
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"__version__",
|
|
18
|
+
"Corpus",
|
|
19
|
+
"CorpusConfig",
|
|
20
|
+
"Evidence",
|
|
21
|
+
"IngestResult",
|
|
22
|
+
"QueryBudget",
|
|
23
|
+
"RecipeManifest",
|
|
24
|
+
"RetrievalResult",
|
|
25
|
+
"RetrievalRun",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.1"
|
biblicus/__main__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Backend registry for Biblicus retrieval engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Type
|
|
8
|
+
|
|
9
|
+
from .base import RetrievalBackend
|
|
10
|
+
from .scan import ScanBackend
|
|
11
|
+
from .sqlite_full_text_search import SqliteFullTextSearchBackend
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
15
|
+
"""
|
|
16
|
+
Return the registered retrieval backends.
|
|
17
|
+
|
|
18
|
+
:return: Mapping of backend identifiers to backend classes.
|
|
19
|
+
:rtype: dict[str, Type[RetrievalBackend]]
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
ScanBackend.backend_id: ScanBackend,
|
|
24
|
+
SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_backend(backend_id: str) -> RetrievalBackend:
|
|
29
|
+
"""
|
|
30
|
+
Instantiate a retrieval backend by identifier.
|
|
31
|
+
|
|
32
|
+
:param backend_id: Backend identifier.
|
|
33
|
+
:type backend_id: str
|
|
34
|
+
:return: Backend instance.
|
|
35
|
+
:rtype: RetrievalBackend
|
|
36
|
+
:raises KeyError: If the backend identifier is unknown.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
registry = available_backends()
|
|
40
|
+
backend_class = registry.get(backend_id)
|
|
41
|
+
if backend_class is None:
|
|
42
|
+
known = ", ".join(sorted(registry))
|
|
43
|
+
raise KeyError(f"Unknown backend '{backend_id}'. Known backends: {known}")
|
|
44
|
+
return backend_class()
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Backend interface for Biblicus retrieval engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
from ..corpus import Corpus
|
|
11
|
+
from ..models import QueryBudget, RetrievalResult, RetrievalRun
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RetrievalBackend(ABC):
|
|
15
|
+
"""
|
|
16
|
+
Abstract interface for retrieval backends.
|
|
17
|
+
|
|
18
|
+
:ivar backend_id: Identifier string for the backend.
|
|
19
|
+
:vartype backend_id: str
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
backend_id: str
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
|
|
26
|
+
"""
|
|
27
|
+
Build or register a retrieval run for the backend.
|
|
28
|
+
|
|
29
|
+
:param corpus: Corpus to build against.
|
|
30
|
+
:type corpus: Corpus
|
|
31
|
+
:param recipe_name: Human name for the recipe.
|
|
32
|
+
:type recipe_name: str
|
|
33
|
+
:param config: Backend-specific configuration values.
|
|
34
|
+
:type config: dict[str, object]
|
|
35
|
+
:return: Run manifest describing the build.
|
|
36
|
+
:rtype: RetrievalRun
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
raise NotImplementedError
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def query(
|
|
43
|
+
self,
|
|
44
|
+
corpus: Corpus,
|
|
45
|
+
*,
|
|
46
|
+
run: RetrievalRun,
|
|
47
|
+
query_text: str,
|
|
48
|
+
budget: QueryBudget,
|
|
49
|
+
) -> RetrievalResult:
|
|
50
|
+
"""
|
|
51
|
+
Run a retrieval query against a backend.
|
|
52
|
+
|
|
53
|
+
:param corpus: Corpus associated with the run.
|
|
54
|
+
:type corpus: Corpus
|
|
55
|
+
:param run: Run manifest to use for querying.
|
|
56
|
+
:type run: RetrievalRun
|
|
57
|
+
:param query_text: Query text to execute.
|
|
58
|
+
:type query_text: str
|
|
59
|
+
:param budget: Evidence selection budget.
|
|
60
|
+
:type budget: QueryBudget
|
|
61
|
+
:return: Retrieval results containing evidence.
|
|
62
|
+
:rtype: RetrievalResult
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Naive full-scan retrieval backend.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..frontmatter import parse_front_matter
|
|
13
|
+
from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
|
|
14
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
15
|
+
from ..time import utc_now_iso
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ScanRecipeConfig(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Configuration for the naive scan backend.
|
|
21
|
+
|
|
22
|
+
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
23
|
+
:vartype snippet_characters: int
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
snippet_characters: int = Field(default=400, ge=1)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ScanBackend:
|
|
32
|
+
"""
|
|
33
|
+
Naive backend that scans all text items at query time.
|
|
34
|
+
|
|
35
|
+
:ivar backend_id: Backend identifier.
|
|
36
|
+
:vartype backend_id: str
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
backend_id = "scan"
|
|
40
|
+
|
|
41
|
+
def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
|
|
42
|
+
"""
|
|
43
|
+
Register a scan backend run (no materialization).
|
|
44
|
+
|
|
45
|
+
:param corpus: Corpus to build against.
|
|
46
|
+
:type corpus: Corpus
|
|
47
|
+
:param recipe_name: Human-readable recipe name.
|
|
48
|
+
:type recipe_name: str
|
|
49
|
+
:param config: Backend-specific configuration values.
|
|
50
|
+
:type config: dict[str, object]
|
|
51
|
+
:return: Run manifest describing the build.
|
|
52
|
+
:rtype: RetrievalRun
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
recipe_config = ScanRecipeConfig.model_validate(config)
|
|
56
|
+
catalog = corpus.load_catalog()
|
|
57
|
+
recipe = create_recipe_manifest(
|
|
58
|
+
backend_id=self.backend_id,
|
|
59
|
+
name=recipe_name,
|
|
60
|
+
config=recipe_config.model_dump(),
|
|
61
|
+
)
|
|
62
|
+
stats = {"items": len(catalog.items), "text_items": _count_text_items(catalog.items.values())}
|
|
63
|
+
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
64
|
+
corpus.write_run(run)
|
|
65
|
+
return run
|
|
66
|
+
|
|
67
|
+
def query(
|
|
68
|
+
self,
|
|
69
|
+
corpus: Corpus,
|
|
70
|
+
*,
|
|
71
|
+
run: RetrievalRun,
|
|
72
|
+
query_text: str,
|
|
73
|
+
budget: QueryBudget,
|
|
74
|
+
) -> RetrievalResult:
|
|
75
|
+
"""
|
|
76
|
+
Query the corpus with a full scan.
|
|
77
|
+
|
|
78
|
+
:param corpus: Corpus associated with the run.
|
|
79
|
+
:type corpus: Corpus
|
|
80
|
+
:param run: Run manifest to use for querying.
|
|
81
|
+
:type run: RetrievalRun
|
|
82
|
+
:param query_text: Query text to execute.
|
|
83
|
+
:type query_text: str
|
|
84
|
+
:param budget: Evidence selection budget.
|
|
85
|
+
:type budget: QueryBudget
|
|
86
|
+
:return: Retrieval results containing evidence.
|
|
87
|
+
:rtype: RetrievalResult
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
|
|
91
|
+
catalog = corpus.load_catalog()
|
|
92
|
+
query_tokens = _tokenize_query(query_text)
|
|
93
|
+
scored_candidates = _score_items(
|
|
94
|
+
corpus,
|
|
95
|
+
catalog.items.values(),
|
|
96
|
+
query_tokens,
|
|
97
|
+
recipe_config.snippet_characters,
|
|
98
|
+
)
|
|
99
|
+
sorted_candidates = sorted(
|
|
100
|
+
scored_candidates,
|
|
101
|
+
key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
|
|
102
|
+
)
|
|
103
|
+
ranked = [
|
|
104
|
+
evidence_item.model_copy(
|
|
105
|
+
update={
|
|
106
|
+
"rank": index,
|
|
107
|
+
"recipe_id": run.recipe.recipe_id,
|
|
108
|
+
"run_id": run.run_id,
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
112
|
+
]
|
|
113
|
+
evidence = apply_budget(ranked, budget)
|
|
114
|
+
stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
115
|
+
return RetrievalResult(
|
|
116
|
+
query_text=query_text,
|
|
117
|
+
budget=budget,
|
|
118
|
+
run_id=run.run_id,
|
|
119
|
+
recipe_id=run.recipe.recipe_id,
|
|
120
|
+
backend_id=self.backend_id,
|
|
121
|
+
generated_at=utc_now_iso(),
|
|
122
|
+
evidence=evidence,
|
|
123
|
+
stats=stats,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _count_text_items(items: Iterable[object]) -> int:
|
|
128
|
+
"""
|
|
129
|
+
Count catalog items that represent text content.
|
|
130
|
+
|
|
131
|
+
:param items: Catalog items to inspect.
|
|
132
|
+
:type items: Iterable[object]
|
|
133
|
+
:return: Number of text items.
|
|
134
|
+
:rtype: int
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
text_item_count = 0
|
|
138
|
+
for catalog_item in items:
|
|
139
|
+
media_type = getattr(catalog_item, "media_type", "")
|
|
140
|
+
if media_type == "text/markdown" or str(media_type).startswith("text/"):
|
|
141
|
+
text_item_count += 1
|
|
142
|
+
return text_item_count
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _tokenize_query(query_text: str) -> List[str]:
|
|
146
|
+
"""
|
|
147
|
+
Tokenize a query string for naive text matching.
|
|
148
|
+
|
|
149
|
+
:param query_text: Raw query text.
|
|
150
|
+
:type query_text: str
|
|
151
|
+
:return: Lowercased non-empty tokens.
|
|
152
|
+
:rtype: list[str]
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
return [token for token in query_text.lower().split() if token]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
|
|
159
|
+
"""
|
|
160
|
+
Load a text payload from a catalog item.
|
|
161
|
+
|
|
162
|
+
:param corpus: Corpus containing the item.
|
|
163
|
+
:type corpus: Corpus
|
|
164
|
+
:param relpath: Relative path to the stored content.
|
|
165
|
+
:type relpath: str
|
|
166
|
+
:param media_type: Media type for the stored content.
|
|
167
|
+
:type media_type: str
|
|
168
|
+
:return: Text payload or None if not decodable as text.
|
|
169
|
+
:rtype: str or None
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
content_path = corpus.root / relpath
|
|
173
|
+
raw_bytes = content_path.read_bytes()
|
|
174
|
+
if media_type == "text/markdown":
|
|
175
|
+
markdown_text = raw_bytes.decode("utf-8")
|
|
176
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
177
|
+
return parsed_document.body
|
|
178
|
+
if media_type.startswith("text/"):
|
|
179
|
+
return raw_bytes.decode("utf-8")
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
|
|
184
|
+
"""
|
|
185
|
+
Locate the earliest token match span in a text payload.
|
|
186
|
+
|
|
187
|
+
:param text: Text to scan.
|
|
188
|
+
:type text: str
|
|
189
|
+
:param tokens: Query tokens.
|
|
190
|
+
:type tokens: list[str]
|
|
191
|
+
:return: Start/end span for the earliest match, or None if no matches.
|
|
192
|
+
:rtype: tuple[int, int] or None
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
lower_text = text.lower()
|
|
196
|
+
best_start: Optional[int] = None
|
|
197
|
+
best_end: Optional[int] = None
|
|
198
|
+
for token in tokens:
|
|
199
|
+
if not token:
|
|
200
|
+
continue
|
|
201
|
+
token_start = lower_text.find(token)
|
|
202
|
+
if token_start == -1:
|
|
203
|
+
continue
|
|
204
|
+
token_end = token_start + len(token)
|
|
205
|
+
if best_start is None or token_start < best_start:
|
|
206
|
+
best_start = token_start
|
|
207
|
+
best_end = token_end
|
|
208
|
+
if best_start is None or best_end is None:
|
|
209
|
+
return None
|
|
210
|
+
return best_start, best_end
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Build a snippet around a match span, constrained by a character budget.
|
|
216
|
+
|
|
217
|
+
:param text: Source text to slice.
|
|
218
|
+
:type text: str
|
|
219
|
+
:param span: Match span to center on.
|
|
220
|
+
:type span: tuple[int, int] or None
|
|
221
|
+
:param max_chars: Maximum snippet length.
|
|
222
|
+
:type max_chars: int
|
|
223
|
+
:return: Snippet text.
|
|
224
|
+
:rtype: str
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
if not text:
|
|
228
|
+
return ""
|
|
229
|
+
if span is None:
|
|
230
|
+
return text[:max_chars]
|
|
231
|
+
span_start, span_end = span
|
|
232
|
+
half_window = max_chars // 2
|
|
233
|
+
snippet_start = max(span_start - half_window, 0)
|
|
234
|
+
snippet_end = min(span_end + half_window, len(text))
|
|
235
|
+
return text[snippet_start:snippet_end]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _score_items(
|
|
239
|
+
corpus: Corpus,
|
|
240
|
+
items: Iterable[object],
|
|
241
|
+
tokens: List[str],
|
|
242
|
+
snippet_characters: int,
|
|
243
|
+
) -> List[Evidence]:
|
|
244
|
+
"""
|
|
245
|
+
Score catalog items by token frequency and return evidence candidates.
|
|
246
|
+
|
|
247
|
+
:param corpus: Corpus containing the items.
|
|
248
|
+
:type corpus: Corpus
|
|
249
|
+
:param items: Catalog items to score.
|
|
250
|
+
:type items: Iterable[object]
|
|
251
|
+
:param tokens: Query tokens to count.
|
|
252
|
+
:type tokens: list[str]
|
|
253
|
+
:param snippet_characters: Snippet length budget.
|
|
254
|
+
:type snippet_characters: int
|
|
255
|
+
:return: Evidence candidates with provisional ranks.
|
|
256
|
+
:rtype: list[Evidence]
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
evidence_items: List[Evidence] = []
|
|
260
|
+
for catalog_item in items:
|
|
261
|
+
media_type = getattr(catalog_item, "media_type", "")
|
|
262
|
+
relpath = getattr(catalog_item, "relpath", "")
|
|
263
|
+
item_text = _load_text_from_item(corpus, relpath, media_type)
|
|
264
|
+
if item_text is None:
|
|
265
|
+
continue
|
|
266
|
+
lower_text = item_text.lower()
|
|
267
|
+
match_score = sum(lower_text.count(token) for token in tokens)
|
|
268
|
+
if match_score <= 0:
|
|
269
|
+
continue
|
|
270
|
+
span = _find_first_match(item_text, tokens)
|
|
271
|
+
snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
272
|
+
span_start = span[0] if span else None
|
|
273
|
+
span_end = span[1] if span else None
|
|
274
|
+
evidence_items.append(
|
|
275
|
+
Evidence(
|
|
276
|
+
item_id=str(getattr(catalog_item, "id")),
|
|
277
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
278
|
+
media_type=str(media_type),
|
|
279
|
+
score=float(match_score),
|
|
280
|
+
rank=1,
|
|
281
|
+
text=snippet,
|
|
282
|
+
content_ref=None,
|
|
283
|
+
span_start=span_start,
|
|
284
|
+
span_end=span_end,
|
|
285
|
+
stage="scan",
|
|
286
|
+
recipe_id="",
|
|
287
|
+
run_id="",
|
|
288
|
+
hash=hash_text(snippet),
|
|
289
|
+
)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return evidence_items
|