biblicus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """
2
+ Biblicus public package interface.
3
+ """
4
+
5
+ from .models import (
6
+ CorpusConfig,
7
+ Evidence,
8
+ IngestResult,
9
+ QueryBudget,
10
+ RecipeManifest,
11
+ RetrievalResult,
12
+ RetrievalRun,
13
+ )
14
+ from .corpus import Corpus
15
+
16
+ __all__ = [
17
+ "__version__",
18
+ "Corpus",
19
+ "CorpusConfig",
20
+ "Evidence",
21
+ "IngestResult",
22
+ "QueryBudget",
23
+ "RecipeManifest",
24
+ "RetrievalResult",
25
+ "RetrievalRun",
26
+ ]
27
+
28
+ __version__ = "0.1.1"
biblicus/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """
2
+ Biblicus module entry point.
3
+ """
4
+
5
+ from .cli import main
6
+
7
+ if __name__ == "__main__":
8
+ raise SystemExit(main())
@@ -0,0 +1,44 @@
1
+ """
2
+ Backend registry for Biblicus retrieval engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, Type
8
+
9
+ from .base import RetrievalBackend
10
+ from .scan import ScanBackend
11
+ from .sqlite_full_text_search import SqliteFullTextSearchBackend
12
+
13
+
14
+ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
15
+ """
16
+ Return the registered retrieval backends.
17
+
18
+ :return: Mapping of backend identifiers to backend classes.
19
+ :rtype: dict[str, Type[RetrievalBackend]]
20
+ """
21
+
22
+ return {
23
+ ScanBackend.backend_id: ScanBackend,
24
+ SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
25
+ }
26
+
27
+
28
+ def get_backend(backend_id: str) -> RetrievalBackend:
29
+ """
30
+ Instantiate a retrieval backend by identifier.
31
+
32
+ :param backend_id: Backend identifier.
33
+ :type backend_id: str
34
+ :return: Backend instance.
35
+ :rtype: RetrievalBackend
36
+ :raises KeyError: If the backend identifier is unknown.
37
+ """
38
+
39
+ registry = available_backends()
40
+ backend_class = registry.get(backend_id)
41
+ if backend_class is None:
42
+ known = ", ".join(sorted(registry))
43
+ raise KeyError(f"Unknown backend '{backend_id}'. Known backends: {known}")
44
+ return backend_class()
@@ -0,0 +1,65 @@
1
+ """
2
+ Backend interface for Biblicus retrieval engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Dict
9
+
10
+ from ..corpus import Corpus
11
+ from ..models import QueryBudget, RetrievalResult, RetrievalRun
12
+
13
+
14
+ class RetrievalBackend(ABC):
15
+ """
16
+ Abstract interface for retrieval backends.
17
+
18
+ :ivar backend_id: Identifier string for the backend.
19
+ :vartype backend_id: str
20
+ """
21
+
22
+ backend_id: str
23
+
24
+ @abstractmethod
25
+ def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
26
+ """
27
+ Build or register a retrieval run for the backend.
28
+
29
+ :param corpus: Corpus to build against.
30
+ :type corpus: Corpus
31
+ :param recipe_name: Human name for the recipe.
32
+ :type recipe_name: str
33
+ :param config: Backend-specific configuration values.
34
+ :type config: dict[str, object]
35
+ :return: Run manifest describing the build.
36
+ :rtype: RetrievalRun
37
+ """
38
+
39
+ raise NotImplementedError
40
+
41
+ @abstractmethod
42
+ def query(
43
+ self,
44
+ corpus: Corpus,
45
+ *,
46
+ run: RetrievalRun,
47
+ query_text: str,
48
+ budget: QueryBudget,
49
+ ) -> RetrievalResult:
50
+ """
51
+ Run a retrieval query against a backend.
52
+
53
+ :param corpus: Corpus associated with the run.
54
+ :type corpus: Corpus
55
+ :param run: Run manifest to use for querying.
56
+ :type run: RetrievalRun
57
+ :param query_text: Query text to execute.
58
+ :type query_text: str
59
+ :param budget: Evidence selection budget.
60
+ :type budget: QueryBudget
61
+ :return: Retrieval results containing evidence.
62
+ :rtype: RetrievalResult
63
+ """
64
+
65
+ raise NotImplementedError
@@ -0,0 +1,292 @@
1
+ """
2
+ Naive full-scan retrieval backend.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, Iterable, List, Optional, Tuple
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from ..corpus import Corpus
12
+ from ..frontmatter import parse_front_matter
13
+ from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
14
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
15
+ from ..time import utc_now_iso
16
+
17
+
18
+ class ScanRecipeConfig(BaseModel):
19
+ """
20
+ Configuration for the naive scan backend.
21
+
22
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
23
+ :vartype snippet_characters: int
24
+ """
25
+
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+ snippet_characters: int = Field(default=400, ge=1)
29
+
30
+
31
+ class ScanBackend:
32
+ """
33
+ Naive backend that scans all text items at query time.
34
+
35
+ :ivar backend_id: Backend identifier.
36
+ :vartype backend_id: str
37
+ """
38
+
39
+ backend_id = "scan"
40
+
41
+ def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
42
+ """
43
+ Register a scan backend run (no materialization).
44
+
45
+ :param corpus: Corpus to build against.
46
+ :type corpus: Corpus
47
+ :param recipe_name: Human-readable recipe name.
48
+ :type recipe_name: str
49
+ :param config: Backend-specific configuration values.
50
+ :type config: dict[str, object]
51
+ :return: Run manifest describing the build.
52
+ :rtype: RetrievalRun
53
+ """
54
+
55
+ recipe_config = ScanRecipeConfig.model_validate(config)
56
+ catalog = corpus.load_catalog()
57
+ recipe = create_recipe_manifest(
58
+ backend_id=self.backend_id,
59
+ name=recipe_name,
60
+ config=recipe_config.model_dump(),
61
+ )
62
+ stats = {"items": len(catalog.items), "text_items": _count_text_items(catalog.items.values())}
63
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
64
+ corpus.write_run(run)
65
+ return run
66
+
67
+ def query(
68
+ self,
69
+ corpus: Corpus,
70
+ *,
71
+ run: RetrievalRun,
72
+ query_text: str,
73
+ budget: QueryBudget,
74
+ ) -> RetrievalResult:
75
+ """
76
+ Query the corpus with a full scan.
77
+
78
+ :param corpus: Corpus associated with the run.
79
+ :type corpus: Corpus
80
+ :param run: Run manifest to use for querying.
81
+ :type run: RetrievalRun
82
+ :param query_text: Query text to execute.
83
+ :type query_text: str
84
+ :param budget: Evidence selection budget.
85
+ :type budget: QueryBudget
86
+ :return: Retrieval results containing evidence.
87
+ :rtype: RetrievalResult
88
+ """
89
+
90
+ recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
91
+ catalog = corpus.load_catalog()
92
+ query_tokens = _tokenize_query(query_text)
93
+ scored_candidates = _score_items(
94
+ corpus,
95
+ catalog.items.values(),
96
+ query_tokens,
97
+ recipe_config.snippet_characters,
98
+ )
99
+ sorted_candidates = sorted(
100
+ scored_candidates,
101
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
102
+ )
103
+ ranked = [
104
+ evidence_item.model_copy(
105
+ update={
106
+ "rank": index,
107
+ "recipe_id": run.recipe.recipe_id,
108
+ "run_id": run.run_id,
109
+ }
110
+ )
111
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
112
+ ]
113
+ evidence = apply_budget(ranked, budget)
114
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
115
+ return RetrievalResult(
116
+ query_text=query_text,
117
+ budget=budget,
118
+ run_id=run.run_id,
119
+ recipe_id=run.recipe.recipe_id,
120
+ backend_id=self.backend_id,
121
+ generated_at=utc_now_iso(),
122
+ evidence=evidence,
123
+ stats=stats,
124
+ )
125
+
126
+
127
+ def _count_text_items(items: Iterable[object]) -> int:
128
+ """
129
+ Count catalog items that represent text content.
130
+
131
+ :param items: Catalog items to inspect.
132
+ :type items: Iterable[object]
133
+ :return: Number of text items.
134
+ :rtype: int
135
+ """
136
+
137
+ text_item_count = 0
138
+ for catalog_item in items:
139
+ media_type = getattr(catalog_item, "media_type", "")
140
+ if media_type == "text/markdown" or str(media_type).startswith("text/"):
141
+ text_item_count += 1
142
+ return text_item_count
143
+
144
+
145
+ def _tokenize_query(query_text: str) -> List[str]:
146
+ """
147
+ Tokenize a query string for naive text matching.
148
+
149
+ :param query_text: Raw query text.
150
+ :type query_text: str
151
+ :return: Lowercased non-empty tokens.
152
+ :rtype: list[str]
153
+ """
154
+
155
+ return [token for token in query_text.lower().split() if token]
156
+
157
+
158
+ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
159
+ """
160
+ Load a text payload from a catalog item.
161
+
162
+ :param corpus: Corpus containing the item.
163
+ :type corpus: Corpus
164
+ :param relpath: Relative path to the stored content.
165
+ :type relpath: str
166
+ :param media_type: Media type for the stored content.
167
+ :type media_type: str
168
+ :return: Text payload or None if not decodable as text.
169
+ :rtype: str or None
170
+ """
171
+
172
+ content_path = corpus.root / relpath
173
+ raw_bytes = content_path.read_bytes()
174
+ if media_type == "text/markdown":
175
+ markdown_text = raw_bytes.decode("utf-8")
176
+ parsed_document = parse_front_matter(markdown_text)
177
+ return parsed_document.body
178
+ if media_type.startswith("text/"):
179
+ return raw_bytes.decode("utf-8")
180
+ return None
181
+
182
+
183
+ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
184
+ """
185
+ Locate the earliest token match span in a text payload.
186
+
187
+ :param text: Text to scan.
188
+ :type text: str
189
+ :param tokens: Query tokens.
190
+ :type tokens: list[str]
191
+ :return: Start/end span for the earliest match, or None if no matches.
192
+ :rtype: tuple[int, int] or None
193
+ """
194
+
195
+ lower_text = text.lower()
196
+ best_start: Optional[int] = None
197
+ best_end: Optional[int] = None
198
+ for token in tokens:
199
+ if not token:
200
+ continue
201
+ token_start = lower_text.find(token)
202
+ if token_start == -1:
203
+ continue
204
+ token_end = token_start + len(token)
205
+ if best_start is None or token_start < best_start:
206
+ best_start = token_start
207
+ best_end = token_end
208
+ if best_start is None or best_end is None:
209
+ return None
210
+ return best_start, best_end
211
+
212
+
213
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
214
+ """
215
+ Build a snippet around a match span, constrained by a character budget.
216
+
217
+ :param text: Source text to slice.
218
+ :type text: str
219
+ :param span: Match span to center on.
220
+ :type span: tuple[int, int] or None
221
+ :param max_chars: Maximum snippet length.
222
+ :type max_chars: int
223
+ :return: Snippet text.
224
+ :rtype: str
225
+ """
226
+
227
+ if not text:
228
+ return ""
229
+ if span is None:
230
+ return text[:max_chars]
231
+ span_start, span_end = span
232
+ half_window = max_chars // 2
233
+ snippet_start = max(span_start - half_window, 0)
234
+ snippet_end = min(span_end + half_window, len(text))
235
+ return text[snippet_start:snippet_end]
236
+
237
+
238
+ def _score_items(
239
+ corpus: Corpus,
240
+ items: Iterable[object],
241
+ tokens: List[str],
242
+ snippet_characters: int,
243
+ ) -> List[Evidence]:
244
+ """
245
+ Score catalog items by token frequency and return evidence candidates.
246
+
247
+ :param corpus: Corpus containing the items.
248
+ :type corpus: Corpus
249
+ :param items: Catalog items to score.
250
+ :type items: Iterable[object]
251
+ :param tokens: Query tokens to count.
252
+ :type tokens: list[str]
253
+ :param snippet_characters: Snippet length budget.
254
+ :type snippet_characters: int
255
+ :return: Evidence candidates with provisional ranks.
256
+ :rtype: list[Evidence]
257
+ """
258
+
259
+ evidence_items: List[Evidence] = []
260
+ for catalog_item in items:
261
+ media_type = getattr(catalog_item, "media_type", "")
262
+ relpath = getattr(catalog_item, "relpath", "")
263
+ item_text = _load_text_from_item(corpus, relpath, media_type)
264
+ if item_text is None:
265
+ continue
266
+ lower_text = item_text.lower()
267
+ match_score = sum(lower_text.count(token) for token in tokens)
268
+ if match_score <= 0:
269
+ continue
270
+ span = _find_first_match(item_text, tokens)
271
+ snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
272
+ span_start = span[0] if span else None
273
+ span_end = span[1] if span else None
274
+ evidence_items.append(
275
+ Evidence(
276
+ item_id=str(getattr(catalog_item, "id")),
277
+ source_uri=getattr(catalog_item, "source_uri", None),
278
+ media_type=str(media_type),
279
+ score=float(match_score),
280
+ rank=1,
281
+ text=snippet,
282
+ content_ref=None,
283
+ span_start=span_start,
284
+ span_end=span_end,
285
+ stage="scan",
286
+ recipe_id="",
287
+ run_id="",
288
+ hash=hash_text(snippet),
289
+ )
290
+ )
291
+
292
+ return evidence_items