biblicus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,427 @@
1
+ """
2
+ SQLite full-text search version five retrieval backend for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import sqlite3
8
+ from pathlib import Path
9
+ from typing import Dict, Iterable, List, Optional, Tuple
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
+ from ..corpus import Corpus
15
+ from ..frontmatter import parse_front_matter
16
+ from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
17
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
18
+ from ..time import utc_now_iso
19
+
20
+
21
+ class SqliteFullTextSearchRecipeConfig(BaseModel):
22
+ """
23
+ Configuration for the SQLite full-text search backend.
24
+
25
+ :ivar chunk_size: Maximum characters per chunk.
26
+ :vartype chunk_size: int
27
+ :ivar chunk_overlap: Overlap characters between chunks.
28
+ :vartype chunk_overlap: int
29
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
30
+ :vartype snippet_characters: int
31
+ """
32
+
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ chunk_size: int = Field(default=800, ge=1)
36
+ chunk_overlap: int = Field(default=200, ge=0)
37
+ snippet_characters: int = Field(default=400, ge=1)
38
+
39
+
40
+ class SqliteFullTextSearchBackend:
41
+ """
42
+ SQLite full-text search version five backend for practical local retrieval.
43
+
44
+ :ivar backend_id: Backend identifier.
45
+ :vartype backend_id: str
46
+ """
47
+
48
+ backend_id = "sqlite-full-text-search"
49
+
50
+ def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
51
+ """
52
+ Build a full-text search version five index for the corpus.
53
+
54
+ :param corpus: Corpus to build against.
55
+ :type corpus: Corpus
56
+ :param recipe_name: Human-readable recipe name.
57
+ :type recipe_name: str
58
+ :param config: Backend-specific configuration values.
59
+ :type config: dict[str, object]
60
+ :return: Run manifest describing the build.
61
+ :rtype: RetrievalRun
62
+ """
63
+
64
+ recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
65
+ catalog = corpus.load_catalog()
66
+ recipe = create_recipe_manifest(
67
+ backend_id=self.backend_id,
68
+ name=recipe_name,
69
+ config=recipe_config.model_dump(),
70
+ )
71
+ run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
72
+ db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
73
+ db_path = corpus.root / db_relpath
74
+ corpus.runs_dir.mkdir(parents=True, exist_ok=True)
75
+ stats = _build_full_text_search_index(
76
+ db_path=db_path,
77
+ corpus=corpus,
78
+ items=catalog.items.values(),
79
+ recipe_config=recipe_config,
80
+ )
81
+ run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
82
+ corpus.write_run(run)
83
+ return run
84
+
85
+ def query(
86
+ self,
87
+ corpus: Corpus,
88
+ *,
89
+ run: RetrievalRun,
90
+ query_text: str,
91
+ budget: QueryBudget,
92
+ ) -> RetrievalResult:
93
+ """
94
+ Query the SQLite full-text search index for evidence.
95
+
96
+ :param corpus: Corpus associated with the run.
97
+ :type corpus: Corpus
98
+ :param run: Run manifest to use for querying.
99
+ :type run: RetrievalRun
100
+ :param query_text: Query text to execute.
101
+ :type query_text: str
102
+ :param budget: Evidence selection budget.
103
+ :type budget: QueryBudget
104
+ :return: Retrieval results containing evidence.
105
+ :rtype: RetrievalResult
106
+ """
107
+
108
+ recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
109
+ db_path = _resolve_run_db_path(corpus, run)
110
+ candidates = _query_full_text_search_index(
111
+ db_path=db_path,
112
+ query_text=query_text,
113
+ limit=_candidate_limit(budget.max_total_items),
114
+ snippet_characters=recipe_config.snippet_characters,
115
+ )
116
+ sorted_candidates = sorted(
117
+ candidates,
118
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
119
+ )
120
+ ranked = [
121
+ evidence_item.model_copy(
122
+ update={
123
+ "rank": index,
124
+ "recipe_id": run.recipe.recipe_id,
125
+ "run_id": run.run_id,
126
+ }
127
+ )
128
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
129
+ ]
130
+ evidence = apply_budget(ranked, budget)
131
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
132
+ return RetrievalResult(
133
+ query_text=query_text,
134
+ budget=budget,
135
+ run_id=run.run_id,
136
+ recipe_id=run.recipe.recipe_id,
137
+ backend_id=self.backend_id,
138
+ generated_at=utc_now_iso(),
139
+ evidence=evidence,
140
+ stats=stats,
141
+ )
142
+
143
+
144
+ def _candidate_limit(max_total_items: int) -> int:
145
+ """
146
+ Expand a candidate limit beyond the requested evidence count.
147
+
148
+ :param max_total_items: Requested evidence count.
149
+ :type max_total_items: int
150
+ :return: Candidate limit for backend search.
151
+ :rtype: int
152
+ """
153
+
154
+ return max_total_items * 5
155
+
156
+
157
+ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
158
+ """
159
+ Resolve the SQLite index path for a retrieval run.
160
+
161
+ :param corpus: Corpus containing run artifacts.
162
+ :type corpus: Corpus
163
+ :param run: Retrieval run manifest.
164
+ :type run: RetrievalRun
165
+ :return: Path to the SQLite index file.
166
+ :rtype: Path
167
+ :raises FileNotFoundError: If the run does not have artifact paths.
168
+ """
169
+
170
+ if not run.artifact_paths:
171
+ raise FileNotFoundError("Run has no artifact paths to query")
172
+ return corpus.root / run.artifact_paths[0]
173
+
174
+
175
+ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
176
+ """
177
+ Verify SQLite full-text search version five support in the current runtime.
178
+
179
+ :param conn: SQLite connection to test.
180
+ :type conn: sqlite3.Connection
181
+ :return: None.
182
+ :rtype: None
183
+ :raises RuntimeError: If full-text search version five support is unavailable.
184
+ """
185
+
186
+ try:
187
+ cursor = conn.execute(
188
+ "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
189
+ )
190
+ cursor.close()
191
+ conn.execute("DROP TABLE IF EXISTS chunks_full_text_search")
192
+ except sqlite3.OperationalError as operational_error:
193
+ raise RuntimeError(
194
+ "SQLite full-text search version five is required but not available in this Python build"
195
+ ) from operational_error
196
+
197
+
198
+ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
199
+ """
200
+ Create the full-text search schema in a fresh SQLite database.
201
+
202
+ :param conn: SQLite connection for schema creation.
203
+ :type conn: sqlite3.Connection
204
+ :return: None.
205
+ :rtype: None
206
+ """
207
+
208
+ conn.execute(
209
+ """
210
+ CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
211
+ content,
212
+ item_id UNINDEXED,
213
+ source_uri UNINDEXED,
214
+ media_type UNINDEXED,
215
+ relpath UNINDEXED,
216
+ title UNINDEXED,
217
+ start_offset UNINDEXED,
218
+ end_offset UNINDEXED
219
+ )
220
+ """
221
+ )
222
+
223
+
224
+ def _build_full_text_search_index(
225
+ *,
226
+ db_path: Path,
227
+ corpus: Corpus,
228
+ items: Iterable[object],
229
+ recipe_config: SqliteFullTextSearchRecipeConfig,
230
+ ) -> Dict[str, int]:
231
+ """
232
+ Build a full-text search index from corpus items.
233
+
234
+ :param db_path: Destination SQLite database path.
235
+ :type db_path: Path
236
+ :param corpus: Corpus containing the items.
237
+ :type corpus: Corpus
238
+ :param items: Catalog items to index.
239
+ :type items: Iterable[object]
240
+ :param recipe_config: Chunking and snippet configuration.
241
+ :type recipe_config: SqliteFullTextSearchRecipeConfig
242
+ :return: Index statistics.
243
+ :rtype: dict[str, int]
244
+ """
245
+
246
+ if db_path.exists():
247
+ db_path.unlink()
248
+ connection = sqlite3.connect(str(db_path))
249
+ try:
250
+ _ensure_full_text_search_version_five(connection)
251
+ _create_full_text_search_schema(connection)
252
+ chunk_count = 0
253
+ item_count = 0
254
+ text_item_count = 0
255
+ for catalog_item in items:
256
+ item_count += 1
257
+ media_type = getattr(catalog_item, "media_type", "")
258
+ relpath = getattr(catalog_item, "relpath", "")
259
+ item_text = _load_text_from_item(corpus, relpath, media_type)
260
+ if item_text is None:
261
+ continue
262
+ text_item_count += 1
263
+ title = getattr(catalog_item, "title", None)
264
+ for start_offset, end_offset, chunk in _iter_chunks(
265
+ item_text,
266
+ chunk_size=recipe_config.chunk_size,
267
+ chunk_overlap=recipe_config.chunk_overlap,
268
+ ):
269
+ connection.execute(
270
+ """
271
+ INSERT INTO chunks_full_text_search (
272
+ content,
273
+ item_id,
274
+ source_uri,
275
+ media_type,
276
+ relpath,
277
+ title,
278
+ start_offset,
279
+ end_offset
280
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
281
+ """,
282
+ (
283
+ chunk,
284
+ str(getattr(catalog_item, "id")),
285
+ getattr(catalog_item, "source_uri", None),
286
+ str(media_type),
287
+ str(relpath),
288
+ str(title) if title is not None else None,
289
+ start_offset,
290
+ end_offset,
291
+ ),
292
+ )
293
+ chunk_count += 1
294
+ connection.commit()
295
+ return {
296
+ "items": item_count,
297
+ "text_items": text_item_count,
298
+ "chunks": chunk_count,
299
+ "bytes": db_path.stat().st_size if db_path.exists() else 0,
300
+ }
301
+ finally:
302
+ connection.close()
303
+
304
+
305
+ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
306
+ """
307
+ Load text content from a catalog item.
308
+
309
+ :param corpus: Corpus containing the content.
310
+ :type corpus: Corpus
311
+ :param relpath: Relative path to the content.
312
+ :type relpath: str
313
+ :param media_type: Media type for the content.
314
+ :type media_type: str
315
+ :return: Text payload or None if not text.
316
+ :rtype: str or None
317
+ """
318
+
319
+ content_path = corpus.root / relpath
320
+ raw_bytes = content_path.read_bytes()
321
+ if media_type == "text/markdown":
322
+ markdown_text = raw_bytes.decode("utf-8")
323
+ parsed_document = parse_front_matter(markdown_text)
324
+ return parsed_document.body
325
+ if media_type.startswith("text/"):
326
+ return raw_bytes.decode("utf-8")
327
+ return None
328
+
329
+
330
+ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
331
+ """
332
+ Yield overlapping chunks of text for indexing.
333
+
334
+ :param text: Text to chunk.
335
+ :type text: str
336
+ :param chunk_size: Maximum chunk size.
337
+ :type chunk_size: int
338
+ :param chunk_overlap: Overlap between chunks.
339
+ :type chunk_overlap: int
340
+ :return: Iterable of (start, end, chunk) tuples.
341
+ :rtype: Iterable[tuple[int, int, str]]
342
+ :raises ValueError: If the overlap is greater than or equal to the chunk size.
343
+ """
344
+
345
+ if chunk_overlap >= chunk_size:
346
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
347
+ start_offset = 0
348
+ text_length = len(text)
349
+ while start_offset < text_length:
350
+ end_offset = min(text_length, start_offset + chunk_size)
351
+ yield start_offset, end_offset, text[start_offset:end_offset]
352
+ if end_offset == text_length:
353
+ break
354
+ start_offset = end_offset - chunk_overlap
355
+
356
+
357
+ def _query_full_text_search_index(
358
+ db_path: Path,
359
+ query_text: str,
360
+ limit: int,
361
+ snippet_characters: int,
362
+ ) -> List[Evidence]:
363
+ """
364
+ Query the SQLite full-text search index for evidence candidates.
365
+
366
+ :param db_path: SQLite database path.
367
+ :type db_path: Path
368
+ :param query_text: Query text to execute.
369
+ :type query_text: str
370
+ :param limit: Maximum number of candidates to return.
371
+ :type limit: int
372
+ :param snippet_characters: Snippet length budget.
373
+ :type snippet_characters: int
374
+ :return: Evidence candidates.
375
+ :rtype: list[Evidence]
376
+ """
377
+
378
+ connection = sqlite3.connect(str(db_path))
379
+ try:
380
+ rows = connection.execute(
381
+ """
382
+ SELECT
383
+ content,
384
+ item_id,
385
+ source_uri,
386
+ media_type,
387
+ start_offset,
388
+ end_offset,
389
+ bm25(chunks_full_text_search) AS score
390
+ FROM chunks_full_text_search
391
+ WHERE chunks_full_text_search MATCH ?
392
+ ORDER BY score
393
+ LIMIT ?
394
+ """,
395
+ (query_text, limit),
396
+ ).fetchall()
397
+ evidence_items: List[Evidence] = []
398
+ for (
399
+ content,
400
+ item_id,
401
+ source_uri,
402
+ media_type,
403
+ start_offset,
404
+ end_offset,
405
+ score,
406
+ ) in rows:
407
+ snippet_text = content[:snippet_characters]
408
+ evidence_items.append(
409
+ Evidence(
410
+ item_id=str(item_id),
411
+ source_uri=str(source_uri) if source_uri is not None else None,
412
+ media_type=str(media_type),
413
+ score=float(-score),
414
+ rank=1,
415
+ text=snippet_text,
416
+ content_ref=None,
417
+ span_start=int(start_offset) if start_offset is not None else None,
418
+ span_end=int(end_offset) if end_offset is not None else None,
419
+ stage="full-text-search",
420
+ recipe_id="",
421
+ run_id="",
422
+ hash=hash_text(snippet_text),
423
+ )
424
+ )
425
+ return evidence_items
426
+ finally:
427
+ connection.close()