biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,487 @@
1
+ """
2
+ SQLite full-text search version five retrieval backend for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import sqlite3
8
+ from pathlib import Path
9
+ from typing import Dict, Iterable, List, Optional, Tuple
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
+ from ..corpus import Corpus
15
+ from ..frontmatter import parse_front_matter
16
+ from ..models import (
17
+ Evidence,
18
+ ExtractionRunReference,
19
+ QueryBudget,
20
+ RetrievalResult,
21
+ RetrievalRun,
22
+ parse_extraction_run_reference,
23
+ )
24
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
25
+ from ..time import utc_now_iso
26
+
27
+
28
+ class SqliteFullTextSearchRecipeConfig(BaseModel):
29
+ """
30
+ Configuration for the SQLite full-text search backend.
31
+
32
+ :ivar chunk_size: Maximum characters per chunk.
33
+ :vartype chunk_size: int
34
+ :ivar chunk_overlap: Overlap characters between chunks.
35
+ :vartype chunk_overlap: int
36
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
37
+ :vartype snippet_characters: int
38
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
39
+ :vartype extraction_run: str or None
40
+ """
41
+
42
+ model_config = ConfigDict(extra="forbid")
43
+
44
+ chunk_size: int = Field(default=800, ge=1)
45
+ chunk_overlap: int = Field(default=200, ge=0)
46
+ snippet_characters: int = Field(default=400, ge=1)
47
+ extraction_run: Optional[str] = None
48
+
49
+
50
+ class SqliteFullTextSearchBackend:
51
+ """
52
+ SQLite full-text search version five backend for practical local retrieval.
53
+
54
+ :ivar backend_id: Backend identifier.
55
+ :vartype backend_id: str
56
+ """
57
+
58
+ backend_id = "sqlite-full-text-search"
59
+
60
+ def build_run(
61
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
62
+ ) -> RetrievalRun:
63
+ """
64
+ Build a full-text search version five index for the corpus.
65
+
66
+ :param corpus: Corpus to build against.
67
+ :type corpus: Corpus
68
+ :param recipe_name: Human-readable recipe name.
69
+ :type recipe_name: str
70
+ :param config: Backend-specific configuration values.
71
+ :type config: dict[str, object]
72
+ :return: Run manifest describing the build.
73
+ :rtype: RetrievalRun
74
+ """
75
+ recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
76
+ catalog = corpus.load_catalog()
77
+ recipe = create_recipe_manifest(
78
+ backend_id=self.backend_id,
79
+ name=recipe_name,
80
+ config=recipe_config.model_dump(),
81
+ )
82
+ run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
83
+ db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
84
+ db_path = corpus.root / db_relpath
85
+ corpus.runs_dir.mkdir(parents=True, exist_ok=True)
86
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
87
+ stats = _build_full_text_search_index(
88
+ db_path=db_path,
89
+ corpus=corpus,
90
+ items=catalog.items.values(),
91
+ recipe_config=recipe_config,
92
+ extraction_reference=extraction_reference,
93
+ )
94
+ run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
95
+ corpus.write_run(run)
96
+ return run
97
+
98
+ def query(
99
+ self,
100
+ corpus: Corpus,
101
+ *,
102
+ run: RetrievalRun,
103
+ query_text: str,
104
+ budget: QueryBudget,
105
+ ) -> RetrievalResult:
106
+ """
107
+ Query the SQLite full-text search index for evidence.
108
+
109
+ :param corpus: Corpus associated with the run.
110
+ :type corpus: Corpus
111
+ :param run: Run manifest to use for querying.
112
+ :type run: RetrievalRun
113
+ :param query_text: Query text to execute.
114
+ :type query_text: str
115
+ :param budget: Evidence selection budget.
116
+ :type budget: QueryBudget
117
+ :return: Retrieval results containing evidence.
118
+ :rtype: RetrievalResult
119
+ """
120
+ recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
121
+ db_path = _resolve_run_db_path(corpus, run)
122
+ candidates = _query_full_text_search_index(
123
+ db_path=db_path,
124
+ query_text=query_text,
125
+ limit=_candidate_limit(budget.max_total_items),
126
+ snippet_characters=recipe_config.snippet_characters,
127
+ )
128
+ sorted_candidates = sorted(
129
+ candidates,
130
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
131
+ )
132
+ ranked = [
133
+ evidence_item.model_copy(
134
+ update={
135
+ "rank": index,
136
+ "recipe_id": run.recipe.recipe_id,
137
+ "run_id": run.run_id,
138
+ }
139
+ )
140
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
141
+ ]
142
+ evidence = apply_budget(ranked, budget)
143
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
144
+ return RetrievalResult(
145
+ query_text=query_text,
146
+ budget=budget,
147
+ run_id=run.run_id,
148
+ recipe_id=run.recipe.recipe_id,
149
+ backend_id=self.backend_id,
150
+ generated_at=utc_now_iso(),
151
+ evidence=evidence,
152
+ stats=stats,
153
+ )
154
+
155
+
156
+ def _candidate_limit(max_total_items: int) -> int:
157
+ """
158
+ Expand a candidate limit beyond the requested evidence count.
159
+
160
+ :param max_total_items: Requested evidence count.
161
+ :type max_total_items: int
162
+ :return: Candidate limit for backend search.
163
+ :rtype: int
164
+ """
165
+ return max_total_items * 5
166
+
167
+
168
+ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
169
+ """
170
+ Resolve the SQLite index path for a retrieval run.
171
+
172
+ :param corpus: Corpus containing run artifacts.
173
+ :type corpus: Corpus
174
+ :param run: Retrieval run manifest.
175
+ :type run: RetrievalRun
176
+ :return: Path to the SQLite index file.
177
+ :rtype: Path
178
+ :raises FileNotFoundError: If the run does not have artifact paths.
179
+ """
180
+ if not run.artifact_paths:
181
+ raise FileNotFoundError("Run has no artifact paths to query")
182
+ return corpus.root / run.artifact_paths[0]
183
+
184
+
185
+ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
186
+ """
187
+ Verify SQLite full-text search version five support in the current runtime.
188
+
189
+ :param conn: SQLite connection to test.
190
+ :type conn: sqlite3.Connection
191
+ :return: None.
192
+ :rtype: None
193
+ :raises RuntimeError: If full-text search version five support is unavailable.
194
+ """
195
+ try:
196
+ cursor = conn.execute(
197
+ "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
198
+ )
199
+ cursor.close()
200
+ conn.execute("DROP TABLE IF EXISTS chunks_full_text_search")
201
+ except sqlite3.OperationalError as operational_error:
202
+ raise RuntimeError(
203
+ "SQLite full-text search version five is required but not available in this Python build"
204
+ ) from operational_error
205
+
206
+
207
+ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
208
+ """
209
+ Create the full-text search schema in a fresh SQLite database.
210
+
211
+ :param conn: SQLite connection for schema creation.
212
+ :type conn: sqlite3.Connection
213
+ :return: None.
214
+ :rtype: None
215
+ """
216
+ conn.execute(
217
+ """
218
+ CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
219
+ content,
220
+ item_id UNINDEXED,
221
+ source_uri UNINDEXED,
222
+ media_type UNINDEXED,
223
+ relpath UNINDEXED,
224
+ title UNINDEXED,
225
+ start_offset UNINDEXED,
226
+ end_offset UNINDEXED
227
+ )
228
+ """
229
+ )
230
+
231
+
232
+ def _build_full_text_search_index(
233
+ *,
234
+ db_path: Path,
235
+ corpus: Corpus,
236
+ items: Iterable[object],
237
+ recipe_config: SqliteFullTextSearchRecipeConfig,
238
+ extraction_reference: Optional[ExtractionRunReference],
239
+ ) -> Dict[str, int]:
240
+ """
241
+ Build a full-text search index from corpus items.
242
+
243
+ :param db_path: Destination SQLite database path.
244
+ :type db_path: Path
245
+ :param corpus: Corpus containing the items.
246
+ :type corpus: Corpus
247
+ :param items: Catalog items to index.
248
+ :type items: Iterable[object]
249
+ :param recipe_config: Chunking and snippet configuration.
250
+ :type recipe_config: SqliteFullTextSearchRecipeConfig
251
+ :return: Index statistics.
252
+ :rtype: dict[str, int]
253
+ """
254
+ if db_path.exists():
255
+ db_path.unlink()
256
+ connection = sqlite3.connect(str(db_path))
257
+ try:
258
+ _ensure_full_text_search_version_five(connection)
259
+ _create_full_text_search_schema(connection)
260
+ chunk_count = 0
261
+ item_count = 0
262
+ text_item_count = 0
263
+ for catalog_item in items:
264
+ item_count += 1
265
+ media_type = getattr(catalog_item, "media_type", "")
266
+ relpath = getattr(catalog_item, "relpath", "")
267
+ item_text = _load_text_from_item(
268
+ corpus,
269
+ item_id=str(getattr(catalog_item, "id", "")),
270
+ relpath=str(relpath),
271
+ media_type=str(media_type),
272
+ extraction_reference=extraction_reference,
273
+ )
274
+ if item_text is None:
275
+ continue
276
+ text_item_count += 1
277
+ title = getattr(catalog_item, "title", None)
278
+ for start_offset, end_offset, chunk in _iter_chunks(
279
+ item_text,
280
+ chunk_size=recipe_config.chunk_size,
281
+ chunk_overlap=recipe_config.chunk_overlap,
282
+ ):
283
+ connection.execute(
284
+ """
285
+ INSERT INTO chunks_full_text_search (
286
+ content,
287
+ item_id,
288
+ source_uri,
289
+ media_type,
290
+ relpath,
291
+ title,
292
+ start_offset,
293
+ end_offset
294
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
295
+ """,
296
+ (
297
+ chunk,
298
+ str(getattr(catalog_item, "id")),
299
+ getattr(catalog_item, "source_uri", None),
300
+ str(media_type),
301
+ str(relpath),
302
+ str(title) if title is not None else None,
303
+ start_offset,
304
+ end_offset,
305
+ ),
306
+ )
307
+ chunk_count += 1
308
+ connection.commit()
309
+ return {
310
+ "items": item_count,
311
+ "text_items": text_item_count,
312
+ "chunks": chunk_count,
313
+ "bytes": db_path.stat().st_size if db_path.exists() else 0,
314
+ }
315
+ finally:
316
+ connection.close()
317
+
318
+
319
+ def _load_text_from_item(
320
+ corpus: Corpus,
321
+ *,
322
+ item_id: str,
323
+ relpath: str,
324
+ media_type: str,
325
+ extraction_reference: Optional[ExtractionRunReference],
326
+ ) -> Optional[str]:
327
+ """
328
+ Load text content from a catalog item.
329
+
330
+ :param corpus: Corpus containing the content.
331
+ :type corpus: Corpus
332
+ :param item_id: Item identifier.
333
+ :type item_id: str
334
+ :param relpath: Relative path to the content.
335
+ :type relpath: str
336
+ :param media_type: Media type for the content.
337
+ :type media_type: str
338
+ :param extraction_reference: Optional extraction run reference.
339
+ :type extraction_reference: ExtractionRunReference or None
340
+ :return: Text payload or None if not text.
341
+ :rtype: str or None
342
+ """
343
+ if extraction_reference:
344
+ extracted_text = corpus.read_extracted_text(
345
+ extractor_id=extraction_reference.extractor_id,
346
+ run_id=extraction_reference.run_id,
347
+ item_id=item_id,
348
+ )
349
+ if isinstance(extracted_text, str) and extracted_text.strip():
350
+ return extracted_text
351
+
352
+ content_path = corpus.root / relpath
353
+ raw_bytes = content_path.read_bytes()
354
+ if media_type == "text/markdown":
355
+ markdown_text = raw_bytes.decode("utf-8")
356
+ parsed_document = parse_front_matter(markdown_text)
357
+ return parsed_document.body
358
+ if media_type.startswith("text/"):
359
+ return raw_bytes.decode("utf-8")
360
+ return None
361
+
362
+
363
+ def _resolve_extraction_reference(
364
+ corpus: Corpus,
365
+ recipe_config: SqliteFullTextSearchRecipeConfig,
366
+ ) -> Optional[ExtractionRunReference]:
367
+ """
368
+ Resolve an extraction run reference from a recipe config.
369
+
370
+ :param corpus: Corpus associated with the recipe.
371
+ :type corpus: Corpus
372
+ :param recipe_config: Parsed backend recipe configuration.
373
+ :type recipe_config: SqliteFullTextSearchRecipeConfig
374
+ :return: Parsed extraction reference or None.
375
+ :rtype: ExtractionRunReference or None
376
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
377
+ """
378
+ if not recipe_config.extraction_run:
379
+ return None
380
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
381
+ run_dir = corpus.extraction_run_dir(
382
+ extractor_id=extraction_reference.extractor_id,
383
+ run_id=extraction_reference.run_id,
384
+ )
385
+ if not run_dir.is_dir():
386
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
387
+ return extraction_reference
388
+
389
+
390
+ def _iter_chunks(
391
+ text: str, *, chunk_size: int, chunk_overlap: int
392
+ ) -> Iterable[Tuple[int, int, str]]:
393
+ """
394
+ Yield overlapping chunks of text for indexing.
395
+
396
+ :param text: Text to chunk.
397
+ :type text: str
398
+ :param chunk_size: Maximum chunk size.
399
+ :type chunk_size: int
400
+ :param chunk_overlap: Overlap between chunks.
401
+ :type chunk_overlap: int
402
+ :return: Iterable of (start, end, chunk) tuples.
403
+ :rtype: Iterable[tuple[int, int, str]]
404
+ :raises ValueError: If the overlap is greater than or equal to the chunk size.
405
+ """
406
+ if chunk_overlap >= chunk_size:
407
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
408
+ start_offset = 0
409
+ text_length = len(text)
410
+ while start_offset < text_length:
411
+ end_offset = min(text_length, start_offset + chunk_size)
412
+ yield start_offset, end_offset, text[start_offset:end_offset]
413
+ if end_offset == text_length:
414
+ break
415
+ start_offset = end_offset - chunk_overlap
416
+
417
+
418
+ def _query_full_text_search_index(
419
+ db_path: Path,
420
+ query_text: str,
421
+ limit: int,
422
+ snippet_characters: int,
423
+ ) -> List[Evidence]:
424
+ """
425
+ Query the SQLite full-text search index for evidence candidates.
426
+
427
+ :param db_path: SQLite database path.
428
+ :type db_path: Path
429
+ :param query_text: Query text to execute.
430
+ :type query_text: str
431
+ :param limit: Maximum number of candidates to return.
432
+ :type limit: int
433
+ :param snippet_characters: Snippet length budget.
434
+ :type snippet_characters: int
435
+ :return: Evidence candidates.
436
+ :rtype: list[Evidence]
437
+ """
438
+ connection = sqlite3.connect(str(db_path))
439
+ try:
440
+ rows = connection.execute(
441
+ """
442
+ SELECT
443
+ content,
444
+ item_id,
445
+ source_uri,
446
+ media_type,
447
+ start_offset,
448
+ end_offset,
449
+ bm25(chunks_full_text_search) AS score
450
+ FROM chunks_full_text_search
451
+ WHERE chunks_full_text_search MATCH ?
452
+ ORDER BY score
453
+ LIMIT ?
454
+ """,
455
+ (query_text, limit),
456
+ ).fetchall()
457
+ evidence_items: List[Evidence] = []
458
+ for (
459
+ content,
460
+ item_id,
461
+ source_uri,
462
+ media_type,
463
+ start_offset,
464
+ end_offset,
465
+ score,
466
+ ) in rows:
467
+ snippet_text = content[:snippet_characters]
468
+ evidence_items.append(
469
+ Evidence(
470
+ item_id=str(item_id),
471
+ source_uri=str(source_uri) if source_uri is not None else None,
472
+ media_type=str(media_type),
473
+ score=float(-score),
474
+ rank=1,
475
+ text=snippet_text,
476
+ content_ref=None,
477
+ span_start=int(start_offset) if start_offset is not None else None,
478
+ span_end=int(end_offset) if end_offset is not None else None,
479
+ stage="full-text-search",
480
+ recipe_id="",
481
+ run_id="",
482
+ hash=hash_text(snippet_text),
483
+ )
484
+ )
485
+ return evidence_items
486
+ finally:
487
+ connection.close()