biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,375 @@
1
+ """
2
+ Naive full-scan retrieval backend.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, Iterable, List, Optional, Tuple
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from ..corpus import Corpus
12
+ from ..frontmatter import parse_front_matter
13
+ from ..models import (
14
+ Evidence,
15
+ ExtractionRunReference,
16
+ QueryBudget,
17
+ RetrievalResult,
18
+ RetrievalRun,
19
+ parse_extraction_run_reference,
20
+ )
21
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
22
+ from ..time import utc_now_iso
23
+
24
+
25
+ class ScanRecipeConfig(BaseModel):
26
+ """
27
+ Configuration for the naive scan backend.
28
+
29
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
30
+ :vartype snippet_characters: int
31
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
32
+ :vartype extraction_run: str or None
33
+ """
34
+
35
+ model_config = ConfigDict(extra="forbid")
36
+
37
+ snippet_characters: int = Field(default=400, ge=1)
38
+ extraction_run: Optional[str] = None
39
+
40
+
41
+ class ScanBackend:
42
+ """
43
+ Naive backend that scans all text items at query time.
44
+
45
+ :ivar backend_id: Backend identifier.
46
+ :vartype backend_id: str
47
+ """
48
+
49
+ backend_id = "scan"
50
+
51
+ def build_run(
52
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
53
+ ) -> RetrievalRun:
54
+ """
55
+ Register a scan backend run (no materialization).
56
+
57
+ :param corpus: Corpus to build against.
58
+ :type corpus: Corpus
59
+ :param recipe_name: Human-readable recipe name.
60
+ :type recipe_name: str
61
+ :param config: Backend-specific configuration values.
62
+ :type config: dict[str, object]
63
+ :return: Run manifest describing the build.
64
+ :rtype: RetrievalRun
65
+ """
66
+ recipe_config = ScanRecipeConfig.model_validate(config)
67
+ catalog = corpus.load_catalog()
68
+ recipe = create_recipe_manifest(
69
+ backend_id=self.backend_id,
70
+ name=recipe_name,
71
+ config=recipe_config.model_dump(),
72
+ )
73
+ stats = {
74
+ "items": len(catalog.items),
75
+ "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
76
+ }
77
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
78
+ corpus.write_run(run)
79
+ return run
80
+
81
+ def query(
82
+ self,
83
+ corpus: Corpus,
84
+ *,
85
+ run: RetrievalRun,
86
+ query_text: str,
87
+ budget: QueryBudget,
88
+ ) -> RetrievalResult:
89
+ """
90
+ Query the corpus with a full scan.
91
+
92
+ :param corpus: Corpus associated with the run.
93
+ :type corpus: Corpus
94
+ :param run: Run manifest to use for querying.
95
+ :type run: RetrievalRun
96
+ :param query_text: Query text to execute.
97
+ :type query_text: str
98
+ :param budget: Evidence selection budget.
99
+ :type budget: QueryBudget
100
+ :return: Retrieval results containing evidence.
101
+ :rtype: RetrievalResult
102
+ """
103
+ recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
104
+ catalog = corpus.load_catalog()
105
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
106
+ query_tokens = _tokenize_query(query_text)
107
+ scored_candidates = _score_items(
108
+ corpus,
109
+ catalog.items.values(),
110
+ query_tokens,
111
+ recipe_config.snippet_characters,
112
+ extraction_reference=extraction_reference,
113
+ )
114
+ sorted_candidates = sorted(
115
+ scored_candidates,
116
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
117
+ )
118
+ ranked = [
119
+ evidence_item.model_copy(
120
+ update={
121
+ "rank": index,
122
+ "recipe_id": run.recipe.recipe_id,
123
+ "run_id": run.run_id,
124
+ }
125
+ )
126
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
127
+ ]
128
+ evidence = apply_budget(ranked, budget)
129
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
130
+ return RetrievalResult(
131
+ query_text=query_text,
132
+ budget=budget,
133
+ run_id=run.run_id,
134
+ recipe_id=run.recipe.recipe_id,
135
+ backend_id=self.backend_id,
136
+ generated_at=utc_now_iso(),
137
+ evidence=evidence,
138
+ stats=stats,
139
+ )
140
+
141
+
142
+ def _resolve_extraction_reference(
143
+ corpus: Corpus, recipe_config: ScanRecipeConfig
144
+ ) -> Optional[ExtractionRunReference]:
145
+ """
146
+ Resolve an extraction run reference from a recipe config.
147
+
148
+ :param corpus: Corpus associated with the recipe.
149
+ :type corpus: Corpus
150
+ :param recipe_config: Parsed scan recipe configuration.
151
+ :type recipe_config: ScanRecipeConfig
152
+ :return: Parsed extraction reference or None.
153
+ :rtype: ExtractionRunReference or None
154
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
155
+ """
156
+ if not recipe_config.extraction_run:
157
+ return None
158
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
159
+ run_dir = corpus.extraction_run_dir(
160
+ extractor_id=extraction_reference.extractor_id,
161
+ run_id=extraction_reference.run_id,
162
+ )
163
+ if not run_dir.is_dir():
164
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
165
+ return extraction_reference
166
+
167
+
168
+ def _count_text_items(
169
+ corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig
170
+ ) -> int:
171
+ """
172
+ Count catalog items that represent text content.
173
+
174
+ When an extraction run is configured, extracted artifacts are treated as text.
175
+
176
+ :param corpus: Corpus containing the items.
177
+ :type corpus: Corpus
178
+ :param items: Catalog items to inspect.
179
+ :type items: Iterable[object]
180
+ :param recipe_config: Parsed scan recipe configuration.
181
+ :type recipe_config: ScanRecipeConfig
182
+ :return: Number of text items.
183
+ :rtype: int
184
+ """
185
+ text_item_count = 0
186
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
187
+ for catalog_item in items:
188
+ item_id = str(getattr(catalog_item, "id", ""))
189
+ if extraction_reference and item_id:
190
+ extracted_text = corpus.read_extracted_text(
191
+ extractor_id=extraction_reference.extractor_id,
192
+ run_id=extraction_reference.run_id,
193
+ item_id=item_id,
194
+ )
195
+ if isinstance(extracted_text, str) and extracted_text.strip():
196
+ text_item_count += 1
197
+ continue
198
+ media_type = getattr(catalog_item, "media_type", "")
199
+ if media_type == "text/markdown" or str(media_type).startswith("text/"):
200
+ text_item_count += 1
201
+ return text_item_count
202
+
203
+
204
+ def _tokenize_query(query_text: str) -> List[str]:
205
+ """
206
+ Tokenize a query string for naive text matching.
207
+
208
+ :param query_text: Raw query text.
209
+ :type query_text: str
210
+ :return: Lowercased non-empty tokens.
211
+ :rtype: list[str]
212
+ """
213
+ return [token for token in query_text.lower().split() if token]
214
+
215
+
216
+ def _load_text_from_item(
217
+ corpus: Corpus,
218
+ *,
219
+ item_id: str,
220
+ relpath: str,
221
+ media_type: str,
222
+ extraction_reference: Optional[ExtractionRunReference],
223
+ ) -> Optional[str]:
224
+ """
225
+ Load a text payload from a catalog item.
226
+
227
+ :param corpus: Corpus containing the item.
228
+ :type corpus: Corpus
229
+ :param item_id: Item identifier.
230
+ :type item_id: str
231
+ :param relpath: Relative path to the stored content.
232
+ :type relpath: str
233
+ :param media_type: Media type for the stored content.
234
+ :type media_type: str
235
+ :param extraction_reference: Optional extraction run reference.
236
+ :type extraction_reference: ExtractionRunReference or None
237
+ :return: Text payload or None if not decodable as text.
238
+ :rtype: str or None
239
+ """
240
+ if extraction_reference:
241
+ extracted_text = corpus.read_extracted_text(
242
+ extractor_id=extraction_reference.extractor_id,
243
+ run_id=extraction_reference.run_id,
244
+ item_id=item_id,
245
+ )
246
+ if isinstance(extracted_text, str) and extracted_text.strip():
247
+ return extracted_text
248
+
249
+ content_path = corpus.root / relpath
250
+ raw_bytes = content_path.read_bytes()
251
+ if media_type == "text/markdown":
252
+ markdown_text = raw_bytes.decode("utf-8")
253
+ parsed_document = parse_front_matter(markdown_text)
254
+ return parsed_document.body
255
+ if media_type.startswith("text/"):
256
+ return raw_bytes.decode("utf-8")
257
+ return None
258
+
259
+
260
+ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
261
+ """
262
+ Locate the earliest token match span in a text payload.
263
+
264
+ :param text: Text to scan.
265
+ :type text: str
266
+ :param tokens: Query tokens.
267
+ :type tokens: list[str]
268
+ :return: Start/end span for the earliest match, or None if no matches.
269
+ :rtype: tuple[int, int] or None
270
+ """
271
+ lower_text = text.lower()
272
+ best_start: Optional[int] = None
273
+ best_end: Optional[int] = None
274
+ for token in tokens:
275
+ if not token:
276
+ continue
277
+ token_start = lower_text.find(token)
278
+ if token_start == -1:
279
+ continue
280
+ token_end = token_start + len(token)
281
+ if best_start is None or token_start < best_start:
282
+ best_start = token_start
283
+ best_end = token_end
284
+ if best_start is None or best_end is None:
285
+ return None
286
+ return best_start, best_end
287
+
288
+
289
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
290
+ """
291
+ Build a snippet around a match span, constrained by a character budget.
292
+
293
+ :param text: Source text to slice.
294
+ :type text: str
295
+ :param span: Match span to center on.
296
+ :type span: tuple[int, int] or None
297
+ :param max_chars: Maximum snippet length.
298
+ :type max_chars: int
299
+ :return: Snippet text.
300
+ :rtype: str
301
+ """
302
+ if not text:
303
+ return ""
304
+ if span is None:
305
+ return text[:max_chars]
306
+ span_start, span_end = span
307
+ half_window = max_chars // 2
308
+ snippet_start = max(span_start - half_window, 0)
309
+ snippet_end = min(span_end + half_window, len(text))
310
+ return text[snippet_start:snippet_end]
311
+
312
+
313
+ def _score_items(
314
+ corpus: Corpus,
315
+ items: Iterable[object],
316
+ tokens: List[str],
317
+ snippet_characters: int,
318
+ *,
319
+ extraction_reference: Optional[ExtractionRunReference],
320
+ ) -> List[Evidence]:
321
+ """
322
+ Score catalog items by token frequency and return evidence candidates.
323
+
324
+ :param corpus: Corpus containing the items.
325
+ :type corpus: Corpus
326
+ :param items: Catalog items to score.
327
+ :type items: Iterable[object]
328
+ :param tokens: Query tokens to count.
329
+ :type tokens: list[str]
330
+ :param snippet_characters: Snippet length budget.
331
+ :type snippet_characters: int
332
+ :return: Evidence candidates with provisional ranks.
333
+ :rtype: list[Evidence]
334
+ """
335
+ evidence_items: List[Evidence] = []
336
+ for catalog_item in items:
337
+ media_type = getattr(catalog_item, "media_type", "")
338
+ relpath = getattr(catalog_item, "relpath", "")
339
+ item_id = str(getattr(catalog_item, "id", ""))
340
+ item_text = _load_text_from_item(
341
+ corpus,
342
+ item_id=item_id,
343
+ relpath=relpath,
344
+ media_type=str(media_type),
345
+ extraction_reference=extraction_reference,
346
+ )
347
+ if item_text is None:
348
+ continue
349
+ lower_text = item_text.lower()
350
+ match_score = sum(lower_text.count(token) for token in tokens)
351
+ if match_score <= 0:
352
+ continue
353
+ span = _find_first_match(item_text, tokens)
354
+ snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
355
+ span_start = span[0] if span else None
356
+ span_end = span[1] if span else None
357
+ evidence_items.append(
358
+ Evidence(
359
+ item_id=str(getattr(catalog_item, "id")),
360
+ source_uri=getattr(catalog_item, "source_uri", None),
361
+ media_type=str(media_type),
362
+ score=float(match_score),
363
+ rank=1,
364
+ text=snippet,
365
+ content_ref=None,
366
+ span_start=span_start,
367
+ span_end=span_end,
368
+ stage="scan",
369
+ recipe_id="",
370
+ run_id="",
371
+ hash=hash_text(snippet),
372
+ )
373
+ )
374
+
375
+ return evidence_items