biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- SQLite full-text search version five retrieval backend for Biblicus.
2
+ SQLite full-text search version five retriever for Biblicus.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -10,24 +10,29 @@ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
12
12
 
13
- from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
13
+ from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
14
14
  from ..corpus import Corpus
15
15
  from ..frontmatter import parse_front_matter
16
16
  from ..models import (
17
17
  Evidence,
18
- ExtractionRunReference,
18
+ ExtractionSnapshotReference,
19
19
  QueryBudget,
20
20
  RetrievalResult,
21
- RetrievalRun,
22
- parse_extraction_run_reference,
21
+ RetrievalSnapshot,
22
+ parse_extraction_snapshot_reference,
23
+ )
24
+ from ..retrieval import (
25
+ apply_budget,
26
+ create_configuration_manifest,
27
+ create_snapshot_manifest,
28
+ hash_text,
23
29
  )
24
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
25
30
  from ..time import utc_now_iso
26
31
 
27
32
 
28
- class SqliteFullTextSearchRecipeConfig(BaseModel):
33
+ class SqliteFullTextSearchConfiguration(BaseModel):
29
34
  """
30
- Configuration for the SQLite full-text search backend.
35
+ Configuration for the SQLite full-text search retriever.
31
36
 
32
37
  :ivar chunk_size: Maximum characters per chunk.
33
38
  :vartype chunk_size: int
@@ -57,8 +62,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
57
62
  :vartype rerank_model: str or None
58
63
  :ivar rerank_top_k: Number of candidates to rerank.
59
64
  :vartype rerank_top_k: int
60
- :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
61
- :vartype extraction_run: str or None
65
+ :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
66
+ :vartype extraction_snapshot: str or None
62
67
  """
63
68
 
64
69
  model_config = ConfigDict(extra="forbid")
@@ -77,7 +82,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
77
82
  rerank_enabled: bool = False
78
83
  rerank_model: Optional[str] = None
79
84
  rerank_top_k: int = Field(default=10, ge=1)
80
- extraction_run: Optional[str] = None
85
+ extraction_snapshot: Optional[str] = None
81
86
 
82
87
  @field_validator("stop_words")
83
88
  @classmethod
@@ -97,7 +102,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
97
102
  return value
98
103
 
99
104
  @model_validator(mode="after")
100
- def _validate_ngram_range(self) -> "SqliteFullTextSearchRecipeConfig":
105
+ def _validate_ngram_range(self) -> "SqliteFullTextSearchConfiguration":
101
106
  if self.ngram_min > self.ngram_max:
102
107
  raise ValueError("Invalid ngram range: ngram_min must be <= ngram_max")
103
108
  if self.rerank_enabled and not self.rerank_model:
@@ -142,69 +147,76 @@ _ENGLISH_STOP_WORDS: Set[str] = {
142
147
  }
143
148
 
144
149
 
145
- class SqliteFullTextSearchBackend:
150
+ class SqliteFullTextSearchRetriever:
146
151
  """
147
- SQLite full-text search version five backend for practical local retrieval.
152
+ SQLite full-text search version five retriever for practical local retrieval.
148
153
 
149
- :ivar backend_id: Backend identifier.
150
- :vartype backend_id: str
154
+ :ivar retriever_id: Retriever identifier.
155
+ :vartype retriever_id: str
151
156
  """
152
157
 
153
- backend_id = "sqlite-full-text-search"
158
+ retriever_id = "sqlite-full-text-search"
154
159
 
155
- def build_run(
156
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
157
- ) -> RetrievalRun:
160
+ def build_snapshot(
161
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
162
+ ) -> RetrievalSnapshot:
158
163
  """
159
164
  Build a full-text search version five index for the corpus.
160
165
 
161
166
  :param corpus: Corpus to build against.
162
167
  :type corpus: Corpus
163
- :param recipe_name: Human-readable recipe name.
164
- :type recipe_name: str
165
- :param config: Backend-specific configuration values.
166
- :type config: dict[str, object]
167
- :return: Run manifest describing the build.
168
- :rtype: RetrievalRun
168
+ :param configuration_name: Human-readable configuration name.
169
+ :type configuration_name: str
170
+ :param configuration: Retriever-specific configuration values.
171
+ :type configuration: dict[str, object]
172
+ :return: Snapshot manifest describing the build.
173
+ :rtype: RetrievalSnapshot
169
174
  """
170
- recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
175
+ parsed_config = SqliteFullTextSearchConfiguration.model_validate(configuration)
171
176
  catalog = corpus.load_catalog()
172
- recipe = create_recipe_manifest(
173
- backend_id=self.backend_id,
174
- name=recipe_name,
175
- config=recipe_config.model_dump(),
177
+ configuration_manifest = create_configuration_manifest(
178
+ retriever_id=self.retriever_id,
179
+ name=configuration_name,
180
+ configuration=parsed_config.model_dump(),
181
+ )
182
+ snapshot = create_snapshot_manifest(
183
+ corpus,
184
+ configuration=configuration_manifest,
185
+ stats={},
186
+ snapshot_artifacts=[],
187
+ )
188
+ db_relpath = str(
189
+ Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{snapshot.snapshot_id}.sqlite"
176
190
  )
177
- run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
178
- db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
179
191
  db_path = corpus.root / db_relpath
180
- corpus.runs_dir.mkdir(parents=True, exist_ok=True)
181
- extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
192
+ corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
193
+ extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
182
194
  stats = _build_full_text_search_index(
183
195
  db_path=db_path,
184
196
  corpus=corpus,
185
197
  items=catalog.items.values(),
186
- recipe_config=recipe_config,
198
+ configuration=parsed_config,
187
199
  extraction_reference=extraction_reference,
188
200
  )
189
- run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
190
- corpus.write_run(run)
191
- return run
201
+ snapshot = snapshot.model_copy(update={"snapshot_artifacts": [db_relpath], "stats": stats})
202
+ corpus.write_snapshot(snapshot)
203
+ return snapshot
192
204
 
193
205
  def query(
194
206
  self,
195
207
  corpus: Corpus,
196
208
  *,
197
- run: RetrievalRun,
209
+ snapshot: RetrievalSnapshot,
198
210
  query_text: str,
199
211
  budget: QueryBudget,
200
212
  ) -> RetrievalResult:
201
213
  """
202
214
  Query the SQLite full-text search index for evidence.
203
215
 
204
- :param corpus: Corpus associated with the run.
216
+ :param corpus: Corpus associated with the snapshot.
205
217
  :type corpus: Corpus
206
- :param run: Run manifest to use for querying.
207
- :type run: RetrievalRun
218
+ :param snapshot: Snapshot manifest to use for querying.
219
+ :type snapshot: RetrievalSnapshot
208
220
  :param query_text: Query text to execute.
209
221
  :type query_text: str
210
222
  :param budget: Evidence selection budget.
@@ -212,46 +224,48 @@ class SqliteFullTextSearchBackend:
212
224
  :return: Retrieval results containing evidence.
213
225
  :rtype: RetrievalResult
214
226
  """
215
- recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
227
+ parsed_config = SqliteFullTextSearchConfiguration.model_validate(
228
+ snapshot.configuration.configuration
229
+ )
216
230
  query_tokens = _tokenize_query(query_text)
217
- stop_words = _resolve_stop_words(recipe_config.stop_words)
231
+ stop_words = _resolve_stop_words(parsed_config.stop_words)
218
232
  filtered_tokens = _apply_stop_words(query_tokens, stop_words)
219
233
  if not filtered_tokens:
220
234
  return RetrievalResult(
221
235
  query_text=query_text,
222
236
  budget=budget,
223
- run_id=run.run_id,
224
- recipe_id=run.recipe.recipe_id,
225
- backend_id=self.backend_id,
237
+ snapshot_id=snapshot.snapshot_id,
238
+ configuration_id=snapshot.configuration.configuration_id,
239
+ retriever_id=snapshot.configuration.retriever_id,
226
240
  generated_at=utc_now_iso(),
227
241
  evidence=[],
228
242
  stats={"candidates": 0, "returned": 0},
229
243
  )
230
- db_path = _resolve_run_db_path(corpus, run)
244
+ db_path = _resolve_snapshot_db_path(corpus, snapshot)
231
245
  candidates = _query_full_text_search_index(
232
246
  db_path=db_path,
233
247
  query_text=" ".join(filtered_tokens),
234
248
  limit=_candidate_limit(budget.max_total_items + budget.offset),
235
- snippet_characters=recipe_config.snippet_characters,
249
+ snippet_characters=parsed_config.snippet_characters,
236
250
  )
237
251
  sorted_candidates = _rank_candidates(candidates)
238
252
  evidence = _apply_rerank_if_enabled(
239
253
  sorted_candidates,
240
254
  query_tokens=filtered_tokens,
241
- run=run,
255
+ snapshot=snapshot,
242
256
  budget=budget,
243
- rerank_enabled=recipe_config.rerank_enabled,
244
- rerank_top_k=recipe_config.rerank_top_k,
257
+ rerank_enabled=parsed_config.rerank_enabled,
258
+ rerank_top_k=parsed_config.rerank_top_k,
245
259
  )
246
260
  stats: Dict[str, object] = {"candidates": len(sorted_candidates), "returned": len(evidence)}
247
- if recipe_config.rerank_enabled:
248
- stats["reranked_candidates"] = min(len(sorted_candidates), recipe_config.rerank_top_k)
261
+ if parsed_config.rerank_enabled:
262
+ stats["reranked_candidates"] = min(len(sorted_candidates), parsed_config.rerank_top_k)
249
263
  return RetrievalResult(
250
264
  query_text=query_text,
251
265
  budget=budget,
252
- run_id=run.run_id,
253
- recipe_id=run.recipe.recipe_id,
254
- backend_id=self.backend_id,
266
+ snapshot_id=snapshot.snapshot_id,
267
+ configuration_id=snapshot.configuration.configuration_id,
268
+ retriever_id=snapshot.configuration.retriever_id,
255
269
  generated_at=utc_now_iso(),
256
270
  evidence=evidence,
257
271
  stats=stats,
@@ -264,7 +278,7 @@ def _candidate_limit(max_total_items: int) -> int:
264
278
 
265
279
  :param max_total_items: Requested evidence count.
266
280
  :type max_total_items: int
267
- :return: Candidate limit for backend search.
281
+ :return: Candidate limit for retriever search.
268
282
  :rtype: int
269
283
  """
270
284
  return max_total_items * 5
@@ -347,7 +361,7 @@ def _apply_rerank_if_enabled(
347
361
  candidates: List[Evidence],
348
362
  *,
349
363
  query_tokens: List[str],
350
- run: RetrievalRun,
364
+ snapshot: RetrievalSnapshot,
351
365
  budget: QueryBudget,
352
366
  rerank_enabled: bool,
353
367
  rerank_top_k: int,
@@ -359,8 +373,8 @@ def _apply_rerank_if_enabled(
359
373
  :type candidates: list[Evidence]
360
374
  :param query_tokens: Query tokens used for reranking.
361
375
  :type query_tokens: list[str]
362
- :param run: Retrieval run to annotate evidence with.
363
- :type run: RetrievalRun
376
+ :param snapshot: Retrieval snapshot to annotate evidence with.
377
+ :type snapshot: RetrievalSnapshot
364
378
  :param budget: Evidence selection budget.
365
379
  :type budget: QueryBudget
366
380
  :param rerank_enabled: Whether reranking is enabled.
@@ -375,8 +389,8 @@ def _apply_rerank_if_enabled(
375
389
  evidence_item.model_copy(
376
390
  update={
377
391
  "rank": index,
378
- "recipe_id": run.recipe.recipe_id,
379
- "run_id": run.run_id,
392
+ "configuration_id": snapshot.configuration.configuration_id,
393
+ "snapshot_id": snapshot.snapshot_id,
380
394
  }
381
395
  )
382
396
  for index, evidence_item in enumerate(candidates, start=1)
@@ -402,8 +416,8 @@ def _apply_rerank_if_enabled(
402
416
  evidence_item.model_copy(
403
417
  update={
404
418
  "rank": index,
405
- "recipe_id": run.recipe.recipe_id,
406
- "run_id": run.run_id,
419
+ "configuration_id": snapshot.configuration.configuration_id,
420
+ "snapshot_id": snapshot.snapshot_id,
407
421
  }
408
422
  )
409
423
  for index, evidence_item in enumerate(reranked_sorted, start=1)
@@ -411,21 +425,21 @@ def _apply_rerank_if_enabled(
411
425
  return apply_budget(ranked, budget)
412
426
 
413
427
 
414
- def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
428
+ def _resolve_snapshot_db_path(corpus: Corpus, snapshot: RetrievalSnapshot) -> Path:
415
429
  """
416
- Resolve the SQLite index path for a retrieval run.
430
+ Resolve the SQLite index path for a retrieval snapshot.
417
431
 
418
- :param corpus: Corpus containing run artifacts.
432
+ :param corpus: Corpus containing snapshot artifacts.
419
433
  :type corpus: Corpus
420
- :param run: Retrieval run manifest.
421
- :type run: RetrievalRun
434
+ :param snapshot: Retrieval snapshot manifest.
435
+ :type snapshot: RetrievalSnapshot
422
436
  :return: Path to the SQLite index file.
423
437
  :rtype: Path
424
- :raises FileNotFoundError: If the run does not have artifact paths.
438
+ :raises FileNotFoundError: If the snapshot does not have artifact paths.
425
439
  """
426
- if not run.artifact_paths:
427
- raise FileNotFoundError("Run has no artifact paths to query")
428
- return corpus.root / run.artifact_paths[0]
440
+ if not snapshot.snapshot_artifacts:
441
+ raise FileNotFoundError("Snapshot has no artifact paths to query")
442
+ return corpus.root / snapshot.snapshot_artifacts[0]
429
443
 
430
444
 
431
445
  def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
@@ -480,8 +494,8 @@ def _build_full_text_search_index(
480
494
  db_path: Path,
481
495
  corpus: Corpus,
482
496
  items: Iterable[object],
483
- recipe_config: SqliteFullTextSearchRecipeConfig,
484
- extraction_reference: Optional[ExtractionRunReference],
497
+ configuration: SqliteFullTextSearchConfiguration,
498
+ extraction_reference: Optional[ExtractionSnapshotReference],
485
499
  ) -> Dict[str, int]:
486
500
  """
487
501
  Build a full-text search index from corpus items.
@@ -492,8 +506,8 @@ def _build_full_text_search_index(
492
506
  :type corpus: Corpus
493
507
  :param items: Catalog items to index.
494
508
  :type items: Iterable[object]
495
- :param recipe_config: Chunking and snippet configuration.
496
- :type recipe_config: SqliteFullTextSearchRecipeConfig
509
+ :param configuration: Chunking and snippet configuration.
510
+ :type configuration: SqliteFullTextSearchConfiguration
497
511
  :return: Index statistics.
498
512
  :rtype: dict[str, int]
499
513
  """
@@ -523,8 +537,8 @@ def _build_full_text_search_index(
523
537
  title = getattr(catalog_item, "title", None)
524
538
  for start_offset, end_offset, chunk in _iter_chunks(
525
539
  item_text,
526
- chunk_size=recipe_config.chunk_size,
527
- chunk_overlap=recipe_config.chunk_overlap,
540
+ chunk_size=configuration.chunk_size,
541
+ chunk_overlap=configuration.chunk_overlap,
528
542
  ):
529
543
  connection.execute(
530
544
  """
@@ -568,7 +582,7 @@ def _load_text_from_item(
568
582
  item_id: str,
569
583
  relpath: str,
570
584
  media_type: str,
571
- extraction_reference: Optional[ExtractionRunReference],
585
+ extraction_reference: Optional[ExtractionSnapshotReference],
572
586
  ) -> Optional[str]:
573
587
  """
574
588
  Load text content from a catalog item.
@@ -581,15 +595,15 @@ def _load_text_from_item(
581
595
  :type relpath: str
582
596
  :param media_type: Media type for the content.
583
597
  :type media_type: str
584
- :param extraction_reference: Optional extraction run reference.
585
- :type extraction_reference: ExtractionRunReference or None
598
+ :param extraction_reference: Optional extraction snapshot reference.
599
+ :type extraction_reference: ExtractionSnapshotReference or None
586
600
  :return: Text payload or None if not text.
587
601
  :rtype: str or None
588
602
  """
589
603
  if extraction_reference:
590
604
  extracted_text = corpus.read_extracted_text(
591
605
  extractor_id=extraction_reference.extractor_id,
592
- run_id=extraction_reference.run_id,
606
+ snapshot_id=extraction_reference.snapshot_id,
593
607
  item_id=item_id,
594
608
  )
595
609
  if isinstance(extracted_text, str) and extracted_text.strip():
@@ -608,28 +622,28 @@ def _load_text_from_item(
608
622
 
609
623
  def _resolve_extraction_reference(
610
624
  corpus: Corpus,
611
- recipe_config: SqliteFullTextSearchRecipeConfig,
612
- ) -> Optional[ExtractionRunReference]:
625
+ configuration: SqliteFullTextSearchConfiguration,
626
+ ) -> Optional[ExtractionSnapshotReference]:
613
627
  """
614
- Resolve an extraction run reference from a recipe config.
628
+ Resolve an extraction snapshot reference from a configuration.
615
629
 
616
- :param corpus: Corpus associated with the recipe.
630
+ :param corpus: Corpus associated with the configuration.
617
631
  :type corpus: Corpus
618
- :param recipe_config: Parsed backend recipe configuration.
619
- :type recipe_config: SqliteFullTextSearchRecipeConfig
632
+ :param configuration: Parsed retriever configuration.
633
+ :type configuration: SqliteFullTextSearchConfiguration
620
634
  :return: Parsed extraction reference or None.
621
- :rtype: ExtractionRunReference or None
622
- :raises FileNotFoundError: If an extraction run is referenced but not present.
635
+ :rtype: ExtractionSnapshotReference or None
636
+ :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
623
637
  """
624
- if not recipe_config.extraction_run:
638
+ if not configuration.extraction_snapshot:
625
639
  return None
626
- extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
627
- run_dir = corpus.extraction_run_dir(
640
+ extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
641
+ snapshot_dir = corpus.extraction_snapshot_dir(
628
642
  extractor_id=extraction_reference.extractor_id,
629
- run_id=extraction_reference.run_id,
643
+ snapshot_id=extraction_reference.snapshot_id,
630
644
  )
631
- if not run_dir.is_dir():
632
- raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
645
+ if not snapshot_dir.is_dir():
646
+ raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
633
647
  return extraction_reference
634
648
 
635
649
 
@@ -723,8 +737,8 @@ def _query_full_text_search_index(
723
737
  span_start=int(start_offset) if start_offset is not None else None,
724
738
  span_end=int(end_offset) if end_offset is not None else None,
725
739
  stage="full-text-search",
726
- recipe_id="",
727
- run_id="",
740
+ configuration_id="",
741
+ snapshot_id="",
728
742
  hash=hash_text(snippet_text),
729
743
  )
730
744
  )