biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Naive full-scan retrieval backend.
2
+ Naive full-scan retriever.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -12,87 +12,97 @@ from ..corpus import Corpus
12
12
  from ..frontmatter import parse_front_matter
13
13
  from ..models import (
14
14
  Evidence,
15
- ExtractionRunReference,
15
+ ExtractionSnapshotReference,
16
16
  QueryBudget,
17
17
  RetrievalResult,
18
- RetrievalRun,
19
- parse_extraction_run_reference,
18
+ RetrievalSnapshot,
19
+ parse_extraction_snapshot_reference,
20
+ )
21
+ from ..retrieval import (
22
+ apply_budget,
23
+ create_configuration_manifest,
24
+ create_snapshot_manifest,
25
+ hash_text,
20
26
  )
21
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
22
27
  from ..time import utc_now_iso
23
28
 
24
29
 
25
- class ScanRecipeConfig(BaseModel):
30
+ class ScanConfiguration(BaseModel):
26
31
  """
27
- Configuration for the naive scan backend.
32
+ Configuration for the naive scan retriever.
28
33
 
29
34
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
30
35
  :vartype snippet_characters: int
31
- :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
32
- :vartype extraction_run: str or None
36
+ :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
37
+ :vartype extraction_snapshot: str or None
33
38
  """
34
39
 
35
40
  model_config = ConfigDict(extra="forbid")
36
41
 
37
42
  snippet_characters: int = Field(default=400, ge=1)
38
- extraction_run: Optional[str] = None
43
+ extraction_snapshot: Optional[str] = None
39
44
 
40
45
 
41
- class ScanBackend:
46
+ class ScanRetriever:
42
47
  """
43
- Naive backend that scans all text items at query time.
48
+ Naive retriever that scans all text items at query time.
44
49
 
45
- :ivar backend_id: Backend identifier.
46
- :vartype backend_id: str
50
+ :ivar retriever_id: Retriever identifier.
51
+ :vartype retriever_id: str
47
52
  """
48
53
 
49
- backend_id = "scan"
54
+ retriever_id = "scan"
50
55
 
51
- def build_run(
52
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
53
- ) -> RetrievalRun:
56
+ def build_snapshot(
57
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
58
+ ) -> RetrievalSnapshot:
54
59
  """
55
- Register a scan backend run (no materialization).
60
+ Register a scan retriever snapshot (no snapshot artifacts).
56
61
 
57
62
  :param corpus: Corpus to build against.
58
63
  :type corpus: Corpus
59
- :param recipe_name: Human-readable recipe name.
60
- :type recipe_name: str
61
- :param config: Backend-specific configuration values.
62
- :type config: dict[str, object]
63
- :return: Run manifest describing the build.
64
- :rtype: RetrievalRun
64
+ :param configuration_name: Human-readable configuration name.
65
+ :type configuration_name: str
66
+ :param configuration: Retriever-specific configuration values.
67
+ :type configuration: dict[str, object]
68
+ :return: Snapshot manifest describing the build.
69
+ :rtype: RetrievalSnapshot
65
70
  """
66
- recipe_config = ScanRecipeConfig.model_validate(config)
71
+ parsed_config = ScanConfiguration.model_validate(configuration)
67
72
  catalog = corpus.load_catalog()
68
- recipe = create_recipe_manifest(
69
- backend_id=self.backend_id,
70
- name=recipe_name,
71
- config=recipe_config.model_dump(),
73
+ configuration_manifest = create_configuration_manifest(
74
+ retriever_id=self.retriever_id,
75
+ name=configuration_name,
76
+ configuration=parsed_config.model_dump(),
72
77
  )
73
78
  stats = {
74
79
  "items": len(catalog.items),
75
- "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
80
+ "text_items": _count_text_items(corpus, catalog.items.values(), parsed_config),
76
81
  }
77
- run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
78
- corpus.write_run(run)
79
- return run
82
+ snapshot = create_snapshot_manifest(
83
+ corpus,
84
+ configuration=configuration_manifest,
85
+ stats=stats,
86
+ snapshot_artifacts=[],
87
+ )
88
+ corpus.write_snapshot(snapshot)
89
+ return snapshot
80
90
 
81
91
  def query(
82
92
  self,
83
93
  corpus: Corpus,
84
94
  *,
85
- run: RetrievalRun,
95
+ snapshot: RetrievalSnapshot,
86
96
  query_text: str,
87
97
  budget: QueryBudget,
88
98
  ) -> RetrievalResult:
89
99
  """
90
100
  Query the corpus with a full scan.
91
101
 
92
- :param corpus: Corpus associated with the run.
102
+ :param corpus: Corpus associated with the snapshot.
93
103
  :type corpus: Corpus
94
- :param run: Run manifest to use for querying.
95
- :type run: RetrievalRun
104
+ :param snapshot: Snapshot manifest to use for querying.
105
+ :type snapshot: RetrievalSnapshot
96
106
  :param query_text: Query text to execute.
97
107
  :type query_text: str
98
108
  :param budget: Evidence selection budget.
@@ -100,15 +110,15 @@ class ScanBackend:
100
110
  :return: Retrieval results containing evidence.
101
111
  :rtype: RetrievalResult
102
112
  """
103
- recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
113
+ parsed_config = ScanConfiguration.model_validate(snapshot.configuration.configuration)
104
114
  catalog = corpus.load_catalog()
105
- extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
115
+ extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
106
116
  query_tokens = _tokenize_query(query_text)
107
117
  scored_candidates = _score_items(
108
118
  corpus,
109
119
  catalog.items.values(),
110
120
  query_tokens,
111
- recipe_config.snippet_characters,
121
+ parsed_config.snippet_characters,
112
122
  extraction_reference=extraction_reference,
113
123
  )
114
124
  sorted_candidates = sorted(
@@ -119,8 +129,8 @@ class ScanBackend:
119
129
  evidence_item.model_copy(
120
130
  update={
121
131
  "rank": index,
122
- "recipe_id": run.recipe.recipe_id,
123
- "run_id": run.run_id,
132
+ "configuration_id": snapshot.configuration.configuration_id,
133
+ "snapshot_id": snapshot.snapshot_id,
124
134
  }
125
135
  )
126
136
  for index, evidence_item in enumerate(sorted_candidates, start=1)
@@ -130,9 +140,9 @@ class ScanBackend:
130
140
  return RetrievalResult(
131
141
  query_text=query_text,
132
142
  budget=budget,
133
- run_id=run.run_id,
134
- recipe_id=run.recipe.recipe_id,
135
- backend_id=self.backend_id,
143
+ snapshot_id=snapshot.snapshot_id,
144
+ configuration_id=snapshot.configuration.configuration_id,
145
+ retriever_id=snapshot.configuration.retriever_id,
136
146
  generated_at=utc_now_iso(),
137
147
  evidence=evidence,
138
148
  stats=stats,
@@ -140,56 +150,56 @@ class ScanBackend:
140
150
 
141
151
 
142
152
  def _resolve_extraction_reference(
143
- corpus: Corpus, recipe_config: ScanRecipeConfig
144
- ) -> Optional[ExtractionRunReference]:
153
+ corpus: Corpus, configuration: ScanConfiguration
154
+ ) -> Optional[ExtractionSnapshotReference]:
145
155
  """
146
- Resolve an extraction run reference from a recipe config.
156
+ Resolve an extraction snapshot reference from a configuration.
147
157
 
148
- :param corpus: Corpus associated with the recipe.
158
+ :param corpus: Corpus associated with the configuration.
149
159
  :type corpus: Corpus
150
- :param recipe_config: Parsed scan recipe configuration.
151
- :type recipe_config: ScanRecipeConfig
160
+ :param configuration: Parsed scan configuration.
161
+ :type configuration: ScanConfiguration
152
162
  :return: Parsed extraction reference or None.
153
- :rtype: ExtractionRunReference or None
154
- :raises FileNotFoundError: If an extraction run is referenced but not present.
163
+ :rtype: ExtractionSnapshotReference or None
164
+ :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
155
165
  """
156
- if not recipe_config.extraction_run:
166
+ if not configuration.extraction_snapshot:
157
167
  return None
158
- extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
159
- run_dir = corpus.extraction_run_dir(
168
+ extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
169
+ snapshot_dir = corpus.extraction_snapshot_dir(
160
170
  extractor_id=extraction_reference.extractor_id,
161
- run_id=extraction_reference.run_id,
171
+ snapshot_id=extraction_reference.snapshot_id,
162
172
  )
163
- if not run_dir.is_dir():
164
- raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
173
+ if not snapshot_dir.is_dir():
174
+ raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
165
175
  return extraction_reference
166
176
 
167
177
 
168
178
  def _count_text_items(
169
- corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig
179
+ corpus: Corpus, items: Iterable[object], configuration: ScanConfiguration
170
180
  ) -> int:
171
181
  """
172
182
  Count catalog items that represent text content.
173
183
 
174
- When an extraction run is configured, extracted artifacts are treated as text.
184
+ When an extraction snapshot is configured, extracted artifacts are treated as text.
175
185
 
176
186
  :param corpus: Corpus containing the items.
177
187
  :type corpus: Corpus
178
188
  :param items: Catalog items to inspect.
179
189
  :type items: Iterable[object]
180
- :param recipe_config: Parsed scan recipe configuration.
181
- :type recipe_config: ScanRecipeConfig
190
+ :param configuration: Parsed scan configuration.
191
+ :type configuration: ScanConfiguration
182
192
  :return: Number of text items.
183
193
  :rtype: int
184
194
  """
185
195
  text_item_count = 0
186
- extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
196
+ extraction_reference = _resolve_extraction_reference(corpus, configuration)
187
197
  for catalog_item in items:
188
198
  item_id = str(getattr(catalog_item, "id", ""))
189
199
  if extraction_reference and item_id:
190
200
  extracted_text = corpus.read_extracted_text(
191
201
  extractor_id=extraction_reference.extractor_id,
192
- run_id=extraction_reference.run_id,
202
+ snapshot_id=extraction_reference.snapshot_id,
193
203
  item_id=item_id,
194
204
  )
195
205
  if isinstance(extracted_text, str) and extracted_text.strip():
@@ -219,7 +229,7 @@ def _load_text_from_item(
219
229
  item_id: str,
220
230
  relpath: str,
221
231
  media_type: str,
222
- extraction_reference: Optional[ExtractionRunReference],
232
+ extraction_reference: Optional[ExtractionSnapshotReference],
223
233
  ) -> Optional[str]:
224
234
  """
225
235
  Load a text payload from a catalog item.
@@ -232,15 +242,15 @@ def _load_text_from_item(
232
242
  :type relpath: str
233
243
  :param media_type: Media type for the stored content.
234
244
  :type media_type: str
235
- :param extraction_reference: Optional extraction run reference.
236
- :type extraction_reference: ExtractionRunReference or None
245
+ :param extraction_reference: Optional extraction snapshot reference.
246
+ :type extraction_reference: ExtractionSnapshotReference or None
237
247
  :return: Text payload or None if not decodable as text.
238
248
  :rtype: str or None
239
249
  """
240
250
  if extraction_reference:
241
251
  extracted_text = corpus.read_extracted_text(
242
252
  extractor_id=extraction_reference.extractor_id,
243
- run_id=extraction_reference.run_id,
253
+ snapshot_id=extraction_reference.snapshot_id,
244
254
  item_id=item_id,
245
255
  )
246
256
  if isinstance(extracted_text, str) and extracted_text.strip():
@@ -316,7 +326,7 @@ def _score_items(
316
326
  tokens: List[str],
317
327
  snippet_characters: int,
318
328
  *,
319
- extraction_reference: Optional[ExtractionRunReference],
329
+ extraction_reference: Optional[ExtractionSnapshotReference],
320
330
  ) -> List[Evidence]:
321
331
  """
322
332
  Score catalog items by token frequency and return evidence candidates.
@@ -366,8 +376,8 @@ def _score_items(
366
376
  span_start=span_start,
367
377
  span_end=span_end,
368
378
  stage="scan",
369
- recipe_id="",
370
- run_id="",
379
+ configuration_id="",
380
+ snapshot_id="",
371
381
  metadata=getattr(catalog_item, "metadata", {}) or {},
372
382
  hash=hash_text(snippet),
373
383
  )