biblicus 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,460 @@
1
+ """
2
+ Deterministic term-frequency vector retrieval backend.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ import re
9
+ from typing import Dict, Iterable, List, Optional, Tuple
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from ..corpus import Corpus
14
+ from ..frontmatter import parse_front_matter
15
+ from ..models import (
16
+ Evidence,
17
+ ExtractionRunReference,
18
+ QueryBudget,
19
+ RetrievalResult,
20
+ RetrievalRun,
21
+ parse_extraction_run_reference,
22
+ )
23
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
24
+ from ..time import utc_now_iso
25
+
26
+
27
+ class VectorRecipeConfig(BaseModel):
28
+ """
29
+ Configuration for the vector retrieval backend.
30
+
31
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
32
+ :vartype snippet_characters: int
33
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
34
+ :vartype extraction_run: str or None
35
+ """
36
+
37
+ model_config = ConfigDict(extra="forbid")
38
+
39
+ snippet_characters: int = Field(default=400, ge=1)
40
+ extraction_run: Optional[str] = None
41
+
42
+
43
+ class VectorBackend:
44
+ """
45
+ Deterministic vector backend using term-frequency cosine similarity.
46
+
47
+ :ivar backend_id: Backend identifier.
48
+ :vartype backend_id: str
49
+ """
50
+
51
+ backend_id = "vector"
52
+
53
+ def build_run(
54
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
55
+ ) -> RetrievalRun:
56
+ """
57
+ Register a vector backend run (no materialization).
58
+
59
+ :param corpus: Corpus to build against.
60
+ :type corpus: Corpus
61
+ :param recipe_name: Human-readable recipe name.
62
+ :type recipe_name: str
63
+ :param config: Backend-specific configuration values.
64
+ :type config: dict[str, object]
65
+ :return: Run manifest describing the build.
66
+ :rtype: RetrievalRun
67
+ """
68
+ recipe_config = VectorRecipeConfig.model_validate(config)
69
+ catalog = corpus.load_catalog()
70
+ recipe = create_recipe_manifest(
71
+ backend_id=self.backend_id,
72
+ name=recipe_name,
73
+ config=recipe_config.model_dump(),
74
+ )
75
+ stats = {
76
+ "items": len(catalog.items),
77
+ "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
78
+ }
79
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
80
+ corpus.write_run(run)
81
+ return run
82
+
83
+ def query(
84
+ self,
85
+ corpus: Corpus,
86
+ *,
87
+ run: RetrievalRun,
88
+ query_text: str,
89
+ budget: QueryBudget,
90
+ ) -> RetrievalResult:
91
+ """
92
+ Query the corpus using term-frequency cosine similarity.
93
+
94
+ :param corpus: Corpus associated with the run.
95
+ :type corpus: Corpus
96
+ :param run: Run manifest to use for querying.
97
+ :type run: RetrievalRun
98
+ :param query_text: Query text to execute.
99
+ :type query_text: str
100
+ :param budget: Evidence selection budget.
101
+ :type budget: QueryBudget
102
+ :return: Retrieval results containing evidence.
103
+ :rtype: RetrievalResult
104
+ """
105
+ recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
106
+ query_tokens = _tokenize_text(query_text)
107
+ if not query_tokens:
108
+ return RetrievalResult(
109
+ query_text=query_text,
110
+ budget=budget,
111
+ run_id=run.run_id,
112
+ recipe_id=run.recipe.recipe_id,
113
+ backend_id=self.backend_id,
114
+ generated_at=utc_now_iso(),
115
+ evidence=[],
116
+ stats={"candidates": 0, "returned": 0},
117
+ )
118
+ query_vector = _term_frequencies(query_tokens)
119
+ query_norm = _vector_norm(query_vector)
120
+ catalog = corpus.load_catalog()
121
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
122
+ scored_candidates = _score_items(
123
+ corpus,
124
+ catalog.items.values(),
125
+ query_tokens=query_tokens,
126
+ query_vector=query_vector,
127
+ query_norm=query_norm,
128
+ snippet_characters=recipe_config.snippet_characters,
129
+ extraction_reference=extraction_reference,
130
+ )
131
+ sorted_candidates = sorted(
132
+ scored_candidates,
133
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
134
+ )
135
+ ranked = [
136
+ evidence_item.model_copy(
137
+ update={
138
+ "rank": index,
139
+ "recipe_id": run.recipe.recipe_id,
140
+ "run_id": run.run_id,
141
+ }
142
+ )
143
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
144
+ ]
145
+ evidence = apply_budget(ranked, budget)
146
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
147
+ return RetrievalResult(
148
+ query_text=query_text,
149
+ budget=budget,
150
+ run_id=run.run_id,
151
+ recipe_id=run.recipe.recipe_id,
152
+ backend_id=self.backend_id,
153
+ generated_at=utc_now_iso(),
154
+ evidence=evidence,
155
+ stats=stats,
156
+ )
157
+
158
+
159
+ def _resolve_extraction_reference(
160
+ corpus: Corpus, recipe_config: VectorRecipeConfig
161
+ ) -> Optional[ExtractionRunReference]:
162
+ """
163
+ Resolve an extraction run reference from a recipe config.
164
+
165
+ :param corpus: Corpus associated with the recipe.
166
+ :type corpus: Corpus
167
+ :param recipe_config: Parsed vector recipe configuration.
168
+ :type recipe_config: VectorRecipeConfig
169
+ :return: Parsed extraction reference or None.
170
+ :rtype: ExtractionRunReference or None
171
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
172
+ """
173
+ if not recipe_config.extraction_run:
174
+ return None
175
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
176
+ run_dir = corpus.extraction_run_dir(
177
+ extractor_id=extraction_reference.extractor_id,
178
+ run_id=extraction_reference.run_id,
179
+ )
180
+ if not run_dir.is_dir():
181
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
182
+ return extraction_reference
183
+
184
+
185
+ def _count_text_items(
186
+ corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
187
+ ) -> int:
188
+ """
189
+ Count catalog items that represent text content.
190
+
191
+ :param corpus: Corpus containing the items.
192
+ :type corpus: Corpus
193
+ :param items: Catalog items to inspect.
194
+ :type items: Iterable[object]
195
+ :param recipe_config: Parsed vector recipe configuration.
196
+ :type recipe_config: VectorRecipeConfig
197
+ :return: Number of text items.
198
+ :rtype: int
199
+ """
200
+ text_item_count = 0
201
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
202
+ for catalog_item in items:
203
+ item_id = str(getattr(catalog_item, "id", ""))
204
+ if extraction_reference and item_id:
205
+ extracted_text = corpus.read_extracted_text(
206
+ extractor_id=extraction_reference.extractor_id,
207
+ run_id=extraction_reference.run_id,
208
+ item_id=item_id,
209
+ )
210
+ if isinstance(extracted_text, str) and extracted_text.strip():
211
+ text_item_count += 1
212
+ continue
213
+ media_type = getattr(catalog_item, "media_type", "")
214
+ if media_type == "text/markdown" or str(media_type).startswith("text/"):
215
+ text_item_count += 1
216
+ return text_item_count
217
+
218
+
219
+ def _tokenize_text(text: str) -> List[str]:
220
+ """
221
+ Tokenize text into lowercase word tokens.
222
+
223
+ :param text: Input text.
224
+ :type text: str
225
+ :return: Token list.
226
+ :rtype: list[str]
227
+ """
228
+ return re.findall(r"[a-z0-9]+", text.lower())
229
+
230
+
231
+ def _term_frequencies(tokens: List[str]) -> Dict[str, float]:
232
+ """
233
+ Build term frequency weights from tokens.
234
+
235
+ :param tokens: Token list.
236
+ :type tokens: list[str]
237
+ :return: Term frequency mapping.
238
+ :rtype: dict[str, float]
239
+ """
240
+ frequencies: Dict[str, float] = {}
241
+ for token in tokens:
242
+ frequencies[token] = frequencies.get(token, 0.0) + 1.0
243
+ return frequencies
244
+
245
+
246
+ def _vector_norm(vector: Dict[str, float]) -> float:
247
+ """
248
+ Compute the Euclidean norm of a term-frequency vector.
249
+
250
+ :param vector: Term frequency mapping.
251
+ :type vector: dict[str, float]
252
+ :return: Vector norm.
253
+ :rtype: float
254
+ """
255
+ return math.sqrt(sum(value * value for value in vector.values()))
256
+
257
+
258
+ def _cosine_similarity(
259
+ left: Dict[str, float],
260
+ *,
261
+ left_norm: float,
262
+ right: Dict[str, float],
263
+ right_norm: float,
264
+ ) -> float:
265
+ """
266
+ Compute cosine similarity between two term-frequency vectors.
267
+
268
+ :param left: Left term-frequency vector.
269
+ :type left: dict[str, float]
270
+ :param left_norm: Precomputed left vector norm.
271
+ :type left_norm: float
272
+ :param right: Right term-frequency vector.
273
+ :type right: dict[str, float]
274
+ :param right_norm: Precomputed right vector norm.
275
+ :type right_norm: float
276
+ :return: Cosine similarity score.
277
+ :rtype: float
278
+ """
279
+ dot = 0.0
280
+ if len(left) < len(right):
281
+ for token, value in left.items():
282
+ dot += value * right.get(token, 0.0)
283
+ else:
284
+ for token, value in right.items():
285
+ dot += value * left.get(token, 0.0)
286
+ return dot / (left_norm * right_norm)
287
+
288
+
289
+ def _load_text_from_item(
290
+ corpus: Corpus,
291
+ *,
292
+ item_id: str,
293
+ relpath: str,
294
+ media_type: str,
295
+ extraction_reference: Optional[ExtractionRunReference],
296
+ ) -> Optional[str]:
297
+ """
298
+ Load a text payload from a catalog item.
299
+
300
+ :param corpus: Corpus containing the item.
301
+ :type corpus: Corpus
302
+ :param item_id: Item identifier.
303
+ :type item_id: str
304
+ :param relpath: Relative path to the stored content.
305
+ :type relpath: str
306
+ :param media_type: Media type for the stored content.
307
+ :type media_type: str
308
+ :param extraction_reference: Optional extraction run reference.
309
+ :type extraction_reference: ExtractionRunReference or None
310
+ :return: Text payload or None if not decodable as text.
311
+ :rtype: str or None
312
+ """
313
+ if extraction_reference:
314
+ extracted_text = corpus.read_extracted_text(
315
+ extractor_id=extraction_reference.extractor_id,
316
+ run_id=extraction_reference.run_id,
317
+ item_id=item_id,
318
+ )
319
+ if isinstance(extracted_text, str) and extracted_text.strip():
320
+ return extracted_text
321
+
322
+ content_path = corpus.root / relpath
323
+ raw_bytes = content_path.read_bytes()
324
+ if media_type == "text/markdown":
325
+ markdown_text = raw_bytes.decode("utf-8")
326
+ parsed_document = parse_front_matter(markdown_text)
327
+ return parsed_document.body
328
+ if media_type.startswith("text/"):
329
+ return raw_bytes.decode("utf-8")
330
+ return None
331
+
332
+
333
+ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
334
+ """
335
+ Locate the earliest token match span in a text payload.
336
+
337
+ :param text: Text to scan.
338
+ :type text: str
339
+ :param tokens: Query tokens.
340
+ :type tokens: list[str]
341
+ :return: Start/end span for the earliest match, or None if no matches.
342
+ :rtype: tuple[int, int] or None
343
+ """
344
+ lower_text = text.lower()
345
+ best_start: Optional[int] = None
346
+ best_end: Optional[int] = None
347
+ for token in tokens:
348
+ if not token:
349
+ continue
350
+ token_start = lower_text.find(token)
351
+ if token_start == -1:
352
+ continue
353
+ token_end = token_start + len(token)
354
+ if best_start is None or token_start < best_start:
355
+ best_start = token_start
356
+ best_end = token_end
357
+ if best_start is None or best_end is None:
358
+ return None
359
+ return best_start, best_end
360
+
361
+
362
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
363
+ """
364
+ Build a snippet around a match span, constrained by a character budget.
365
+
366
+ :param text: Source text to slice.
367
+ :type text: str
368
+ :param span: Match span to center on.
369
+ :type span: tuple[int, int] or None
370
+ :param max_chars: Maximum snippet length.
371
+ :type max_chars: int
372
+ :return: Snippet text.
373
+ :rtype: str
374
+ """
375
+ if not text:
376
+ return ""
377
+ if span is None:
378
+ return text[:max_chars]
379
+ span_start, span_end = span
380
+ half_window = max_chars // 2
381
+ snippet_start = max(span_start - half_window, 0)
382
+ snippet_end = min(span_end + half_window, len(text))
383
+ return text[snippet_start:snippet_end]
384
+
385
+
386
+ def _score_items(
387
+ corpus: Corpus,
388
+ items: Iterable[object],
389
+ *,
390
+ query_tokens: List[str],
391
+ query_vector: Dict[str, float],
392
+ query_norm: float,
393
+ snippet_characters: int,
394
+ extraction_reference: Optional[ExtractionRunReference],
395
+ ) -> List[Evidence]:
396
+ """
397
+ Score catalog items and return evidence candidates.
398
+
399
+ :param corpus: Corpus containing the items.
400
+ :type corpus: Corpus
401
+ :param items: Catalog items to score.
402
+ :type items: Iterable[object]
403
+ :param query_tokens: Tokenized query text.
404
+ :type query_tokens: list[str]
405
+ :param query_vector: Query term-frequency vector.
406
+ :type query_vector: dict[str, float]
407
+ :param query_norm: Query vector norm.
408
+ :type query_norm: float
409
+ :param snippet_characters: Snippet length budget.
410
+ :type snippet_characters: int
411
+ :param extraction_reference: Optional extraction run reference.
412
+ :type extraction_reference: ExtractionRunReference or None
413
+ :return: Evidence candidates with provisional ranks.
414
+ :rtype: list[Evidence]
415
+ """
416
+ evidence_items: List[Evidence] = []
417
+ for catalog_item in items:
418
+ media_type = getattr(catalog_item, "media_type", "")
419
+ relpath = getattr(catalog_item, "relpath", "")
420
+ item_id = str(getattr(catalog_item, "id", ""))
421
+ item_text = _load_text_from_item(
422
+ corpus,
423
+ item_id=item_id,
424
+ relpath=relpath,
425
+ media_type=str(media_type),
426
+ extraction_reference=extraction_reference,
427
+ )
428
+ if item_text is None:
429
+ continue
430
+ tokens = _tokenize_text(item_text)
431
+ if not tokens:
432
+ continue
433
+ vector = _term_frequencies(tokens)
434
+ similarity = _cosine_similarity(
435
+ query_vector, left_norm=query_norm, right=vector, right_norm=_vector_norm(vector)
436
+ )
437
+ if similarity <= 0:
438
+ continue
439
+ span = _find_first_match(item_text, query_tokens)
440
+ snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
441
+ span_start = span[0] if span else None
442
+ span_end = span[1] if span else None
443
+ evidence_items.append(
444
+ Evidence(
445
+ item_id=str(getattr(catalog_item, "id")),
446
+ source_uri=getattr(catalog_item, "source_uri", None),
447
+ media_type=str(media_type),
448
+ score=float(similarity),
449
+ rank=1,
450
+ text=snippet,
451
+ content_ref=None,
452
+ span_start=span_start,
453
+ span_end=span_end,
454
+ stage="vector",
455
+ recipe_id="",
456
+ run_id="",
457
+ hash=hash_text(snippet),
458
+ )
459
+ )
460
+ return evidence_items
biblicus/cli.py CHANGED
@@ -563,7 +563,9 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
563
563
  """
564
564
  input_text = sys.stdin.read()
565
565
  if not input_text.strip():
566
- raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
566
+ raise ValueError(
567
+ "Context pack build requires a retrieval result JavaScript Object Notation on standard input"
568
+ )
567
569
  retrieval_result = RetrievalResult.model_validate_json(input_text)
568
570
  join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
569
571
  policy = ContextPackPolicy(join_with=join_with)
@@ -685,6 +687,58 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
685
687
  return 0
686
688
 
687
689
 
690
+ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
691
+ """
692
+ Run profiling analysis for a corpus.
693
+
694
+ :param arguments: Parsed command-line interface arguments.
695
+ :type arguments: argparse.Namespace
696
+ :return: Exit code.
697
+ :rtype: int
698
+ """
699
+ import yaml
700
+
701
+ corpus = (
702
+ Corpus.open(arguments.corpus)
703
+ if getattr(arguments, "corpus", None)
704
+ else Corpus.find(Path.cwd())
705
+ )
706
+
707
+ recipe_data: dict[str, object] = {}
708
+ if arguments.recipe is not None:
709
+ recipe_path = Path(arguments.recipe)
710
+ if not recipe_path.is_file():
711
+ raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
712
+ recipe_raw = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
713
+ if not isinstance(recipe_raw, dict):
714
+ raise ValueError("Profiling recipe must be a mapping/object")
715
+ recipe_data = recipe_raw
716
+
717
+ if arguments.extraction_run:
718
+ extraction_run = parse_extraction_run_reference(arguments.extraction_run)
719
+ else:
720
+ extraction_run = corpus.latest_extraction_run_reference()
721
+ if extraction_run is None:
722
+ raise ValueError("Profiling analysis requires an extraction run to supply text inputs")
723
+ print(
724
+ "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
725
+ file=sys.stderr,
726
+ )
727
+
728
+ backend = get_analysis_backend("profiling")
729
+ try:
730
+ output = backend.run_analysis(
731
+ corpus,
732
+ recipe_name=arguments.recipe_name,
733
+ config=recipe_data,
734
+ extraction_run=extraction_run,
735
+ )
736
+ except ValidationError as exc:
737
+ raise ValueError(f"Invalid profiling recipe: {exc}") from exc
738
+ print(output.model_dump_json(indent=2))
739
+ return 0
740
+
741
+
688
742
  def build_parser() -> argparse.ArgumentParser:
689
743
  """
690
744
  Build the command-line interface argument parser.
@@ -890,14 +944,20 @@ def build_parser() -> argparse.ArgumentParser:
890
944
 
891
945
  p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
892
946
  _add_common_corpus_arg(p_crawl)
893
- p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
947
+ p_crawl.add_argument(
948
+ "--root-url", required=True, help="Root uniform resource locator to fetch."
949
+ )
894
950
  p_crawl.add_argument(
895
951
  "--allowed-prefix",
896
952
  required=True,
897
953
  help="Uniform resource locator prefix that limits which links are eligible for crawl.",
898
954
  )
899
- p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
900
- p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
955
+ p_crawl.add_argument(
956
+ "--max-items", type=int, default=50, help="Maximum number of items to store."
957
+ )
958
+ p_crawl.add_argument(
959
+ "--tags", default=None, help="Comma-separated tags to apply to stored items."
960
+ )
901
961
  p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
902
962
  p_crawl.set_defaults(func=cmd_crawl)
903
963
 
@@ -923,6 +983,25 @@ def build_parser() -> argparse.ArgumentParser:
923
983
  )
924
984
  p_analyze_topics.set_defaults(func=cmd_analyze_topics)
925
985
 
986
+ p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
987
+ _add_common_corpus_arg(p_analyze_profile)
988
+ p_analyze_profile.add_argument(
989
+ "--recipe",
990
+ default=None,
991
+ help="Optional profiling recipe YAML file.",
992
+ )
993
+ p_analyze_profile.add_argument(
994
+ "--recipe-name",
995
+ default="default",
996
+ help="Human-readable recipe name.",
997
+ )
998
+ p_analyze_profile.add_argument(
999
+ "--extraction-run",
1000
+ default=None,
1001
+ help="Extraction run reference in the form extractor_id:run_id.",
1002
+ )
1003
+ p_analyze_profile.set_defaults(func=cmd_analyze_profile)
1004
+
926
1005
  return parser
927
1006
 
928
1007
 
biblicus/corpus.py CHANGED
@@ -622,7 +622,9 @@ class Corpus:
622
622
  data = json.loads(manifest_path.read_text(encoding="utf-8"))
623
623
  return ExtractionRunManifest.model_validate(data)
624
624
 
625
- def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
625
+ def list_extraction_runs(
626
+ self, *, extractor_id: Optional[str] = None
627
+ ) -> List[ExtractionRunListEntry]:
626
628
  """
627
629
  List extraction runs stored under the corpus.
628
630
 
@@ -669,7 +671,9 @@ class Corpus:
669
671
  )
670
672
  )
671
673
 
672
- entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
674
+ entries.sort(
675
+ key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
676
+ )
673
677
  return entries
674
678
 
675
679
  def latest_extraction_run_reference(
@@ -1366,7 +1370,9 @@ class Corpus:
1366
1370
  """
1367
1371
  _ = filename
1368
1372
  item_id = str(uuid.uuid4())
1369
- destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
1373
+ destination_relpath = str(
1374
+ Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path
1375
+ )
1370
1376
  destination_path = (self.root / destination_relpath).resolve()
1371
1377
  destination_path.parent.mkdir(parents=True, exist_ok=True)
1372
1378
  destination_path.write_bytes(data)
@@ -99,7 +99,10 @@ class EvidenceRerankLongestText(EvidenceReranker):
99
99
  """
100
100
  return sorted(
101
101
  evidence,
102
- key=lambda evidence_item: (-len((evidence_item.text or "").strip()), evidence_item.item_id),
102
+ key=lambda evidence_item: (
103
+ -len((evidence_item.text or "").strip()),
104
+ evidence_item.item_id,
105
+ ),
103
106
  )
104
107
 
105
108
 
@@ -198,4 +201,3 @@ def apply_evidence_filter(
198
201
  """
199
202
  evidence_filter = _EVIDENCE_FILTERS[filter_id]
200
203
  return evidence_filter.filter(query_text=query_text, evidence=evidence, config=config)
201
-
biblicus/extraction.py CHANGED
@@ -345,7 +345,9 @@ def build_extraction_run(
345
345
  manifest = create_extraction_run_manifest(corpus, recipe=recipe)
346
346
  run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
347
347
  if run_dir.exists():
348
- return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
348
+ return corpus.load_extraction_run_manifest(
349
+ extractor_id=extractor_id, run_id=manifest.run_id
350
+ )
349
351
  run_dir.mkdir(parents=True, exist_ok=False)
350
352
 
351
353
  catalog = corpus.load_catalog()
@@ -29,6 +29,7 @@ class MarkItDownExtractorConfig(BaseModel):
29
29
 
30
30
  enable_plugins: bool = Field(default=False)
31
31
 
32
+
32
33
  class MarkItDownExtractor(TextExtractor):
33
34
  """
34
35
  Extractor plugin backed by the `markitdown` library.
@@ -152,9 +152,7 @@ class PaddleOcrVlExtractor(TextExtractor):
152
152
  parsed_config.backend.api_provider,
153
153
  config_override=parsed_config.backend.api_key,
154
154
  )
155
- text, confidence = self._extract_via_api(
156
- source_path, parsed_config, api_key
157
- )
155
+ text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
158
156
 
159
157
  return ExtractedText(
160
158
  text=text,
biblicus/models.py CHANGED
@@ -263,6 +263,8 @@ class Evidence(BaseModel):
263
263
  :vartype span_end: int or None
264
264
  :ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
265
265
  :vartype stage: str
266
+ :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
267
+ :vartype stage_scores: dict[str, float] or None
266
268
  :ivar recipe_id: Recipe identifier used to create the run.
267
269
  :vartype recipe_id: str
268
270
  :ivar run_id: Retrieval run identifier.
@@ -283,6 +285,7 @@ class Evidence(BaseModel):
283
285
  span_start: Optional[int] = None
284
286
  span_end: Optional[int] = None
285
287
  stage: str
288
+ stage_scores: Optional[Dict[str, float]] = None
286
289
  recipe_id: str
287
290
  run_id: str
288
291
  hash: Optional[str] = None