biblicus 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,460 @@
1
+ """
2
+ Deterministic term-frequency vector retrieval backend.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ import re
9
+ from typing import Dict, Iterable, List, Optional, Tuple
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from ..corpus import Corpus
14
+ from ..frontmatter import parse_front_matter
15
+ from ..models import (
16
+ Evidence,
17
+ ExtractionRunReference,
18
+ QueryBudget,
19
+ RetrievalResult,
20
+ RetrievalRun,
21
+ parse_extraction_run_reference,
22
+ )
23
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
24
+ from ..time import utc_now_iso
25
+
26
+
27
+ class VectorRecipeConfig(BaseModel):
28
+ """
29
+ Configuration for the vector retrieval backend.
30
+
31
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
32
+ :vartype snippet_characters: int
33
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
34
+ :vartype extraction_run: str or None
35
+ """
36
+
37
+ model_config = ConfigDict(extra="forbid")
38
+
39
+ snippet_characters: int = Field(default=400, ge=1)
40
+ extraction_run: Optional[str] = None
41
+
42
+
43
+ class VectorBackend:
44
+ """
45
+ Deterministic vector backend using term-frequency cosine similarity.
46
+
47
+ :ivar backend_id: Backend identifier.
48
+ :vartype backend_id: str
49
+ """
50
+
51
+ backend_id = "vector"
52
+
53
+ def build_run(
54
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
55
+ ) -> RetrievalRun:
56
+ """
57
+ Register a vector backend run (no materialization).
58
+
59
+ :param corpus: Corpus to build against.
60
+ :type corpus: Corpus
61
+ :param recipe_name: Human-readable recipe name.
62
+ :type recipe_name: str
63
+ :param config: Backend-specific configuration values.
64
+ :type config: dict[str, object]
65
+ :return: Run manifest describing the build.
66
+ :rtype: RetrievalRun
67
+ """
68
+ recipe_config = VectorRecipeConfig.model_validate(config)
69
+ catalog = corpus.load_catalog()
70
+ recipe = create_recipe_manifest(
71
+ backend_id=self.backend_id,
72
+ name=recipe_name,
73
+ config=recipe_config.model_dump(),
74
+ )
75
+ stats = {
76
+ "items": len(catalog.items),
77
+ "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
78
+ }
79
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
80
+ corpus.write_run(run)
81
+ return run
82
+
83
+ def query(
84
+ self,
85
+ corpus: Corpus,
86
+ *,
87
+ run: RetrievalRun,
88
+ query_text: str,
89
+ budget: QueryBudget,
90
+ ) -> RetrievalResult:
91
+ """
92
+ Query the corpus using term-frequency cosine similarity.
93
+
94
+ :param corpus: Corpus associated with the run.
95
+ :type corpus: Corpus
96
+ :param run: Run manifest to use for querying.
97
+ :type run: RetrievalRun
98
+ :param query_text: Query text to execute.
99
+ :type query_text: str
100
+ :param budget: Evidence selection budget.
101
+ :type budget: QueryBudget
102
+ :return: Retrieval results containing evidence.
103
+ :rtype: RetrievalResult
104
+ """
105
+ recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
106
+ query_tokens = _tokenize_text(query_text)
107
+ if not query_tokens:
108
+ return RetrievalResult(
109
+ query_text=query_text,
110
+ budget=budget,
111
+ run_id=run.run_id,
112
+ recipe_id=run.recipe.recipe_id,
113
+ backend_id=self.backend_id,
114
+ generated_at=utc_now_iso(),
115
+ evidence=[],
116
+ stats={"candidates": 0, "returned": 0},
117
+ )
118
+ query_vector = _term_frequencies(query_tokens)
119
+ query_norm = _vector_norm(query_vector)
120
+ catalog = corpus.load_catalog()
121
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
122
+ scored_candidates = _score_items(
123
+ corpus,
124
+ catalog.items.values(),
125
+ query_tokens=query_tokens,
126
+ query_vector=query_vector,
127
+ query_norm=query_norm,
128
+ snippet_characters=recipe_config.snippet_characters,
129
+ extraction_reference=extraction_reference,
130
+ )
131
+ sorted_candidates = sorted(
132
+ scored_candidates,
133
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
134
+ )
135
+ ranked = [
136
+ evidence_item.model_copy(
137
+ update={
138
+ "rank": index,
139
+ "recipe_id": run.recipe.recipe_id,
140
+ "run_id": run.run_id,
141
+ }
142
+ )
143
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
144
+ ]
145
+ evidence = apply_budget(ranked, budget)
146
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
147
+ return RetrievalResult(
148
+ query_text=query_text,
149
+ budget=budget,
150
+ run_id=run.run_id,
151
+ recipe_id=run.recipe.recipe_id,
152
+ backend_id=self.backend_id,
153
+ generated_at=utc_now_iso(),
154
+ evidence=evidence,
155
+ stats=stats,
156
+ )
157
+
158
+
159
+ def _resolve_extraction_reference(
160
+ corpus: Corpus, recipe_config: VectorRecipeConfig
161
+ ) -> Optional[ExtractionRunReference]:
162
+ """
163
+ Resolve an extraction run reference from a recipe config.
164
+
165
+ :param corpus: Corpus associated with the recipe.
166
+ :type corpus: Corpus
167
+ :param recipe_config: Parsed vector recipe configuration.
168
+ :type recipe_config: VectorRecipeConfig
169
+ :return: Parsed extraction reference or None.
170
+ :rtype: ExtractionRunReference or None
171
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
172
+ """
173
+ if not recipe_config.extraction_run:
174
+ return None
175
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
176
+ run_dir = corpus.extraction_run_dir(
177
+ extractor_id=extraction_reference.extractor_id,
178
+ run_id=extraction_reference.run_id,
179
+ )
180
+ if not run_dir.is_dir():
181
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
182
+ return extraction_reference
183
+
184
+
185
+ def _count_text_items(
186
+ corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
187
+ ) -> int:
188
+ """
189
+ Count catalog items that represent text content.
190
+
191
+ :param corpus: Corpus containing the items.
192
+ :type corpus: Corpus
193
+ :param items: Catalog items to inspect.
194
+ :type items: Iterable[object]
195
+ :param recipe_config: Parsed vector recipe configuration.
196
+ :type recipe_config: VectorRecipeConfig
197
+ :return: Number of text items.
198
+ :rtype: int
199
+ """
200
+ text_item_count = 0
201
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
202
+ for catalog_item in items:
203
+ item_id = str(getattr(catalog_item, "id", ""))
204
+ if extraction_reference and item_id:
205
+ extracted_text = corpus.read_extracted_text(
206
+ extractor_id=extraction_reference.extractor_id,
207
+ run_id=extraction_reference.run_id,
208
+ item_id=item_id,
209
+ )
210
+ if isinstance(extracted_text, str) and extracted_text.strip():
211
+ text_item_count += 1
212
+ continue
213
+ media_type = getattr(catalog_item, "media_type", "")
214
+ if media_type == "text/markdown" or str(media_type).startswith("text/"):
215
+ text_item_count += 1
216
+ return text_item_count
217
+
218
+
219
+ def _tokenize_text(text: str) -> List[str]:
220
+ """
221
+ Tokenize text into lowercase word tokens.
222
+
223
+ :param text: Input text.
224
+ :type text: str
225
+ :return: Token list.
226
+ :rtype: list[str]
227
+ """
228
+ return re.findall(r"[a-z0-9]+", text.lower())
229
+
230
+
231
+ def _term_frequencies(tokens: List[str]) -> Dict[str, float]:
232
+ """
233
+ Build term frequency weights from tokens.
234
+
235
+ :param tokens: Token list.
236
+ :type tokens: list[str]
237
+ :return: Term frequency mapping.
238
+ :rtype: dict[str, float]
239
+ """
240
+ frequencies: Dict[str, float] = {}
241
+ for token in tokens:
242
+ frequencies[token] = frequencies.get(token, 0.0) + 1.0
243
+ return frequencies
244
+
245
+
246
+ def _vector_norm(vector: Dict[str, float]) -> float:
247
+ """
248
+ Compute the Euclidean norm of a term-frequency vector.
249
+
250
+ :param vector: Term frequency mapping.
251
+ :type vector: dict[str, float]
252
+ :return: Vector norm.
253
+ :rtype: float
254
+ """
255
+ return math.sqrt(sum(value * value for value in vector.values()))
256
+
257
+
258
+ def _cosine_similarity(
259
+ left: Dict[str, float],
260
+ *,
261
+ left_norm: float,
262
+ right: Dict[str, float],
263
+ right_norm: float,
264
+ ) -> float:
265
+ """
266
+ Compute cosine similarity between two term-frequency vectors.
267
+
268
+ :param left: Left term-frequency vector.
269
+ :type left: dict[str, float]
270
+ :param left_norm: Precomputed left vector norm.
271
+ :type left_norm: float
272
+ :param right: Right term-frequency vector.
273
+ :type right: dict[str, float]
274
+ :param right_norm: Precomputed right vector norm.
275
+ :type right_norm: float
276
+ :return: Cosine similarity score.
277
+ :rtype: float
278
+ """
279
+ dot = 0.0
280
+ if len(left) < len(right):
281
+ for token, value in left.items():
282
+ dot += value * right.get(token, 0.0)
283
+ else:
284
+ for token, value in right.items():
285
+ dot += value * left.get(token, 0.0)
286
+ return dot / (left_norm * right_norm)
287
+
288
+
289
+ def _load_text_from_item(
290
+ corpus: Corpus,
291
+ *,
292
+ item_id: str,
293
+ relpath: str,
294
+ media_type: str,
295
+ extraction_reference: Optional[ExtractionRunReference],
296
+ ) -> Optional[str]:
297
+ """
298
+ Load a text payload from a catalog item.
299
+
300
+ :param corpus: Corpus containing the item.
301
+ :type corpus: Corpus
302
+ :param item_id: Item identifier.
303
+ :type item_id: str
304
+ :param relpath: Relative path to the stored content.
305
+ :type relpath: str
306
+ :param media_type: Media type for the stored content.
307
+ :type media_type: str
308
+ :param extraction_reference: Optional extraction run reference.
309
+ :type extraction_reference: ExtractionRunReference or None
310
+ :return: Text payload or None if not decodable as text.
311
+ :rtype: str or None
312
+ """
313
+ if extraction_reference:
314
+ extracted_text = corpus.read_extracted_text(
315
+ extractor_id=extraction_reference.extractor_id,
316
+ run_id=extraction_reference.run_id,
317
+ item_id=item_id,
318
+ )
319
+ if isinstance(extracted_text, str) and extracted_text.strip():
320
+ return extracted_text
321
+
322
+ content_path = corpus.root / relpath
323
+ raw_bytes = content_path.read_bytes()
324
+ if media_type == "text/markdown":
325
+ markdown_text = raw_bytes.decode("utf-8")
326
+ parsed_document = parse_front_matter(markdown_text)
327
+ return parsed_document.body
328
+ if media_type.startswith("text/"):
329
+ return raw_bytes.decode("utf-8")
330
+ return None
331
+
332
+
333
+ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
334
+ """
335
+ Locate the earliest token match span in a text payload.
336
+
337
+ :param text: Text to scan.
338
+ :type text: str
339
+ :param tokens: Query tokens.
340
+ :type tokens: list[str]
341
+ :return: Start/end span for the earliest match, or None if no matches.
342
+ :rtype: tuple[int, int] or None
343
+ """
344
+ lower_text = text.lower()
345
+ best_start: Optional[int] = None
346
+ best_end: Optional[int] = None
347
+ for token in tokens:
348
+ if not token:
349
+ continue
350
+ token_start = lower_text.find(token)
351
+ if token_start == -1:
352
+ continue
353
+ token_end = token_start + len(token)
354
+ if best_start is None or token_start < best_start:
355
+ best_start = token_start
356
+ best_end = token_end
357
+ if best_start is None or best_end is None:
358
+ return None
359
+ return best_start, best_end
360
+
361
+
362
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
363
+ """
364
+ Build a snippet around a match span, constrained by a character budget.
365
+
366
+ :param text: Source text to slice.
367
+ :type text: str
368
+ :param span: Match span to center on.
369
+ :type span: tuple[int, int] or None
370
+ :param max_chars: Maximum snippet length.
371
+ :type max_chars: int
372
+ :return: Snippet text.
373
+ :rtype: str
374
+ """
375
+ if not text:
376
+ return ""
377
+ if span is None:
378
+ return text[:max_chars]
379
+ span_start, span_end = span
380
+ half_window = max_chars // 2
381
+ snippet_start = max(span_start - half_window, 0)
382
+ snippet_end = min(span_end + half_window, len(text))
383
+ return text[snippet_start:snippet_end]
384
+
385
+
386
+ def _score_items(
387
+ corpus: Corpus,
388
+ items: Iterable[object],
389
+ *,
390
+ query_tokens: List[str],
391
+ query_vector: Dict[str, float],
392
+ query_norm: float,
393
+ snippet_characters: int,
394
+ extraction_reference: Optional[ExtractionRunReference],
395
+ ) -> List[Evidence]:
396
+ """
397
+ Score catalog items and return evidence candidates.
398
+
399
+ :param corpus: Corpus containing the items.
400
+ :type corpus: Corpus
401
+ :param items: Catalog items to score.
402
+ :type items: Iterable[object]
403
+ :param query_tokens: Tokenized query text.
404
+ :type query_tokens: list[str]
405
+ :param query_vector: Query term-frequency vector.
406
+ :type query_vector: dict[str, float]
407
+ :param query_norm: Query vector norm.
408
+ :type query_norm: float
409
+ :param snippet_characters: Snippet length budget.
410
+ :type snippet_characters: int
411
+ :param extraction_reference: Optional extraction run reference.
412
+ :type extraction_reference: ExtractionRunReference or None
413
+ :return: Evidence candidates with provisional ranks.
414
+ :rtype: list[Evidence]
415
+ """
416
+ evidence_items: List[Evidence] = []
417
+ for catalog_item in items:
418
+ media_type = getattr(catalog_item, "media_type", "")
419
+ relpath = getattr(catalog_item, "relpath", "")
420
+ item_id = str(getattr(catalog_item, "id", ""))
421
+ item_text = _load_text_from_item(
422
+ corpus,
423
+ item_id=item_id,
424
+ relpath=relpath,
425
+ media_type=str(media_type),
426
+ extraction_reference=extraction_reference,
427
+ )
428
+ if item_text is None:
429
+ continue
430
+ tokens = _tokenize_text(item_text)
431
+ if not tokens:
432
+ continue
433
+ vector = _term_frequencies(tokens)
434
+ similarity = _cosine_similarity(
435
+ query_vector, left_norm=query_norm, right=vector, right_norm=_vector_norm(vector)
436
+ )
437
+ if similarity <= 0:
438
+ continue
439
+ span = _find_first_match(item_text, query_tokens)
440
+ snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
441
+ span_start = span[0] if span else None
442
+ span_end = span[1] if span else None
443
+ evidence_items.append(
444
+ Evidence(
445
+ item_id=str(getattr(catalog_item, "id")),
446
+ source_uri=getattr(catalog_item, "source_uri", None),
447
+ media_type=str(media_type),
448
+ score=float(similarity),
449
+ rank=1,
450
+ text=snippet,
451
+ content_ref=None,
452
+ span_start=span_start,
453
+ span_end=span_end,
454
+ stage="vector",
455
+ recipe_id="",
456
+ run_id="",
457
+ hash=hash_text(snippet),
458
+ )
459
+ )
460
+ return evidence_items
biblicus/cli.py CHANGED
@@ -15,9 +15,11 @@ from pydantic import ValidationError
15
15
  from .analysis import get_analysis_backend
16
16
  from .backends import get_backend
17
17
  from .context import (
18
+ CharacterBudget,
18
19
  ContextPackPolicy,
19
20
  TokenBudget,
20
21
  build_context_pack,
22
+ fit_context_pack_to_character_budget,
21
23
  fit_context_pack_to_token_budget,
22
24
  )
23
25
  from .corpus import Corpus
@@ -568,7 +570,11 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
568
570
  )
569
571
  retrieval_result = RetrievalResult.model_validate_json(input_text)
570
572
  join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
571
- policy = ContextPackPolicy(join_with=join_with)
573
+ policy = ContextPackPolicy(
574
+ join_with=join_with,
575
+ ordering=arguments.ordering,
576
+ include_metadata=arguments.include_metadata,
577
+ )
572
578
  context_pack = build_context_pack(retrieval_result, policy=policy)
573
579
  if arguments.max_tokens is not None:
574
580
  context_pack = fit_context_pack_to_token_budget(
@@ -576,6 +582,12 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
576
582
  policy=policy,
577
583
  token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
578
584
  )
585
+ if arguments.max_characters is not None:
586
+ context_pack = fit_context_pack_to_character_budget(
587
+ context_pack,
588
+ policy=policy,
589
+ character_budget=CharacterBudget(max_characters=int(arguments.max_characters)),
590
+ )
579
591
  print(
580
592
  json.dumps(
581
593
  {
@@ -921,12 +933,29 @@ def build_parser() -> argparse.ArgumentParser:
921
933
  default="\\n\\n",
922
934
  help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
923
935
  )
936
+ p_context_pack_build.add_argument(
937
+ "--ordering",
938
+ choices=["rank", "score", "source"],
939
+ default="rank",
940
+ help="Evidence ordering policy (rank, score, source).",
941
+ )
942
+ p_context_pack_build.add_argument(
943
+ "--include-metadata",
944
+ action="store_true",
945
+ help="Include evidence metadata in each context pack block.",
946
+ )
924
947
  p_context_pack_build.add_argument(
925
948
  "--max-tokens",
926
949
  default=None,
927
950
  type=int,
928
951
  help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
929
952
  )
953
+ p_context_pack_build.add_argument(
954
+ "--max-characters",
955
+ default=None,
956
+ type=int,
957
+ help="Optional character budget for the final context pack.",
958
+ )
930
959
  p_context_pack_build.set_defaults(func=cmd_context_pack_build)
931
960
 
932
961
  p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")