biblicus 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +0 -1
- biblicus/_vendor/dotyaml/interpolation.py +0 -1
- biblicus/_vendor/dotyaml/loader.py +0 -1
- biblicus/_vendor/dotyaml/transformer.py +0 -1
- biblicus/analysis/__init__.py +2 -0
- biblicus/analysis/models.py +228 -5
- biblicus/analysis/profiling.py +337 -0
- biblicus/analysis/topic_modeling.py +3 -6
- biblicus/backends/__init__.py +4 -0
- biblicus/backends/hybrid.py +284 -0
- biblicus/backends/sqlite_full_text_search.py +266 -22
- biblicus/backends/vector.py +460 -0
- biblicus/cli.py +83 -4
- biblicus/corpus.py +9 -3
- biblicus/evidence_processing.py +4 -2
- biblicus/extraction.py +3 -1
- biblicus/extractors/markitdown_text.py +1 -0
- biblicus/extractors/paddleocr_vl_text.py +1 -3
- biblicus/models.py +3 -0
- biblicus/user_config.py +2 -6
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/METADATA +13 -6
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/RECORD +27 -24
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/WHEEL +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic term-frequency vector retrieval backend.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import re
|
|
9
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..frontmatter import parse_front_matter
|
|
15
|
+
from ..models import (
|
|
16
|
+
Evidence,
|
|
17
|
+
ExtractionRunReference,
|
|
18
|
+
QueryBudget,
|
|
19
|
+
RetrievalResult,
|
|
20
|
+
RetrievalRun,
|
|
21
|
+
parse_extraction_run_reference,
|
|
22
|
+
)
|
|
23
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
24
|
+
from ..time import utc_now_iso
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VectorRecipeConfig(BaseModel):
|
|
28
|
+
"""
|
|
29
|
+
Configuration for the vector retrieval backend.
|
|
30
|
+
|
|
31
|
+
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
32
|
+
:vartype snippet_characters: int
|
|
33
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
34
|
+
:vartype extraction_run: str or None
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
model_config = ConfigDict(extra="forbid")
|
|
38
|
+
|
|
39
|
+
snippet_characters: int = Field(default=400, ge=1)
|
|
40
|
+
extraction_run: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class VectorBackend:
|
|
44
|
+
"""
|
|
45
|
+
Deterministic vector backend using term-frequency cosine similarity.
|
|
46
|
+
|
|
47
|
+
:ivar backend_id: Backend identifier.
|
|
48
|
+
:vartype backend_id: str
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
backend_id = "vector"
|
|
52
|
+
|
|
53
|
+
def build_run(
|
|
54
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
55
|
+
) -> RetrievalRun:
|
|
56
|
+
"""
|
|
57
|
+
Register a vector backend run (no materialization).
|
|
58
|
+
|
|
59
|
+
:param corpus: Corpus to build against.
|
|
60
|
+
:type corpus: Corpus
|
|
61
|
+
:param recipe_name: Human-readable recipe name.
|
|
62
|
+
:type recipe_name: str
|
|
63
|
+
:param config: Backend-specific configuration values.
|
|
64
|
+
:type config: dict[str, object]
|
|
65
|
+
:return: Run manifest describing the build.
|
|
66
|
+
:rtype: RetrievalRun
|
|
67
|
+
"""
|
|
68
|
+
recipe_config = VectorRecipeConfig.model_validate(config)
|
|
69
|
+
catalog = corpus.load_catalog()
|
|
70
|
+
recipe = create_recipe_manifest(
|
|
71
|
+
backend_id=self.backend_id,
|
|
72
|
+
name=recipe_name,
|
|
73
|
+
config=recipe_config.model_dump(),
|
|
74
|
+
)
|
|
75
|
+
stats = {
|
|
76
|
+
"items": len(catalog.items),
|
|
77
|
+
"text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
|
|
78
|
+
}
|
|
79
|
+
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
80
|
+
corpus.write_run(run)
|
|
81
|
+
return run
|
|
82
|
+
|
|
83
|
+
def query(
|
|
84
|
+
self,
|
|
85
|
+
corpus: Corpus,
|
|
86
|
+
*,
|
|
87
|
+
run: RetrievalRun,
|
|
88
|
+
query_text: str,
|
|
89
|
+
budget: QueryBudget,
|
|
90
|
+
) -> RetrievalResult:
|
|
91
|
+
"""
|
|
92
|
+
Query the corpus using term-frequency cosine similarity.
|
|
93
|
+
|
|
94
|
+
:param corpus: Corpus associated with the run.
|
|
95
|
+
:type corpus: Corpus
|
|
96
|
+
:param run: Run manifest to use for querying.
|
|
97
|
+
:type run: RetrievalRun
|
|
98
|
+
:param query_text: Query text to execute.
|
|
99
|
+
:type query_text: str
|
|
100
|
+
:param budget: Evidence selection budget.
|
|
101
|
+
:type budget: QueryBudget
|
|
102
|
+
:return: Retrieval results containing evidence.
|
|
103
|
+
:rtype: RetrievalResult
|
|
104
|
+
"""
|
|
105
|
+
recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
|
|
106
|
+
query_tokens = _tokenize_text(query_text)
|
|
107
|
+
if not query_tokens:
|
|
108
|
+
return RetrievalResult(
|
|
109
|
+
query_text=query_text,
|
|
110
|
+
budget=budget,
|
|
111
|
+
run_id=run.run_id,
|
|
112
|
+
recipe_id=run.recipe.recipe_id,
|
|
113
|
+
backend_id=self.backend_id,
|
|
114
|
+
generated_at=utc_now_iso(),
|
|
115
|
+
evidence=[],
|
|
116
|
+
stats={"candidates": 0, "returned": 0},
|
|
117
|
+
)
|
|
118
|
+
query_vector = _term_frequencies(query_tokens)
|
|
119
|
+
query_norm = _vector_norm(query_vector)
|
|
120
|
+
catalog = corpus.load_catalog()
|
|
121
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
122
|
+
scored_candidates = _score_items(
|
|
123
|
+
corpus,
|
|
124
|
+
catalog.items.values(),
|
|
125
|
+
query_tokens=query_tokens,
|
|
126
|
+
query_vector=query_vector,
|
|
127
|
+
query_norm=query_norm,
|
|
128
|
+
snippet_characters=recipe_config.snippet_characters,
|
|
129
|
+
extraction_reference=extraction_reference,
|
|
130
|
+
)
|
|
131
|
+
sorted_candidates = sorted(
|
|
132
|
+
scored_candidates,
|
|
133
|
+
key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
|
|
134
|
+
)
|
|
135
|
+
ranked = [
|
|
136
|
+
evidence_item.model_copy(
|
|
137
|
+
update={
|
|
138
|
+
"rank": index,
|
|
139
|
+
"recipe_id": run.recipe.recipe_id,
|
|
140
|
+
"run_id": run.run_id,
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
144
|
+
]
|
|
145
|
+
evidence = apply_budget(ranked, budget)
|
|
146
|
+
stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
147
|
+
return RetrievalResult(
|
|
148
|
+
query_text=query_text,
|
|
149
|
+
budget=budget,
|
|
150
|
+
run_id=run.run_id,
|
|
151
|
+
recipe_id=run.recipe.recipe_id,
|
|
152
|
+
backend_id=self.backend_id,
|
|
153
|
+
generated_at=utc_now_iso(),
|
|
154
|
+
evidence=evidence,
|
|
155
|
+
stats=stats,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _resolve_extraction_reference(
|
|
160
|
+
corpus: Corpus, recipe_config: VectorRecipeConfig
|
|
161
|
+
) -> Optional[ExtractionRunReference]:
|
|
162
|
+
"""
|
|
163
|
+
Resolve an extraction run reference from a recipe config.
|
|
164
|
+
|
|
165
|
+
:param corpus: Corpus associated with the recipe.
|
|
166
|
+
:type corpus: Corpus
|
|
167
|
+
:param recipe_config: Parsed vector recipe configuration.
|
|
168
|
+
:type recipe_config: VectorRecipeConfig
|
|
169
|
+
:return: Parsed extraction reference or None.
|
|
170
|
+
:rtype: ExtractionRunReference or None
|
|
171
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
172
|
+
"""
|
|
173
|
+
if not recipe_config.extraction_run:
|
|
174
|
+
return None
|
|
175
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
176
|
+
run_dir = corpus.extraction_run_dir(
|
|
177
|
+
extractor_id=extraction_reference.extractor_id,
|
|
178
|
+
run_id=extraction_reference.run_id,
|
|
179
|
+
)
|
|
180
|
+
if not run_dir.is_dir():
|
|
181
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
182
|
+
return extraction_reference
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _count_text_items(
|
|
186
|
+
corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
|
|
187
|
+
) -> int:
|
|
188
|
+
"""
|
|
189
|
+
Count catalog items that represent text content.
|
|
190
|
+
|
|
191
|
+
:param corpus: Corpus containing the items.
|
|
192
|
+
:type corpus: Corpus
|
|
193
|
+
:param items: Catalog items to inspect.
|
|
194
|
+
:type items: Iterable[object]
|
|
195
|
+
:param recipe_config: Parsed vector recipe configuration.
|
|
196
|
+
:type recipe_config: VectorRecipeConfig
|
|
197
|
+
:return: Number of text items.
|
|
198
|
+
:rtype: int
|
|
199
|
+
"""
|
|
200
|
+
text_item_count = 0
|
|
201
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
202
|
+
for catalog_item in items:
|
|
203
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
204
|
+
if extraction_reference and item_id:
|
|
205
|
+
extracted_text = corpus.read_extracted_text(
|
|
206
|
+
extractor_id=extraction_reference.extractor_id,
|
|
207
|
+
run_id=extraction_reference.run_id,
|
|
208
|
+
item_id=item_id,
|
|
209
|
+
)
|
|
210
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
211
|
+
text_item_count += 1
|
|
212
|
+
continue
|
|
213
|
+
media_type = getattr(catalog_item, "media_type", "")
|
|
214
|
+
if media_type == "text/markdown" or str(media_type).startswith("text/"):
|
|
215
|
+
text_item_count += 1
|
|
216
|
+
return text_item_count
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _tokenize_text(text: str) -> List[str]:
|
|
220
|
+
"""
|
|
221
|
+
Tokenize text into lowercase word tokens.
|
|
222
|
+
|
|
223
|
+
:param text: Input text.
|
|
224
|
+
:type text: str
|
|
225
|
+
:return: Token list.
|
|
226
|
+
:rtype: list[str]
|
|
227
|
+
"""
|
|
228
|
+
return re.findall(r"[a-z0-9]+", text.lower())
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _term_frequencies(tokens: List[str]) -> Dict[str, float]:
|
|
232
|
+
"""
|
|
233
|
+
Build term frequency weights from tokens.
|
|
234
|
+
|
|
235
|
+
:param tokens: Token list.
|
|
236
|
+
:type tokens: list[str]
|
|
237
|
+
:return: Term frequency mapping.
|
|
238
|
+
:rtype: dict[str, float]
|
|
239
|
+
"""
|
|
240
|
+
frequencies: Dict[str, float] = {}
|
|
241
|
+
for token in tokens:
|
|
242
|
+
frequencies[token] = frequencies.get(token, 0.0) + 1.0
|
|
243
|
+
return frequencies
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _vector_norm(vector: Dict[str, float]) -> float:
|
|
247
|
+
"""
|
|
248
|
+
Compute the Euclidean norm of a term-frequency vector.
|
|
249
|
+
|
|
250
|
+
:param vector: Term frequency mapping.
|
|
251
|
+
:type vector: dict[str, float]
|
|
252
|
+
:return: Vector norm.
|
|
253
|
+
:rtype: float
|
|
254
|
+
"""
|
|
255
|
+
return math.sqrt(sum(value * value for value in vector.values()))
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _cosine_similarity(
|
|
259
|
+
left: Dict[str, float],
|
|
260
|
+
*,
|
|
261
|
+
left_norm: float,
|
|
262
|
+
right: Dict[str, float],
|
|
263
|
+
right_norm: float,
|
|
264
|
+
) -> float:
|
|
265
|
+
"""
|
|
266
|
+
Compute cosine similarity between two term-frequency vectors.
|
|
267
|
+
|
|
268
|
+
:param left: Left term-frequency vector.
|
|
269
|
+
:type left: dict[str, float]
|
|
270
|
+
:param left_norm: Precomputed left vector norm.
|
|
271
|
+
:type left_norm: float
|
|
272
|
+
:param right: Right term-frequency vector.
|
|
273
|
+
:type right: dict[str, float]
|
|
274
|
+
:param right_norm: Precomputed right vector norm.
|
|
275
|
+
:type right_norm: float
|
|
276
|
+
:return: Cosine similarity score.
|
|
277
|
+
:rtype: float
|
|
278
|
+
"""
|
|
279
|
+
dot = 0.0
|
|
280
|
+
if len(left) < len(right):
|
|
281
|
+
for token, value in left.items():
|
|
282
|
+
dot += value * right.get(token, 0.0)
|
|
283
|
+
else:
|
|
284
|
+
for token, value in right.items():
|
|
285
|
+
dot += value * left.get(token, 0.0)
|
|
286
|
+
return dot / (left_norm * right_norm)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _load_text_from_item(
|
|
290
|
+
corpus: Corpus,
|
|
291
|
+
*,
|
|
292
|
+
item_id: str,
|
|
293
|
+
relpath: str,
|
|
294
|
+
media_type: str,
|
|
295
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
296
|
+
) -> Optional[str]:
|
|
297
|
+
"""
|
|
298
|
+
Load a text payload from a catalog item.
|
|
299
|
+
|
|
300
|
+
:param corpus: Corpus containing the item.
|
|
301
|
+
:type corpus: Corpus
|
|
302
|
+
:param item_id: Item identifier.
|
|
303
|
+
:type item_id: str
|
|
304
|
+
:param relpath: Relative path to the stored content.
|
|
305
|
+
:type relpath: str
|
|
306
|
+
:param media_type: Media type for the stored content.
|
|
307
|
+
:type media_type: str
|
|
308
|
+
:param extraction_reference: Optional extraction run reference.
|
|
309
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
310
|
+
:return: Text payload or None if not decodable as text.
|
|
311
|
+
:rtype: str or None
|
|
312
|
+
"""
|
|
313
|
+
if extraction_reference:
|
|
314
|
+
extracted_text = corpus.read_extracted_text(
|
|
315
|
+
extractor_id=extraction_reference.extractor_id,
|
|
316
|
+
run_id=extraction_reference.run_id,
|
|
317
|
+
item_id=item_id,
|
|
318
|
+
)
|
|
319
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
320
|
+
return extracted_text
|
|
321
|
+
|
|
322
|
+
content_path = corpus.root / relpath
|
|
323
|
+
raw_bytes = content_path.read_bytes()
|
|
324
|
+
if media_type == "text/markdown":
|
|
325
|
+
markdown_text = raw_bytes.decode("utf-8")
|
|
326
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
327
|
+
return parsed_document.body
|
|
328
|
+
if media_type.startswith("text/"):
|
|
329
|
+
return raw_bytes.decode("utf-8")
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
|
|
334
|
+
"""
|
|
335
|
+
Locate the earliest token match span in a text payload.
|
|
336
|
+
|
|
337
|
+
:param text: Text to scan.
|
|
338
|
+
:type text: str
|
|
339
|
+
:param tokens: Query tokens.
|
|
340
|
+
:type tokens: list[str]
|
|
341
|
+
:return: Start/end span for the earliest match, or None if no matches.
|
|
342
|
+
:rtype: tuple[int, int] or None
|
|
343
|
+
"""
|
|
344
|
+
lower_text = text.lower()
|
|
345
|
+
best_start: Optional[int] = None
|
|
346
|
+
best_end: Optional[int] = None
|
|
347
|
+
for token in tokens:
|
|
348
|
+
if not token:
|
|
349
|
+
continue
|
|
350
|
+
token_start = lower_text.find(token)
|
|
351
|
+
if token_start == -1:
|
|
352
|
+
continue
|
|
353
|
+
token_end = token_start + len(token)
|
|
354
|
+
if best_start is None or token_start < best_start:
|
|
355
|
+
best_start = token_start
|
|
356
|
+
best_end = token_end
|
|
357
|
+
if best_start is None or best_end is None:
|
|
358
|
+
return None
|
|
359
|
+
return best_start, best_end
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
|
|
363
|
+
"""
|
|
364
|
+
Build a snippet around a match span, constrained by a character budget.
|
|
365
|
+
|
|
366
|
+
:param text: Source text to slice.
|
|
367
|
+
:type text: str
|
|
368
|
+
:param span: Match span to center on.
|
|
369
|
+
:type span: tuple[int, int] or None
|
|
370
|
+
:param max_chars: Maximum snippet length.
|
|
371
|
+
:type max_chars: int
|
|
372
|
+
:return: Snippet text.
|
|
373
|
+
:rtype: str
|
|
374
|
+
"""
|
|
375
|
+
if not text:
|
|
376
|
+
return ""
|
|
377
|
+
if span is None:
|
|
378
|
+
return text[:max_chars]
|
|
379
|
+
span_start, span_end = span
|
|
380
|
+
half_window = max_chars // 2
|
|
381
|
+
snippet_start = max(span_start - half_window, 0)
|
|
382
|
+
snippet_end = min(span_end + half_window, len(text))
|
|
383
|
+
return text[snippet_start:snippet_end]
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _score_items(
|
|
387
|
+
corpus: Corpus,
|
|
388
|
+
items: Iterable[object],
|
|
389
|
+
*,
|
|
390
|
+
query_tokens: List[str],
|
|
391
|
+
query_vector: Dict[str, float],
|
|
392
|
+
query_norm: float,
|
|
393
|
+
snippet_characters: int,
|
|
394
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
395
|
+
) -> List[Evidence]:
|
|
396
|
+
"""
|
|
397
|
+
Score catalog items and return evidence candidates.
|
|
398
|
+
|
|
399
|
+
:param corpus: Corpus containing the items.
|
|
400
|
+
:type corpus: Corpus
|
|
401
|
+
:param items: Catalog items to score.
|
|
402
|
+
:type items: Iterable[object]
|
|
403
|
+
:param query_tokens: Tokenized query text.
|
|
404
|
+
:type query_tokens: list[str]
|
|
405
|
+
:param query_vector: Query term-frequency vector.
|
|
406
|
+
:type query_vector: dict[str, float]
|
|
407
|
+
:param query_norm: Query vector norm.
|
|
408
|
+
:type query_norm: float
|
|
409
|
+
:param snippet_characters: Snippet length budget.
|
|
410
|
+
:type snippet_characters: int
|
|
411
|
+
:param extraction_reference: Optional extraction run reference.
|
|
412
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
413
|
+
:return: Evidence candidates with provisional ranks.
|
|
414
|
+
:rtype: list[Evidence]
|
|
415
|
+
"""
|
|
416
|
+
evidence_items: List[Evidence] = []
|
|
417
|
+
for catalog_item in items:
|
|
418
|
+
media_type = getattr(catalog_item, "media_type", "")
|
|
419
|
+
relpath = getattr(catalog_item, "relpath", "")
|
|
420
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
421
|
+
item_text = _load_text_from_item(
|
|
422
|
+
corpus,
|
|
423
|
+
item_id=item_id,
|
|
424
|
+
relpath=relpath,
|
|
425
|
+
media_type=str(media_type),
|
|
426
|
+
extraction_reference=extraction_reference,
|
|
427
|
+
)
|
|
428
|
+
if item_text is None:
|
|
429
|
+
continue
|
|
430
|
+
tokens = _tokenize_text(item_text)
|
|
431
|
+
if not tokens:
|
|
432
|
+
continue
|
|
433
|
+
vector = _term_frequencies(tokens)
|
|
434
|
+
similarity = _cosine_similarity(
|
|
435
|
+
query_vector, left_norm=query_norm, right=vector, right_norm=_vector_norm(vector)
|
|
436
|
+
)
|
|
437
|
+
if similarity <= 0:
|
|
438
|
+
continue
|
|
439
|
+
span = _find_first_match(item_text, query_tokens)
|
|
440
|
+
snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
441
|
+
span_start = span[0] if span else None
|
|
442
|
+
span_end = span[1] if span else None
|
|
443
|
+
evidence_items.append(
|
|
444
|
+
Evidence(
|
|
445
|
+
item_id=str(getattr(catalog_item, "id")),
|
|
446
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
447
|
+
media_type=str(media_type),
|
|
448
|
+
score=float(similarity),
|
|
449
|
+
rank=1,
|
|
450
|
+
text=snippet,
|
|
451
|
+
content_ref=None,
|
|
452
|
+
span_start=span_start,
|
|
453
|
+
span_end=span_end,
|
|
454
|
+
stage="vector",
|
|
455
|
+
recipe_id="",
|
|
456
|
+
run_id="",
|
|
457
|
+
hash=hash_text(snippet),
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
return evidence_items
|
biblicus/cli.py
CHANGED
|
@@ -563,7 +563,9 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
|
563
563
|
"""
|
|
564
564
|
input_text = sys.stdin.read()
|
|
565
565
|
if not input_text.strip():
|
|
566
|
-
raise ValueError(
|
|
566
|
+
raise ValueError(
|
|
567
|
+
"Context pack build requires a retrieval result JavaScript Object Notation on standard input"
|
|
568
|
+
)
|
|
567
569
|
retrieval_result = RetrievalResult.model_validate_json(input_text)
|
|
568
570
|
join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
|
|
569
571
|
policy = ContextPackPolicy(join_with=join_with)
|
|
@@ -685,6 +687,58 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
685
687
|
return 0
|
|
686
688
|
|
|
687
689
|
|
|
690
|
+
def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
691
|
+
"""
|
|
692
|
+
Run profiling analysis for a corpus.
|
|
693
|
+
|
|
694
|
+
:param arguments: Parsed command-line interface arguments.
|
|
695
|
+
:type arguments: argparse.Namespace
|
|
696
|
+
:return: Exit code.
|
|
697
|
+
:rtype: int
|
|
698
|
+
"""
|
|
699
|
+
import yaml
|
|
700
|
+
|
|
701
|
+
corpus = (
|
|
702
|
+
Corpus.open(arguments.corpus)
|
|
703
|
+
if getattr(arguments, "corpus", None)
|
|
704
|
+
else Corpus.find(Path.cwd())
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
recipe_data: dict[str, object] = {}
|
|
708
|
+
if arguments.recipe is not None:
|
|
709
|
+
recipe_path = Path(arguments.recipe)
|
|
710
|
+
if not recipe_path.is_file():
|
|
711
|
+
raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
|
|
712
|
+
recipe_raw = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
|
|
713
|
+
if not isinstance(recipe_raw, dict):
|
|
714
|
+
raise ValueError("Profiling recipe must be a mapping/object")
|
|
715
|
+
recipe_data = recipe_raw
|
|
716
|
+
|
|
717
|
+
if arguments.extraction_run:
|
|
718
|
+
extraction_run = parse_extraction_run_reference(arguments.extraction_run)
|
|
719
|
+
else:
|
|
720
|
+
extraction_run = corpus.latest_extraction_run_reference()
|
|
721
|
+
if extraction_run is None:
|
|
722
|
+
raise ValueError("Profiling analysis requires an extraction run to supply text inputs")
|
|
723
|
+
print(
|
|
724
|
+
"Warning: using latest extraction run; pass --extraction-run for reproducibility.",
|
|
725
|
+
file=sys.stderr,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
backend = get_analysis_backend("profiling")
|
|
729
|
+
try:
|
|
730
|
+
output = backend.run_analysis(
|
|
731
|
+
corpus,
|
|
732
|
+
recipe_name=arguments.recipe_name,
|
|
733
|
+
config=recipe_data,
|
|
734
|
+
extraction_run=extraction_run,
|
|
735
|
+
)
|
|
736
|
+
except ValidationError as exc:
|
|
737
|
+
raise ValueError(f"Invalid profiling recipe: {exc}") from exc
|
|
738
|
+
print(output.model_dump_json(indent=2))
|
|
739
|
+
return 0
|
|
740
|
+
|
|
741
|
+
|
|
688
742
|
def build_parser() -> argparse.ArgumentParser:
|
|
689
743
|
"""
|
|
690
744
|
Build the command-line interface argument parser.
|
|
@@ -890,14 +944,20 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
890
944
|
|
|
891
945
|
p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
|
|
892
946
|
_add_common_corpus_arg(p_crawl)
|
|
893
|
-
p_crawl.add_argument(
|
|
947
|
+
p_crawl.add_argument(
|
|
948
|
+
"--root-url", required=True, help="Root uniform resource locator to fetch."
|
|
949
|
+
)
|
|
894
950
|
p_crawl.add_argument(
|
|
895
951
|
"--allowed-prefix",
|
|
896
952
|
required=True,
|
|
897
953
|
help="Uniform resource locator prefix that limits which links are eligible for crawl.",
|
|
898
954
|
)
|
|
899
|
-
p_crawl.add_argument(
|
|
900
|
-
|
|
955
|
+
p_crawl.add_argument(
|
|
956
|
+
"--max-items", type=int, default=50, help="Maximum number of items to store."
|
|
957
|
+
)
|
|
958
|
+
p_crawl.add_argument(
|
|
959
|
+
"--tags", default=None, help="Comma-separated tags to apply to stored items."
|
|
960
|
+
)
|
|
901
961
|
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
902
962
|
p_crawl.set_defaults(func=cmd_crawl)
|
|
903
963
|
|
|
@@ -923,6 +983,25 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
923
983
|
)
|
|
924
984
|
p_analyze_topics.set_defaults(func=cmd_analyze_topics)
|
|
925
985
|
|
|
986
|
+
p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
|
|
987
|
+
_add_common_corpus_arg(p_analyze_profile)
|
|
988
|
+
p_analyze_profile.add_argument(
|
|
989
|
+
"--recipe",
|
|
990
|
+
default=None,
|
|
991
|
+
help="Optional profiling recipe YAML file.",
|
|
992
|
+
)
|
|
993
|
+
p_analyze_profile.add_argument(
|
|
994
|
+
"--recipe-name",
|
|
995
|
+
default="default",
|
|
996
|
+
help="Human-readable recipe name.",
|
|
997
|
+
)
|
|
998
|
+
p_analyze_profile.add_argument(
|
|
999
|
+
"--extraction-run",
|
|
1000
|
+
default=None,
|
|
1001
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
1002
|
+
)
|
|
1003
|
+
p_analyze_profile.set_defaults(func=cmd_analyze_profile)
|
|
1004
|
+
|
|
926
1005
|
return parser
|
|
927
1006
|
|
|
928
1007
|
|
biblicus/corpus.py
CHANGED
|
@@ -622,7 +622,9 @@ class Corpus:
|
|
|
622
622
|
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
623
623
|
return ExtractionRunManifest.model_validate(data)
|
|
624
624
|
|
|
625
|
-
def list_extraction_runs(
|
|
625
|
+
def list_extraction_runs(
|
|
626
|
+
self, *, extractor_id: Optional[str] = None
|
|
627
|
+
) -> List[ExtractionRunListEntry]:
|
|
626
628
|
"""
|
|
627
629
|
List extraction runs stored under the corpus.
|
|
628
630
|
|
|
@@ -669,7 +671,9 @@ class Corpus:
|
|
|
669
671
|
)
|
|
670
672
|
)
|
|
671
673
|
|
|
672
|
-
entries.sort(
|
|
674
|
+
entries.sort(
|
|
675
|
+
key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
|
|
676
|
+
)
|
|
673
677
|
return entries
|
|
674
678
|
|
|
675
679
|
def latest_extraction_run_reference(
|
|
@@ -1366,7 +1370,9 @@ class Corpus:
|
|
|
1366
1370
|
"""
|
|
1367
1371
|
_ = filename
|
|
1368
1372
|
item_id = str(uuid.uuid4())
|
|
1369
|
-
destination_relpath = str(
|
|
1373
|
+
destination_relpath = str(
|
|
1374
|
+
Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path
|
|
1375
|
+
)
|
|
1370
1376
|
destination_path = (self.root / destination_relpath).resolve()
|
|
1371
1377
|
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1372
1378
|
destination_path.write_bytes(data)
|
biblicus/evidence_processing.py
CHANGED
|
@@ -99,7 +99,10 @@ class EvidenceRerankLongestText(EvidenceReranker):
|
|
|
99
99
|
"""
|
|
100
100
|
return sorted(
|
|
101
101
|
evidence,
|
|
102
|
-
key=lambda evidence_item: (
|
|
102
|
+
key=lambda evidence_item: (
|
|
103
|
+
-len((evidence_item.text or "").strip()),
|
|
104
|
+
evidence_item.item_id,
|
|
105
|
+
),
|
|
103
106
|
)
|
|
104
107
|
|
|
105
108
|
|
|
@@ -198,4 +201,3 @@ def apply_evidence_filter(
|
|
|
198
201
|
"""
|
|
199
202
|
evidence_filter = _EVIDENCE_FILTERS[filter_id]
|
|
200
203
|
return evidence_filter.filter(query_text=query_text, evidence=evidence, config=config)
|
|
201
|
-
|
biblicus/extraction.py
CHANGED
|
@@ -345,7 +345,9 @@ def build_extraction_run(
|
|
|
345
345
|
manifest = create_extraction_run_manifest(corpus, recipe=recipe)
|
|
346
346
|
run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
347
347
|
if run_dir.exists():
|
|
348
|
-
return corpus.load_extraction_run_manifest(
|
|
348
|
+
return corpus.load_extraction_run_manifest(
|
|
349
|
+
extractor_id=extractor_id, run_id=manifest.run_id
|
|
350
|
+
)
|
|
349
351
|
run_dir.mkdir(parents=True, exist_ok=False)
|
|
350
352
|
|
|
351
353
|
catalog = corpus.load_catalog()
|
|
@@ -152,9 +152,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
152
152
|
parsed_config.backend.api_provider,
|
|
153
153
|
config_override=parsed_config.backend.api_key,
|
|
154
154
|
)
|
|
155
|
-
text, confidence = self._extract_via_api(
|
|
156
|
-
source_path, parsed_config, api_key
|
|
157
|
-
)
|
|
155
|
+
text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
|
|
158
156
|
|
|
159
157
|
return ExtractedText(
|
|
160
158
|
text=text,
|
biblicus/models.py
CHANGED
|
@@ -263,6 +263,8 @@ class Evidence(BaseModel):
|
|
|
263
263
|
:vartype span_end: int or None
|
|
264
264
|
:ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
|
|
265
265
|
:vartype stage: str
|
|
266
|
+
:ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
|
|
267
|
+
:vartype stage_scores: dict[str, float] or None
|
|
266
268
|
:ivar recipe_id: Recipe identifier used to create the run.
|
|
267
269
|
:vartype recipe_id: str
|
|
268
270
|
:ivar run_id: Retrieval run identifier.
|
|
@@ -283,6 +285,7 @@ class Evidence(BaseModel):
|
|
|
283
285
|
span_start: Optional[int] = None
|
|
284
286
|
span_end: Optional[int] = None
|
|
285
287
|
stage: str
|
|
288
|
+
stage_scores: Optional[Dict[str, float]] = None
|
|
286
289
|
recipe_id: str
|
|
287
290
|
run_id: str
|
|
288
291
|
hash: Optional[str] = None
|