biblicus 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/profiling.py +1 -1
- biblicus/backends/__init__.py +4 -0
- biblicus/backends/hybrid.py +284 -0
- biblicus/backends/sqlite_full_text_search.py +264 -18
- biblicus/backends/vector.py +460 -0
- biblicus/cli.py +30 -1
- biblicus/context.py +138 -4
- biblicus/models.py +3 -0
- {biblicus-0.10.0.dist-info → biblicus-0.12.0.dist-info}/METADATA +7 -1
- {biblicus-0.10.0.dist-info → biblicus-0.12.0.dist-info}/RECORD +15 -13
- {biblicus-0.10.0.dist-info → biblicus-0.12.0.dist-info}/WHEEL +0 -0
- {biblicus-0.10.0.dist-info → biblicus-0.12.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.10.0.dist-info → biblicus-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.10.0.dist-info → biblicus-0.12.0.dist-info}/top_level.txt +0 -0
biblicus/context.py
CHANGED
|
@@ -8,11 +8,11 @@ stable contract while context formatting remains an explicit policy surface.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
-
from typing import List, Optional
|
|
11
|
+
from typing import Dict, List, Literal, Optional
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, ConfigDict, Field
|
|
14
14
|
|
|
15
|
-
from .models import RetrievalResult
|
|
15
|
+
from .models import Evidence, RetrievalResult
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class ContextPackPolicy(BaseModel):
|
|
@@ -21,11 +21,17 @@ class ContextPackPolicy(BaseModel):
|
|
|
21
21
|
|
|
22
22
|
:ivar join_with: Separator inserted between evidence text blocks.
|
|
23
23
|
:vartype join_with: str
|
|
24
|
+
:ivar ordering: Evidence ordering policy (rank, score, or source).
|
|
25
|
+
:vartype ordering: str
|
|
26
|
+
:ivar include_metadata: Whether to include evidence metadata lines in each block.
|
|
27
|
+
:vartype include_metadata: bool
|
|
24
28
|
"""
|
|
25
29
|
|
|
26
30
|
model_config = ConfigDict(extra="forbid")
|
|
27
31
|
|
|
28
32
|
join_with: str = Field(default="\n\n")
|
|
33
|
+
ordering: Literal["rank", "score", "source"] = Field(default="rank")
|
|
34
|
+
include_metadata: bool = Field(default=False)
|
|
29
35
|
|
|
30
36
|
|
|
31
37
|
class ContextPack(BaseModel):
|
|
@@ -55,12 +61,15 @@ class ContextPackBlock(BaseModel):
|
|
|
55
61
|
:vartype evidence_item_id: str
|
|
56
62
|
:ivar text: Text included in this block.
|
|
57
63
|
:vartype text: str
|
|
64
|
+
:ivar metadata: Optional metadata included with the block.
|
|
65
|
+
:vartype metadata: dict[str, object] or None
|
|
58
66
|
"""
|
|
59
67
|
|
|
60
68
|
model_config = ConfigDict(extra="forbid")
|
|
61
69
|
|
|
62
70
|
evidence_item_id: str = Field(min_length=1)
|
|
63
71
|
text: str = Field(min_length=1)
|
|
72
|
+
metadata: Optional[Dict[str, object]] = None
|
|
64
73
|
|
|
65
74
|
|
|
66
75
|
class TokenCounter(BaseModel):
|
|
@@ -92,6 +101,19 @@ class TokenBudget(BaseModel):
|
|
|
92
101
|
max_tokens: int = Field(ge=1)
|
|
93
102
|
|
|
94
103
|
|
|
104
|
+
class CharacterBudget(BaseModel):
|
|
105
|
+
"""
|
|
106
|
+
Character budget for a context pack.
|
|
107
|
+
|
|
108
|
+
:ivar max_characters: Maximum characters permitted for the final context pack text.
|
|
109
|
+
:vartype max_characters: int
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
model_config = ConfigDict(extra="forbid")
|
|
113
|
+
|
|
114
|
+
max_characters: int = Field(ge=1)
|
|
115
|
+
|
|
116
|
+
|
|
95
117
|
def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
|
|
96
118
|
"""
|
|
97
119
|
Build a context pack from a retrieval result using an explicit policy.
|
|
@@ -104,14 +126,20 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
|
|
|
104
126
|
:rtype: ContextPack
|
|
105
127
|
"""
|
|
106
128
|
selected_blocks: List[ContextPackBlock] = []
|
|
107
|
-
for evidence in result.evidence:
|
|
129
|
+
for evidence in _order_evidence(result.evidence, policy=policy):
|
|
108
130
|
if not isinstance(evidence.text, str):
|
|
109
131
|
continue
|
|
110
132
|
trimmed_text = evidence.text.strip()
|
|
111
133
|
if not trimmed_text:
|
|
112
134
|
continue
|
|
135
|
+
metadata = _metadata_for_evidence(evidence) if policy.include_metadata else None
|
|
136
|
+
block_text = _format_block_text(trimmed_text, metadata=metadata)
|
|
113
137
|
selected_blocks.append(
|
|
114
|
-
ContextPackBlock(
|
|
138
|
+
ContextPackBlock(
|
|
139
|
+
evidence_item_id=evidence.item_id,
|
|
140
|
+
text=block_text,
|
|
141
|
+
metadata=metadata,
|
|
142
|
+
)
|
|
115
143
|
)
|
|
116
144
|
|
|
117
145
|
return ContextPack(
|
|
@@ -181,3 +209,109 @@ def fit_context_pack_to_token_budget(
|
|
|
181
209
|
remaining_blocks = remaining_blocks[:-1]
|
|
182
210
|
|
|
183
211
|
return ContextPack(text="", evidence_count=0, blocks=[])
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def fit_context_pack_to_character_budget(
|
|
215
|
+
context_pack: ContextPack,
|
|
216
|
+
*,
|
|
217
|
+
policy: ContextPackPolicy,
|
|
218
|
+
character_budget: CharacterBudget,
|
|
219
|
+
) -> ContextPack:
|
|
220
|
+
"""
|
|
221
|
+
Fit a context pack to a character budget by dropping trailing blocks.
|
|
222
|
+
|
|
223
|
+
:param context_pack: Context pack to fit.
|
|
224
|
+
:type context_pack: ContextPack
|
|
225
|
+
:param policy: Policy controlling how blocks are joined into text.
|
|
226
|
+
:type policy: ContextPackPolicy
|
|
227
|
+
:param character_budget: Character budget to enforce.
|
|
228
|
+
:type character_budget: CharacterBudget
|
|
229
|
+
:return: Fitted context pack.
|
|
230
|
+
:rtype: ContextPack
|
|
231
|
+
"""
|
|
232
|
+
remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
|
|
233
|
+
max_characters = character_budget.max_characters
|
|
234
|
+
|
|
235
|
+
while remaining_blocks:
|
|
236
|
+
candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
|
|
237
|
+
if len(candidate_text) <= max_characters:
|
|
238
|
+
return ContextPack(
|
|
239
|
+
text=candidate_text,
|
|
240
|
+
evidence_count=len(remaining_blocks),
|
|
241
|
+
blocks=remaining_blocks,
|
|
242
|
+
)
|
|
243
|
+
remaining_blocks = remaining_blocks[:-1]
|
|
244
|
+
|
|
245
|
+
return ContextPack(text="", evidence_count=0, blocks=[])
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _order_evidence(
|
|
249
|
+
evidence: List[Evidence],
|
|
250
|
+
*,
|
|
251
|
+
policy: ContextPackPolicy,
|
|
252
|
+
) -> List[Evidence]:
|
|
253
|
+
"""
|
|
254
|
+
Order evidence items according to the context pack policy.
|
|
255
|
+
|
|
256
|
+
:param evidence: Evidence list to order.
|
|
257
|
+
:type evidence: list[Evidence]
|
|
258
|
+
:param policy: Context pack policy.
|
|
259
|
+
:type policy: ContextPackPolicy
|
|
260
|
+
:return: Ordered evidence list.
|
|
261
|
+
:rtype: list[Evidence]
|
|
262
|
+
"""
|
|
263
|
+
if policy.ordering == "rank":
|
|
264
|
+
return sorted(evidence, key=lambda item: (item.rank, item.item_id))
|
|
265
|
+
if policy.ordering == "score":
|
|
266
|
+
return sorted(evidence, key=lambda item: (-item.score, item.item_id))
|
|
267
|
+
if policy.ordering == "source":
|
|
268
|
+
return sorted(
|
|
269
|
+
evidence,
|
|
270
|
+
key=lambda item: (
|
|
271
|
+
item.source_uri or item.item_id,
|
|
272
|
+
-item.score,
|
|
273
|
+
item.item_id,
|
|
274
|
+
),
|
|
275
|
+
)
|
|
276
|
+
raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
|
|
280
|
+
"""
|
|
281
|
+
Build metadata for a context pack block.
|
|
282
|
+
|
|
283
|
+
:param evidence: Evidence item to describe.
|
|
284
|
+
:type evidence: Evidence
|
|
285
|
+
:return: Metadata mapping.
|
|
286
|
+
:rtype: dict[str, object]
|
|
287
|
+
"""
|
|
288
|
+
return {
|
|
289
|
+
"item_id": evidence.item_id,
|
|
290
|
+
"source_uri": evidence.source_uri or "none",
|
|
291
|
+
"score": evidence.score,
|
|
292
|
+
"stage": evidence.stage,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
|
|
297
|
+
"""
|
|
298
|
+
Format a context pack block text with optional metadata.
|
|
299
|
+
|
|
300
|
+
:param text: Evidence text.
|
|
301
|
+
:type text: str
|
|
302
|
+
:param metadata: Optional metadata mapping.
|
|
303
|
+
:type metadata: dict[str, object] or None
|
|
304
|
+
:return: Formatted block text.
|
|
305
|
+
:rtype: str
|
|
306
|
+
"""
|
|
307
|
+
if not metadata:
|
|
308
|
+
return text
|
|
309
|
+
metadata_lines = "\n".join(
|
|
310
|
+
[
|
|
311
|
+
f"item_id: {metadata['item_id']}",
|
|
312
|
+
f"source_uri: {metadata['source_uri']}",
|
|
313
|
+
f"score: {metadata['score']}",
|
|
314
|
+
f"stage: {metadata['stage']}",
|
|
315
|
+
]
|
|
316
|
+
)
|
|
317
|
+
return f"{metadata_lines}\n{text}"
|
biblicus/models.py
CHANGED
|
@@ -263,6 +263,8 @@ class Evidence(BaseModel):
|
|
|
263
263
|
:vartype span_end: int or None
|
|
264
264
|
:ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
|
|
265
265
|
:vartype stage: str
|
|
266
|
+
:ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
|
|
267
|
+
:vartype stage_scores: dict[str, float] or None
|
|
266
268
|
:ivar recipe_id: Recipe identifier used to create the run.
|
|
267
269
|
:vartype recipe_id: str
|
|
268
270
|
:ivar run_id: Retrieval run identifier.
|
|
@@ -283,6 +285,7 @@ class Evidence(BaseModel):
|
|
|
283
285
|
span_start: Optional[int] = None
|
|
284
286
|
span_end: Optional[int] = None
|
|
285
287
|
stage: str
|
|
288
|
+
stage_scores: Optional[Dict[str, float]] = None
|
|
286
289
|
recipe_id: str
|
|
287
290
|
run_id: str
|
|
288
291
|
hash: Optional[str] = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -493,6 +493,12 @@ Two backends are included.
|
|
|
493
493
|
|
|
494
494
|
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
495
495
|
|
|
496
|
+
## Retrieval documentation
|
|
497
|
+
|
|
498
|
+
For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
|
|
499
|
+
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
|
|
500
|
+
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
|
|
501
|
+
|
|
496
502
|
## Extraction backends
|
|
497
503
|
|
|
498
504
|
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=okAXmTSud_hQzaGEURDqX95I66SlcvTERCrWbUZA5ko,496
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
3
|
+
biblicus/cli.py,sha256=bZV-ZxeWskRL4CFCGzyVpcaFC8KOb0xmxx3bnMqP-1I,36118
|
|
4
4
|
biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
|
|
5
|
-
biblicus/context.py,sha256=
|
|
5
|
+
biblicus/context.py,sha256=U7qkOwMdqNgYnqaC9hgQY0kv0R-6qcjV6bhXQl2WUkE,10215
|
|
6
6
|
biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
|
|
7
7
|
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
8
8
|
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
@@ -16,7 +16,7 @@ biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
|
16
16
|
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
17
17
|
biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
|
|
18
18
|
biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
|
|
19
|
-
biblicus/models.py,sha256=
|
|
19
|
+
biblicus/models.py,sha256=r28O6cg3d1bjJnKqpLieVLTgtXTfzb_60wMORvVuDN0,15846
|
|
20
20
|
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
21
21
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
22
22
|
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
@@ -30,13 +30,15 @@ biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w
|
|
|
30
30
|
biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
|
|
31
31
|
biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
|
|
32
32
|
biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
|
|
33
|
-
biblicus/analysis/profiling.py,sha256=
|
|
33
|
+
biblicus/analysis/profiling.py,sha256=v2B4Tn9WiXRRP_wIADBPRQVKkMc92KXCas7OBa7n0LU,10670
|
|
34
34
|
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
35
35
|
biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
|
|
36
|
-
biblicus/backends/__init__.py,sha256=
|
|
36
|
+
biblicus/backends/__init__.py,sha256=3HJY0oMm8pFFVGC4Z-dlPRHhIPVDdUzsa4IMjKP_9dI,1378
|
|
37
37
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
38
|
+
biblicus/backends/hybrid.py,sha256=CXh6QrlE0RsTJjSlZRdtomLlILfkglBDQG3YVa8RpFU,10589
|
|
38
39
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
39
|
-
biblicus/backends/sqlite_full_text_search.py,sha256=
|
|
40
|
+
biblicus/backends/sqlite_full_text_search.py,sha256=VAn4fDdfiaS1Rn6zHlYz3E10_3vMU9P94QU8cL0l8Mk,24466
|
|
41
|
+
biblicus/backends/vector.py,sha256=3RdxSBPb1kOX4Sfd4d1qXFW9ecuiRvGpOHadLCbeh1g,15183
|
|
40
42
|
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
41
43
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
42
44
|
biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
|
|
@@ -55,9 +57,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
|
|
|
55
57
|
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
56
58
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
57
59
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
58
|
-
biblicus-0.
|
|
59
|
-
biblicus-0.
|
|
60
|
-
biblicus-0.
|
|
61
|
-
biblicus-0.
|
|
62
|
-
biblicus-0.
|
|
63
|
-
biblicus-0.
|
|
60
|
+
biblicus-0.12.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
61
|
+
biblicus-0.12.0.dist-info/METADATA,sha256=fhWcCcczfuLn2mZ_Moqe2zMKJ1-Q7KxZtR_x9YaiFO8,27765
|
|
62
|
+
biblicus-0.12.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
63
|
+
biblicus-0.12.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
64
|
+
biblicus-0.12.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
65
|
+
biblicus-0.12.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|