biblicus 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/context.py CHANGED
@@ -8,11 +8,11 @@ stable contract while context formatting remains an explicit policy surface.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import List, Optional
11
+ from typing import Dict, List, Literal, Optional
12
12
 
13
13
  from pydantic import BaseModel, ConfigDict, Field
14
14
 
15
- from .models import RetrievalResult
15
+ from .models import Evidence, RetrievalResult
16
16
 
17
17
 
18
18
  class ContextPackPolicy(BaseModel):
@@ -21,11 +21,17 @@ class ContextPackPolicy(BaseModel):
21
21
 
22
22
  :ivar join_with: Separator inserted between evidence text blocks.
23
23
  :vartype join_with: str
24
+ :ivar ordering: Evidence ordering policy (rank, score, or source).
25
+ :vartype ordering: str
26
+ :ivar include_metadata: Whether to include evidence metadata lines in each block.
27
+ :vartype include_metadata: bool
24
28
  """
25
29
 
26
30
  model_config = ConfigDict(extra="forbid")
27
31
 
28
32
  join_with: str = Field(default="\n\n")
33
+ ordering: Literal["rank", "score", "source"] = Field(default="rank")
34
+ include_metadata: bool = Field(default=False)
29
35
 
30
36
 
31
37
  class ContextPack(BaseModel):
@@ -55,12 +61,15 @@ class ContextPackBlock(BaseModel):
55
61
  :vartype evidence_item_id: str
56
62
  :ivar text: Text included in this block.
57
63
  :vartype text: str
64
+ :ivar metadata: Optional metadata included with the block.
65
+ :vartype metadata: dict[str, object] or None
58
66
  """
59
67
 
60
68
  model_config = ConfigDict(extra="forbid")
61
69
 
62
70
  evidence_item_id: str = Field(min_length=1)
63
71
  text: str = Field(min_length=1)
72
+ metadata: Optional[Dict[str, object]] = None
64
73
 
65
74
 
66
75
  class TokenCounter(BaseModel):
@@ -92,6 +101,19 @@ class TokenBudget(BaseModel):
92
101
  max_tokens: int = Field(ge=1)
93
102
 
94
103
 
104
+ class CharacterBudget(BaseModel):
105
+ """
106
+ Character budget for a context pack.
107
+
108
+ :ivar max_characters: Maximum characters permitted for the final context pack text.
109
+ :vartype max_characters: int
110
+ """
111
+
112
+ model_config = ConfigDict(extra="forbid")
113
+
114
+ max_characters: int = Field(ge=1)
115
+
116
+
95
117
  def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
96
118
  """
97
119
  Build a context pack from a retrieval result using an explicit policy.
@@ -104,14 +126,20 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
104
126
  :rtype: ContextPack
105
127
  """
106
128
  selected_blocks: List[ContextPackBlock] = []
107
- for evidence in result.evidence:
129
+ for evidence in _order_evidence(result.evidence, policy=policy):
108
130
  if not isinstance(evidence.text, str):
109
131
  continue
110
132
  trimmed_text = evidence.text.strip()
111
133
  if not trimmed_text:
112
134
  continue
135
+ metadata = _metadata_for_evidence(evidence) if policy.include_metadata else None
136
+ block_text = _format_block_text(trimmed_text, metadata=metadata)
113
137
  selected_blocks.append(
114
- ContextPackBlock(evidence_item_id=evidence.item_id, text=trimmed_text)
138
+ ContextPackBlock(
139
+ evidence_item_id=evidence.item_id,
140
+ text=block_text,
141
+ metadata=metadata,
142
+ )
115
143
  )
116
144
 
117
145
  return ContextPack(
@@ -181,3 +209,109 @@ def fit_context_pack_to_token_budget(
181
209
  remaining_blocks = remaining_blocks[:-1]
182
210
 
183
211
  return ContextPack(text="", evidence_count=0, blocks=[])
212
+
213
+
214
+ def fit_context_pack_to_character_budget(
215
+ context_pack: ContextPack,
216
+ *,
217
+ policy: ContextPackPolicy,
218
+ character_budget: CharacterBudget,
219
+ ) -> ContextPack:
220
+ """
221
+ Fit a context pack to a character budget by dropping trailing blocks.
222
+
223
+ :param context_pack: Context pack to fit.
224
+ :type context_pack: ContextPack
225
+ :param policy: Policy controlling how blocks are joined into text.
226
+ :type policy: ContextPackPolicy
227
+ :param character_budget: Character budget to enforce.
228
+ :type character_budget: CharacterBudget
229
+ :return: Fitted context pack.
230
+ :rtype: ContextPack
231
+ """
232
+ remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
233
+ max_characters = character_budget.max_characters
234
+
235
+ while remaining_blocks:
236
+ candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
237
+ if len(candidate_text) <= max_characters:
238
+ return ContextPack(
239
+ text=candidate_text,
240
+ evidence_count=len(remaining_blocks),
241
+ blocks=remaining_blocks,
242
+ )
243
+ remaining_blocks = remaining_blocks[:-1]
244
+
245
+ return ContextPack(text="", evidence_count=0, blocks=[])
246
+
247
+
248
+ def _order_evidence(
249
+ evidence: List[Evidence],
250
+ *,
251
+ policy: ContextPackPolicy,
252
+ ) -> List[Evidence]:
253
+ """
254
+ Order evidence items according to the context pack policy.
255
+
256
+ :param evidence: Evidence list to order.
257
+ :type evidence: list[Evidence]
258
+ :param policy: Context pack policy.
259
+ :type policy: ContextPackPolicy
260
+ :return: Ordered evidence list.
261
+ :rtype: list[Evidence]
262
+ """
263
+ if policy.ordering == "rank":
264
+ return sorted(evidence, key=lambda item: (item.rank, item.item_id))
265
+ if policy.ordering == "score":
266
+ return sorted(evidence, key=lambda item: (-item.score, item.item_id))
267
+ if policy.ordering == "source":
268
+ return sorted(
269
+ evidence,
270
+ key=lambda item: (
271
+ item.source_uri or item.item_id,
272
+ -item.score,
273
+ item.item_id,
274
+ ),
275
+ )
276
+ raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
277
+
278
+
279
+ def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
280
+ """
281
+ Build metadata for a context pack block.
282
+
283
+ :param evidence: Evidence item to describe.
284
+ :type evidence: Evidence
285
+ :return: Metadata mapping.
286
+ :rtype: dict[str, object]
287
+ """
288
+ return {
289
+ "item_id": evidence.item_id,
290
+ "source_uri": evidence.source_uri or "none",
291
+ "score": evidence.score,
292
+ "stage": evidence.stage,
293
+ }
294
+
295
+
296
+ def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
297
+ """
298
+ Format a context pack block text with optional metadata.
299
+
300
+ :param text: Evidence text.
301
+ :type text: str
302
+ :param metadata: Optional metadata mapping.
303
+ :type metadata: dict[str, object] or None
304
+ :return: Formatted block text.
305
+ :rtype: str
306
+ """
307
+ if not metadata:
308
+ return text
309
+ metadata_lines = "\n".join(
310
+ [
311
+ f"item_id: {metadata['item_id']}",
312
+ f"source_uri: {metadata['source_uri']}",
313
+ f"score: {metadata['score']}",
314
+ f"stage: {metadata['stage']}",
315
+ ]
316
+ )
317
+ return f"{metadata_lines}\n{text}"
biblicus/models.py CHANGED
@@ -263,6 +263,8 @@ class Evidence(BaseModel):
263
263
  :vartype span_end: int or None
264
264
  :ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
265
265
  :vartype stage: str
266
+ :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
267
+ :vartype stage_scores: dict[str, float] or None
266
268
  :ivar recipe_id: Recipe identifier used to create the run.
267
269
  :vartype recipe_id: str
268
270
  :ivar run_id: Retrieval run identifier.
@@ -283,6 +285,7 @@ class Evidence(BaseModel):
283
285
  span_start: Optional[int] = None
284
286
  span_end: Optional[int] = None
285
287
  stage: str
288
+ stage_scores: Optional[Dict[str, float]] = None
286
289
  recipe_id: str
287
290
  run_id: str
288
291
  hash: Optional[str] = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.10.0
3
+ Version: 0.12.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -493,6 +493,12 @@ Two backends are included.
493
493
 
494
494
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
495
495
 
496
+ ## Retrieval documentation
497
+
498
+ For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
499
+ (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
500
+ and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
501
+
496
502
  ## Extraction backends
497
503
 
498
504
  These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
@@ -1,8 +1,8 @@
1
- biblicus/__init__.py,sha256=BejOPHIlCnT74pu9fNuLm14HsmWjGqCIwpfD9hDOqSo,496
1
+ biblicus/__init__.py,sha256=okAXmTSud_hQzaGEURDqX95I66SlcvTERCrWbUZA5ko,496
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
- biblicus/cli.py,sha256=aH3plnednnYgcPnSoYQf200nboKc6N-tuc3FuLPQEcU,35132
3
+ biblicus/cli.py,sha256=bZV-ZxeWskRL4CFCGzyVpcaFC8KOb0xmxx3bnMqP-1I,36118
4
4
  biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
5
- biblicus/context.py,sha256=qnT9CH7_ldoPcg-rxnUOtRhheOmpDAbF8uqhf8OdjC4,5832
5
+ biblicus/context.py,sha256=U7qkOwMdqNgYnqaC9hgQY0kv0R-6qcjV6bhXQl2WUkE,10215
6
6
  biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
7
7
  biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
8
8
  biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
@@ -16,7 +16,7 @@ biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
16
16
  biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
17
17
  biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
18
18
  biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
19
- biblicus/models.py,sha256=vlvPP7AOZGtnHSq47-s9YW-fqLwjgYR6NBcSfeC8YKk,15665
19
+ biblicus/models.py,sha256=r28O6cg3d1bjJnKqpLieVLTgtXTfzb_60wMORvVuDN0,15846
20
20
  biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
21
21
  biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
22
22
  biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
@@ -30,13 +30,15 @@ biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w
30
30
  biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
31
31
  biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
32
32
  biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
33
- biblicus/analysis/profiling.py,sha256=z4w14LVJrTEXcQ3PBNwwb_61KuuwQgXw4-EiAaxOQ4Y,10672
33
+ biblicus/analysis/profiling.py,sha256=v2B4Tn9WiXRRP_wIADBPRQVKkMc92KXCas7OBa7n0LU,10670
34
34
  biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
35
35
  biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
36
- biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
36
+ biblicus/backends/__init__.py,sha256=3HJY0oMm8pFFVGC4Z-dlPRHhIPVDdUzsa4IMjKP_9dI,1378
37
37
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
38
+ biblicus/backends/hybrid.py,sha256=CXh6QrlE0RsTJjSlZRdtomLlILfkglBDQG3YVa8RpFU,10589
38
39
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
39
- biblicus/backends/sqlite_full_text_search.py,sha256=XFuIbEHYWMD9JkjgRZcgYH3kP3b4hRnJ3PwP8rSFjUU,16502
40
+ biblicus/backends/sqlite_full_text_search.py,sha256=VAn4fDdfiaS1Rn6zHlYz3E10_3vMU9P94QU8cL0l8Mk,24466
41
+ biblicus/backends/vector.py,sha256=3RdxSBPb1kOX4Sfd4d1qXFW9ecuiRvGpOHadLCbeh1g,15183
40
42
  biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
41
43
  biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
42
44
  biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
@@ -55,9 +57,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
55
57
  biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
56
58
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
57
59
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
58
- biblicus-0.10.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
59
- biblicus-0.10.0.dist-info/METADATA,sha256=xZ7scJLdlKHRtm0EU5Ravq5ih2mS2KNfMbbLXNqZ8Ek,27455
60
- biblicus-0.10.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
61
- biblicus-0.10.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
62
- biblicus-0.10.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
63
- biblicus-0.10.0.dist-info/RECORD,,
60
+ biblicus-0.12.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
61
+ biblicus-0.12.0.dist-info/METADATA,sha256=fhWcCcczfuLn2mZ_Moqe2zMKJ1-Q7KxZtR_x9YaiFO8,27765
62
+ biblicus-0.12.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
63
+ biblicus-0.12.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
64
+ biblicus-0.12.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
65
+ biblicus-0.12.0.dist-info/RECORD,,