biblicus 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.11.0"
30
+ __version__ = "0.12.0"
biblicus/cli.py CHANGED
@@ -15,9 +15,11 @@ from pydantic import ValidationError
15
15
  from .analysis import get_analysis_backend
16
16
  from .backends import get_backend
17
17
  from .context import (
18
+ CharacterBudget,
18
19
  ContextPackPolicy,
19
20
  TokenBudget,
20
21
  build_context_pack,
22
+ fit_context_pack_to_character_budget,
21
23
  fit_context_pack_to_token_budget,
22
24
  )
23
25
  from .corpus import Corpus
@@ -568,7 +570,11 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
568
570
  )
569
571
  retrieval_result = RetrievalResult.model_validate_json(input_text)
570
572
  join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
571
- policy = ContextPackPolicy(join_with=join_with)
573
+ policy = ContextPackPolicy(
574
+ join_with=join_with,
575
+ ordering=arguments.ordering,
576
+ include_metadata=arguments.include_metadata,
577
+ )
572
578
  context_pack = build_context_pack(retrieval_result, policy=policy)
573
579
  if arguments.max_tokens is not None:
574
580
  context_pack = fit_context_pack_to_token_budget(
@@ -576,6 +582,12 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
576
582
  policy=policy,
577
583
  token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
578
584
  )
585
+ if arguments.max_characters is not None:
586
+ context_pack = fit_context_pack_to_character_budget(
587
+ context_pack,
588
+ policy=policy,
589
+ character_budget=CharacterBudget(max_characters=int(arguments.max_characters)),
590
+ )
579
591
  print(
580
592
  json.dumps(
581
593
  {
@@ -921,12 +933,29 @@ def build_parser() -> argparse.ArgumentParser:
921
933
  default="\\n\\n",
922
934
  help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
923
935
  )
936
+ p_context_pack_build.add_argument(
937
+ "--ordering",
938
+ choices=["rank", "score", "source"],
939
+ default="rank",
940
+ help="Evidence ordering policy (rank, score, source).",
941
+ )
942
+ p_context_pack_build.add_argument(
943
+ "--include-metadata",
944
+ action="store_true",
945
+ help="Include evidence metadata in each context pack block.",
946
+ )
924
947
  p_context_pack_build.add_argument(
925
948
  "--max-tokens",
926
949
  default=None,
927
950
  type=int,
928
951
  help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
929
952
  )
953
+ p_context_pack_build.add_argument(
954
+ "--max-characters",
955
+ default=None,
956
+ type=int,
957
+ help="Optional character budget for the final context pack.",
958
+ )
930
959
  p_context_pack_build.set_defaults(func=cmd_context_pack_build)
931
960
 
932
961
  p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
biblicus/context.py CHANGED
@@ -8,11 +8,11 @@ stable contract while context formatting remains an explicit policy surface.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import List, Optional
11
+ from typing import Dict, List, Literal, Optional
12
12
 
13
13
  from pydantic import BaseModel, ConfigDict, Field
14
14
 
15
- from .models import RetrievalResult
15
+ from .models import Evidence, RetrievalResult
16
16
 
17
17
 
18
18
  class ContextPackPolicy(BaseModel):
@@ -21,11 +21,17 @@ class ContextPackPolicy(BaseModel):
21
21
 
22
22
  :ivar join_with: Separator inserted between evidence text blocks.
23
23
  :vartype join_with: str
24
+ :ivar ordering: Evidence ordering policy (rank, score, or source).
25
+ :vartype ordering: str
26
+ :ivar include_metadata: Whether to include evidence metadata lines in each block.
27
+ :vartype include_metadata: bool
24
28
  """
25
29
 
26
30
  model_config = ConfigDict(extra="forbid")
27
31
 
28
32
  join_with: str = Field(default="\n\n")
33
+ ordering: Literal["rank", "score", "source"] = Field(default="rank")
34
+ include_metadata: bool = Field(default=False)
29
35
 
30
36
 
31
37
  class ContextPack(BaseModel):
@@ -55,12 +61,15 @@ class ContextPackBlock(BaseModel):
55
61
  :vartype evidence_item_id: str
56
62
  :ivar text: Text included in this block.
57
63
  :vartype text: str
64
+ :ivar metadata: Optional metadata included with the block.
65
+ :vartype metadata: dict[str, object] or None
58
66
  """
59
67
 
60
68
  model_config = ConfigDict(extra="forbid")
61
69
 
62
70
  evidence_item_id: str = Field(min_length=1)
63
71
  text: str = Field(min_length=1)
72
+ metadata: Optional[Dict[str, object]] = None
64
73
 
65
74
 
66
75
  class TokenCounter(BaseModel):
@@ -92,6 +101,19 @@ class TokenBudget(BaseModel):
92
101
  max_tokens: int = Field(ge=1)
93
102
 
94
103
 
104
+ class CharacterBudget(BaseModel):
105
+ """
106
+ Character budget for a context pack.
107
+
108
+ :ivar max_characters: Maximum characters permitted for the final context pack text.
109
+ :vartype max_characters: int
110
+ """
111
+
112
+ model_config = ConfigDict(extra="forbid")
113
+
114
+ max_characters: int = Field(ge=1)
115
+
116
+
95
117
  def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
96
118
  """
97
119
  Build a context pack from a retrieval result using an explicit policy.
@@ -104,14 +126,20 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
104
126
  :rtype: ContextPack
105
127
  """
106
128
  selected_blocks: List[ContextPackBlock] = []
107
- for evidence in result.evidence:
129
+ for evidence in _order_evidence(result.evidence, policy=policy):
108
130
  if not isinstance(evidence.text, str):
109
131
  continue
110
132
  trimmed_text = evidence.text.strip()
111
133
  if not trimmed_text:
112
134
  continue
135
+ metadata = _metadata_for_evidence(evidence) if policy.include_metadata else None
136
+ block_text = _format_block_text(trimmed_text, metadata=metadata)
113
137
  selected_blocks.append(
114
- ContextPackBlock(evidence_item_id=evidence.item_id, text=trimmed_text)
138
+ ContextPackBlock(
139
+ evidence_item_id=evidence.item_id,
140
+ text=block_text,
141
+ metadata=metadata,
142
+ )
115
143
  )
116
144
 
117
145
  return ContextPack(
@@ -181,3 +209,109 @@ def fit_context_pack_to_token_budget(
181
209
  remaining_blocks = remaining_blocks[:-1]
182
210
 
183
211
  return ContextPack(text="", evidence_count=0, blocks=[])
212
+
213
+
214
+ def fit_context_pack_to_character_budget(
215
+ context_pack: ContextPack,
216
+ *,
217
+ policy: ContextPackPolicy,
218
+ character_budget: CharacterBudget,
219
+ ) -> ContextPack:
220
+ """
221
+ Fit a context pack to a character budget by dropping trailing blocks.
222
+
223
+ :param context_pack: Context pack to fit.
224
+ :type context_pack: ContextPack
225
+ :param policy: Policy controlling how blocks are joined into text.
226
+ :type policy: ContextPackPolicy
227
+ :param character_budget: Character budget to enforce.
228
+ :type character_budget: CharacterBudget
229
+ :return: Fitted context pack.
230
+ :rtype: ContextPack
231
+ """
232
+ remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
233
+ max_characters = character_budget.max_characters
234
+
235
+ while remaining_blocks:
236
+ candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
237
+ if len(candidate_text) <= max_characters:
238
+ return ContextPack(
239
+ text=candidate_text,
240
+ evidence_count=len(remaining_blocks),
241
+ blocks=remaining_blocks,
242
+ )
243
+ remaining_blocks = remaining_blocks[:-1]
244
+
245
+ return ContextPack(text="", evidence_count=0, blocks=[])
246
+
247
+
248
+ def _order_evidence(
249
+ evidence: List[Evidence],
250
+ *,
251
+ policy: ContextPackPolicy,
252
+ ) -> List[Evidence]:
253
+ """
254
+ Order evidence items according to the context pack policy.
255
+
256
+ :param evidence: Evidence list to order.
257
+ :type evidence: list[Evidence]
258
+ :param policy: Context pack policy.
259
+ :type policy: ContextPackPolicy
260
+ :return: Ordered evidence list.
261
+ :rtype: list[Evidence]
262
+ """
263
+ if policy.ordering == "rank":
264
+ return sorted(evidence, key=lambda item: (item.rank, item.item_id))
265
+ if policy.ordering == "score":
266
+ return sorted(evidence, key=lambda item: (-item.score, item.item_id))
267
+ if policy.ordering == "source":
268
+ return sorted(
269
+ evidence,
270
+ key=lambda item: (
271
+ item.source_uri or item.item_id,
272
+ -item.score,
273
+ item.item_id,
274
+ ),
275
+ )
276
+ raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
277
+
278
+
279
+ def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
280
+ """
281
+ Build metadata for a context pack block.
282
+
283
+ :param evidence: Evidence item to describe.
284
+ :type evidence: Evidence
285
+ :return: Metadata mapping.
286
+ :rtype: dict[str, object]
287
+ """
288
+ return {
289
+ "item_id": evidence.item_id,
290
+ "source_uri": evidence.source_uri or "none",
291
+ "score": evidence.score,
292
+ "stage": evidence.stage,
293
+ }
294
+
295
+
296
+ def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
297
+ """
298
+ Format a context pack block text with optional metadata.
299
+
300
+ :param text: Evidence text.
301
+ :type text: str
302
+ :param metadata: Optional metadata mapping.
303
+ :type metadata: dict[str, object] or None
304
+ :return: Formatted block text.
305
+ :rtype: str
306
+ """
307
+ if not metadata:
308
+ return text
309
+ metadata_lines = "\n".join(
310
+ [
311
+ f"item_id: {metadata['item_id']}",
312
+ f"source_uri: {metadata['source_uri']}",
313
+ f"score: {metadata['score']}",
314
+ f"stage: {metadata['stage']}",
315
+ ]
316
+ )
317
+ return f"{metadata_lines}\n{text}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.11.0
3
+ Version: 0.12.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -1,8 +1,8 @@
1
- biblicus/__init__.py,sha256=sT0PFc3DRGFRcN7Zx4Yooc8OzmLvaj1-ZjbvFHce8lU,496
1
+ biblicus/__init__.py,sha256=okAXmTSud_hQzaGEURDqX95I66SlcvTERCrWbUZA5ko,496
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
- biblicus/cli.py,sha256=aH3plnednnYgcPnSoYQf200nboKc6N-tuc3FuLPQEcU,35132
3
+ biblicus/cli.py,sha256=bZV-ZxeWskRL4CFCGzyVpcaFC8KOb0xmxx3bnMqP-1I,36118
4
4
  biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
5
- biblicus/context.py,sha256=qnT9CH7_ldoPcg-rxnUOtRhheOmpDAbF8uqhf8OdjC4,5832
5
+ biblicus/context.py,sha256=U7qkOwMdqNgYnqaC9hgQY0kv0R-6qcjV6bhXQl2WUkE,10215
6
6
  biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
7
7
  biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
8
8
  biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
@@ -57,9 +57,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
57
57
  biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
58
58
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
59
59
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
60
- biblicus-0.11.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
61
- biblicus-0.11.0.dist-info/METADATA,sha256=zrJESYGfGLu7Iq1I--GPIkEY9gXDb9szBIuenlWor7I,27765
62
- biblicus-0.11.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
63
- biblicus-0.11.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
64
- biblicus-0.11.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
65
- biblicus-0.11.0.dist-info/RECORD,,
60
+ biblicus-0.12.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
61
+ biblicus-0.12.0.dist-info/METADATA,sha256=fhWcCcczfuLn2mZ_Moqe2zMKJ1-Q7KxZtR_x9YaiFO8,27765
62
+ biblicus-0.12.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
63
+ biblicus-0.12.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
64
+ biblicus-0.12.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
65
+ biblicus-0.12.0.dist-info/RECORD,,