biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/constants.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared constants for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
SCHEMA_VERSION = 2
|
|
6
|
+
DATASET_SCHEMA_VERSION = 1
|
|
7
|
+
CORPUS_DIR_NAME = ".biblicus"
|
|
8
|
+
DEFAULT_RAW_DIR = "raw"
|
|
9
|
+
SIDECAR_SUFFIX = ".biblicus.yml"
|
|
10
|
+
RUNS_DIR_NAME = "runs"
|
|
11
|
+
EXTRACTION_RUNS_DIR_NAME = "extraction"
|
|
12
|
+
HOOK_LOGS_DIR_NAME = "hook_logs"
|
biblicus/context.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Context pack building for Biblicus.
|
|
3
|
+
|
|
4
|
+
A context pack is the text that your application sends to a large language model.
|
|
5
|
+
Biblicus produces a context pack from structured retrieval results so that evidence remains a
|
|
6
|
+
stable contract while context formatting remains an explicit policy surface.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from .models import RetrievalResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ContextPackPolicy(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Policy that controls how evidence becomes context pack text.
|
|
21
|
+
|
|
22
|
+
:ivar join_with: Separator inserted between evidence text blocks.
|
|
23
|
+
:vartype join_with: str
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
join_with: str = Field(default="\n\n")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ContextPack(BaseModel):
|
|
32
|
+
"""
|
|
33
|
+
Context pack derived from retrieval evidence.
|
|
34
|
+
|
|
35
|
+
:ivar text: Context pack text suitable for inclusion in a model call.
|
|
36
|
+
:vartype text: str
|
|
37
|
+
:ivar evidence_count: Number of evidence blocks included in the context pack.
|
|
38
|
+
:vartype evidence_count: int
|
|
39
|
+
:ivar blocks: Structured blocks that produced the context pack.
|
|
40
|
+
:vartype blocks: list[ContextPackBlock]
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="forbid")
|
|
44
|
+
|
|
45
|
+
text: str
|
|
46
|
+
evidence_count: int = Field(ge=0)
|
|
47
|
+
blocks: List["ContextPackBlock"] = Field(default_factory=list)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ContextPackBlock(BaseModel):
|
|
51
|
+
"""
|
|
52
|
+
A single context pack block derived from one evidence item.
|
|
53
|
+
|
|
54
|
+
:ivar evidence_item_id: Item identifier that produced this block.
|
|
55
|
+
:vartype evidence_item_id: str
|
|
56
|
+
:ivar text: Text included in this block.
|
|
57
|
+
:vartype text: str
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
model_config = ConfigDict(extra="forbid")
|
|
61
|
+
|
|
62
|
+
evidence_item_id: str = Field(min_length=1)
|
|
63
|
+
text: str = Field(min_length=1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TokenCounter(BaseModel):
|
|
67
|
+
"""
|
|
68
|
+
Token counter configuration for token budget fitting.
|
|
69
|
+
|
|
70
|
+
This is a lightweight model wrapper so token fitting remains explicit and testable even when
|
|
71
|
+
the underlying tokenizer is provided by an optional dependency.
|
|
72
|
+
|
|
73
|
+
:ivar tokenizer_id: Tokenizer identifier (for example, naive-whitespace).
|
|
74
|
+
:vartype tokenizer_id: str
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
model_config = ConfigDict(extra="forbid")
|
|
78
|
+
|
|
79
|
+
tokenizer_id: str = Field(default="naive-whitespace", min_length=1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class TokenBudget(BaseModel):
|
|
83
|
+
"""
|
|
84
|
+
Token budget for a context pack.
|
|
85
|
+
|
|
86
|
+
:ivar max_tokens: Maximum tokens permitted for the final context pack text.
|
|
87
|
+
:vartype max_tokens: int
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(extra="forbid")
|
|
91
|
+
|
|
92
|
+
max_tokens: int = Field(ge=1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
|
|
96
|
+
"""
|
|
97
|
+
Build a context pack from a retrieval result using an explicit policy.
|
|
98
|
+
|
|
99
|
+
:param result: Retrieval result containing ranked evidence.
|
|
100
|
+
:type result: RetrievalResult
|
|
101
|
+
:param policy: Policy controlling how evidence text is joined.
|
|
102
|
+
:type policy: ContextPackPolicy
|
|
103
|
+
:return: Context pack containing concatenated evidence text.
|
|
104
|
+
:rtype: ContextPack
|
|
105
|
+
"""
|
|
106
|
+
selected_blocks: List[ContextPackBlock] = []
|
|
107
|
+
for evidence in result.evidence:
|
|
108
|
+
if not isinstance(evidence.text, str):
|
|
109
|
+
continue
|
|
110
|
+
trimmed_text = evidence.text.strip()
|
|
111
|
+
if not trimmed_text:
|
|
112
|
+
continue
|
|
113
|
+
selected_blocks.append(
|
|
114
|
+
ContextPackBlock(evidence_item_id=evidence.item_id, text=trimmed_text)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return ContextPack(
|
|
118
|
+
text=policy.join_with.join([block.text for block in selected_blocks]),
|
|
119
|
+
evidence_count=len(selected_blocks),
|
|
120
|
+
blocks=selected_blocks,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def count_tokens(text: str, *, tokenizer_id: str) -> int:
|
|
125
|
+
"""
|
|
126
|
+
Count tokens in a text using a tokenizer identifier.
|
|
127
|
+
|
|
128
|
+
The default tokenizer is naive-whitespace, which counts whitespace-separated tokens.
|
|
129
|
+
|
|
130
|
+
:param text: Text payload to count.
|
|
131
|
+
:type text: str
|
|
132
|
+
:param tokenizer_id: Tokenizer identifier.
|
|
133
|
+
:type tokenizer_id: str
|
|
134
|
+
:return: Token count.
|
|
135
|
+
:rtype: int
|
|
136
|
+
:raises KeyError: If the tokenizer identifier is unknown.
|
|
137
|
+
"""
|
|
138
|
+
tokenizers = {
|
|
139
|
+
"naive-whitespace": lambda value: len([token for token in value.split() if token]),
|
|
140
|
+
}
|
|
141
|
+
tokenizer = tokenizers[tokenizer_id]
|
|
142
|
+
return int(tokenizer(text))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def fit_context_pack_to_token_budget(
|
|
146
|
+
context_pack: ContextPack,
|
|
147
|
+
*,
|
|
148
|
+
policy: ContextPackPolicy,
|
|
149
|
+
token_budget: TokenBudget,
|
|
150
|
+
token_counter: Optional[TokenCounter] = None,
|
|
151
|
+
) -> ContextPack:
|
|
152
|
+
"""
|
|
153
|
+
Fit a context pack to a token budget by dropping trailing blocks.
|
|
154
|
+
|
|
155
|
+
This function is deterministic. It never rewrites block text. It only removes blocks from the
|
|
156
|
+
end of the block list until the token budget is met.
|
|
157
|
+
|
|
158
|
+
:param context_pack: Context pack to fit.
|
|
159
|
+
:type context_pack: ContextPack
|
|
160
|
+
:param policy: Policy controlling how blocks are joined into text.
|
|
161
|
+
:type policy: ContextPackPolicy
|
|
162
|
+
:param token_budget: Token budget to enforce.
|
|
163
|
+
:type token_budget: TokenBudget
|
|
164
|
+
:param token_counter: Optional token counter configuration.
|
|
165
|
+
:type token_counter: TokenCounter or None
|
|
166
|
+
:return: Fitted context pack.
|
|
167
|
+
:rtype: ContextPack
|
|
168
|
+
"""
|
|
169
|
+
token_counter = token_counter or TokenCounter()
|
|
170
|
+
remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
|
|
171
|
+
|
|
172
|
+
while remaining_blocks:
|
|
173
|
+
candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
|
|
174
|
+
candidate_tokens = count_tokens(candidate_text, tokenizer_id=token_counter.tokenizer_id)
|
|
175
|
+
if candidate_tokens <= token_budget.max_tokens:
|
|
176
|
+
return ContextPack(
|
|
177
|
+
text=candidate_text,
|
|
178
|
+
evidence_count=len(remaining_blocks),
|
|
179
|
+
blocks=remaining_blocks,
|
|
180
|
+
)
|
|
181
|
+
remaining_blocks = remaining_blocks[:-1]
|
|
182
|
+
|
|
183
|
+
return ContextPack(text="", evidence_count=0, blocks=[])
|