biblicus 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/cli.py +90 -1
- biblicus/context.py +183 -0
- biblicus/evidence_processing.py +201 -0
- {biblicus-0.4.0.dist-info → biblicus-0.5.0.dist-info}/METADATA +247 -102
- {biblicus-0.4.0.dist-info → biblicus-0.5.0.dist-info}/RECORD +10 -8
- {biblicus-0.4.0.dist-info → biblicus-0.5.0.dist-info}/WHEEL +0 -0
- {biblicus-0.4.0.dist-info → biblicus-0.5.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.4.0.dist-info → biblicus-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.4.0.dist-info → biblicus-0.5.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/cli.py
CHANGED
|
@@ -13,12 +13,19 @@ from typing import Dict, List, Optional
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
|
+
from .context import (
|
|
17
|
+
ContextPackPolicy,
|
|
18
|
+
TokenBudget,
|
|
19
|
+
build_context_pack,
|
|
20
|
+
fit_context_pack_to_token_budget,
|
|
21
|
+
)
|
|
16
22
|
from .corpus import Corpus
|
|
17
23
|
from .crawl import CrawlRequest, crawl_into_corpus
|
|
18
24
|
from .errors import ExtractionRunFatalError
|
|
19
25
|
from .evaluation import evaluate_run, load_dataset
|
|
26
|
+
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
20
27
|
from .extraction import build_extraction_run
|
|
21
|
-
from .models import QueryBudget, parse_extraction_run_reference
|
|
28
|
+
from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
|
|
22
29
|
from .uris import corpus_ref_to_path
|
|
23
30
|
|
|
24
31
|
|
|
@@ -449,10 +456,62 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
449
456
|
query_text = arguments.query if arguments.query is not None else sys.stdin.read()
|
|
450
457
|
budget = _budget_from_args(arguments)
|
|
451
458
|
result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
|
|
459
|
+
processed_evidence = result.evidence
|
|
460
|
+
if getattr(arguments, "reranker_id", None):
|
|
461
|
+
processed_evidence = apply_evidence_reranker(
|
|
462
|
+
reranker_id=arguments.reranker_id,
|
|
463
|
+
query_text=result.query_text,
|
|
464
|
+
evidence=processed_evidence,
|
|
465
|
+
)
|
|
466
|
+
if getattr(arguments, "minimum_score", None) is not None:
|
|
467
|
+
processed_evidence = apply_evidence_filter(
|
|
468
|
+
filter_id="filter-minimum-score",
|
|
469
|
+
query_text=result.query_text,
|
|
470
|
+
evidence=processed_evidence,
|
|
471
|
+
config={"minimum_score": float(arguments.minimum_score)},
|
|
472
|
+
)
|
|
473
|
+
if processed_evidence is not result.evidence:
|
|
474
|
+
result = result.model_copy(update={"evidence": processed_evidence})
|
|
452
475
|
print(result.model_dump_json(indent=2))
|
|
453
476
|
return 0
|
|
454
477
|
|
|
455
478
|
|
|
479
|
+
def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
480
|
+
"""
|
|
481
|
+
Build a context pack from a retrieval result.
|
|
482
|
+
|
|
483
|
+
The retrieval result is read from standard input as JavaScript Object Notation.
|
|
484
|
+
|
|
485
|
+
:param arguments: Parsed command-line interface arguments.
|
|
486
|
+
:type arguments: argparse.Namespace
|
|
487
|
+
:return: Exit code.
|
|
488
|
+
:rtype: int
|
|
489
|
+
"""
|
|
490
|
+
input_text = sys.stdin.read()
|
|
491
|
+
if not input_text.strip():
|
|
492
|
+
raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
|
|
493
|
+
retrieval_result = RetrievalResult.model_validate_json(input_text)
|
|
494
|
+
join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
|
|
495
|
+
policy = ContextPackPolicy(join_with=join_with)
|
|
496
|
+
context_pack = build_context_pack(retrieval_result, policy=policy)
|
|
497
|
+
if arguments.max_tokens is not None:
|
|
498
|
+
context_pack = fit_context_pack_to_token_budget(
|
|
499
|
+
context_pack,
|
|
500
|
+
policy=policy,
|
|
501
|
+
token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
|
|
502
|
+
)
|
|
503
|
+
print(
|
|
504
|
+
json.dumps(
|
|
505
|
+
{
|
|
506
|
+
"policy": policy.model_dump(),
|
|
507
|
+
"context_pack": context_pack.model_dump(),
|
|
508
|
+
},
|
|
509
|
+
indent=2,
|
|
510
|
+
)
|
|
511
|
+
)
|
|
512
|
+
return 0
|
|
513
|
+
|
|
514
|
+
|
|
456
515
|
def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
457
516
|
"""
|
|
458
517
|
Evaluate a retrieval run against a dataset.
|
|
@@ -657,8 +716,38 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
657
716
|
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
658
717
|
p_query.add_argument("--max-total-characters", type=int, default=2000)
|
|
659
718
|
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
719
|
+
p_query.add_argument(
|
|
720
|
+
"--reranker-id",
|
|
721
|
+
default=None,
|
|
722
|
+
help="Optional reranker identifier to apply after retrieval (for example: rerank-longest-text).",
|
|
723
|
+
)
|
|
724
|
+
p_query.add_argument(
|
|
725
|
+
"--minimum-score",
|
|
726
|
+
type=float,
|
|
727
|
+
default=None,
|
|
728
|
+
help="Optional minimum score threshold to filter evidence after retrieval.",
|
|
729
|
+
)
|
|
660
730
|
p_query.set_defaults(func=cmd_query)
|
|
661
731
|
|
|
732
|
+
p_context_pack = sub.add_parser("context-pack", help="Build context pack text from evidence.")
|
|
733
|
+
context_pack_sub = p_context_pack.add_subparsers(dest="context_pack_command", required=True)
|
|
734
|
+
|
|
735
|
+
p_context_pack_build = context_pack_sub.add_parser(
|
|
736
|
+
"build", help="Build a context pack from a retrieval result JavaScript Object Notation."
|
|
737
|
+
)
|
|
738
|
+
p_context_pack_build.add_argument(
|
|
739
|
+
"--join-with",
|
|
740
|
+
default="\\n\\n",
|
|
741
|
+
help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
|
|
742
|
+
)
|
|
743
|
+
p_context_pack_build.add_argument(
|
|
744
|
+
"--max-tokens",
|
|
745
|
+
default=None,
|
|
746
|
+
type=int,
|
|
747
|
+
help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
|
|
748
|
+
)
|
|
749
|
+
p_context_pack_build.set_defaults(func=cmd_context_pack_build)
|
|
750
|
+
|
|
662
751
|
p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
|
|
663
752
|
_add_common_corpus_arg(p_eval)
|
|
664
753
|
p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
biblicus/context.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Context pack building for Biblicus.
|
|
3
|
+
|
|
4
|
+
A context pack is the text that your application sends to a large language model.
|
|
5
|
+
Biblicus produces a context pack from structured retrieval results so that evidence remains a
|
|
6
|
+
stable contract while context formatting remains an explicit policy surface.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from .models import RetrievalResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ContextPackPolicy(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Policy that controls how evidence becomes context pack text.
|
|
21
|
+
|
|
22
|
+
:ivar join_with: Separator inserted between evidence text blocks.
|
|
23
|
+
:vartype join_with: str
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
join_with: str = Field(default="\n\n")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ContextPack(BaseModel):
|
|
32
|
+
"""
|
|
33
|
+
Context pack derived from retrieval evidence.
|
|
34
|
+
|
|
35
|
+
:ivar text: Context pack text suitable for inclusion in a model call.
|
|
36
|
+
:vartype text: str
|
|
37
|
+
:ivar evidence_count: Number of evidence blocks included in the context pack.
|
|
38
|
+
:vartype evidence_count: int
|
|
39
|
+
:ivar blocks: Structured blocks that produced the context pack.
|
|
40
|
+
:vartype blocks: list[ContextPackBlock]
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="forbid")
|
|
44
|
+
|
|
45
|
+
text: str
|
|
46
|
+
evidence_count: int = Field(ge=0)
|
|
47
|
+
blocks: List["ContextPackBlock"] = Field(default_factory=list)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ContextPackBlock(BaseModel):
|
|
51
|
+
"""
|
|
52
|
+
A single context pack block derived from one evidence item.
|
|
53
|
+
|
|
54
|
+
:ivar evidence_item_id: Item identifier that produced this block.
|
|
55
|
+
:vartype evidence_item_id: str
|
|
56
|
+
:ivar text: Text included in this block.
|
|
57
|
+
:vartype text: str
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
model_config = ConfigDict(extra="forbid")
|
|
61
|
+
|
|
62
|
+
evidence_item_id: str = Field(min_length=1)
|
|
63
|
+
text: str = Field(min_length=1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TokenCounter(BaseModel):
|
|
67
|
+
"""
|
|
68
|
+
Token counter configuration for token budget fitting.
|
|
69
|
+
|
|
70
|
+
This is a lightweight model wrapper so token fitting remains explicit and testable even when
|
|
71
|
+
the underlying tokenizer is provided by an optional dependency.
|
|
72
|
+
|
|
73
|
+
:ivar tokenizer_id: Tokenizer identifier (for example, naive-whitespace).
|
|
74
|
+
:vartype tokenizer_id: str
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
model_config = ConfigDict(extra="forbid")
|
|
78
|
+
|
|
79
|
+
tokenizer_id: str = Field(default="naive-whitespace", min_length=1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class TokenBudget(BaseModel):
|
|
83
|
+
"""
|
|
84
|
+
Token budget for a context pack.
|
|
85
|
+
|
|
86
|
+
:ivar max_tokens: Maximum tokens permitted for the final context pack text.
|
|
87
|
+
:vartype max_tokens: int
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(extra="forbid")
|
|
91
|
+
|
|
92
|
+
max_tokens: int = Field(ge=1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
|
|
96
|
+
"""
|
|
97
|
+
Build a context pack from a retrieval result using an explicit policy.
|
|
98
|
+
|
|
99
|
+
:param result: Retrieval result containing ranked evidence.
|
|
100
|
+
:type result: RetrievalResult
|
|
101
|
+
:param policy: Policy controlling how evidence text is joined.
|
|
102
|
+
:type policy: ContextPackPolicy
|
|
103
|
+
:return: Context pack containing concatenated evidence text.
|
|
104
|
+
:rtype: ContextPack
|
|
105
|
+
"""
|
|
106
|
+
selected_blocks: List[ContextPackBlock] = []
|
|
107
|
+
for evidence in result.evidence:
|
|
108
|
+
if not isinstance(evidence.text, str):
|
|
109
|
+
continue
|
|
110
|
+
trimmed_text = evidence.text.strip()
|
|
111
|
+
if not trimmed_text:
|
|
112
|
+
continue
|
|
113
|
+
selected_blocks.append(
|
|
114
|
+
ContextPackBlock(evidence_item_id=evidence.item_id, text=trimmed_text)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return ContextPack(
|
|
118
|
+
text=policy.join_with.join([block.text for block in selected_blocks]),
|
|
119
|
+
evidence_count=len(selected_blocks),
|
|
120
|
+
blocks=selected_blocks,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def count_tokens(text: str, *, tokenizer_id: str) -> int:
|
|
125
|
+
"""
|
|
126
|
+
Count tokens in a text using a tokenizer identifier.
|
|
127
|
+
|
|
128
|
+
The default tokenizer is naive-whitespace, which counts whitespace-separated tokens.
|
|
129
|
+
|
|
130
|
+
:param text: Text payload to count.
|
|
131
|
+
:type text: str
|
|
132
|
+
:param tokenizer_id: Tokenizer identifier.
|
|
133
|
+
:type tokenizer_id: str
|
|
134
|
+
:return: Token count.
|
|
135
|
+
:rtype: int
|
|
136
|
+
:raises KeyError: If the tokenizer identifier is unknown.
|
|
137
|
+
"""
|
|
138
|
+
tokenizers = {
|
|
139
|
+
"naive-whitespace": lambda value: len([token for token in value.split() if token]),
|
|
140
|
+
}
|
|
141
|
+
tokenizer = tokenizers[tokenizer_id]
|
|
142
|
+
return int(tokenizer(text))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def fit_context_pack_to_token_budget(
|
|
146
|
+
context_pack: ContextPack,
|
|
147
|
+
*,
|
|
148
|
+
policy: ContextPackPolicy,
|
|
149
|
+
token_budget: TokenBudget,
|
|
150
|
+
token_counter: Optional[TokenCounter] = None,
|
|
151
|
+
) -> ContextPack:
|
|
152
|
+
"""
|
|
153
|
+
Fit a context pack to a token budget by dropping trailing blocks.
|
|
154
|
+
|
|
155
|
+
This function is deterministic. It never rewrites block text. It only removes blocks from the
|
|
156
|
+
end of the block list until the token budget is met.
|
|
157
|
+
|
|
158
|
+
:param context_pack: Context pack to fit.
|
|
159
|
+
:type context_pack: ContextPack
|
|
160
|
+
:param policy: Policy controlling how blocks are joined into text.
|
|
161
|
+
:type policy: ContextPackPolicy
|
|
162
|
+
:param token_budget: Token budget to enforce.
|
|
163
|
+
:type token_budget: TokenBudget
|
|
164
|
+
:param token_counter: Optional token counter configuration.
|
|
165
|
+
:type token_counter: TokenCounter or None
|
|
166
|
+
:return: Fitted context pack.
|
|
167
|
+
:rtype: ContextPack
|
|
168
|
+
"""
|
|
169
|
+
token_counter = token_counter or TokenCounter()
|
|
170
|
+
remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
|
|
171
|
+
|
|
172
|
+
while remaining_blocks:
|
|
173
|
+
candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
|
|
174
|
+
candidate_tokens = count_tokens(candidate_text, tokenizer_id=token_counter.tokenizer_id)
|
|
175
|
+
if candidate_tokens <= token_budget.max_tokens:
|
|
176
|
+
return ContextPack(
|
|
177
|
+
text=candidate_text,
|
|
178
|
+
evidence_count=len(remaining_blocks),
|
|
179
|
+
blocks=remaining_blocks,
|
|
180
|
+
)
|
|
181
|
+
remaining_blocks = remaining_blocks[:-1]
|
|
182
|
+
|
|
183
|
+
return ContextPack(text="", evidence_count=0, blocks=[])
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evidence processing stages for Biblicus.
|
|
3
|
+
|
|
4
|
+
Retrieval backends return ranked evidence. Additional stages can be applied without changing the
|
|
5
|
+
backend implementation:
|
|
6
|
+
|
|
7
|
+
- Rerank: reorder evidence.
|
|
8
|
+
- Filter: remove evidence.
|
|
9
|
+
|
|
10
|
+
These stages are explicit so they can be configured, tested, and evaluated independently from the
|
|
11
|
+
retrieval backend.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from typing import Any, Dict, List
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
20
|
+
|
|
21
|
+
from .models import Evidence
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EvidenceReranker(ABC):
|
|
25
|
+
"""
|
|
26
|
+
Evidence reranker interface.
|
|
27
|
+
|
|
28
|
+
:param reranker_id: Stable identifier for this reranker implementation.
|
|
29
|
+
:type reranker_id: str
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
reranker_id: str
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def rerank(self, *, query_text: str, evidence: List[Evidence]) -> List[Evidence]:
|
|
36
|
+
"""
|
|
37
|
+
Reorder evidence for the given query.
|
|
38
|
+
|
|
39
|
+
:param query_text: Query text associated with the evidence.
|
|
40
|
+
:type query_text: str
|
|
41
|
+
:param evidence: Evidence objects to rerank.
|
|
42
|
+
:type evidence: list[Evidence]
|
|
43
|
+
:return: Reranked evidence list.
|
|
44
|
+
:rtype: list[Evidence]
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class EvidenceFilter(ABC):
|
|
49
|
+
"""
|
|
50
|
+
Evidence filter interface.
|
|
51
|
+
|
|
52
|
+
:param filter_id: Stable identifier for this filter implementation.
|
|
53
|
+
:type filter_id: str
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
filter_id: str
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def filter(
|
|
60
|
+
self, *, query_text: str, evidence: List[Evidence], config: Dict[str, Any]
|
|
61
|
+
) -> List[Evidence]:
|
|
62
|
+
"""
|
|
63
|
+
Filter evidence for the given query.
|
|
64
|
+
|
|
65
|
+
:param query_text: Query text associated with the evidence.
|
|
66
|
+
:type query_text: str
|
|
67
|
+
:param evidence: Evidence objects to filter.
|
|
68
|
+
:type evidence: list[Evidence]
|
|
69
|
+
:param config: Filter-specific configuration values.
|
|
70
|
+
:type config: dict[str, Any]
|
|
71
|
+
:return: Filtered evidence list.
|
|
72
|
+
:rtype: list[Evidence]
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EvidenceRerankLongestText(EvidenceReranker):
|
|
77
|
+
"""
|
|
78
|
+
Reranker that prioritizes evidence with longer text.
|
|
79
|
+
|
|
80
|
+
This is a deterministic policy that is useful when a downstream context pack is limited by a
|
|
81
|
+
character or token budget and longer evidence is preferred.
|
|
82
|
+
|
|
83
|
+
:ivar reranker_id: Stable reranker identifier.
|
|
84
|
+
:vartype reranker_id: str
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
reranker_id = "rerank-longest-text"
|
|
88
|
+
|
|
89
|
+
def rerank(self, *, query_text: str, evidence: List[Evidence]) -> List[Evidence]:
|
|
90
|
+
"""
|
|
91
|
+
Reorder evidence by descending text length.
|
|
92
|
+
|
|
93
|
+
:param query_text: Query text associated with the evidence.
|
|
94
|
+
:type query_text: str
|
|
95
|
+
:param evidence: Evidence objects to rerank.
|
|
96
|
+
:type evidence: list[Evidence]
|
|
97
|
+
:return: Evidence list ordered by text length.
|
|
98
|
+
:rtype: list[Evidence]
|
|
99
|
+
"""
|
|
100
|
+
return sorted(
|
|
101
|
+
evidence,
|
|
102
|
+
key=lambda evidence_item: (-len((evidence_item.text or "").strip()), evidence_item.item_id),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class EvidenceFilterMinimumScoreConfig(BaseModel):
|
|
107
|
+
"""
|
|
108
|
+
Configuration for the minimum score evidence filter.
|
|
109
|
+
|
|
110
|
+
:ivar minimum_score: Evidence with score below this threshold is removed.
|
|
111
|
+
:vartype minimum_score: float
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
model_config = ConfigDict(extra="forbid")
|
|
115
|
+
|
|
116
|
+
minimum_score: float = Field(ge=0.0)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class EvidenceFilterMinimumScore(EvidenceFilter):
|
|
120
|
+
"""
|
|
121
|
+
Filter that removes evidence below a minimum score threshold.
|
|
122
|
+
|
|
123
|
+
:ivar filter_id: Stable filter identifier.
|
|
124
|
+
:vartype filter_id: str
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
filter_id = "filter-minimum-score"
|
|
128
|
+
|
|
129
|
+
def filter(
|
|
130
|
+
self, *, query_text: str, evidence: List[Evidence], config: Dict[str, Any]
|
|
131
|
+
) -> List[Evidence]:
|
|
132
|
+
"""
|
|
133
|
+
Filter evidence by score threshold.
|
|
134
|
+
|
|
135
|
+
:param query_text: Query text associated with the evidence.
|
|
136
|
+
:type query_text: str
|
|
137
|
+
:param evidence: Evidence objects to filter.
|
|
138
|
+
:type evidence: list[Evidence]
|
|
139
|
+
:param config: Filter configuration values.
|
|
140
|
+
:type config: dict[str, Any]
|
|
141
|
+
:return: Evidence list with low-score items removed.
|
|
142
|
+
:rtype: list[Evidence]
|
|
143
|
+
"""
|
|
144
|
+
parsed_config = EvidenceFilterMinimumScoreConfig.model_validate(config)
|
|
145
|
+
return [
|
|
146
|
+
evidence_item
|
|
147
|
+
for evidence_item in evidence
|
|
148
|
+
if float(evidence_item.score) >= parsed_config.minimum_score
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
_EVIDENCE_RERANKERS: Dict[str, EvidenceReranker] = {
|
|
153
|
+
EvidenceRerankLongestText.reranker_id: EvidenceRerankLongestText(),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
_EVIDENCE_FILTERS: Dict[str, EvidenceFilter] = {
|
|
157
|
+
EvidenceFilterMinimumScore.filter_id: EvidenceFilterMinimumScore(),
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def apply_evidence_reranker(
|
|
162
|
+
*, reranker_id: str, query_text: str, evidence: List[Evidence]
|
|
163
|
+
) -> List[Evidence]:
|
|
164
|
+
"""
|
|
165
|
+
Apply a reranker to evidence by identifier.
|
|
166
|
+
|
|
167
|
+
:param reranker_id: Reranker identifier.
|
|
168
|
+
:type reranker_id: str
|
|
169
|
+
:param query_text: Query text associated with the evidence.
|
|
170
|
+
:type query_text: str
|
|
171
|
+
:param evidence: Evidence objects to rerank.
|
|
172
|
+
:type evidence: list[Evidence]
|
|
173
|
+
:return: Reranked evidence list.
|
|
174
|
+
:rtype: list[Evidence]
|
|
175
|
+
:raises KeyError: If the reranker identifier is unknown.
|
|
176
|
+
"""
|
|
177
|
+
reranker = _EVIDENCE_RERANKERS[reranker_id]
|
|
178
|
+
return reranker.rerank(query_text=query_text, evidence=evidence)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def apply_evidence_filter(
|
|
182
|
+
*, filter_id: str, query_text: str, evidence: List[Evidence], config: Dict[str, Any]
|
|
183
|
+
) -> List[Evidence]:
|
|
184
|
+
"""
|
|
185
|
+
Apply a filter to evidence by identifier.
|
|
186
|
+
|
|
187
|
+
:param filter_id: Filter identifier.
|
|
188
|
+
:type filter_id: str
|
|
189
|
+
:param query_text: Query text associated with the evidence.
|
|
190
|
+
:type query_text: str
|
|
191
|
+
:param evidence: Evidence objects to filter.
|
|
192
|
+
:type evidence: list[Evidence]
|
|
193
|
+
:param config: Filter-specific configuration values.
|
|
194
|
+
:type config: dict[str, Any]
|
|
195
|
+
:return: Filtered evidence list.
|
|
196
|
+
:rtype: list[Evidence]
|
|
197
|
+
:raises KeyError: If the filter identifier is unknown.
|
|
198
|
+
"""
|
|
199
|
+
evidence_filter = _EVIDENCE_FILTERS[filter_id]
|
|
200
|
+
return evidence_filter.filter(query_text=query_text, evidence=evidence, config=config)
|
|
201
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -41,11 +41,11 @@ The first practical problem is not retrieval. It is collection and care. You nee
|
|
|
41
41
|
|
|
42
42
|
This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
|
|
43
43
|
|
|
44
|
-
It can be used alongside
|
|
44
|
+
It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
|
|
45
45
|
|
|
46
46
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
47
47
|
|
|
48
|
-
## A
|
|
48
|
+
## A simple mental model
|
|
49
49
|
|
|
50
50
|
Think in three stages.
|
|
51
51
|
|
|
@@ -63,94 +63,30 @@ If you learn a few project words, the rest of the system becomes predictable.
|
|
|
63
63
|
- Run is a recorded retrieval build for a corpus.
|
|
64
64
|
- Evidence is what retrieval returns, with identifiers and source information.
|
|
65
65
|
|
|
66
|
-
##
|
|
66
|
+
## Where it fits in an assistant
|
|
67
67
|
|
|
68
|
-
|
|
69
|
-
Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
|
|
70
|
-
The legend shows what the block styles mean.
|
|
71
|
-
Your code is where you decide how to turn evidence into context and how to call a model.
|
|
68
|
+
Biblicus does not answer user questions. It is not a language model. It helps your assistant answer them by retrieving relevant material and returning it as structured evidence. Your code decides how to turn evidence into a context pack for the model call, which is then passed to a model you choose.
|
|
72
69
|
|
|
73
|
-
|
|
74
|
-
%%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
75
|
-
flowchart LR
|
|
76
|
-
subgraph Legend[Legend]
|
|
77
|
-
direction LR
|
|
78
|
-
LegendArtifact[Stored artifact or evidence]
|
|
79
|
-
LegendStep[Step]
|
|
80
|
-
LegendArtifact --- LegendStep
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
subgraph Main[" "]
|
|
84
|
-
direction TB
|
|
70
|
+
In a coding assistant, retrieval is often triggered by what the user is doing right now. For example: you are about to propose a user interface change, so you retrieve the user's stated preferences, then you include that as context for the model call.
|
|
85
71
|
|
|
86
|
-
|
|
87
|
-
direction TB
|
|
88
|
-
Source[Source items] --> Ingest[Ingest]
|
|
89
|
-
Ingest --> Raw[Raw item files]
|
|
90
|
-
Raw --> Catalog[Catalog file]
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
subgraph PluggableExtractionPipeline[Pluggable: extraction pipeline]
|
|
94
|
-
direction TB
|
|
95
|
-
Catalog --> Extract[Extract pipeline]
|
|
96
|
-
Extract --> ExtractedText[Extracted text artifacts]
|
|
97
|
-
ExtractedText --> ExtractionRun[Extraction run manifest]
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
subgraph PluggableRetrievalBackend[Pluggable: retrieval backend]
|
|
101
|
-
direction LR
|
|
102
|
-
|
|
103
|
-
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
104
|
-
direction TB
|
|
105
|
-
ExtractionRun --> Build[Build run]
|
|
106
|
-
Build --> BackendIndex[Backend index]
|
|
107
|
-
BackendIndex --> Run[Run manifest]
|
|
108
|
-
end
|
|
72
|
+
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
109
73
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
127
|
-
style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
128
|
-
style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
129
|
-
style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
130
|
-
style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
131
|
-
|
|
132
|
-
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
133
|
-
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
134
|
-
style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
135
|
-
style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
136
|
-
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
137
|
-
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
138
|
-
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
139
|
-
style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
140
|
-
style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
141
|
-
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
142
|
-
|
|
143
|
-
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
144
|
-
style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
145
|
-
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
146
|
-
style Query fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
147
|
-
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
style Legend fill:#ffffff,stroke:#ffffff,color:#111111
|
|
151
|
-
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
152
|
-
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
153
|
-
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
74
|
+
```mermaid
|
|
75
|
+
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
76
|
+
sequenceDiagram
|
|
77
|
+
participant User
|
|
78
|
+
participant App as Your assistant code
|
|
79
|
+
participant Bib as Biblicus
|
|
80
|
+
participant LLM as Large language model
|
|
81
|
+
|
|
82
|
+
User->>App: request
|
|
83
|
+
App->>Bib: query retrieval
|
|
84
|
+
Bib-->>App: retrieval result evidence JSON
|
|
85
|
+
App->>Bib: build context pack from evidence
|
|
86
|
+
Bib-->>App: context pack text
|
|
87
|
+
App->>LLM: context pack plus prompt
|
|
88
|
+
LLM-->>App: response draft
|
|
89
|
+
App-->>User: response
|
|
154
90
|
```
|
|
155
91
|
|
|
156
92
|
## Practical value
|
|
@@ -217,6 +153,216 @@ biblicus crawl --corpus corpora/example \\
|
|
|
217
153
|
--tag crawled
|
|
218
154
|
```
|
|
219
155
|
|
|
156
|
+
## End-to-end example: evidence to assistant context
|
|
157
|
+
|
|
158
|
+
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
159
|
+
|
|
160
|
+
Start with a few short “memories” from a chat system. Each memory is stored as a normal item in the corpus.
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from biblicus.backends import get_backend
|
|
164
|
+
from biblicus.context import ContextPackPolicy, TokenBudget, build_context_pack, fit_context_pack_to_token_budget
|
|
165
|
+
from biblicus.corpus import Corpus
|
|
166
|
+
from biblicus.models import QueryBudget
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
corpus = Corpus.init("corpora/story")
|
|
170
|
+
|
|
171
|
+
notes = [
|
|
172
|
+
("User name", "The user's name is Tactus Maximus."),
|
|
173
|
+
("Button style preference", "Primary button style preference: the user's favorite color is magenta."),
|
|
174
|
+
("Style preference", "The user prefers concise answers."),
|
|
175
|
+
("Language preference", "The user dislikes idioms and abbreviations."),
|
|
176
|
+
("Engineering preference", "The user likes code that is over-documented and behavior-driven."),
|
|
177
|
+
]
|
|
178
|
+
for note_title, note_text in notes:
|
|
179
|
+
corpus.ingest_note(note_text, title=note_title, tags=["memory"])
|
|
180
|
+
|
|
181
|
+
backend = get_backend("scan")
|
|
182
|
+
run = backend.build_run(corpus, recipe_name="Story demo", config={})
|
|
183
|
+
budget = QueryBudget(max_total_items=5, max_total_characters=2000, max_items_per_source=None)
|
|
184
|
+
result = backend.query(
|
|
185
|
+
corpus,
|
|
186
|
+
run=run,
|
|
187
|
+
query_text="Primary button style preference",
|
|
188
|
+
budget=budget,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
policy = ContextPackPolicy(join_with="\n\n")
|
|
192
|
+
context_pack = build_context_pack(result, policy=policy)
|
|
193
|
+
context_pack = fit_context_pack_to_token_budget(
|
|
194
|
+
context_pack,
|
|
195
|
+
policy=policy,
|
|
196
|
+
token_budget=TokenBudget(max_tokens=60),
|
|
197
|
+
)
|
|
198
|
+
print(context_pack.text)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
If you want a runnable version of this story, use the script at `scripts/readme_end_to_end_demo.py`.
|
|
202
|
+
|
|
203
|
+
If you prefer the command-line interface, here is the same flow in compressed form:
|
|
204
|
+
|
|
205
|
+
```
|
|
206
|
+
biblicus init corpora/story
|
|
207
|
+
biblicus ingest --corpus corpora/story --stdin --title "User name" --tag memory <<< "The user's name is Tactus Maximus."
|
|
208
|
+
biblicus ingest --corpus corpora/story --stdin --title "Button style preference" --tag memory <<< "Primary button style preference: the user's favorite color is magenta."
|
|
209
|
+
biblicus ingest --corpus corpora/story --stdin --title "Style preference" --tag memory <<< "The user prefers concise answers."
|
|
210
|
+
biblicus ingest --corpus corpora/story --stdin --title "Language preference" --tag memory <<< "The user dislikes idioms and abbreviations."
|
|
211
|
+
biblicus ingest --corpus corpora/story --stdin --title "Engineering preference" --tag memory <<< "The user likes code that is over-documented and behavior-driven."
|
|
212
|
+
biblicus build --corpus corpora/story --backend scan
|
|
213
|
+
biblicus query --corpus corpora/story --query "Primary button style preference"
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Example output:
|
|
217
|
+
|
|
218
|
+
```json
|
|
219
|
+
{
|
|
220
|
+
"query_text": "Primary button style preference",
|
|
221
|
+
"budget": {
|
|
222
|
+
"max_total_items": 5,
|
|
223
|
+
"max_total_characters": 2000,
|
|
224
|
+
"max_items_per_source": null
|
|
225
|
+
},
|
|
226
|
+
"run_id": "RUN_ID",
|
|
227
|
+
"recipe_id": "RECIPE_ID",
|
|
228
|
+
"backend_id": "scan",
|
|
229
|
+
"generated_at": "2026-01-29T00:00:00.000000Z",
|
|
230
|
+
"evidence": [
|
|
231
|
+
{
|
|
232
|
+
"item_id": "ITEM_ID",
|
|
233
|
+
"source_uri": "text",
|
|
234
|
+
"media_type": "text/markdown",
|
|
235
|
+
"score": 1.0,
|
|
236
|
+
"rank": 1,
|
|
237
|
+
"text": "Primary button style preference: the user's favorite color is magenta.",
|
|
238
|
+
"content_ref": null,
|
|
239
|
+
"span_start": null,
|
|
240
|
+
"span_end": null,
|
|
241
|
+
"stage": "scan",
|
|
242
|
+
"recipe_id": "RECIPE_ID",
|
|
243
|
+
"run_id": "RUN_ID",
|
|
244
|
+
"hash": null
|
|
245
|
+
}
|
|
246
|
+
],
|
|
247
|
+
"stats": {}
|
|
248
|
+
}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Evidence is the output contract. Your code decides how to convert evidence into assistant context.
|
|
252
|
+
|
|
253
|
+
### Turn evidence into a context pack
|
|
254
|
+
|
|
255
|
+
A context pack is a readable text block you send to a model. There is no single correct format. Treat it as a policy surface you can iterate on.
|
|
256
|
+
|
|
257
|
+
Here is a minimal example that builds a context pack from evidence:
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
from biblicus.context import ContextPackPolicy, build_context_pack
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
policy = ContextPackPolicy(
|
|
264
|
+
join_with="\n\n",
|
|
265
|
+
)
|
|
266
|
+
context_pack = build_context_pack(result, policy=policy)
|
|
267
|
+
print(context_pack.text)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
Example context pack output:
|
|
271
|
+
|
|
272
|
+
```text
|
|
273
|
+
Primary button style preference: the user's favorite color is magenta.
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
You can also build a context pack from the command-line interface by piping the retrieval result:
|
|
277
|
+
|
|
278
|
+
```
|
|
279
|
+
biblicus query --corpus corpora/story --query "Primary button style preference" \\
|
|
280
|
+
| biblicus context-pack build
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Most production systems also apply a budget when building context. If you want a precise token budget, the budgeting logic needs a specific tokenizer and should be treated as its own stage.
|
|
284
|
+
|
|
285
|
+
## Pipeline diagram
|
|
286
|
+
|
|
287
|
+
This diagram shows how a corpus becomes evidence for your assistant. Your code decides how to turn evidence into context and how to call a model.
|
|
288
|
+
|
|
289
|
+
```mermaid
|
|
290
|
+
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff"}, "flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
291
|
+
flowchart TB
|
|
292
|
+
subgraph Legend[Legend]
|
|
293
|
+
direction LR
|
|
294
|
+
LegendArtifact[Stored artifact or evidence]
|
|
295
|
+
LegendStep[Step]
|
|
296
|
+
LegendArtifact --- LegendStep
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
subgraph Main[" "]
|
|
300
|
+
direction TB
|
|
301
|
+
|
|
302
|
+
subgraph Pipeline[" "]
|
|
303
|
+
direction TB
|
|
304
|
+
|
|
305
|
+
subgraph RowStable[Stable core]
|
|
306
|
+
direction TB
|
|
307
|
+
Source[Source items] --> Ingest[Ingest] --> Raw[Raw item files] --> Catalog[Catalog file]
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
subgraph RowExtraction[Pluggable: extraction pipeline]
|
|
311
|
+
direction TB
|
|
312
|
+
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction run manifest]
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
subgraph RowRetrieval[Pluggable: retrieval backend]
|
|
316
|
+
direction TB
|
|
317
|
+
ExtractionRun --> Build[Build run] --> BackendIndex[Backend index] --> Run[Run manifest] --> Retrieve[Retrieve] --> Rerank[Rerank optional] --> Filter[Filter optional] --> Evidence[Evidence]
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
subgraph RowContext[Context]
|
|
321
|
+
direction TB
|
|
322
|
+
Evidence --> ContextPack[Context pack] --> FitTokens[Fit tokens optional] --> Context[Assistant context]
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
subgraph RowYourCode[Your code]
|
|
326
|
+
direction TB
|
|
327
|
+
Context --> Model[Large language model call] --> Answer[Answer]
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
style RowStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
332
|
+
style RowExtraction fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
333
|
+
style RowRetrieval fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
334
|
+
style RowContext fill:#ffffff,stroke:#7b1fa2,stroke-width:2px,color:#111111
|
|
335
|
+
style RowYourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
336
|
+
|
|
337
|
+
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
338
|
+
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
339
|
+
style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
340
|
+
style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
341
|
+
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
342
|
+
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
343
|
+
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
344
|
+
style ContextPack fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
345
|
+
style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
346
|
+
style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
347
|
+
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
348
|
+
|
|
349
|
+
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
350
|
+
style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
351
|
+
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
352
|
+
style Retrieve fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
353
|
+
style Rerank fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
354
|
+
style Filter fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
355
|
+
style FitTokens fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
356
|
+
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
style Legend fill:#ffffff,stroke:#ffffff,color:#111111
|
|
360
|
+
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
361
|
+
style Pipeline fill:#ffffff,stroke:#ffffff,color:#111111
|
|
362
|
+
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
363
|
+
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
364
|
+
```
|
|
365
|
+
|
|
220
366
|
## Python usage
|
|
221
367
|
|
|
222
368
|
From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
|
|
@@ -229,30 +375,28 @@ From Python, the same flow is available through the Corpus class and backend int
|
|
|
229
375
|
- Query a run with `backend.query`.
|
|
230
376
|
- Evaluate with `evaluate_run`.
|
|
231
377
|
|
|
232
|
-
## How it fits into an assistant
|
|
233
|
-
|
|
234
|
-
In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
|
|
235
|
-
|
|
236
|
-
- Use a corpus as the source of truth for raw items.
|
|
237
|
-
- Use a backend run to build any derived artifacts needed for retrieval.
|
|
238
|
-
- Use queries to obtain evidence objects.
|
|
239
|
-
- Convert evidence into the format your framework expects, such as message content, tool output, or citations.
|
|
240
|
-
|
|
241
378
|
## Learn more
|
|
242
379
|
|
|
243
380
|
Full documentation is published on GitHub Pages: https://anthusai.github.io/Biblicus/
|
|
244
381
|
|
|
245
|
-
The documents below
|
|
382
|
+
The documents below follow the pipeline from raw items to model context:
|
|
246
383
|
|
|
247
|
-
- [Architecture][architecture]
|
|
248
|
-
- [Roadmap][roadmap]
|
|
249
|
-
- [Feature index][feature-index]
|
|
250
384
|
- [Corpus][corpus]
|
|
251
385
|
- [Text extraction][text-extraction]
|
|
252
|
-
- [User configuration][user-configuration]
|
|
253
386
|
- [Backends][backends]
|
|
387
|
+
- [Context packs][context-packs]
|
|
388
|
+
- [Testing and evaluation][testing]
|
|
389
|
+
|
|
390
|
+
Reference:
|
|
391
|
+
|
|
254
392
|
- [Demos][demos]
|
|
255
|
-
- [
|
|
393
|
+
- [User configuration][user-configuration]
|
|
394
|
+
|
|
395
|
+
Design and implementation map:
|
|
396
|
+
|
|
397
|
+
- [Feature index][feature-index]
|
|
398
|
+
- [Roadmap][roadmap]
|
|
399
|
+
- [Architecture][architecture]
|
|
256
400
|
|
|
257
401
|
## Metadata and catalog
|
|
258
402
|
|
|
@@ -344,6 +488,7 @@ License terms are in `LICENSE`.
|
|
|
344
488
|
[text-extraction]: docs/EXTRACTION.md
|
|
345
489
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
346
490
|
[backends]: docs/BACKENDS.md
|
|
491
|
+
[context-packs]: docs/CONTEXT_PACK.md
|
|
347
492
|
[demos]: docs/DEMOS.md
|
|
348
493
|
[testing]: docs/TESTING.md
|
|
349
494
|
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=9YH3nGunYPrO2wrwwya94mgHqWXnGOiIwDCB1THgGqo,432
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
3
|
+
biblicus/cli.py,sha256=hBau464XNdSGdWeOCE2Q7dm0P8I4sR0W-NgVT0wPmh4,27724
|
|
4
4
|
biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
|
|
5
|
+
biblicus/context.py,sha256=qnT9CH7_ldoPcg-rxnUOtRhheOmpDAbF8uqhf8OdjC4,5832
|
|
5
6
|
biblicus/corpus.py,sha256=gF1RNl6fdz7wplzpHEIkEBkhYxHgKTKguBR_kD9IgUw,54109
|
|
6
7
|
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
7
8
|
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
8
9
|
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
10
|
+
biblicus/evidence_processing.py,sha256=EMv1AkV_Eufk-poBz9nRR1dZgC-QewvI-NrULBUGVGA,6074
|
|
9
11
|
biblicus/extraction.py,sha256=VEjBjIpaBboftGgEcpDj7z7um41e5uDZpP_7acQg7fw,19448
|
|
10
12
|
biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
|
|
11
13
|
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
@@ -37,9 +39,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
|
|
|
37
39
|
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
38
40
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
39
41
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
40
|
-
biblicus-0.
|
|
41
|
-
biblicus-0.
|
|
42
|
-
biblicus-0.
|
|
43
|
-
biblicus-0.
|
|
44
|
-
biblicus-0.
|
|
45
|
-
biblicus-0.
|
|
42
|
+
biblicus-0.5.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
43
|
+
biblicus-0.5.0.dist-info/METADATA,sha256=SHMtWua4egS09DGjX-YZviQOXojtkVvgrisgPmnlSnk,19666
|
|
44
|
+
biblicus-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
45
|
+
biblicus-0.5.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
46
|
+
biblicus-0.5.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
47
|
+
biblicus-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|