biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +21 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +334 -0
- biblicus/backends/embedding_index_file.py +272 -0
- biblicus/backends/embedding_index_inmemory.py +270 -0
- biblicus/backends/hybrid.py +8 -5
- biblicus/backends/scan.py +1 -0
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +28 -35
- biblicus/chunking.py +396 -0
- biblicus/cli.py +75 -25
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1060 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +129 -0
- biblicus/corpus.py +117 -16
- biblicus/embedding_providers.py +122 -0
- biblicus/errors.py +24 -0
- biblicus/frontmatter.py +2 -0
- biblicus/knowledge_base.py +1 -1
- biblicus/models.py +15 -3
- biblicus/retrieval.py +7 -2
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +2 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py
CHANGED
|
@@ -8,7 +8,7 @@ import argparse
|
|
|
8
8
|
import json
|
|
9
9
|
import sys
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Dict, List, Optional
|
|
11
|
+
from typing import Dict, Iterable, List, Optional
|
|
12
12
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
@@ -24,7 +24,7 @@ from .context import (
|
|
|
24
24
|
)
|
|
25
25
|
from .corpus import Corpus
|
|
26
26
|
from .crawl import CrawlRequest, crawl_into_corpus
|
|
27
|
-
from .errors import ExtractionRunFatalError
|
|
27
|
+
from .errors import ExtractionRunFatalError, IngestCollisionError
|
|
28
28
|
from .evaluation import evaluate_run, load_dataset
|
|
29
29
|
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
30
30
|
from .extraction import build_extraction_run
|
|
@@ -117,18 +117,28 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
|
117
117
|
|
|
118
118
|
results = []
|
|
119
119
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
120
|
+
try:
|
|
121
|
+
if arguments.note is not None or arguments.stdin:
|
|
122
|
+
text = arguments.note if arguments.note is not None else sys.stdin.read()
|
|
123
|
+
ingest_result = corpus.ingest_note(
|
|
124
|
+
text,
|
|
125
|
+
title=arguments.title,
|
|
126
|
+
tags=tags,
|
|
127
|
+
source_uri=None if arguments.stdin else None,
|
|
128
|
+
)
|
|
129
|
+
results.append(ingest_result)
|
|
130
|
+
|
|
131
|
+
for source_path in arguments.files or []:
|
|
132
|
+
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
133
|
+
except IngestCollisionError as error:
|
|
134
|
+
print(
|
|
135
|
+
"Ingest failed: source already ingested\n"
|
|
136
|
+
f"source_uri: {error.source_uri}\n"
|
|
137
|
+
f"existing_item_id: {error.existing_item_id}\n"
|
|
138
|
+
f"existing_relpath: {error.existing_relpath}",
|
|
139
|
+
file=sys.stderr,
|
|
127
140
|
)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
for source_path in arguments.files or []:
|
|
131
|
-
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
141
|
+
return 3
|
|
132
142
|
|
|
133
143
|
if not results:
|
|
134
144
|
print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
|
|
@@ -239,15 +249,23 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
|
239
249
|
return 0
|
|
240
250
|
|
|
241
251
|
|
|
242
|
-
def _parse_config_pairs(pairs: Optional[
|
|
252
|
+
def _parse_config_pairs(pairs: Optional[Iterable[str]]) -> Dict[str, object]:
|
|
243
253
|
"""
|
|
244
|
-
Parse
|
|
254
|
+
Parse key=value pairs into a configuration mapping.
|
|
245
255
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
256
|
+
This is used by a few command-line options that accept repeated key=value items.
|
|
257
|
+
Values are coerced to useful types in a predictable way:
|
|
258
|
+
|
|
259
|
+
- JSON objects/arrays (leading ``{`` or ``[``) are parsed as JSON.
|
|
260
|
+
- Whole numbers are parsed as integers.
|
|
261
|
+
- Other numeric forms are parsed as floats.
|
|
262
|
+
- Everything else remains a string.
|
|
263
|
+
|
|
264
|
+
:param pairs: Iterable of key=value strings.
|
|
265
|
+
:type pairs: Iterable[str] or None
|
|
266
|
+
:return: Parsed configuration mapping.
|
|
249
267
|
:rtype: dict[str, object]
|
|
250
|
-
:raises ValueError: If any entry is not key=value.
|
|
268
|
+
:raises ValueError: If any entry is not a key=value pair or values are invalid.
|
|
251
269
|
"""
|
|
252
270
|
config: Dict[str, object] = {}
|
|
253
271
|
for item in pairs or []:
|
|
@@ -257,8 +275,14 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
257
275
|
key = key.strip()
|
|
258
276
|
if not key:
|
|
259
277
|
raise ValueError("Config keys must be non-empty")
|
|
278
|
+
raw = raw.strip()
|
|
260
279
|
value: object = raw
|
|
261
|
-
if raw.
|
|
280
|
+
if raw.startswith("{") or raw.startswith("["):
|
|
281
|
+
try:
|
|
282
|
+
value = json.loads(raw)
|
|
283
|
+
except json.JSONDecodeError as exc:
|
|
284
|
+
raise ValueError(f"Config value must be valid JSON for key {key!r}") from exc
|
|
285
|
+
elif raw.isdigit():
|
|
262
286
|
value = int(raw)
|
|
263
287
|
else:
|
|
264
288
|
try:
|
|
@@ -359,7 +383,8 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
359
383
|
"""
|
|
360
384
|
return QueryBudget(
|
|
361
385
|
max_total_items=arguments.max_total_items,
|
|
362
|
-
|
|
386
|
+
offset=getattr(arguments, "offset", 0),
|
|
387
|
+
maximum_total_characters=arguments.maximum_total_characters,
|
|
363
388
|
max_items_per_source=arguments.max_items_per_source,
|
|
364
389
|
)
|
|
365
390
|
|
|
@@ -373,13 +398,26 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
373
398
|
:return: Exit code.
|
|
374
399
|
:rtype: int
|
|
375
400
|
"""
|
|
401
|
+
from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
|
|
402
|
+
|
|
376
403
|
corpus = (
|
|
377
404
|
Corpus.open(arguments.corpus)
|
|
378
405
|
if getattr(arguments, "corpus", None)
|
|
379
406
|
else Corpus.find(Path.cwd())
|
|
380
407
|
)
|
|
381
408
|
backend = get_backend(arguments.backend)
|
|
382
|
-
|
|
409
|
+
|
|
410
|
+
base_config: Dict[str, object] = {}
|
|
411
|
+
if getattr(arguments, "recipe", None):
|
|
412
|
+
base_config = load_recipe_view(
|
|
413
|
+
arguments.recipe,
|
|
414
|
+
recipe_label="Recipe file",
|
|
415
|
+
mapping_error_message="Retrieval build recipe must be a mapping/object",
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
overrides = parse_dotted_overrides(arguments.config)
|
|
419
|
+
config = apply_dotted_overrides(base_config, overrides)
|
|
420
|
+
|
|
383
421
|
run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
|
|
384
422
|
print(run.model_dump_json(indent=2))
|
|
385
423
|
return 0
|
|
@@ -947,11 +985,17 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
947
985
|
help="Backend identifier (for example, scan, sqlite-full-text-search).",
|
|
948
986
|
)
|
|
949
987
|
p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
988
|
+
p_build.add_argument(
|
|
989
|
+
"--recipe",
|
|
990
|
+
default=None,
|
|
991
|
+
action="append",
|
|
992
|
+
help="Path to YAML recipe file (repeatable). If provided, recipes are composed in precedence order.",
|
|
993
|
+
)
|
|
950
994
|
p_build.add_argument(
|
|
951
995
|
"--config",
|
|
952
996
|
action="append",
|
|
953
997
|
default=None,
|
|
954
|
-
help="Backend config as key=value (repeatable).",
|
|
998
|
+
help="Backend config override as key=value (repeatable). Dotted keys create nested config mappings.",
|
|
955
999
|
)
|
|
956
1000
|
p_build.set_defaults(func=cmd_build)
|
|
957
1001
|
|
|
@@ -1030,8 +1074,14 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1030
1074
|
p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
1031
1075
|
p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
|
|
1032
1076
|
p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
|
|
1077
|
+
p_query.add_argument(
|
|
1078
|
+
"--offset",
|
|
1079
|
+
type=int,
|
|
1080
|
+
default=0,
|
|
1081
|
+
help="Skip this many ranked candidates before selecting evidence (pagination).",
|
|
1082
|
+
)
|
|
1033
1083
|
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
1034
|
-
p_query.add_argument("--
|
|
1084
|
+
p_query.add_argument("--maximum-total-characters", type=int, default=2000)
|
|
1035
1085
|
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
1036
1086
|
p_query.add_argument(
|
|
1037
1087
|
"--reranker-id",
|
|
@@ -1091,7 +1141,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1091
1141
|
help="Path to dataset JavaScript Object Notation file.",
|
|
1092
1142
|
)
|
|
1093
1143
|
p_eval.add_argument("--max-total-items", type=int, default=5)
|
|
1094
|
-
p_eval.add_argument("--
|
|
1144
|
+
p_eval.add_argument("--maximum-total-characters", type=int, default=2000)
|
|
1095
1145
|
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
1096
1146
|
p_eval.set_defaults(func=cmd_eval)
|
|
1097
1147
|
|
biblicus/context.py
CHANGED
|
@@ -25,6 +25,8 @@ class ContextPackPolicy(BaseModel):
|
|
|
25
25
|
:vartype ordering: str
|
|
26
26
|
:ivar include_metadata: Whether to include evidence metadata lines in each block.
|
|
27
27
|
:vartype include_metadata: bool
|
|
28
|
+
:ivar metadata_fields: Optional evidence metadata fields to include.
|
|
29
|
+
:vartype metadata_fields: list[str] or None
|
|
28
30
|
"""
|
|
29
31
|
|
|
30
32
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -32,6 +34,7 @@ class ContextPackPolicy(BaseModel):
|
|
|
32
34
|
join_with: str = Field(default="\n\n")
|
|
33
35
|
ordering: str = Field(default="rank", min_length=1)
|
|
34
36
|
include_metadata: bool = Field(default=False)
|
|
37
|
+
metadata_fields: Optional[List[str]] = None
|
|
35
38
|
|
|
36
39
|
|
|
37
40
|
class ContextPack(BaseModel):
|
|
@@ -132,7 +135,9 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
|
|
|
132
135
|
trimmed_text = evidence.text.strip()
|
|
133
136
|
if not trimmed_text:
|
|
134
137
|
continue
|
|
135
|
-
metadata =
|
|
138
|
+
metadata = (
|
|
139
|
+
_metadata_for_evidence(evidence, policy=policy) if policy.include_metadata else None
|
|
140
|
+
)
|
|
136
141
|
block_text = _format_block_text(trimmed_text, metadata=metadata)
|
|
137
142
|
selected_blocks.append(
|
|
138
143
|
ContextPackBlock(
|
|
@@ -276,7 +281,11 @@ def _order_evidence(
|
|
|
276
281
|
raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
|
|
277
282
|
|
|
278
283
|
|
|
279
|
-
def _metadata_for_evidence(
|
|
284
|
+
def _metadata_for_evidence(
|
|
285
|
+
evidence: Evidence,
|
|
286
|
+
*,
|
|
287
|
+
policy: ContextPackPolicy,
|
|
288
|
+
) -> Dict[str, object]:
|
|
280
289
|
"""
|
|
281
290
|
Build metadata for a context pack block.
|
|
282
291
|
|
|
@@ -285,12 +294,19 @@ def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
|
|
|
285
294
|
:return: Metadata mapping.
|
|
286
295
|
:rtype: dict[str, object]
|
|
287
296
|
"""
|
|
288
|
-
|
|
297
|
+
metadata = {
|
|
289
298
|
"item_id": evidence.item_id,
|
|
290
299
|
"source_uri": evidence.source_uri or "none",
|
|
291
300
|
"score": evidence.score,
|
|
292
301
|
"stage": evidence.stage,
|
|
293
302
|
}
|
|
303
|
+
extra = evidence.metadata or {}
|
|
304
|
+
if policy.metadata_fields is not None:
|
|
305
|
+
extra = {key: extra.get(key) for key in policy.metadata_fields if key in extra}
|
|
306
|
+
for key, value in extra.items():
|
|
307
|
+
if key not in metadata:
|
|
308
|
+
metadata[key] = value
|
|
309
|
+
return metadata
|
|
294
310
|
|
|
295
311
|
|
|
296
312
|
def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
|
|
@@ -306,12 +322,11 @@ def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> s
|
|
|
306
322
|
"""
|
|
307
323
|
if not metadata:
|
|
308
324
|
return text
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
return f"{metadata_lines}\n{text}"
|
|
325
|
+
ordered_keys = ["item_id", "source_uri", "score", "stage"]
|
|
326
|
+
metadata_lines = [f"{key}: {metadata[key]}" for key in ordered_keys if key in metadata]
|
|
327
|
+
for key in sorted(metadata.keys()):
|
|
328
|
+
if key in ordered_keys:
|
|
329
|
+
continue
|
|
330
|
+
metadata_lines.append(f"{key}: {metadata[key]}")
|
|
331
|
+
metadata_text = "\n".join(metadata_lines)
|
|
332
|
+
return f"{metadata_text}\n{text}"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Public interface for the Biblicus Context Engine.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .assembler import ContextAssembler, ContextAssemblyResult
|
|
6
|
+
from .compaction import BaseCompactor, CompactionRequest, SummaryCompactor, TruncateCompactor
|
|
7
|
+
from .models import (
|
|
8
|
+
AssistantMessageSpec,
|
|
9
|
+
CompactorDeclaration,
|
|
10
|
+
ContextBudgetSpec,
|
|
11
|
+
ContextDeclaration,
|
|
12
|
+
ContextExpansionSpec,
|
|
13
|
+
ContextInsertSpec,
|
|
14
|
+
ContextMessageSpec,
|
|
15
|
+
ContextPackBudgetSpec,
|
|
16
|
+
ContextPackSpec,
|
|
17
|
+
ContextPolicySpec,
|
|
18
|
+
ContextRetrieverRequest,
|
|
19
|
+
ContextTemplateSpec,
|
|
20
|
+
CorpusDeclaration,
|
|
21
|
+
HistoryInsertSpec,
|
|
22
|
+
RetrieverDeclaration,
|
|
23
|
+
SystemMessageSpec,
|
|
24
|
+
UserMessageSpec,
|
|
25
|
+
)
|
|
26
|
+
from .retrieval import retrieve_context_pack
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"ContextAssembler",
|
|
30
|
+
"ContextAssemblyResult",
|
|
31
|
+
"BaseCompactor",
|
|
32
|
+
"CompactionRequest",
|
|
33
|
+
"SummaryCompactor",
|
|
34
|
+
"TruncateCompactor",
|
|
35
|
+
"ContextBudgetSpec",
|
|
36
|
+
"ContextDeclaration",
|
|
37
|
+
"ContextExpansionSpec",
|
|
38
|
+
"ContextInsertSpec",
|
|
39
|
+
"ContextMessageSpec",
|
|
40
|
+
"ContextPackBudgetSpec",
|
|
41
|
+
"ContextPackSpec",
|
|
42
|
+
"ContextPolicySpec",
|
|
43
|
+
"ContextRetrieverRequest",
|
|
44
|
+
"ContextTemplateSpec",
|
|
45
|
+
"CorpusDeclaration",
|
|
46
|
+
"RetrieverDeclaration",
|
|
47
|
+
"CompactorDeclaration",
|
|
48
|
+
"HistoryInsertSpec",
|
|
49
|
+
"SystemMessageSpec",
|
|
50
|
+
"UserMessageSpec",
|
|
51
|
+
"AssistantMessageSpec",
|
|
52
|
+
"retrieve_context_pack",
|
|
53
|
+
]
|