biblicus 0.16.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +21 -1
- biblicus/backends/embedding_index_common.py +36 -3
- biblicus/backends/embedding_index_file.py +11 -5
- biblicus/backends/embedding_index_inmemory.py +14 -12
- biblicus/backends/hybrid.py +4 -3
- biblicus/backends/scan.py +1 -0
- biblicus/backends/tf_vector.py +17 -24
- biblicus/cli.py +25 -15
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1060 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +129 -0
- biblicus/corpus.py +117 -16
- biblicus/errors.py +24 -0
- biblicus/knowledge_base.py +1 -1
- biblicus/models.py +6 -3
- biblicus/retrieval.py +2 -2
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +2 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/METADATA +3 -3
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/RECORD +28 -23
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
|
@@ -2,6 +2,17 @@
|
|
|
2
2
|
Biblicus public package interface.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from .context_engine import (
|
|
6
|
+
ContextAssembler,
|
|
7
|
+
ContextBudgetSpec,
|
|
8
|
+
ContextDeclaration,
|
|
9
|
+
ContextExpansionSpec,
|
|
10
|
+
ContextPackBudgetSpec,
|
|
11
|
+
ContextPackSpec,
|
|
12
|
+
ContextPolicySpec,
|
|
13
|
+
ContextRetrieverRequest,
|
|
14
|
+
retrieve_context_pack,
|
|
15
|
+
)
|
|
5
16
|
from .corpus import Corpus
|
|
6
17
|
from .knowledge_base import KnowledgeBase
|
|
7
18
|
from .models import (
|
|
@@ -16,6 +27,15 @@ from .models import (
|
|
|
16
27
|
|
|
17
28
|
__all__ = [
|
|
18
29
|
"__version__",
|
|
30
|
+
"ContextAssembler",
|
|
31
|
+
"ContextBudgetSpec",
|
|
32
|
+
"ContextDeclaration",
|
|
33
|
+
"ContextExpansionSpec",
|
|
34
|
+
"ContextPackBudgetSpec",
|
|
35
|
+
"ContextPackSpec",
|
|
36
|
+
"ContextPolicySpec",
|
|
37
|
+
"ContextRetrieverRequest",
|
|
38
|
+
"retrieve_context_pack",
|
|
19
39
|
"Corpus",
|
|
20
40
|
"CorpusConfig",
|
|
21
41
|
"Evidence",
|
|
@@ -27,4 +47,4 @@ __all__ = [
|
|
|
27
47
|
"RetrievalRun",
|
|
28
48
|
]
|
|
29
49
|
|
|
30
|
-
__version__ = "0.
|
|
50
|
+
__version__ = "1.0.0"
|
|
@@ -47,8 +47,6 @@ class EmbeddingIndexRecipeConfig(BaseModel):
|
|
|
47
47
|
"""
|
|
48
48
|
Configuration for embedding-index retrieval backends.
|
|
49
49
|
|
|
50
|
-
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
51
|
-
:vartype snippet_characters: int
|
|
52
50
|
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
53
51
|
:vartype extraction_run: str or None
|
|
54
52
|
:ivar chunker: Chunker configuration.
|
|
@@ -57,17 +55,52 @@ class EmbeddingIndexRecipeConfig(BaseModel):
|
|
|
57
55
|
:vartype tokenizer: biblicus.chunking.TokenizerConfig or None
|
|
58
56
|
:ivar embedding_provider: Embedding provider configuration.
|
|
59
57
|
:vartype embedding_provider: biblicus.embedding_providers.EmbeddingProviderConfig
|
|
58
|
+
:ivar snippet_characters: Optional maximum character count for returned evidence text.
|
|
59
|
+
:vartype snippet_characters: int or None
|
|
60
|
+
:ivar maximum_cache_total_items: Optional maximum number of vectors cached per scan batch.
|
|
61
|
+
:vartype maximum_cache_total_items: int or None
|
|
62
|
+
:ivar maximum_cache_total_characters: Optional maximum characters cached per scan batch.
|
|
63
|
+
:vartype maximum_cache_total_characters: int or None
|
|
60
64
|
"""
|
|
61
65
|
|
|
62
66
|
model_config = ConfigDict(extra="forbid")
|
|
63
67
|
|
|
64
|
-
snippet_characters: int = Field(default=
|
|
68
|
+
snippet_characters: Optional[int] = Field(default=None, ge=1)
|
|
69
|
+
maximum_cache_total_items: Optional[int] = Field(default=None, ge=1)
|
|
70
|
+
maximum_cache_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
65
71
|
extraction_run: Optional[str] = None
|
|
66
72
|
chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
|
|
67
73
|
tokenizer: Optional[TokenizerConfig] = None
|
|
68
74
|
embedding_provider: EmbeddingProviderConfig
|
|
69
75
|
|
|
70
76
|
|
|
77
|
+
def _extract_span_text(text: Optional[str], span: Tuple[int, int]) -> Optional[str]:
|
|
78
|
+
if not isinstance(text, str):
|
|
79
|
+
return None
|
|
80
|
+
span_start, span_end = span
|
|
81
|
+
if span_start < 0 or span_end <= span_start:
|
|
82
|
+
return text
|
|
83
|
+
return text[span_start:span_end]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _build_snippet(
|
|
87
|
+
text: Optional[str], span: Tuple[int, int], max_chars: Optional[int]
|
|
88
|
+
) -> Optional[str]:
|
|
89
|
+
if not isinstance(text, str):
|
|
90
|
+
return None
|
|
91
|
+
if max_chars is None:
|
|
92
|
+
return _extract_span_text(text, span)
|
|
93
|
+
if max_chars <= 0:
|
|
94
|
+
return ""
|
|
95
|
+
span_start, span_end = span
|
|
96
|
+
if span_start < 0 or span_end <= span_start:
|
|
97
|
+
return text[:max_chars]
|
|
98
|
+
half_window = max_chars // 2
|
|
99
|
+
snippet_start = max(span_start - half_window, 0)
|
|
100
|
+
snippet_end = min(span_end + half_window, len(text))
|
|
101
|
+
return text[snippet_start:snippet_end]
|
|
102
|
+
|
|
103
|
+
|
|
71
104
|
def resolve_extraction_reference(
|
|
72
105
|
corpus: Corpus, recipe_config: EmbeddingIndexRecipeConfig
|
|
73
106
|
) -> Optional[ExtractionRunReference]:
|
|
@@ -16,6 +16,8 @@ from ..time import utc_now_iso
|
|
|
16
16
|
from .embedding_index_common import (
|
|
17
17
|
ChunkRecord,
|
|
18
18
|
EmbeddingIndexRecipeConfig,
|
|
19
|
+
_build_snippet,
|
|
20
|
+
_extract_span_text,
|
|
19
21
|
artifact_paths_for_run,
|
|
20
22
|
chunks_to_records,
|
|
21
23
|
collect_chunks,
|
|
@@ -26,7 +28,6 @@ from .embedding_index_common import (
|
|
|
26
28
|
write_chunks_jsonl,
|
|
27
29
|
write_embeddings,
|
|
28
30
|
)
|
|
29
|
-
from .scan import _build_snippet
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class EmbeddingIndexFileBackend:
|
|
@@ -132,10 +133,12 @@ class EmbeddingIndexFileBackend:
|
|
|
132
133
|
if query_embedding.shape[0] != 1:
|
|
133
134
|
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
134
135
|
|
|
136
|
+
batch_rows = recipe_config.maximum_cache_total_items or 4096
|
|
135
137
|
candidates = _top_indices_batched(
|
|
136
138
|
embeddings=embeddings,
|
|
137
139
|
query_vector=query_embedding[0],
|
|
138
140
|
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
141
|
+
batch_rows=batch_rows,
|
|
139
142
|
)
|
|
140
143
|
evidence_items = _build_evidence(
|
|
141
144
|
corpus,
|
|
@@ -222,9 +225,11 @@ def _build_evidence(
|
|
|
222
225
|
media_type=str(getattr(catalog_item, "media_type")),
|
|
223
226
|
extraction_reference=extraction_reference,
|
|
224
227
|
)
|
|
225
|
-
|
|
226
|
-
text, (record.span_start, record.span_end),
|
|
228
|
+
span_text = _build_snippet(
|
|
229
|
+
text, (record.span_start, record.span_end), recipe_config.snippet_characters
|
|
227
230
|
)
|
|
231
|
+
if span_text is None:
|
|
232
|
+
span_text = _extract_span_text(text, (record.span_start, record.span_end))
|
|
228
233
|
score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
|
|
229
234
|
evidence_items.append(
|
|
230
235
|
Evidence(
|
|
@@ -233,7 +238,7 @@ def _build_evidence(
|
|
|
233
238
|
media_type=str(getattr(catalog_item, "media_type")),
|
|
234
239
|
score=score,
|
|
235
240
|
rank=1,
|
|
236
|
-
text=
|
|
241
|
+
text=span_text,
|
|
237
242
|
content_ref=None,
|
|
238
243
|
span_start=record.span_start,
|
|
239
244
|
span_end=record.span_end,
|
|
@@ -241,7 +246,8 @@ def _build_evidence(
|
|
|
241
246
|
stage_scores=None,
|
|
242
247
|
recipe_id=run.recipe.recipe_id,
|
|
243
248
|
run_id=run.run_id,
|
|
244
|
-
|
|
249
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
250
|
+
hash=hash_text(span_text or ""),
|
|
245
251
|
)
|
|
246
252
|
)
|
|
247
253
|
return evidence_items
|
|
@@ -16,6 +16,8 @@ from ..time import utc_now_iso
|
|
|
16
16
|
from .embedding_index_common import (
|
|
17
17
|
ChunkRecord,
|
|
18
18
|
EmbeddingIndexRecipeConfig,
|
|
19
|
+
_build_snippet,
|
|
20
|
+
_extract_span_text,
|
|
19
21
|
artifact_paths_for_run,
|
|
20
22
|
chunks_to_records,
|
|
21
23
|
collect_chunks,
|
|
@@ -26,20 +28,19 @@ from .embedding_index_common import (
|
|
|
26
28
|
write_chunks_jsonl,
|
|
27
29
|
write_embeddings,
|
|
28
30
|
)
|
|
29
|
-
from .scan import _build_snippet
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
|
|
33
34
|
"""
|
|
34
35
|
Configuration for embedding-index-inmemory retrieval.
|
|
35
36
|
|
|
36
|
-
:ivar
|
|
37
|
-
:vartype
|
|
37
|
+
:ivar maximum_cache_total_items: Maximum chunks allowed for in-memory query loading.
|
|
38
|
+
:vartype maximum_cache_total_items: int
|
|
38
39
|
"""
|
|
39
40
|
|
|
40
41
|
model_config = ConfigDict(extra="forbid")
|
|
41
42
|
|
|
42
|
-
|
|
43
|
+
maximum_cache_total_items: int = Field(default=25000, ge=1)
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class EmbeddingIndexInMemoryBackend:
|
|
@@ -66,10 +67,10 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
66
67
|
"""
|
|
67
68
|
recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
|
|
68
69
|
chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
|
|
69
|
-
if len(chunks) > recipe_config.
|
|
70
|
+
if len(chunks) > recipe_config.maximum_cache_total_items:
|
|
70
71
|
raise ValueError(
|
|
71
|
-
"embedding-index-inmemory exceeded
|
|
72
|
-
"Use embedding-index-file or increase
|
|
72
|
+
"embedding-index-inmemory exceeded maximum_cache_total_items. "
|
|
73
|
+
"Use embedding-index-file or increase maximum_cache_total_items."
|
|
73
74
|
)
|
|
74
75
|
|
|
75
76
|
provider = recipe_config.embedding_provider.build_provider()
|
|
@@ -225,9 +226,9 @@ def _build_evidence(
|
|
|
225
226
|
media_type=media_type,
|
|
226
227
|
extraction_reference=extraction_reference,
|
|
227
228
|
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
229
|
+
span_text = _build_snippet(text, (span_start, span_end), recipe_config.snippet_characters)
|
|
230
|
+
if span_text is None:
|
|
231
|
+
span_text = _extract_span_text(text, (span_start, span_end))
|
|
231
232
|
evidence_items.append(
|
|
232
233
|
Evidence(
|
|
233
234
|
item_id=item_id,
|
|
@@ -235,7 +236,7 @@ def _build_evidence(
|
|
|
235
236
|
media_type=media_type,
|
|
236
237
|
score=float(scores[idx]),
|
|
237
238
|
rank=1,
|
|
238
|
-
text=
|
|
239
|
+
text=span_text,
|
|
239
240
|
content_ref=None,
|
|
240
241
|
span_start=span_start,
|
|
241
242
|
span_end=span_end,
|
|
@@ -243,7 +244,8 @@ def _build_evidence(
|
|
|
243
244
|
stage_scores=None,
|
|
244
245
|
recipe_id=run.recipe.recipe_id,
|
|
245
246
|
run_id=run.run_id,
|
|
246
|
-
|
|
247
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
248
|
+
hash=hash_text(span_text or ""),
|
|
247
249
|
)
|
|
248
250
|
)
|
|
249
251
|
return evidence_items
|
biblicus/backends/hybrid.py
CHANGED
|
@@ -217,9 +217,9 @@ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> Que
|
|
|
217
217
|
:return: Expanded budget for component backends.
|
|
218
218
|
:rtype: QueryBudget
|
|
219
219
|
"""
|
|
220
|
-
|
|
220
|
+
maximum_total_characters = budget.maximum_total_characters
|
|
221
221
|
expanded_characters = (
|
|
222
|
-
|
|
222
|
+
maximum_total_characters * multiplier if maximum_total_characters is not None else None
|
|
223
223
|
)
|
|
224
224
|
expanded_max_items_per_source = (
|
|
225
225
|
budget.max_items_per_source * multiplier
|
|
@@ -230,7 +230,7 @@ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> Que
|
|
|
230
230
|
return QueryBudget(
|
|
231
231
|
max_total_items=requested_items * multiplier,
|
|
232
232
|
offset=0,
|
|
233
|
-
|
|
233
|
+
maximum_total_characters=expanded_characters,
|
|
234
234
|
max_items_per_source=expanded_max_items_per_source,
|
|
235
235
|
)
|
|
236
236
|
|
|
@@ -285,6 +285,7 @@ def _fuse_evidence(
|
|
|
285
285
|
stage_scores={"lexical": lexical_score, "embedding": embedding_score},
|
|
286
286
|
recipe_id="",
|
|
287
287
|
run_id="",
|
|
288
|
+
metadata=base_evidence.metadata,
|
|
288
289
|
hash=base_evidence.hash,
|
|
289
290
|
)
|
|
290
291
|
)
|
biblicus/backends/scan.py
CHANGED
biblicus/backends/tf_vector.py
CHANGED
|
@@ -8,7 +8,7 @@ import math
|
|
|
8
8
|
import re
|
|
9
9
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
-
from pydantic import BaseModel, ConfigDict
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
14
|
from ..frontmatter import parse_front_matter
|
|
@@ -28,16 +28,16 @@ class TfVectorRecipeConfig(BaseModel):
|
|
|
28
28
|
"""
|
|
29
29
|
Configuration for the term-frequency vector retrieval backend.
|
|
30
30
|
|
|
31
|
-
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
32
|
-
:vartype snippet_characters: int
|
|
33
31
|
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
34
32
|
:vartype extraction_run: str or None
|
|
33
|
+
:ivar snippet_characters: Optional maximum character count for returned evidence text.
|
|
34
|
+
:vartype snippet_characters: int or None
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
model_config = ConfigDict(extra="forbid")
|
|
38
38
|
|
|
39
|
-
snippet_characters: int = Field(default=400, ge=1)
|
|
40
39
|
extraction_run: Optional[str] = None
|
|
40
|
+
snippet_characters: Optional[int] = None
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class TfVectorBackend:
|
|
@@ -125,8 +125,8 @@ class TfVectorBackend:
|
|
|
125
125
|
query_tokens=query_tokens,
|
|
126
126
|
query_vector=query_vector,
|
|
127
127
|
query_norm=query_norm,
|
|
128
|
-
snippet_characters=recipe_config.snippet_characters,
|
|
129
128
|
extraction_reference=extraction_reference,
|
|
129
|
+
snippet_characters=recipe_config.snippet_characters,
|
|
130
130
|
)
|
|
131
131
|
sorted_candidates = sorted(
|
|
132
132
|
scored_candidates,
|
|
@@ -359,21 +359,13 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
|
|
|
359
359
|
return best_start, best_end
|
|
360
360
|
|
|
361
361
|
|
|
362
|
-
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
:param text: Source text to slice.
|
|
367
|
-
:type text: str
|
|
368
|
-
:param span: Match span to center on.
|
|
369
|
-
:type span: tuple[int, int] or None
|
|
370
|
-
:param max_chars: Maximum snippet length.
|
|
371
|
-
:type max_chars: int
|
|
372
|
-
:return: Snippet text.
|
|
373
|
-
:rtype: str
|
|
374
|
-
"""
|
|
362
|
+
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: Optional[int]) -> str:
|
|
363
|
+
if max_chars is None:
|
|
364
|
+
return text
|
|
375
365
|
if not text:
|
|
376
366
|
return ""
|
|
367
|
+
if max_chars <= 0:
|
|
368
|
+
return ""
|
|
377
369
|
if span is None:
|
|
378
370
|
return text[:max_chars]
|
|
379
371
|
span_start, span_end = span
|
|
@@ -390,8 +382,8 @@ def _score_items(
|
|
|
390
382
|
query_tokens: List[str],
|
|
391
383
|
query_vector: Dict[str, float],
|
|
392
384
|
query_norm: float,
|
|
393
|
-
snippet_characters: int,
|
|
394
385
|
extraction_reference: Optional[ExtractionRunReference],
|
|
386
|
+
snippet_characters: Optional[int],
|
|
395
387
|
) -> List[Evidence]:
|
|
396
388
|
"""
|
|
397
389
|
Score catalog items and return evidence candidates.
|
|
@@ -406,10 +398,10 @@ def _score_items(
|
|
|
406
398
|
:type query_vector: dict[str, float]
|
|
407
399
|
:param query_norm: Query vector norm.
|
|
408
400
|
:type query_norm: float
|
|
409
|
-
:param snippet_characters: Snippet length budget.
|
|
410
|
-
:type snippet_characters: int
|
|
411
401
|
:param extraction_reference: Optional extraction run reference.
|
|
412
402
|
:type extraction_reference: ExtractionRunReference or None
|
|
403
|
+
:param snippet_characters: Optional maximum character count for returned evidence text.
|
|
404
|
+
:type snippet_characters: int or None
|
|
413
405
|
:return: Evidence candidates with provisional ranks.
|
|
414
406
|
:rtype: list[Evidence]
|
|
415
407
|
"""
|
|
@@ -437,9 +429,9 @@ def _score_items(
|
|
|
437
429
|
if similarity <= 0:
|
|
438
430
|
continue
|
|
439
431
|
span = _find_first_match(item_text, query_tokens)
|
|
440
|
-
snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
441
432
|
span_start = span[0] if span else None
|
|
442
433
|
span_end = span[1] if span else None
|
|
434
|
+
evidence_text = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
443
435
|
evidence_items.append(
|
|
444
436
|
Evidence(
|
|
445
437
|
item_id=str(getattr(catalog_item, "id")),
|
|
@@ -447,14 +439,15 @@ def _score_items(
|
|
|
447
439
|
media_type=str(media_type),
|
|
448
440
|
score=float(similarity),
|
|
449
441
|
rank=1,
|
|
450
|
-
text=
|
|
442
|
+
text=evidence_text,
|
|
451
443
|
content_ref=None,
|
|
452
444
|
span_start=span_start,
|
|
453
445
|
span_end=span_end,
|
|
454
446
|
stage="tf-vector",
|
|
455
447
|
recipe_id="",
|
|
456
448
|
run_id="",
|
|
457
|
-
|
|
449
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
450
|
+
hash=hash_text(evidence_text or ""),
|
|
458
451
|
)
|
|
459
452
|
)
|
|
460
453
|
return evidence_items
|
biblicus/cli.py
CHANGED
|
@@ -24,7 +24,7 @@ from .context import (
|
|
|
24
24
|
)
|
|
25
25
|
from .corpus import Corpus
|
|
26
26
|
from .crawl import CrawlRequest, crawl_into_corpus
|
|
27
|
-
from .errors import ExtractionRunFatalError
|
|
27
|
+
from .errors import ExtractionRunFatalError, IngestCollisionError
|
|
28
28
|
from .evaluation import evaluate_run, load_dataset
|
|
29
29
|
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
30
30
|
from .extraction import build_extraction_run
|
|
@@ -117,18 +117,28 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
|
117
117
|
|
|
118
118
|
results = []
|
|
119
119
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
120
|
+
try:
|
|
121
|
+
if arguments.note is not None or arguments.stdin:
|
|
122
|
+
text = arguments.note if arguments.note is not None else sys.stdin.read()
|
|
123
|
+
ingest_result = corpus.ingest_note(
|
|
124
|
+
text,
|
|
125
|
+
title=arguments.title,
|
|
126
|
+
tags=tags,
|
|
127
|
+
source_uri=None if arguments.stdin else None,
|
|
128
|
+
)
|
|
129
|
+
results.append(ingest_result)
|
|
130
|
+
|
|
131
|
+
for source_path in arguments.files or []:
|
|
132
|
+
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
133
|
+
except IngestCollisionError as error:
|
|
134
|
+
print(
|
|
135
|
+
"Ingest failed: source already ingested\n"
|
|
136
|
+
f"source_uri: {error.source_uri}\n"
|
|
137
|
+
f"existing_item_id: {error.existing_item_id}\n"
|
|
138
|
+
f"existing_relpath: {error.existing_relpath}",
|
|
139
|
+
file=sys.stderr,
|
|
127
140
|
)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
for source_path in arguments.files or []:
|
|
131
|
-
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
141
|
+
return 3
|
|
132
142
|
|
|
133
143
|
if not results:
|
|
134
144
|
print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
|
|
@@ -374,7 +384,7 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
374
384
|
return QueryBudget(
|
|
375
385
|
max_total_items=arguments.max_total_items,
|
|
376
386
|
offset=getattr(arguments, "offset", 0),
|
|
377
|
-
|
|
387
|
+
maximum_total_characters=arguments.maximum_total_characters,
|
|
378
388
|
max_items_per_source=arguments.max_items_per_source,
|
|
379
389
|
)
|
|
380
390
|
|
|
@@ -1071,7 +1081,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1071
1081
|
help="Skip this many ranked candidates before selecting evidence (pagination).",
|
|
1072
1082
|
)
|
|
1073
1083
|
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
1074
|
-
p_query.add_argument("--
|
|
1084
|
+
p_query.add_argument("--maximum-total-characters", type=int, default=2000)
|
|
1075
1085
|
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
1076
1086
|
p_query.add_argument(
|
|
1077
1087
|
"--reranker-id",
|
|
@@ -1131,7 +1141,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1131
1141
|
help="Path to dataset JavaScript Object Notation file.",
|
|
1132
1142
|
)
|
|
1133
1143
|
p_eval.add_argument("--max-total-items", type=int, default=5)
|
|
1134
|
-
p_eval.add_argument("--
|
|
1144
|
+
p_eval.add_argument("--maximum-total-characters", type=int, default=2000)
|
|
1135
1145
|
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
1136
1146
|
p_eval.set_defaults(func=cmd_eval)
|
|
1137
1147
|
|
biblicus/context.py
CHANGED
|
@@ -25,6 +25,8 @@ class ContextPackPolicy(BaseModel):
|
|
|
25
25
|
:vartype ordering: str
|
|
26
26
|
:ivar include_metadata: Whether to include evidence metadata lines in each block.
|
|
27
27
|
:vartype include_metadata: bool
|
|
28
|
+
:ivar metadata_fields: Optional evidence metadata fields to include.
|
|
29
|
+
:vartype metadata_fields: list[str] or None
|
|
28
30
|
"""
|
|
29
31
|
|
|
30
32
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -32,6 +34,7 @@ class ContextPackPolicy(BaseModel):
|
|
|
32
34
|
join_with: str = Field(default="\n\n")
|
|
33
35
|
ordering: str = Field(default="rank", min_length=1)
|
|
34
36
|
include_metadata: bool = Field(default=False)
|
|
37
|
+
metadata_fields: Optional[List[str]] = None
|
|
35
38
|
|
|
36
39
|
|
|
37
40
|
class ContextPack(BaseModel):
|
|
@@ -132,7 +135,9 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
|
|
|
132
135
|
trimmed_text = evidence.text.strip()
|
|
133
136
|
if not trimmed_text:
|
|
134
137
|
continue
|
|
135
|
-
metadata =
|
|
138
|
+
metadata = (
|
|
139
|
+
_metadata_for_evidence(evidence, policy=policy) if policy.include_metadata else None
|
|
140
|
+
)
|
|
136
141
|
block_text = _format_block_text(trimmed_text, metadata=metadata)
|
|
137
142
|
selected_blocks.append(
|
|
138
143
|
ContextPackBlock(
|
|
@@ -276,7 +281,11 @@ def _order_evidence(
|
|
|
276
281
|
raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
|
|
277
282
|
|
|
278
283
|
|
|
279
|
-
def _metadata_for_evidence(
|
|
284
|
+
def _metadata_for_evidence(
|
|
285
|
+
evidence: Evidence,
|
|
286
|
+
*,
|
|
287
|
+
policy: ContextPackPolicy,
|
|
288
|
+
) -> Dict[str, object]:
|
|
280
289
|
"""
|
|
281
290
|
Build metadata for a context pack block.
|
|
282
291
|
|
|
@@ -285,12 +294,19 @@ def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
|
|
|
285
294
|
:return: Metadata mapping.
|
|
286
295
|
:rtype: dict[str, object]
|
|
287
296
|
"""
|
|
288
|
-
|
|
297
|
+
metadata = {
|
|
289
298
|
"item_id": evidence.item_id,
|
|
290
299
|
"source_uri": evidence.source_uri or "none",
|
|
291
300
|
"score": evidence.score,
|
|
292
301
|
"stage": evidence.stage,
|
|
293
302
|
}
|
|
303
|
+
extra = evidence.metadata or {}
|
|
304
|
+
if policy.metadata_fields is not None:
|
|
305
|
+
extra = {key: extra.get(key) for key in policy.metadata_fields if key in extra}
|
|
306
|
+
for key, value in extra.items():
|
|
307
|
+
if key not in metadata:
|
|
308
|
+
metadata[key] = value
|
|
309
|
+
return metadata
|
|
294
310
|
|
|
295
311
|
|
|
296
312
|
def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
|
|
@@ -306,12 +322,11 @@ def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> s
|
|
|
306
322
|
"""
|
|
307
323
|
if not metadata:
|
|
308
324
|
return text
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
return f"{metadata_lines}\n{text}"
|
|
325
|
+
ordered_keys = ["item_id", "source_uri", "score", "stage"]
|
|
326
|
+
metadata_lines = [f"{key}: {metadata[key]}" for key in ordered_keys if key in metadata]
|
|
327
|
+
for key in sorted(metadata.keys()):
|
|
328
|
+
if key in ordered_keys:
|
|
329
|
+
continue
|
|
330
|
+
metadata_lines.append(f"{key}: {metadata[key]}")
|
|
331
|
+
metadata_text = "\n".join(metadata_lines)
|
|
332
|
+
return f"{metadata_text}\n{text}"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Public interface for the Biblicus Context Engine.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .assembler import ContextAssembler, ContextAssemblyResult
|
|
6
|
+
from .compaction import BaseCompactor, CompactionRequest, SummaryCompactor, TruncateCompactor
|
|
7
|
+
from .models import (
|
|
8
|
+
AssistantMessageSpec,
|
|
9
|
+
CompactorDeclaration,
|
|
10
|
+
ContextBudgetSpec,
|
|
11
|
+
ContextDeclaration,
|
|
12
|
+
ContextExpansionSpec,
|
|
13
|
+
ContextInsertSpec,
|
|
14
|
+
ContextMessageSpec,
|
|
15
|
+
ContextPackBudgetSpec,
|
|
16
|
+
ContextPackSpec,
|
|
17
|
+
ContextPolicySpec,
|
|
18
|
+
ContextRetrieverRequest,
|
|
19
|
+
ContextTemplateSpec,
|
|
20
|
+
CorpusDeclaration,
|
|
21
|
+
HistoryInsertSpec,
|
|
22
|
+
RetrieverDeclaration,
|
|
23
|
+
SystemMessageSpec,
|
|
24
|
+
UserMessageSpec,
|
|
25
|
+
)
|
|
26
|
+
from .retrieval import retrieve_context_pack
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"ContextAssembler",
|
|
30
|
+
"ContextAssemblyResult",
|
|
31
|
+
"BaseCompactor",
|
|
32
|
+
"CompactionRequest",
|
|
33
|
+
"SummaryCompactor",
|
|
34
|
+
"TruncateCompactor",
|
|
35
|
+
"ContextBudgetSpec",
|
|
36
|
+
"ContextDeclaration",
|
|
37
|
+
"ContextExpansionSpec",
|
|
38
|
+
"ContextInsertSpec",
|
|
39
|
+
"ContextMessageSpec",
|
|
40
|
+
"ContextPackBudgetSpec",
|
|
41
|
+
"ContextPackSpec",
|
|
42
|
+
"ContextPolicySpec",
|
|
43
|
+
"ContextRetrieverRequest",
|
|
44
|
+
"ContextTemplateSpec",
|
|
45
|
+
"CorpusDeclaration",
|
|
46
|
+
"RetrieverDeclaration",
|
|
47
|
+
"CompactorDeclaration",
|
|
48
|
+
"HistoryInsertSpec",
|
|
49
|
+
"SystemMessageSpec",
|
|
50
|
+
"UserMessageSpec",
|
|
51
|
+
"AssistantMessageSpec",
|
|
52
|
+
"retrieve_context_pack",
|
|
53
|
+
]
|