biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. biblicus/__init__.py +21 -1
  2. biblicus/analysis/markov.py +35 -3
  3. biblicus/backends/__init__.py +6 -2
  4. biblicus/backends/embedding_index_common.py +334 -0
  5. biblicus/backends/embedding_index_file.py +272 -0
  6. biblicus/backends/embedding_index_inmemory.py +270 -0
  7. biblicus/backends/hybrid.py +8 -5
  8. biblicus/backends/scan.py +1 -0
  9. biblicus/backends/sqlite_full_text_search.py +1 -1
  10. biblicus/backends/{vector.py → tf_vector.py} +28 -35
  11. biblicus/chunking.py +396 -0
  12. biblicus/cli.py +75 -25
  13. biblicus/context.py +27 -12
  14. biblicus/context_engine/__init__.py +53 -0
  15. biblicus/context_engine/assembler.py +1060 -0
  16. biblicus/context_engine/compaction.py +110 -0
  17. biblicus/context_engine/models.py +423 -0
  18. biblicus/context_engine/retrieval.py +129 -0
  19. biblicus/corpus.py +117 -16
  20. biblicus/embedding_providers.py +122 -0
  21. biblicus/errors.py +24 -0
  22. biblicus/frontmatter.py +2 -0
  23. biblicus/knowledge_base.py +1 -1
  24. biblicus/models.py +15 -3
  25. biblicus/retrieval.py +7 -2
  26. biblicus/sources.py +46 -11
  27. biblicus/text/link.py +6 -0
  28. biblicus/text/prompts.py +2 -0
  29. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
  30. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
  31. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
  32. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
  33. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
  34. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py CHANGED
@@ -8,7 +8,7 @@ import argparse
8
8
  import json
9
9
  import sys
10
10
  from pathlib import Path
11
- from typing import Dict, List, Optional
11
+ from typing import Dict, Iterable, List, Optional
12
12
 
13
13
  from pydantic import ValidationError
14
14
 
@@ -24,7 +24,7 @@ from .context import (
24
24
  )
25
25
  from .corpus import Corpus
26
26
  from .crawl import CrawlRequest, crawl_into_corpus
27
- from .errors import ExtractionRunFatalError
27
+ from .errors import ExtractionRunFatalError, IngestCollisionError
28
28
  from .evaluation import evaluate_run, load_dataset
29
29
  from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
30
30
  from .extraction import build_extraction_run
@@ -117,18 +117,28 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
117
117
 
118
118
  results = []
119
119
 
120
- if arguments.note is not None or arguments.stdin:
121
- text = arguments.note if arguments.note is not None else sys.stdin.read()
122
- ingest_result = corpus.ingest_note(
123
- text,
124
- title=arguments.title,
125
- tags=tags,
126
- source_uri="stdin" if arguments.stdin else "text",
120
+ try:
121
+ if arguments.note is not None or arguments.stdin:
122
+ text = arguments.note if arguments.note is not None else sys.stdin.read()
123
+ ingest_result = corpus.ingest_note(
124
+ text,
125
+ title=arguments.title,
126
+ tags=tags,
127
+ source_uri=None if arguments.stdin else None,
128
+ )
129
+ results.append(ingest_result)
130
+
131
+ for source_path in arguments.files or []:
132
+ results.append(corpus.ingest_source(source_path, tags=tags))
133
+ except IngestCollisionError as error:
134
+ print(
135
+ "Ingest failed: source already ingested\n"
136
+ f"source_uri: {error.source_uri}\n"
137
+ f"existing_item_id: {error.existing_item_id}\n"
138
+ f"existing_relpath: {error.existing_relpath}",
139
+ file=sys.stderr,
127
140
  )
128
- results.append(ingest_result)
129
-
130
- for source_path in arguments.files or []:
131
- results.append(corpus.ingest_source(source_path, tags=tags))
141
+ return 3
132
142
 
133
143
  if not results:
134
144
  print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
@@ -239,15 +249,23 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
239
249
  return 0
240
250
 
241
251
 
242
- def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
252
+ def _parse_config_pairs(pairs: Optional[Iterable[str]]) -> Dict[str, object]:
243
253
  """
244
- Parse repeated key=value config pairs.
254
+ Parse key=value pairs into a configuration mapping.
245
255
 
246
- :param pairs: Config pairs supplied via the command-line interface.
247
- :type pairs: list[str] or None
248
- :return: Parsed config mapping.
256
+ This is used by a few command-line options that accept repeated key=value items.
257
+ Values are coerced to useful types in a predictable way:
258
+
259
+ - JSON objects/arrays (leading ``{`` or ``[``) are parsed as JSON.
260
+ - Whole numbers are parsed as integers.
261
+ - Other numeric forms are parsed as floats.
262
+ - Everything else remains a string.
263
+
264
+ :param pairs: Iterable of key=value strings.
265
+ :type pairs: Iterable[str] or None
266
+ :return: Parsed configuration mapping.
249
267
  :rtype: dict[str, object]
250
- :raises ValueError: If any entry is not key=value.
268
+ :raises ValueError: If any entry is not a key=value pair or values are invalid.
251
269
  """
252
270
  config: Dict[str, object] = {}
253
271
  for item in pairs or []:
@@ -257,8 +275,14 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
257
275
  key = key.strip()
258
276
  if not key:
259
277
  raise ValueError("Config keys must be non-empty")
278
+ raw = raw.strip()
260
279
  value: object = raw
261
- if raw.isdigit():
280
+ if raw.startswith("{") or raw.startswith("["):
281
+ try:
282
+ value = json.loads(raw)
283
+ except json.JSONDecodeError as exc:
284
+ raise ValueError(f"Config value must be valid JSON for key {key!r}") from exc
285
+ elif raw.isdigit():
262
286
  value = int(raw)
263
287
  else:
264
288
  try:
@@ -359,7 +383,8 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
359
383
  """
360
384
  return QueryBudget(
361
385
  max_total_items=arguments.max_total_items,
362
- max_total_characters=arguments.max_total_characters,
386
+ offset=getattr(arguments, "offset", 0),
387
+ maximum_total_characters=arguments.maximum_total_characters,
363
388
  max_items_per_source=arguments.max_items_per_source,
364
389
  )
365
390
 
@@ -373,13 +398,26 @@ def cmd_build(arguments: argparse.Namespace) -> int:
373
398
  :return: Exit code.
374
399
  :rtype: int
375
400
  """
401
+ from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
402
+
376
403
  corpus = (
377
404
  Corpus.open(arguments.corpus)
378
405
  if getattr(arguments, "corpus", None)
379
406
  else Corpus.find(Path.cwd())
380
407
  )
381
408
  backend = get_backend(arguments.backend)
382
- config = _parse_config_pairs(arguments.config)
409
+
410
+ base_config: Dict[str, object] = {}
411
+ if getattr(arguments, "recipe", None):
412
+ base_config = load_recipe_view(
413
+ arguments.recipe,
414
+ recipe_label="Recipe file",
415
+ mapping_error_message="Retrieval build recipe must be a mapping/object",
416
+ )
417
+
418
+ overrides = parse_dotted_overrides(arguments.config)
419
+ config = apply_dotted_overrides(base_config, overrides)
420
+
383
421
  run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
384
422
  print(run.model_dump_json(indent=2))
385
423
  return 0
@@ -947,11 +985,17 @@ def build_parser() -> argparse.ArgumentParser:
947
985
  help="Backend identifier (for example, scan, sqlite-full-text-search).",
948
986
  )
949
987
  p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
988
+ p_build.add_argument(
989
+ "--recipe",
990
+ default=None,
991
+ action="append",
992
+ help="Path to YAML recipe file (repeatable). If provided, recipes are composed in precedence order.",
993
+ )
950
994
  p_build.add_argument(
951
995
  "--config",
952
996
  action="append",
953
997
  default=None,
954
- help="Backend config as key=value (repeatable).",
998
+ help="Backend config override as key=value (repeatable). Dotted keys create nested config mappings.",
955
999
  )
956
1000
  p_build.set_defaults(func=cmd_build)
957
1001
 
@@ -1030,8 +1074,14 @@ def build_parser() -> argparse.ArgumentParser:
1030
1074
  p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
1031
1075
  p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
1032
1076
  p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
1077
+ p_query.add_argument(
1078
+ "--offset",
1079
+ type=int,
1080
+ default=0,
1081
+ help="Skip this many ranked candidates before selecting evidence (pagination).",
1082
+ )
1033
1083
  p_query.add_argument("--max-total-items", type=int, default=5)
1034
- p_query.add_argument("--max-total-characters", type=int, default=2000)
1084
+ p_query.add_argument("--maximum-total-characters", type=int, default=2000)
1035
1085
  p_query.add_argument("--max-items-per-source", type=int, default=5)
1036
1086
  p_query.add_argument(
1037
1087
  "--reranker-id",
@@ -1091,7 +1141,7 @@ def build_parser() -> argparse.ArgumentParser:
1091
1141
  help="Path to dataset JavaScript Object Notation file.",
1092
1142
  )
1093
1143
  p_eval.add_argument("--max-total-items", type=int, default=5)
1094
- p_eval.add_argument("--max-total-characters", type=int, default=2000)
1144
+ p_eval.add_argument("--maximum-total-characters", type=int, default=2000)
1095
1145
  p_eval.add_argument("--max-items-per-source", type=int, default=5)
1096
1146
  p_eval.set_defaults(func=cmd_eval)
1097
1147
 
biblicus/context.py CHANGED
@@ -25,6 +25,8 @@ class ContextPackPolicy(BaseModel):
25
25
  :vartype ordering: str
26
26
  :ivar include_metadata: Whether to include evidence metadata lines in each block.
27
27
  :vartype include_metadata: bool
28
+ :ivar metadata_fields: Optional evidence metadata fields to include.
29
+ :vartype metadata_fields: list[str] or None
28
30
  """
29
31
 
30
32
  model_config = ConfigDict(extra="forbid")
@@ -32,6 +34,7 @@ class ContextPackPolicy(BaseModel):
32
34
  join_with: str = Field(default="\n\n")
33
35
  ordering: str = Field(default="rank", min_length=1)
34
36
  include_metadata: bool = Field(default=False)
37
+ metadata_fields: Optional[List[str]] = None
35
38
 
36
39
 
37
40
  class ContextPack(BaseModel):
@@ -132,7 +135,9 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
132
135
  trimmed_text = evidence.text.strip()
133
136
  if not trimmed_text:
134
137
  continue
135
- metadata = _metadata_for_evidence(evidence) if policy.include_metadata else None
138
+ metadata = (
139
+ _metadata_for_evidence(evidence, policy=policy) if policy.include_metadata else None
140
+ )
136
141
  block_text = _format_block_text(trimmed_text, metadata=metadata)
137
142
  selected_blocks.append(
138
143
  ContextPackBlock(
@@ -276,7 +281,11 @@ def _order_evidence(
276
281
  raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
277
282
 
278
283
 
279
- def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
284
+ def _metadata_for_evidence(
285
+ evidence: Evidence,
286
+ *,
287
+ policy: ContextPackPolicy,
288
+ ) -> Dict[str, object]:
280
289
  """
281
290
  Build metadata for a context pack block.
282
291
 
@@ -285,12 +294,19 @@ def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
285
294
  :return: Metadata mapping.
286
295
  :rtype: dict[str, object]
287
296
  """
288
- return {
297
+ metadata = {
289
298
  "item_id": evidence.item_id,
290
299
  "source_uri": evidence.source_uri or "none",
291
300
  "score": evidence.score,
292
301
  "stage": evidence.stage,
293
302
  }
303
+ extra = evidence.metadata or {}
304
+ if policy.metadata_fields is not None:
305
+ extra = {key: extra.get(key) for key in policy.metadata_fields if key in extra}
306
+ for key, value in extra.items():
307
+ if key not in metadata:
308
+ metadata[key] = value
309
+ return metadata
294
310
 
295
311
 
296
312
  def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
@@ -306,12 +322,11 @@ def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> s
306
322
  """
307
323
  if not metadata:
308
324
  return text
309
- metadata_lines = "\n".join(
310
- [
311
- f"item_id: {metadata['item_id']}",
312
- f"source_uri: {metadata['source_uri']}",
313
- f"score: {metadata['score']}",
314
- f"stage: {metadata['stage']}",
315
- ]
316
- )
317
- return f"{metadata_lines}\n{text}"
325
+ ordered_keys = ["item_id", "source_uri", "score", "stage"]
326
+ metadata_lines = [f"{key}: {metadata[key]}" for key in ordered_keys if key in metadata]
327
+ for key in sorted(metadata.keys()):
328
+ if key in ordered_keys:
329
+ continue
330
+ metadata_lines.append(f"{key}: {metadata[key]}")
331
+ metadata_text = "\n".join(metadata_lines)
332
+ return f"{metadata_text}\n{text}"
@@ -0,0 +1,53 @@
1
+ """
2
+ Public interface for the Biblicus Context Engine.
3
+ """
4
+
5
+ from .assembler import ContextAssembler, ContextAssemblyResult
6
+ from .compaction import BaseCompactor, CompactionRequest, SummaryCompactor, TruncateCompactor
7
+ from .models import (
8
+ AssistantMessageSpec,
9
+ CompactorDeclaration,
10
+ ContextBudgetSpec,
11
+ ContextDeclaration,
12
+ ContextExpansionSpec,
13
+ ContextInsertSpec,
14
+ ContextMessageSpec,
15
+ ContextPackBudgetSpec,
16
+ ContextPackSpec,
17
+ ContextPolicySpec,
18
+ ContextRetrieverRequest,
19
+ ContextTemplateSpec,
20
+ CorpusDeclaration,
21
+ HistoryInsertSpec,
22
+ RetrieverDeclaration,
23
+ SystemMessageSpec,
24
+ UserMessageSpec,
25
+ )
26
+ from .retrieval import retrieve_context_pack
27
+
28
+ __all__ = [
29
+ "ContextAssembler",
30
+ "ContextAssemblyResult",
31
+ "BaseCompactor",
32
+ "CompactionRequest",
33
+ "SummaryCompactor",
34
+ "TruncateCompactor",
35
+ "ContextBudgetSpec",
36
+ "ContextDeclaration",
37
+ "ContextExpansionSpec",
38
+ "ContextInsertSpec",
39
+ "ContextMessageSpec",
40
+ "ContextPackBudgetSpec",
41
+ "ContextPackSpec",
42
+ "ContextPolicySpec",
43
+ "ContextRetrieverRequest",
44
+ "ContextTemplateSpec",
45
+ "CorpusDeclaration",
46
+ "RetrieverDeclaration",
47
+ "CompactorDeclaration",
48
+ "HistoryInsertSpec",
49
+ "SystemMessageSpec",
50
+ "UserMessageSpec",
51
+ "AssistantMessageSpec",
52
+ "retrieve_context_pack",
53
+ ]