biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Recipe loading utilities for Biblicus.
2
+ Configuration loading utilities for Biblicus.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -103,34 +103,34 @@ def apply_dotted_overrides(
103
103
  return updated
104
104
 
105
105
 
106
- def load_recipe_view(
107
- recipe_paths: Iterable[str],
106
+ def load_configuration_view(
107
+ configuration_paths: Iterable[str],
108
108
  *,
109
- recipe_label: str = "Recipe",
109
+ configuration_label: str = "Configuration",
110
110
  mapping_error_message: Optional[str] = None,
111
111
  ) -> Dict[str, object]:
112
112
  """
113
- Load a composed recipe view from one or more YAML files.
113
+ Load a composed configuration view from one or more YAML files.
114
114
 
115
- :param recipe_paths: Iterable of recipe file paths in precedence order.
116
- :type recipe_paths: Iterable[str]
117
- :param recipe_label: Label used in error messages (for example: "Recipe file").
118
- :type recipe_label: str
115
+ :param configuration_paths: Iterable of configuration file paths in precedence order.
116
+ :type configuration_paths: Iterable[str]
117
+ :param configuration_label: Label used in error messages (for example: "Configuration file").
118
+ :type configuration_label: str
119
119
  :return: Composed configuration view.
120
120
  :rtype: dict[str, object]
121
- :raises FileNotFoundError: If any recipe file is missing.
122
- :raises ValueError: If any recipe file is not a mapping/object.
121
+ :raises FileNotFoundError: If any configuration file is missing.
122
+ :raises ValueError: If any configuration file is not a mapping/object.
123
123
  """
124
124
  from biblicus._vendor.dotyaml import load_yaml_view
125
125
 
126
- paths: List[str] = [str(path) for path in recipe_paths]
126
+ paths: List[str] = [str(path) for path in configuration_paths]
127
127
  for raw in paths:
128
128
  candidate = Path(raw)
129
129
  if not candidate.is_file():
130
- raise FileNotFoundError(f"{recipe_label} not found: {candidate}")
130
+ raise FileNotFoundError(f"{configuration_label} not found: {candidate}")
131
131
  try:
132
132
  view = load_yaml_view(paths)
133
133
  except ValueError as exc:
134
- message = mapping_error_message or f"{recipe_label} must be a mapping/object"
134
+ message = mapping_error_message or f"{configuration_label} must be a mapping/object"
135
135
  raise ValueError(message) from exc
136
136
  return view
biblicus/constants.py CHANGED
@@ -9,7 +9,7 @@ ANALYSIS_SCHEMA_VERSION = 1
9
9
  CORPUS_DIR_NAME = ".biblicus"
10
10
  DEFAULT_RAW_DIR = "raw"
11
11
  SIDECAR_SUFFIX = ".biblicus.yml"
12
- RUNS_DIR_NAME = "runs"
13
- EXTRACTION_RUNS_DIR_NAME = "extraction"
12
+ SNAPSHOTS_DIR_NAME = "snapshots"
13
+ EXTRACTION_SNAPSHOTS_DIR_NAME = "extraction"
14
14
  ANALYSIS_RUNS_DIR_NAME = "analysis"
15
15
  HOOK_LOGS_DIR_NAME = "hook_logs"
@@ -403,13 +403,15 @@ class ContextAssembler:
403
403
  maximum_items_per_source = None
404
404
  include_metadata = False
405
405
  metadata_fields = None
406
- backend_id = None
406
+ retriever_id = None
407
407
  corpus_root = None
408
- run_id = None
409
- recipe_name = None
410
- recipe_config = None
408
+ snapshot_id = None
409
+ configuration_name = None
410
+ configuration = None
411
411
  corpus_name = getattr(retriever_spec, "corpus", None)
412
412
  join_with = "\n\n"
413
+ pipeline_config = None
414
+ query_config = None
413
415
 
414
416
  if isinstance(config, dict):
415
417
  split = config.get("split", split)
@@ -424,13 +426,26 @@ class ContextAssembler:
424
426
  )
425
427
  include_metadata = config.get("include_metadata", include_metadata)
426
428
  metadata_fields = config.get("metadata_fields", metadata_fields)
427
- backend_id = config.get("backend_id", backend_id)
428
- run_id = config.get("run_id", run_id)
429
- recipe_name = config.get("recipe_name", recipe_name)
430
- recipe_config = config.get("recipe_config", config.get("recipe", recipe_config))
429
+ retriever_id = config.get("retriever_id", retriever_id)
430
+ snapshot_id = config.get("snapshot_id", snapshot_id)
431
+ configuration_name = config.get("configuration_name", configuration_name)
432
+ if isinstance(config.get("configuration"), dict):
433
+ configuration = config.get("configuration")
431
434
  corpus_name = config.get("corpus", corpus_name)
432
435
  join_with = config.get("join_with", join_with)
433
-
436
+ if isinstance(configuration, dict):
437
+ pipeline_config = configuration.get("pipeline")
438
+ if not isinstance(pipeline_config, dict) and isinstance(config.get("pipeline"), dict):
439
+ pipeline_config = config.get("pipeline")
440
+ if isinstance(pipeline_config, dict):
441
+ if isinstance(pipeline_config.get("query"), dict):
442
+ query_config = pipeline_config.get("query") or {}
443
+ if configuration is None and isinstance(pipeline_config.get("index"), dict):
444
+ configuration = pipeline_config.get("index") or {}
445
+ if configuration is None and isinstance(config.get("index"), dict):
446
+ configuration = config.get("index") or {}
447
+ if configuration is None and isinstance(pipeline_config, dict):
448
+ configuration = pipeline_config.get("index") or {}
434
449
  if corpus_name and corpus_name in self._corpus_registry:
435
450
  corpus_spec = self._corpus_registry[corpus_name]
436
451
  corpus_config = corpus_spec.config if hasattr(corpus_spec, "config") else {}
@@ -442,17 +457,32 @@ class ContextAssembler:
442
457
  maximum_cache_total_characters = corpus_config.get(
443
458
  "maximum_cache_total_characters", maximum_cache_total_characters
444
459
  )
445
- backend_id = corpus_config.get("backend_id", backend_id)
446
460
  corpus_root = corpus_config.get(
447
461
  "corpus_root",
448
462
  corpus_config.get("root", corpus_root),
449
463
  )
450
- run_id = corpus_config.get("run_id", run_id)
451
- recipe_name = corpus_config.get("recipe_name", recipe_name)
452
- recipe_config = corpus_config.get(
453
- "recipe_config",
454
- corpus_config.get("recipe", recipe_config),
464
+ if query_config:
465
+ if "limit" in query_config:
466
+ limit = query_config.get("limit", limit)
467
+ if "offset" in query_config:
468
+ offset = query_config.get("offset", offset)
469
+ if "maximum_total_characters" in query_config:
470
+ maximum_total_characters = query_config.get(
471
+ "maximum_total_characters", maximum_total_characters
472
+ )
473
+ if "maximum_items_per_source" in query_config:
474
+ maximum_items_per_source = query_config.get(
475
+ "maximum_items_per_source",
476
+ maximum_items_per_source,
455
477
  )
478
+ if "max_items_per_source" in query_config and maximum_items_per_source is None:
479
+ maximum_items_per_source = query_config.get("max_items_per_source")
480
+ if "include_metadata" in query_config:
481
+ include_metadata = query_config.get("include_metadata", include_metadata)
482
+ if "metadata_fields" in query_config:
483
+ metadata_fields = query_config.get("metadata_fields", metadata_fields)
484
+ if "join_with" in query_config:
485
+ join_with = query_config.get("join_with", join_with)
456
486
 
457
487
  allocated_tokens = self._allocate_pack_budget(pack_budget, policy, weight)
458
488
  if allocated_tokens is not None:
@@ -486,11 +516,11 @@ class ContextAssembler:
486
516
  "maximum_items_per_source": maximum_items_per_source,
487
517
  "include_metadata": include_metadata,
488
518
  "metadata_fields": metadata_fields,
489
- "backend_id": backend_id,
519
+ "retriever_id": retriever_id,
490
520
  "corpus_root": corpus_root,
491
- "run_id": run_id,
492
- "recipe_name": recipe_name,
493
- "recipe_config": recipe_config,
521
+ "snapshot_id": snapshot_id,
522
+ "configuration_name": configuration_name,
523
+ "configuration": configuration,
494
524
  },
495
525
  )
496
526
  context_pack = self._retrieve_with_expansion(
@@ -6,7 +6,6 @@ from __future__ import annotations
6
6
 
7
7
  from typing import Any, Optional
8
8
 
9
- from biblicus.backends import get_backend
10
9
  from biblicus.context import (
11
10
  ContextPack,
12
11
  ContextPackPolicy,
@@ -15,67 +14,72 @@ from biblicus.context import (
15
14
  fit_context_pack_to_token_budget,
16
15
  )
17
16
  from biblicus.corpus import Corpus
18
- from biblicus.models import QueryBudget, RetrievalRun
17
+ from biblicus.models import QueryBudget, RetrievalSnapshot
18
+ from biblicus.retrievers import get_retriever
19
19
 
20
20
  from .models import ContextRetrieverRequest
21
21
 
22
22
 
23
- def _resolve_run(
23
+ def _resolve_snapshot(
24
24
  corpus: Corpus,
25
25
  *,
26
- backend_id: str,
27
- run_id: Optional[str],
28
- recipe_name: Optional[str],
29
- recipe_config: Optional[dict[str, Any]],
30
- ) -> RetrievalRun:
31
- if run_id:
32
- return corpus.load_run(run_id)
26
+ retriever_id: str,
27
+ snapshot_id: Optional[str],
28
+ configuration_name: Optional[str],
29
+ configuration: Optional[dict[str, Any]],
30
+ ) -> RetrievalSnapshot:
31
+ if snapshot_id:
32
+ return corpus.load_snapshot(snapshot_id)
33
33
 
34
- latest_run_id = corpus.latest_run_id
35
- if latest_run_id:
36
- candidate = corpus.load_run(latest_run_id)
37
- if candidate.recipe.backend_id == backend_id:
34
+ latest_snapshot_id = corpus.latest_snapshot_id
35
+ if latest_snapshot_id:
36
+ candidate = corpus.load_snapshot(latest_snapshot_id)
37
+ if candidate.configuration.retriever_id == retriever_id:
38
38
  return candidate
39
39
 
40
- if recipe_config is None:
40
+ if configuration is None:
41
41
  raise ValueError(
42
- "No retrieval run available for the requested backend. "
43
- "Provide run_id or recipe_config to build one."
42
+ "No retrieval snapshot available for the requested retriever. "
43
+ "Provide snapshot_id or configuration to build one."
44
44
  )
45
45
 
46
- backend = get_backend(backend_id)
47
- resolved_name = recipe_name or f"Context pack ({backend_id})"
48
- return backend.build_run(corpus, recipe_name=resolved_name, config=recipe_config)
46
+ retriever = get_retriever(retriever_id)
47
+ resolved_name = configuration_name or f"Context pack ({retriever_id})"
48
+ return retriever.build_snapshot(
49
+ corpus,
50
+ configuration_name=resolved_name,
51
+ configuration=configuration,
52
+ )
49
53
 
50
54
 
51
55
  def retrieve_context_pack(
52
56
  *,
53
57
  request: ContextRetrieverRequest,
54
58
  corpus: Corpus,
55
- backend_id: str,
56
- run_id: Optional[str] = None,
57
- recipe_name: Optional[str] = None,
58
- recipe_config: Optional[dict[str, Any]] = None,
59
+ retriever_id: str,
60
+ snapshot_id: Optional[str] = None,
61
+ configuration_name: Optional[str] = None,
62
+ configuration: Optional[dict[str, Any]] = None,
59
63
  join_with: str = "\n\n",
60
64
  max_items_per_source: Optional[int] = None,
61
65
  include_metadata: bool = False,
62
66
  metadata_fields: Optional[list[str]] = None,
63
67
  ) -> ContextPack:
64
68
  """
65
- Retrieve a context pack using a Biblicus backend.
69
+ Retrieve a context pack using a Biblicus retriever.
66
70
 
67
71
  :param request: Context retrieval request.
68
72
  :type request: biblicus.context_engine.ContextRetrieverRequest
69
73
  :param corpus: Corpus instance to query.
70
74
  :type corpus: biblicus.corpus.Corpus
71
- :param backend_id: Retrieval backend identifier.
72
- :type backend_id: str
73
- :param run_id: Optional retrieval run identifier.
74
- :type run_id: str or None
75
- :param recipe_name: Optional recipe name for run builds.
76
- :type recipe_name: str or None
77
- :param recipe_config: Optional backend recipe configuration.
78
- :type recipe_config: dict[str, Any] or None
75
+ :param retriever_id: Retrieval retriever identifier.
76
+ :type retriever_id: str
77
+ :param snapshot_id: Optional retrieval snapshot identifier.
78
+ :type snapshot_id: str or None
79
+ :param configuration_name: Optional configuration name for snapshot builds.
80
+ :type configuration_name: str or None
81
+ :param configuration: Optional retriever configuration.
82
+ :type configuration: dict[str, Any] or None
79
83
  :param join_with: Separator between context pack blocks.
80
84
  :type join_with: str
81
85
  :param max_items_per_source: Optional cap per source.
@@ -86,14 +90,14 @@ def retrieve_context_pack(
86
90
  :type metadata_fields: list[str] or None
87
91
  :return: Context pack derived from retrieval results.
88
92
  :rtype: biblicus.context.ContextPack
89
- :raises ValueError: If no compatible retrieval run is available.
93
+ :raises ValueError: If no compatible retrieval snapshot is available.
90
94
  """
91
- run = _resolve_run(
95
+ snapshot = _resolve_snapshot(
92
96
  corpus,
93
- backend_id=backend_id,
94
- run_id=run_id,
95
- recipe_name=recipe_name,
96
- recipe_config=recipe_config,
97
+ retriever_id=retriever_id,
98
+ snapshot_id=snapshot_id,
99
+ configuration_name=configuration_name,
100
+ configuration=configuration,
97
101
  )
98
102
 
99
103
  maximum_total_characters = request.maximum_total_characters
@@ -106,10 +110,10 @@ def retrieve_context_pack(
106
110
  maximum_total_characters=maximum_total_characters,
107
111
  max_items_per_source=max_items_per_source,
108
112
  )
109
- backend = get_backend(backend_id)
110
- result = backend.query(
113
+ retriever = get_retriever(retriever_id)
114
+ result = retriever.query(
111
115
  corpus,
112
- run=run,
116
+ snapshot=snapshot,
113
117
  query_text=request.query,
114
118
  budget=budget,
115
119
  )