biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
  43. biblicus-1.1.1.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -40,14 +40,14 @@ class DoclingSmolExtractorConfig(BaseModel):
40
40
 
41
41
  :ivar output_format: Output format for extracted content (markdown, text, or html).
42
42
  :vartype output_format: str
43
- :ivar backend: Inference backend (mlx or transformers).
44
- :vartype backend: str
43
+ :ivar retriever: Inference retriever (mlx or transformers).
44
+ :vartype retriever: str
45
45
  """
46
46
 
47
- model_config = ConfigDict(extra="forbid")
47
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
48
48
 
49
49
  output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
50
- backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
50
+ retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
51
51
 
52
52
 
53
53
  class DoclingSmolExtractor(TextExtractor):
@@ -71,7 +71,7 @@ class DoclingSmolExtractor(TextExtractor):
71
71
  :type config: dict[str, Any]
72
72
  :return: Parsed config.
73
73
  :rtype: DoclingSmolExtractorConfig
74
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
74
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
75
75
  """
76
76
  parsed = DoclingSmolExtractorConfig.model_validate(config)
77
77
 
@@ -82,19 +82,19 @@ class DoclingSmolExtractor(TextExtractor):
82
82
  vlm_model_specs,
83
83
  )
84
84
  except ImportError as import_error:
85
- raise ExtractionRunFatalError(
85
+ raise ExtractionSnapshotFatalError(
86
86
  "DoclingSmol extractor requires an optional dependency. "
87
87
  'Install it with pip install "biblicus[docling]".'
88
88
  ) from import_error
89
89
 
90
- if parsed.backend == "mlx":
90
+ if parsed.retriever == "mlx":
91
91
  try:
92
92
  from docling.pipeline_options import vlm_model_specs
93
93
 
94
94
  _ = vlm_model_specs.SMOLDOCLING_MLX
95
95
  except (ImportError, AttributeError) as exc:
96
- raise ExtractionRunFatalError(
97
- "DoclingSmol extractor with MLX backend requires MLX support. "
96
+ raise ExtractionSnapshotFatalError(
97
+ "DoclingSmol extractor with MLX retriever requires MLX support. "
98
98
  'Install it with pip install "biblicus[docling-mlx]".'
99
99
  ) from exc
100
100
 
@@ -167,7 +167,7 @@ class DoclingSmolExtractor(TextExtractor):
167
167
  from docling.format_options import InputFormat, PdfFormatOption
168
168
  from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
169
169
 
170
- if config.backend == "mlx":
170
+ if config.retriever == "mlx":
171
171
  vlm_options = vlm_model_specs.SMOLDOCLING_MLX
172
172
  else:
173
173
  vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -52,18 +52,18 @@ class MarkItDownExtractor(TextExtractor):
52
52
  :type config: dict[str, Any]
53
53
  :return: Parsed config.
54
54
  :rtype: MarkItDownExtractorConfig
55
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
55
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
56
56
  """
57
57
  try:
58
58
  import markitdown
59
59
  from markitdown import MarkItDown # noqa: F401
60
60
  except ImportError as import_error:
61
- raise ExtractionRunFatalError(
61
+ raise ExtractionSnapshotFatalError(
62
62
  "MarkItDown extractor requires an optional dependency. "
63
63
  'Install it with pip install "biblicus[markitdown]".'
64
64
  ) from import_error
65
65
  if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
66
- raise ExtractionRunFatalError(
66
+ raise ExtractionSnapshotFatalError(
67
67
  "MarkItDown requires Python 3.10 or higher. "
68
68
  "Upgrade your interpreter or use a compatible extractor."
69
69
  )
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
  from pydantic import BaseModel, ConfigDict, Field, model_validator
12
12
 
13
13
  from ..corpus import Corpus
14
- from ..errors import ExtractionRunFatalError
14
+ from ..errors import ExtractionSnapshotFatalError
15
15
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
16
  from ..user_config import resolve_openai_api_key
17
17
  from .base import TextExtractor
@@ -74,19 +74,19 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
74
74
  :type config: dict[str, Any]
75
75
  :return: Parsed configuration model.
76
76
  :rtype: OpenAiSpeechToTextExtractorConfig
77
- :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
77
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
78
78
  """
79
79
  try:
80
80
  from openai import OpenAI # noqa: F401
81
81
  except ImportError as import_error:
82
- raise ExtractionRunFatalError(
82
+ raise ExtractionSnapshotFatalError(
83
83
  "OpenAI speech to text extractor requires an optional dependency. "
84
84
  'Install it with pip install "biblicus[openai]".'
85
85
  ) from import_error
86
86
 
87
87
  api_key = resolve_openai_api_key()
88
88
  if api_key is None:
89
- raise ExtractionRunFatalError(
89
+ raise ExtractionSnapshotFatalError(
90
90
  "OpenAI speech to text extractor requires an OpenAI API key. "
91
91
  "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
92
92
  "openai.api_key."
@@ -115,7 +115,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
115
115
  :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
116
116
  :return: Extracted text payload, or None when the item is not audio.
117
117
  :rtype: ExtractedText or None
118
- :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
118
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
119
119
  """
120
120
  _ = previous_extractions
121
121
  if not item.media_type.startswith("audio/"):
@@ -129,7 +129,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
129
129
 
130
130
  api_key = resolve_openai_api_key()
131
131
  if api_key is None:
132
- raise ExtractionRunFatalError(
132
+ raise ExtractionSnapshotFatalError(
133
133
  "OpenAI speech to text extractor requires an OpenAI API key. "
134
134
  "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
135
135
  "openai.api_key."
@@ -138,7 +138,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
138
138
  try:
139
139
  from openai import OpenAI
140
140
  except ImportError as import_error:
141
- raise ExtractionRunFatalError(
141
+ raise ExtractionSnapshotFatalError(
142
142
  "OpenAI speech to text extractor requires an optional dependency. "
143
143
  'Install it with pip install "biblicus[openai]".'
144
144
  ) from import_error
@@ -16,7 +16,7 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple
16
16
  from pydantic import BaseModel, ConfigDict, Field
17
17
 
18
18
  from ..corpus import Corpus
19
- from ..errors import ExtractionRunFatalError
19
+ from ..errors import ExtractionSnapshotFatalError
20
20
  from ..inference import ApiProvider, InferenceBackendConfig, InferenceBackendMode, resolve_api_key
21
21
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
22
22
  from .base import TextExtractor
@@ -26,8 +26,8 @@ class PaddleOcrVlExtractorConfig(BaseModel):
26
26
  """
27
27
  Configuration for the PaddleOCR-VL extractor.
28
28
 
29
- :ivar backend: Inference backend configuration for local or application programming interface execution.
30
- :vartype backend: InferenceBackendConfig
29
+ :ivar retriever: Inference retriever configuration for local or application programming interface execution.
30
+ :vartype retriever: InferenceBackendConfig
31
31
  :ivar min_confidence: Minimum confidence threshold for including text.
32
32
  :vartype min_confidence: float
33
33
  :ivar joiner: String used to join recognized text lines.
@@ -38,9 +38,11 @@ class PaddleOcrVlExtractorConfig(BaseModel):
38
38
  :vartype lang: str
39
39
  """
40
40
 
41
- model_config = ConfigDict(extra="forbid")
41
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
42
42
 
43
- backend: InferenceBackendConfig = Field(default_factory=InferenceBackendConfig)
43
+ retriever: InferenceBackendConfig = Field(
44
+ default_factory=InferenceBackendConfig, alias="backend"
45
+ )
44
46
  min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
45
47
  joiner: str = Field(default="\n")
46
48
  use_angle_cls: bool = Field(default=True)
@@ -70,7 +72,7 @@ class PaddleOcrVlExtractor(TextExtractor):
70
72
  :type config: dict[str, Any]
71
73
  :return: Parsed configuration model.
72
74
  :rtype: PaddleOcrVlExtractorConfig
73
- :raises ExtractionRunFatalError: If required dependencies are missing.
75
+ :raises ExtractionSnapshotFatalError: If required dependencies are missing.
74
76
  """
75
77
  import json
76
78
 
@@ -86,26 +88,26 @@ class PaddleOcrVlExtractor(TextExtractor):
86
88
 
87
89
  parsed = PaddleOcrVlExtractorConfig.model_validate(parsed_config)
88
90
 
89
- if parsed.backend.mode == InferenceBackendMode.LOCAL:
91
+ if parsed.retriever.mode == InferenceBackendMode.LOCAL:
90
92
  try:
91
93
  from paddleocr import PaddleOCR # noqa: F401
92
94
  except ImportError as import_error:
93
- raise ExtractionRunFatalError(
95
+ raise ExtractionSnapshotFatalError(
94
96
  "PaddleOCR-VL extractor (local mode) requires paddleocr. "
95
97
  'Install it with pip install "biblicus[paddleocr]".'
96
98
  ) from import_error
97
99
  else:
98
100
  # api_provider is guaranteed to be set by InferenceBackendConfig validator
99
101
  api_key = resolve_api_key(
100
- parsed.backend.api_provider,
101
- config_override=parsed.backend.api_key,
102
+ parsed.retriever.api_provider,
103
+ config_override=parsed.retriever.api_key,
102
104
  )
103
105
  if api_key is None:
104
- provider_name = parsed.backend.api_provider.value.upper()
105
- raise ExtractionRunFatalError(
106
+ provider_name = parsed.retriever.api_provider.value.upper()
107
+ raise ExtractionSnapshotFatalError(
106
108
  f"PaddleOCR-VL extractor (API mode) requires an API key for {provider_name}. "
107
109
  f"Set {provider_name}_API_KEY environment variable or configure "
108
- f"{parsed.backend.api_provider.value} in user config."
110
+ f"{parsed.retriever.api_provider.value} in user config."
109
111
  )
110
112
 
111
113
  return parsed
@@ -145,12 +147,12 @@ class PaddleOcrVlExtractor(TextExtractor):
145
147
 
146
148
  source_path = corpus.root / item.relpath
147
149
 
148
- if parsed_config.backend.mode == InferenceBackendMode.LOCAL:
150
+ if parsed_config.retriever.mode == InferenceBackendMode.LOCAL:
149
151
  text, confidence = self._extract_local(source_path, parsed_config)
150
152
  else:
151
153
  api_key = resolve_api_key(
152
- parsed_config.backend.api_provider,
153
- config_override=parsed_config.backend.api_key,
154
+ parsed_config.retriever.api_provider,
155
+ config_override=parsed_config.retriever.api_key,
154
156
  )
155
157
  text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
156
158
 
@@ -228,7 +230,7 @@ class PaddleOcrVlExtractor(TextExtractor):
228
230
  :return: Tuple of extracted text and confidence score.
229
231
  :rtype: tuple[str, float or None]
230
232
  """
231
- if config.backend.api_provider == ApiProvider.HUGGINGFACE:
233
+ if config.retriever.api_provider == ApiProvider.HUGGINGFACE:
232
234
  return self._extract_via_huggingface_api(source_path, config, api_key)
233
235
  else:
234
236
  return "", None
@@ -257,7 +259,7 @@ class PaddleOcrVlExtractor(TextExtractor):
257
259
 
258
260
  headers = {"Authorization": f"Bearer {api_key}"}
259
261
 
260
- model_id = config.backend.model_id or "PaddlePaddle/PaddleOCR-VL"
262
+ model_id = config.retriever.model_id or "PaddlePaddle/PaddleOCR-VL"
261
263
  api_url = f"https://api-inference.huggingface.co/models/{model_id}"
262
264
  response = requests.post(
263
265
  api_url,
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
9
9
  from pydantic import BaseModel, ConfigDict, Field, model_validator
10
10
 
11
11
  from ..corpus import Corpus
12
- from ..errors import ExtractionRunFatalError
12
+ from ..errors import ExtractionSnapshotFatalError
13
13
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
14
14
  from .base import TextExtractor
15
15
 
@@ -20,14 +20,14 @@ class PipelineStepSpec(BaseModel):
20
20
 
21
21
  :ivar extractor_id: Extractor plugin identifier.
22
22
  :vartype extractor_id: str
23
- :ivar config: Extractor configuration mapping.
24
- :vartype config: dict[str, Any]
23
+ :ivar configuration: Extractor configuration mapping.
24
+ :vartype configuration: dict[str, Any]
25
25
  """
26
26
 
27
- model_config = ConfigDict(extra="forbid")
27
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
28
28
 
29
29
  extractor_id: str = Field(min_length=1)
30
- config: Dict[str, Any] = Field(default_factory=dict)
30
+ configuration: Dict[str, Any] = Field(default_factory=dict, alias="config")
31
31
 
32
32
 
33
33
  class PipelineExtractorConfig(BaseModel):
@@ -92,7 +92,7 @@ class PipelineExtractor(TextExtractor):
92
92
  :type config: PipelineExtractorConfig
93
93
  :param previous_extractions: Prior step outputs for this item within the pipeline.
94
94
  :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
95
- :raises ExtractionRunFatalError: Always, because the pipeline is executed by the runner.
95
+ :raises ExtractionSnapshotFatalError: Always, because the pipeline is executed by the runner.
96
96
  :return: None.
97
97
  :rtype: None
98
98
  """
@@ -100,6 +100,6 @@ class PipelineExtractor(TextExtractor):
100
100
  _ = item
101
101
  _ = config
102
102
  _ = previous_extractions
103
- raise ExtractionRunFatalError(
104
- "Pipeline extractor must be executed by the extraction runner."
103
+ raise ExtractionSnapshotFatalError(
104
+ "Pipeline extractor must be executed by the extraction snapshotner."
105
105
  )
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -54,12 +54,12 @@ class RapidOcrExtractor(TextExtractor):
54
54
  :type config: dict[str, Any]
55
55
  :return: Parsed configuration model.
56
56
  :rtype: RapidOcrExtractorConfig
57
- :raises ExtractionRunFatalError: If the optional dependency is missing.
57
+ :raises ExtractionSnapshotFatalError: If the optional dependency is missing.
58
58
  """
59
59
  try:
60
60
  from rapidocr_onnxruntime import RapidOCR # noqa: F401
61
61
  except ImportError as import_error:
62
- raise ExtractionRunFatalError(
62
+ raise ExtractionSnapshotFatalError(
63
63
  "RapidOCR extractor requires an optional dependency. "
64
64
  'Install it with pip install "biblicus[ocr]".'
65
65
  ) from import_error
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
  from pydantic import BaseModel, ConfigDict
12
12
 
13
13
  from ..corpus import Corpus
14
- from ..errors import ExtractionRunFatalError
14
+ from ..errors import ExtractionSnapshotFatalError
15
15
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
16
  from .base import TextExtractor
17
17
 
@@ -48,12 +48,12 @@ class UnstructuredExtractor(TextExtractor):
48
48
  :type config: dict[str, Any]
49
49
  :return: Parsed config.
50
50
  :rtype: UnstructuredExtractorConfig
51
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
51
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
52
52
  """
53
53
  try:
54
54
  from unstructured.partition.auto import partition # noqa: F401
55
55
  except ImportError as import_error:
56
- raise ExtractionRunFatalError(
56
+ raise ExtractionSnapshotFatalError(
57
57
  "Unstructured extractor requires an optional dependency. "
58
58
  'Install it with pip install "biblicus[unstructured]".'
59
59
  ) from import_error
biblicus/hooks.py CHANGED
@@ -18,8 +18,8 @@ class HookPoint(str, Enum):
18
18
  :cvar after_ingest: Called after an item is ingested and indexed.
19
19
  :cvar before_reindex: Called before a catalog rebuild starts.
20
20
  :cvar after_reindex: Called after a catalog rebuild completes.
21
- :cvar before_build_run: Called before a backend run build starts.
22
- :cvar after_build_run: Called after a backend run build completes.
21
+ :cvar before_build_snapshot: Called before a retriever snapshot build starts.
22
+ :cvar after_build_snapshot: Called after a retriever snapshot build completes.
23
23
  :cvar before_query: Called before a query is executed.
24
24
  :cvar after_query: Called after a query completes.
25
25
  :cvar before_evaluate_run: Called before an evaluation starts.
@@ -30,8 +30,8 @@ class HookPoint(str, Enum):
30
30
  after_ingest = "after_ingest"
31
31
  before_reindex = "before_reindex"
32
32
  after_reindex = "after_reindex"
33
- before_build_run = "before_build_run"
34
- after_build_run = "after_build_run"
33
+ before_build_snapshot = "before_build_snapshot"
34
+ after_build_snapshot = "after_build_snapshot"
35
35
  before_query = "before_query"
36
36
  after_query = "after_query"
37
37
  before_evaluate_run = "before_evaluate_run"
@@ -11,7 +11,6 @@ from typing import List, Optional, Sequence
11
11
 
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
- from .backends import get_backend
15
14
  from .context import (
16
15
  ContextPack,
17
16
  ContextPackPolicy,
@@ -20,17 +19,18 @@ from .context import (
20
19
  fit_context_pack_to_token_budget,
21
20
  )
22
21
  from .corpus import Corpus
23
- from .models import QueryBudget, RetrievalResult, RetrievalRun
22
+ from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
23
+ from .retrievers import get_retriever
24
24
 
25
25
 
26
26
  class KnowledgeBaseDefaults(BaseModel):
27
27
  """
28
28
  Default configuration for a knowledge base workflow.
29
29
 
30
- :ivar backend_id: Backend identifier to use for retrieval.
31
- :vartype backend_id: str
32
- :ivar recipe_name: Human-readable retrieval recipe name.
33
- :vartype recipe_name: str
30
+ :ivar retriever_id: Retriever identifier to use for retrieval.
31
+ :vartype retriever_id: str
32
+ :ivar configuration_name: Human-readable retrieval configuration name.
33
+ :vartype configuration_name: str
34
34
  :ivar query_budget: Default query budget to apply to retrieval.
35
35
  :vartype query_budget: QueryBudget
36
36
  :ivar tags: Tags to apply when importing the folder.
@@ -39,8 +39,8 @@ class KnowledgeBaseDefaults(BaseModel):
39
39
 
40
40
  model_config = ConfigDict(extra="forbid")
41
41
 
42
- backend_id: str = Field(default="scan", min_length=1)
43
- recipe_name: str = Field(default="Knowledge base", min_length=1)
42
+ retriever_id: str = Field(default="scan", min_length=1)
43
+ configuration_name: str = Field(default="Knowledge base", min_length=1)
44
44
  query_budget: QueryBudget = Field(
45
45
  default_factory=lambda: QueryBudget(
46
46
  max_total_items=5,
@@ -58,17 +58,17 @@ class KnowledgeBase:
58
58
 
59
59
  :ivar corpus: Corpus instance that stores the ingested items.
60
60
  :vartype corpus: Corpus
61
- :ivar backend_id: Backend identifier used for retrieval.
62
- :vartype backend_id: str
63
- :ivar run: Retrieval run manifest associated with the knowledge base.
64
- :vartype run: RetrievalRun
61
+ :ivar retriever_id: Retriever identifier used for retrieval.
62
+ :vartype retriever_id: str
63
+ :ivar snapshot: Retrieval snapshot manifest associated with the knowledge base.
64
+ :vartype snapshot: RetrievalSnapshot
65
65
  :ivar defaults: Default configuration used for this knowledge base.
66
66
  :vartype defaults: KnowledgeBaseDefaults
67
67
  """
68
68
 
69
69
  corpus: Corpus
70
- backend_id: str
71
- run: RetrievalRun
70
+ retriever_id: str
71
+ snapshot: RetrievalSnapshot
72
72
  defaults: KnowledgeBaseDefaults
73
73
  _temp_dir: Optional[TemporaryDirectory]
74
74
 
@@ -77,8 +77,8 @@ class KnowledgeBase:
77
77
  cls,
78
78
  folder: str | Path,
79
79
  *,
80
- backend_id: Optional[str] = None,
81
- recipe_name: Optional[str] = None,
80
+ retriever_id: Optional[str] = None,
81
+ configuration_name: Optional[str] = None,
82
82
  query_budget: Optional[QueryBudget] = None,
83
83
  tags: Optional[Sequence[str]] = None,
84
84
  corpus_root: Optional[str | Path] = None,
@@ -88,10 +88,10 @@ class KnowledgeBase:
88
88
 
89
89
  :param folder: Folder containing source files.
90
90
  :type folder: str or Path
91
- :param backend_id: Optional backend identifier override.
92
- :type backend_id: str or None
93
- :param recipe_name: Optional recipe name override.
94
- :type recipe_name: str or None
91
+ :param retriever_id: Optional retriever identifier override.
92
+ :type retriever_id: str or None
93
+ :param configuration_name: Optional configuration name override.
94
+ :type configuration_name: str or None
95
95
  :param query_budget: Optional query budget override.
96
96
  :type query_budget: QueryBudget or None
97
97
  :param tags: Optional tags to apply during import.
@@ -110,8 +110,8 @@ class KnowledgeBase:
110
110
  raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
111
111
 
112
112
  defaults = KnowledgeBaseDefaults()
113
- resolved_backend_id = backend_id or defaults.backend_id
114
- resolved_recipe_name = recipe_name or defaults.recipe_name
113
+ resolved_retriever_id = retriever_id or defaults.retriever_id
114
+ resolved_configuration_name = configuration_name or defaults.configuration_name
115
115
  resolved_query_budget = query_budget or defaults.query_budget
116
116
  resolved_tags = list(tags) if tags is not None else defaults.tags
117
117
 
@@ -125,16 +125,18 @@ class KnowledgeBase:
125
125
  corpus = Corpus.init(corpus_root_path)
126
126
  corpus.import_tree(source_root, tags=resolved_tags)
127
127
 
128
- backend = get_backend(resolved_backend_id)
129
- run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
128
+ retriever = get_retriever(resolved_retriever_id)
129
+ snapshot = retriever.build_snapshot(
130
+ corpus, configuration_name=resolved_configuration_name, configuration={}
131
+ )
130
132
 
131
133
  return cls(
132
134
  corpus=corpus,
133
- backend_id=resolved_backend_id,
134
- run=run,
135
+ retriever_id=resolved_retriever_id,
136
+ snapshot=snapshot,
135
137
  defaults=KnowledgeBaseDefaults(
136
- backend_id=resolved_backend_id,
137
- recipe_name=resolved_recipe_name,
138
+ retriever_id=resolved_retriever_id,
139
+ configuration_name=resolved_configuration_name,
138
140
  query_budget=resolved_query_budget,
139
141
  tags=resolved_tags,
140
142
  ),
@@ -152,11 +154,11 @@ class KnowledgeBase:
152
154
  :return: Retrieval result containing evidence.
153
155
  :rtype: RetrievalResult
154
156
  """
155
- backend = get_backend(self.backend_id)
157
+ retriever = get_retriever(self.retriever_id)
156
158
  resolved_budget = budget or self.defaults.query_budget
157
- return backend.query(
159
+ return retriever.query(
158
160
  self.corpus,
159
- run=self.run,
161
+ snapshot=self.snapshot,
160
162
  query_text=query_text,
161
163
  budget=resolved_budget,
162
164
  )