biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +96 -13
- biblicus/backends/sqlite_full_text_search.py +74 -14
- biblicus/cli.py +126 -19
- biblicus/constants.py +2 -0
- biblicus/corpus.py +455 -45
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +529 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/models.py +107 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +85 -5
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- biblicus-0.3.0.dist-info/METADATA +336 -0
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus-0.1.1.dist-info/METADATA +0 -174
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Biblicus public package interface.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from .corpus import Corpus
|
|
5
6
|
from .models import (
|
|
6
7
|
CorpusConfig,
|
|
7
8
|
Evidence,
|
|
@@ -11,7 +12,6 @@ from .models import (
|
|
|
11
12
|
RetrievalResult,
|
|
12
13
|
RetrievalRun,
|
|
13
14
|
)
|
|
14
|
-
from .corpus import Corpus
|
|
15
15
|
|
|
16
16
|
__all__ = [
|
|
17
17
|
"__version__",
|
|
@@ -25,4 +25,4 @@ __all__ = [
|
|
|
25
25
|
"RetrievalRun",
|
|
26
26
|
]
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vendored dotyaml utilities.
|
|
3
|
+
|
|
4
|
+
This package vendors the minimal pieces of the `dotyaml` project that Biblicus uses for
|
|
5
|
+
loading and interpolating YAML configuration files.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .interpolation import interpolate_env_vars
|
|
11
|
+
from .loader import ConfigLoader, load_config
|
|
12
|
+
|
|
13
|
+
__all__ = ["ConfigLoader", "interpolate_env_vars", "load_config"]
|
|
14
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Environment variable interpolation functionality for dotyaml.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any, Dict, Union
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def interpolate_env_vars(data: Union[str, Dict[str, Any], Any]) -> Union[str, Dict[str, Any], Any]:
|
|
13
|
+
"""
|
|
14
|
+
Recursively interpolate environment variables in YAML data using Jinja-like syntax.
|
|
15
|
+
|
|
16
|
+
Supports syntax like: ``{{ ENV_VAR_NAME }}`` or ``{{ ENV_VAR_NAME|default_value }}``
|
|
17
|
+
|
|
18
|
+
:param data: Data structure to interpolate (string, dict, list, etc).
|
|
19
|
+
:type data: str or dict[str, Any] or Any
|
|
20
|
+
:return: Data structure with environment variables interpolated.
|
|
21
|
+
:rtype: str or dict[str, Any] or Any
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
if isinstance(data, str):
|
|
25
|
+
return _interpolate_string(data)
|
|
26
|
+
if isinstance(data, dict):
|
|
27
|
+
return {key: interpolate_env_vars(value) for key, value in data.items()}
|
|
28
|
+
if isinstance(data, list):
|
|
29
|
+
return [interpolate_env_vars(item) for item in data]
|
|
30
|
+
return data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _interpolate_string(text: str) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Interpolate environment variables in a string using Jinja-like syntax.
|
|
36
|
+
|
|
37
|
+
Supports:
|
|
38
|
+
- ``{{ ENV_VAR }}`` required environment variable.
|
|
39
|
+
- ``{{ ENV_VAR|default_value }}`` environment variable with default.
|
|
40
|
+
|
|
41
|
+
:param text: String to interpolate.
|
|
42
|
+
:type text: str
|
|
43
|
+
:return: String with environment variables interpolated.
|
|
44
|
+
:rtype: str
|
|
45
|
+
:raises ValueError: If a required environment variable is not found.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
pattern = r"\{\{\s*([A-Z_][A-Z0-9_]*)\s*(?:\|\s*([^}]*?))?\s*\}\}"
|
|
49
|
+
|
|
50
|
+
def replace_match(match): # type: ignore[no-untyped-def]
|
|
51
|
+
env_var = match.group(1)
|
|
52
|
+
default_value = match.group(2)
|
|
53
|
+
|
|
54
|
+
env_value = os.getenv(env_var)
|
|
55
|
+
|
|
56
|
+
if env_value is not None:
|
|
57
|
+
return env_value
|
|
58
|
+
if default_value is not None:
|
|
59
|
+
return default_value.strip()
|
|
60
|
+
raise ValueError(f"Required environment variable '{env_var}' not found")
|
|
61
|
+
|
|
62
|
+
return re.sub(pattern, replace_match, text)
|
|
63
|
+
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core loading functionality for dotyaml.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, Optional, Union
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
|
|
16
|
+
DOTENV_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
DOTENV_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
from .interpolation import interpolate_env_vars
|
|
21
|
+
from .transformer import flatten_dict, unflatten_env_vars
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_config(
|
|
25
|
+
yaml_path: Optional[Union[str, Path]] = None,
|
|
26
|
+
prefix: str = "",
|
|
27
|
+
override: bool = False,
|
|
28
|
+
dotenv_path: Optional[Union[str, Path]] = ".env",
|
|
29
|
+
load_dotenv_first: bool = True,
|
|
30
|
+
) -> Dict[str, str]:
|
|
31
|
+
"""
|
|
32
|
+
Load configuration from a YAML file and set environment variables.
|
|
33
|
+
|
|
34
|
+
:param yaml_path: Path to YAML configuration file. When None, only reads existing env vars.
|
|
35
|
+
:type yaml_path: str or Path or None
|
|
36
|
+
:param prefix: Prefix for environment variable names (for example, ``APP``).
|
|
37
|
+
:type prefix: str
|
|
38
|
+
:param override: Whether to override existing environment variables.
|
|
39
|
+
:type override: bool
|
|
40
|
+
:param dotenv_path: Optional ``.env`` file path to load first.
|
|
41
|
+
:type dotenv_path: str or Path or None
|
|
42
|
+
:param load_dotenv_first: Whether to load ``.env`` before YAML.
|
|
43
|
+
:type load_dotenv_first: bool
|
|
44
|
+
:return: Mapping of values that were set.
|
|
45
|
+
:rtype: dict[str, str]
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
config: Dict[str, str] = {}
|
|
49
|
+
|
|
50
|
+
if load_dotenv_first and DOTENV_AVAILABLE and dotenv_path:
|
|
51
|
+
env_file = Path(dotenv_path)
|
|
52
|
+
env_locations: list[Path] = []
|
|
53
|
+
|
|
54
|
+
if env_file.is_absolute():
|
|
55
|
+
env_locations.append(env_file)
|
|
56
|
+
else:
|
|
57
|
+
env_locations.append(Path.cwd() / dotenv_path)
|
|
58
|
+
if yaml_path:
|
|
59
|
+
yaml_dir = Path(yaml_path).parent
|
|
60
|
+
env_locations.append(yaml_dir / dotenv_path)
|
|
61
|
+
|
|
62
|
+
for env_path in env_locations:
|
|
63
|
+
if env_path.exists():
|
|
64
|
+
load_dotenv(env_path)
|
|
65
|
+
break
|
|
66
|
+
|
|
67
|
+
if yaml_path and Path(yaml_path).exists():
|
|
68
|
+
with open(yaml_path, "r", encoding="utf-8") as file:
|
|
69
|
+
yaml_data = yaml.safe_load(file)
|
|
70
|
+
|
|
71
|
+
if yaml_data:
|
|
72
|
+
yaml_data = interpolate_env_vars(yaml_data)
|
|
73
|
+
flat_config = flatten_dict(yaml_data, prefix)
|
|
74
|
+
|
|
75
|
+
for key, value in flat_config.items():
|
|
76
|
+
if not override and key in os.environ:
|
|
77
|
+
config[key] = os.environ[key]
|
|
78
|
+
else:
|
|
79
|
+
os.environ[key] = value
|
|
80
|
+
config[key] = value
|
|
81
|
+
|
|
82
|
+
return config
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ConfigLoader:
|
|
86
|
+
"""
|
|
87
|
+
Configuration loader that can read YAML files or environment variables.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
prefix: str = "",
|
|
93
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
94
|
+
dotenv_path: Optional[Union[str, Path]] = ".env",
|
|
95
|
+
load_dotenv_first: bool = True,
|
|
96
|
+
):
|
|
97
|
+
self.prefix = prefix
|
|
98
|
+
self.schema = schema
|
|
99
|
+
self.dotenv_path = dotenv_path
|
|
100
|
+
self.load_dotenv_first = load_dotenv_first
|
|
101
|
+
|
|
102
|
+
if self.load_dotenv_first and DOTENV_AVAILABLE and self.dotenv_path:
|
|
103
|
+
env_file = Path(self.dotenv_path)
|
|
104
|
+
env_locations: list[Path] = []
|
|
105
|
+
|
|
106
|
+
if env_file.is_absolute():
|
|
107
|
+
env_locations.append(env_file)
|
|
108
|
+
else:
|
|
109
|
+
env_locations.append(Path.cwd() / self.dotenv_path)
|
|
110
|
+
|
|
111
|
+
for env_path in env_locations:
|
|
112
|
+
if env_path.exists():
|
|
113
|
+
load_dotenv(env_path)
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
def load_from_yaml(self, yaml_path: Union[str, Path]) -> Dict[str, Any]:
|
|
117
|
+
"""
|
|
118
|
+
Load configuration from a YAML file with environment variable interpolation.
|
|
119
|
+
|
|
120
|
+
:param yaml_path: YAML configuration file path.
|
|
121
|
+
:type yaml_path: str or Path
|
|
122
|
+
:return: Parsed YAML data.
|
|
123
|
+
:rtype: dict[str, Any]
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
if not Path(yaml_path).exists():
|
|
127
|
+
return {}
|
|
128
|
+
|
|
129
|
+
if self.load_dotenv_first and DOTENV_AVAILABLE and self.dotenv_path:
|
|
130
|
+
env_file = Path(self.dotenv_path)
|
|
131
|
+
env_locations: list[Path] = []
|
|
132
|
+
|
|
133
|
+
if env_file.is_absolute():
|
|
134
|
+
env_locations.append(env_file)
|
|
135
|
+
else:
|
|
136
|
+
env_locations.append(Path.cwd() / self.dotenv_path)
|
|
137
|
+
yaml_dir = Path(yaml_path).parent
|
|
138
|
+
env_locations.append(yaml_dir / self.dotenv_path)
|
|
139
|
+
|
|
140
|
+
for env_path in env_locations:
|
|
141
|
+
if env_path.exists():
|
|
142
|
+
load_dotenv(env_path)
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
with open(yaml_path, "r", encoding="utf-8") as file:
|
|
146
|
+
yaml_data = yaml.safe_load(file)
|
|
147
|
+
|
|
148
|
+
if yaml_data:
|
|
149
|
+
yaml_data = interpolate_env_vars(yaml_data)
|
|
150
|
+
|
|
151
|
+
return yaml_data or {}
|
|
152
|
+
|
|
153
|
+
def load_from_env(self) -> Dict[str, Any]:
|
|
154
|
+
"""
|
|
155
|
+
Load configuration from environment variables.
|
|
156
|
+
|
|
157
|
+
:return: Nested configuration dictionary.
|
|
158
|
+
:rtype: dict[str, Any]
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
env_vars = dict(os.environ)
|
|
162
|
+
return unflatten_env_vars(env_vars, self.prefix)
|
|
163
|
+
|
|
164
|
+
def set_env_vars(self, config: Dict[str, Any], override: bool = False) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Set environment variables from a configuration dictionary.
|
|
167
|
+
|
|
168
|
+
:param config: Configuration mapping.
|
|
169
|
+
:type config: dict[str, Any]
|
|
170
|
+
:param override: Whether to override existing environment variables.
|
|
171
|
+
:type override: bool
|
|
172
|
+
:return: None.
|
|
173
|
+
:rtype: None
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
flat_config = flatten_dict(config, self.prefix)
|
|
177
|
+
|
|
178
|
+
for key, value in flat_config.items():
|
|
179
|
+
if override or key not in os.environ:
|
|
180
|
+
os.environ[key] = value
|
|
181
|
+
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""
|
|
2
|
+
YAML to environment variable transformation utilities for dotyaml.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from typing import Any, Dict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def flatten_dict(data: Dict[str, Any], prefix: str = "", separator: str = "_") -> Dict[str, str]:
|
|
12
|
+
"""
|
|
13
|
+
Flatten a nested dictionary into environment-variable style keys.
|
|
14
|
+
|
|
15
|
+
:param data: Nested dictionary to flatten.
|
|
16
|
+
:type data: dict[str, Any]
|
|
17
|
+
:param prefix: Prefix to add to all keys.
|
|
18
|
+
:type prefix: str
|
|
19
|
+
:param separator: Separator between key parts.
|
|
20
|
+
:type separator: str
|
|
21
|
+
:return: Flattened mapping with string values.
|
|
22
|
+
:rtype: dict[str, str]
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
result: Dict[str, str] = {}
|
|
26
|
+
|
|
27
|
+
for key, value in data.items():
|
|
28
|
+
if prefix:
|
|
29
|
+
full_key = f"{prefix}{separator}{key.upper()}"
|
|
30
|
+
else:
|
|
31
|
+
full_key = key.upper()
|
|
32
|
+
|
|
33
|
+
clean_key = full_key.replace("-", "_").replace(".", "_")
|
|
34
|
+
|
|
35
|
+
if isinstance(value, dict):
|
|
36
|
+
result.update(flatten_dict(value, clean_key, separator))
|
|
37
|
+
else:
|
|
38
|
+
result[clean_key] = convert_value_to_string(value)
|
|
39
|
+
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def convert_value_to_string(value: Any) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Convert a Python value to its environment variable string representation.
|
|
46
|
+
|
|
47
|
+
:param value: Value to convert.
|
|
48
|
+
:type value: Any
|
|
49
|
+
:return: String representation suitable for environment variables.
|
|
50
|
+
:rtype: str
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
if value is None:
|
|
54
|
+
return ""
|
|
55
|
+
if isinstance(value, bool):
|
|
56
|
+
return "true" if value else "false"
|
|
57
|
+
if isinstance(value, (int, float)):
|
|
58
|
+
return str(value)
|
|
59
|
+
if isinstance(value, str):
|
|
60
|
+
return value
|
|
61
|
+
if isinstance(value, (list, tuple)):
|
|
62
|
+
return ",".join(convert_value_to_string(item) for item in value)
|
|
63
|
+
if isinstance(value, dict):
|
|
64
|
+
return json.dumps(value)
|
|
65
|
+
return str(value)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def unflatten_env_vars(env_vars: Dict[str, str], prefix: str = "") -> Dict[str, Any]:
|
|
69
|
+
"""
|
|
70
|
+
Convert flat environment variables back to nested dictionary structure.
|
|
71
|
+
|
|
72
|
+
:param env_vars: Mapping of environment variables.
|
|
73
|
+
:type env_vars: dict[str, str]
|
|
74
|
+
:param prefix: Optional prefix to filter by.
|
|
75
|
+
:type prefix: str
|
|
76
|
+
:return: Nested dictionary structure.
|
|
77
|
+
:rtype: dict[str, Any]
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
result: Dict[str, Any] = {}
|
|
81
|
+
|
|
82
|
+
for key, value in env_vars.items():
|
|
83
|
+
if prefix and not key.startswith(f"{prefix}_"):
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
clean_key = key
|
|
87
|
+
if prefix:
|
|
88
|
+
clean_key = key[len(prefix) + 1 :]
|
|
89
|
+
|
|
90
|
+
parts = clean_key.lower().split("_")
|
|
91
|
+
|
|
92
|
+
current: Dict[str, Any] = result
|
|
93
|
+
for part in parts[:-1]:
|
|
94
|
+
if part not in current:
|
|
95
|
+
current[part] = {}
|
|
96
|
+
current = current[part]
|
|
97
|
+
|
|
98
|
+
final_key = parts[-1]
|
|
99
|
+
current[final_key] = convert_string_to_value(value)
|
|
100
|
+
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def convert_string_to_value(value: str) -> Any:
|
|
105
|
+
"""
|
|
106
|
+
Convert a string environment variable back to an appropriate Python type.
|
|
107
|
+
|
|
108
|
+
:param value: String value from an environment variable.
|
|
109
|
+
:type value: str
|
|
110
|
+
:return: Converted Python value.
|
|
111
|
+
:rtype: Any
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
if value == "":
|
|
115
|
+
return None
|
|
116
|
+
lowered = value.lower()
|
|
117
|
+
if lowered == "true":
|
|
118
|
+
return True
|
|
119
|
+
if lowered == "false":
|
|
120
|
+
return False
|
|
121
|
+
if value.isdigit():
|
|
122
|
+
return int(value)
|
|
123
|
+
if value.replace(".", "").replace("-", "").isdigit():
|
|
124
|
+
try:
|
|
125
|
+
return float(value)
|
|
126
|
+
except ValueError:
|
|
127
|
+
return value
|
|
128
|
+
if "," in value:
|
|
129
|
+
items = [item.strip() for item in value.split(",")]
|
|
130
|
+
return [convert_string_to_value(item) for item in items]
|
|
131
|
+
try:
|
|
132
|
+
return json.loads(value)
|
|
133
|
+
except (json.JSONDecodeError, ValueError):
|
|
134
|
+
return value
|
|
135
|
+
|
biblicus/backends/__init__.py
CHANGED
|
@@ -18,7 +18,6 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
|
18
18
|
:return: Mapping of backend identifiers to backend classes.
|
|
19
19
|
:rtype: dict[str, Type[RetrievalBackend]]
|
|
20
20
|
"""
|
|
21
|
-
|
|
22
21
|
return {
|
|
23
22
|
ScanBackend.backend_id: ScanBackend,
|
|
24
23
|
SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
|
|
@@ -35,7 +34,6 @@ def get_backend(backend_id: str) -> RetrievalBackend:
|
|
|
35
34
|
:rtype: RetrievalBackend
|
|
36
35
|
:raises KeyError: If the backend identifier is unknown.
|
|
37
36
|
"""
|
|
38
|
-
|
|
39
37
|
registry = available_backends()
|
|
40
38
|
backend_class = registry.get(backend_id)
|
|
41
39
|
if backend_class is None:
|
biblicus/backends/base.py
CHANGED
|
@@ -22,7 +22,9 @@ class RetrievalBackend(ABC):
|
|
|
22
22
|
backend_id: str
|
|
23
23
|
|
|
24
24
|
@abstractmethod
|
|
25
|
-
def build_run(
|
|
25
|
+
def build_run(
|
|
26
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
27
|
+
) -> RetrievalRun:
|
|
26
28
|
"""
|
|
27
29
|
Build or register a retrieval run for the backend.
|
|
28
30
|
|
|
@@ -35,7 +37,6 @@ class RetrievalBackend(ABC):
|
|
|
35
37
|
:return: Run manifest describing the build.
|
|
36
38
|
:rtype: RetrievalRun
|
|
37
39
|
"""
|
|
38
|
-
|
|
39
40
|
raise NotImplementedError
|
|
40
41
|
|
|
41
42
|
@abstractmethod
|
|
@@ -61,5 +62,4 @@ class RetrievalBackend(ABC):
|
|
|
61
62
|
:return: Retrieval results containing evidence.
|
|
62
63
|
:rtype: RetrievalResult
|
|
63
64
|
"""
|
|
64
|
-
|
|
65
65
|
raise NotImplementedError
|
biblicus/backends/scan.py
CHANGED
|
@@ -10,7 +10,14 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
|
10
10
|
|
|
11
11
|
from ..corpus import Corpus
|
|
12
12
|
from ..frontmatter import parse_front_matter
|
|
13
|
-
from ..models import
|
|
13
|
+
from ..models import (
|
|
14
|
+
Evidence,
|
|
15
|
+
ExtractionRunReference,
|
|
16
|
+
QueryBudget,
|
|
17
|
+
RetrievalResult,
|
|
18
|
+
RetrievalRun,
|
|
19
|
+
parse_extraction_run_reference,
|
|
20
|
+
)
|
|
14
21
|
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
15
22
|
from ..time import utc_now_iso
|
|
16
23
|
|
|
@@ -21,11 +28,14 @@ class ScanRecipeConfig(BaseModel):
|
|
|
21
28
|
|
|
22
29
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
23
30
|
:vartype snippet_characters: int
|
|
31
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
32
|
+
:vartype extraction_run: str or None
|
|
24
33
|
"""
|
|
25
34
|
|
|
26
35
|
model_config = ConfigDict(extra="forbid")
|
|
27
36
|
|
|
28
37
|
snippet_characters: int = Field(default=400, ge=1)
|
|
38
|
+
extraction_run: Optional[str] = None
|
|
29
39
|
|
|
30
40
|
|
|
31
41
|
class ScanBackend:
|
|
@@ -38,7 +48,9 @@ class ScanBackend:
|
|
|
38
48
|
|
|
39
49
|
backend_id = "scan"
|
|
40
50
|
|
|
41
|
-
def build_run(
|
|
51
|
+
def build_run(
|
|
52
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
53
|
+
) -> RetrievalRun:
|
|
42
54
|
"""
|
|
43
55
|
Register a scan backend run (no materialization).
|
|
44
56
|
|
|
@@ -51,7 +63,6 @@ class ScanBackend:
|
|
|
51
63
|
:return: Run manifest describing the build.
|
|
52
64
|
:rtype: RetrievalRun
|
|
53
65
|
"""
|
|
54
|
-
|
|
55
66
|
recipe_config = ScanRecipeConfig.model_validate(config)
|
|
56
67
|
catalog = corpus.load_catalog()
|
|
57
68
|
recipe = create_recipe_manifest(
|
|
@@ -59,7 +70,10 @@ class ScanBackend:
|
|
|
59
70
|
name=recipe_name,
|
|
60
71
|
config=recipe_config.model_dump(),
|
|
61
72
|
)
|
|
62
|
-
stats = {
|
|
73
|
+
stats = {
|
|
74
|
+
"items": len(catalog.items),
|
|
75
|
+
"text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
|
|
76
|
+
}
|
|
63
77
|
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
64
78
|
corpus.write_run(run)
|
|
65
79
|
return run
|
|
@@ -86,15 +100,16 @@ class ScanBackend:
|
|
|
86
100
|
:return: Retrieval results containing evidence.
|
|
87
101
|
:rtype: RetrievalResult
|
|
88
102
|
"""
|
|
89
|
-
|
|
90
103
|
recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
|
|
91
104
|
catalog = corpus.load_catalog()
|
|
105
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
92
106
|
query_tokens = _tokenize_query(query_text)
|
|
93
107
|
scored_candidates = _score_items(
|
|
94
108
|
corpus,
|
|
95
109
|
catalog.items.values(),
|
|
96
110
|
query_tokens,
|
|
97
111
|
recipe_config.snippet_characters,
|
|
112
|
+
extraction_reference=extraction_reference,
|
|
98
113
|
)
|
|
99
114
|
sorted_candidates = sorted(
|
|
100
115
|
scored_candidates,
|
|
@@ -124,18 +139,62 @@ class ScanBackend:
|
|
|
124
139
|
)
|
|
125
140
|
|
|
126
141
|
|
|
127
|
-
def
|
|
142
|
+
def _resolve_extraction_reference(
|
|
143
|
+
corpus: Corpus, recipe_config: ScanRecipeConfig
|
|
144
|
+
) -> Optional[ExtractionRunReference]:
|
|
145
|
+
"""
|
|
146
|
+
Resolve an extraction run reference from a recipe config.
|
|
147
|
+
|
|
148
|
+
:param corpus: Corpus associated with the recipe.
|
|
149
|
+
:type corpus: Corpus
|
|
150
|
+
:param recipe_config: Parsed scan recipe configuration.
|
|
151
|
+
:type recipe_config: ScanRecipeConfig
|
|
152
|
+
:return: Parsed extraction reference or None.
|
|
153
|
+
:rtype: ExtractionRunReference or None
|
|
154
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
155
|
+
"""
|
|
156
|
+
if not recipe_config.extraction_run:
|
|
157
|
+
return None
|
|
158
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
159
|
+
run_dir = corpus.extraction_run_dir(
|
|
160
|
+
extractor_id=extraction_reference.extractor_id,
|
|
161
|
+
run_id=extraction_reference.run_id,
|
|
162
|
+
)
|
|
163
|
+
if not run_dir.is_dir():
|
|
164
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
165
|
+
return extraction_reference
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _count_text_items(
|
|
169
|
+
corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig
|
|
170
|
+
) -> int:
|
|
128
171
|
"""
|
|
129
172
|
Count catalog items that represent text content.
|
|
130
173
|
|
|
174
|
+
When an extraction run is configured, extracted artifacts are treated as text.
|
|
175
|
+
|
|
176
|
+
:param corpus: Corpus containing the items.
|
|
177
|
+
:type corpus: Corpus
|
|
131
178
|
:param items: Catalog items to inspect.
|
|
132
179
|
:type items: Iterable[object]
|
|
180
|
+
:param recipe_config: Parsed scan recipe configuration.
|
|
181
|
+
:type recipe_config: ScanRecipeConfig
|
|
133
182
|
:return: Number of text items.
|
|
134
183
|
:rtype: int
|
|
135
184
|
"""
|
|
136
|
-
|
|
137
185
|
text_item_count = 0
|
|
186
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
138
187
|
for catalog_item in items:
|
|
188
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
189
|
+
if extraction_reference and item_id:
|
|
190
|
+
extracted_text = corpus.read_extracted_text(
|
|
191
|
+
extractor_id=extraction_reference.extractor_id,
|
|
192
|
+
run_id=extraction_reference.run_id,
|
|
193
|
+
item_id=item_id,
|
|
194
|
+
)
|
|
195
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
196
|
+
text_item_count += 1
|
|
197
|
+
continue
|
|
139
198
|
media_type = getattr(catalog_item, "media_type", "")
|
|
140
199
|
if media_type == "text/markdown" or str(media_type).startswith("text/"):
|
|
141
200
|
text_item_count += 1
|
|
@@ -151,23 +210,41 @@ def _tokenize_query(query_text: str) -> List[str]:
|
|
|
151
210
|
:return: Lowercased non-empty tokens.
|
|
152
211
|
:rtype: list[str]
|
|
153
212
|
"""
|
|
154
|
-
|
|
155
213
|
return [token for token in query_text.lower().split() if token]
|
|
156
214
|
|
|
157
215
|
|
|
158
|
-
def _load_text_from_item(
|
|
216
|
+
def _load_text_from_item(
|
|
217
|
+
corpus: Corpus,
|
|
218
|
+
*,
|
|
219
|
+
item_id: str,
|
|
220
|
+
relpath: str,
|
|
221
|
+
media_type: str,
|
|
222
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
223
|
+
) -> Optional[str]:
|
|
159
224
|
"""
|
|
160
225
|
Load a text payload from a catalog item.
|
|
161
226
|
|
|
162
227
|
:param corpus: Corpus containing the item.
|
|
163
228
|
:type corpus: Corpus
|
|
229
|
+
:param item_id: Item identifier.
|
|
230
|
+
:type item_id: str
|
|
164
231
|
:param relpath: Relative path to the stored content.
|
|
165
232
|
:type relpath: str
|
|
166
233
|
:param media_type: Media type for the stored content.
|
|
167
234
|
:type media_type: str
|
|
235
|
+
:param extraction_reference: Optional extraction run reference.
|
|
236
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
168
237
|
:return: Text payload or None if not decodable as text.
|
|
169
238
|
:rtype: str or None
|
|
170
239
|
"""
|
|
240
|
+
if extraction_reference:
|
|
241
|
+
extracted_text = corpus.read_extracted_text(
|
|
242
|
+
extractor_id=extraction_reference.extractor_id,
|
|
243
|
+
run_id=extraction_reference.run_id,
|
|
244
|
+
item_id=item_id,
|
|
245
|
+
)
|
|
246
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
247
|
+
return extracted_text
|
|
171
248
|
|
|
172
249
|
content_path = corpus.root / relpath
|
|
173
250
|
raw_bytes = content_path.read_bytes()
|
|
@@ -191,7 +268,6 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
|
|
|
191
268
|
:return: Start/end span for the earliest match, or None if no matches.
|
|
192
269
|
:rtype: tuple[int, int] or None
|
|
193
270
|
"""
|
|
194
|
-
|
|
195
271
|
lower_text = text.lower()
|
|
196
272
|
best_start: Optional[int] = None
|
|
197
273
|
best_end: Optional[int] = None
|
|
@@ -223,7 +299,6 @@ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int
|
|
|
223
299
|
:return: Snippet text.
|
|
224
300
|
:rtype: str
|
|
225
301
|
"""
|
|
226
|
-
|
|
227
302
|
if not text:
|
|
228
303
|
return ""
|
|
229
304
|
if span is None:
|
|
@@ -240,6 +315,8 @@ def _score_items(
|
|
|
240
315
|
items: Iterable[object],
|
|
241
316
|
tokens: List[str],
|
|
242
317
|
snippet_characters: int,
|
|
318
|
+
*,
|
|
319
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
243
320
|
) -> List[Evidence]:
|
|
244
321
|
"""
|
|
245
322
|
Score catalog items by token frequency and return evidence candidates.
|
|
@@ -255,12 +332,18 @@ def _score_items(
|
|
|
255
332
|
:return: Evidence candidates with provisional ranks.
|
|
256
333
|
:rtype: list[Evidence]
|
|
257
334
|
"""
|
|
258
|
-
|
|
259
335
|
evidence_items: List[Evidence] = []
|
|
260
336
|
for catalog_item in items:
|
|
261
337
|
media_type = getattr(catalog_item, "media_type", "")
|
|
262
338
|
relpath = getattr(catalog_item, "relpath", "")
|
|
263
|
-
|
|
339
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
340
|
+
item_text = _load_text_from_item(
|
|
341
|
+
corpus,
|
|
342
|
+
item_id=item_id,
|
|
343
|
+
relpath=relpath,
|
|
344
|
+
media_type=str(media_type),
|
|
345
|
+
extraction_reference=extraction_reference,
|
|
346
|
+
)
|
|
264
347
|
if item_text is None:
|
|
265
348
|
continue
|
|
266
349
|
lower_text = item_text.lower()
|