biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/retrieval.py ADDED
@@ -0,0 +1,133 @@
1
+ """
2
+ Shared retrieval helpers for Biblicus backends.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import hashlib
8
+ import json
9
+ from typing import Any, Dict, Iterable, List, Optional
10
+
11
+ from .corpus import Corpus
12
+ from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
13
+ from .time import utc_now_iso
14
+
15
+
16
+ def create_recipe_manifest(
17
+ *,
18
+ backend_id: str,
19
+ name: str,
20
+ config: Dict[str, Any],
21
+ description: Optional[str] = None,
22
+ ) -> RecipeManifest:
23
+ """
24
+ Create a deterministic recipe manifest from a backend configuration.
25
+
26
+ :param backend_id: Backend identifier for the recipe.
27
+ :type backend_id: str
28
+ :param name: Human-readable recipe name.
29
+ :type name: str
30
+ :param config: Backend-specific configuration values.
31
+ :type config: dict[str, Any]
32
+ :param description: Optional recipe description.
33
+ :type description: str or None
34
+ :return: Deterministic recipe manifest.
35
+ :rtype: RecipeManifest
36
+ """
37
+ config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
38
+ recipe_seed = f"{backend_id}:{config_json}"
39
+ recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
40
+ return RecipeManifest(
41
+ recipe_id=recipe_id,
42
+ backend_id=backend_id,
43
+ name=name,
44
+ created_at=utc_now_iso(),
45
+ config=config,
46
+ description=description,
47
+ )
48
+
49
+
50
+ def create_run_manifest(
51
+ corpus: Corpus,
52
+ *,
53
+ recipe: RecipeManifest,
54
+ stats: Dict[str, Any],
55
+ artifact_paths: Optional[List[str]] = None,
56
+ ) -> RetrievalRun:
57
+ """
58
+ Create a retrieval run manifest tied to the current catalog snapshot.
59
+
60
+ :param corpus: Corpus used to generate the run.
61
+ :type corpus: Corpus
62
+ :param recipe: Recipe manifest for the run.
63
+ :type recipe: RecipeManifest
64
+ :param stats: Backend-specific run statistics.
65
+ :type stats: dict[str, Any]
66
+ :param artifact_paths: Optional relative paths to materialized artifacts.
67
+ :type artifact_paths: list[str] or None
68
+ :return: Run manifest.
69
+ :rtype: RetrievalRun
70
+ """
71
+ catalog = corpus.load_catalog()
72
+ created_at = utc_now_iso()
73
+ run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
74
+ return RetrievalRun(
75
+ run_id=run_id,
76
+ recipe=recipe,
77
+ corpus_uri=catalog.corpus_uri,
78
+ catalog_generated_at=catalog.generated_at,
79
+ created_at=created_at,
80
+ artifact_paths=list(artifact_paths or []),
81
+ stats=stats,
82
+ )
83
+
84
+
85
+ def hash_text(text: str) -> str:
86
+ """
87
+ Hash a text payload for provenance.
88
+
89
+ :param text: Text to hash.
90
+ :type text: str
91
+ :return: Secure Hash Algorithm 256 hex digest.
92
+ :rtype: str
93
+ """
94
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
95
+
96
+
97
+ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evidence]:
98
+ """
99
+ Apply a query budget to a ranked evidence list.
100
+
101
+ :param evidence: Ranked evidence iterable (highest score first).
102
+ :type evidence: Iterable[Evidence]
103
+ :param budget: Budget constraints to enforce.
104
+ :type budget: QueryBudget
105
+ :return: Evidence list respecting the budget.
106
+ :rtype: list[Evidence]
107
+ """
108
+ selected_evidence: List[Evidence] = []
109
+ source_counts: Dict[str, int] = {}
110
+ total_characters = 0
111
+
112
+ for candidate_evidence in evidence:
113
+ if len(selected_evidence) >= budget.max_total_items:
114
+ break
115
+
116
+ source_key = candidate_evidence.source_uri or candidate_evidence.item_id
117
+ if budget.max_items_per_source is not None:
118
+ if source_counts.get(source_key, 0) >= budget.max_items_per_source:
119
+ continue
120
+
121
+ text_character_count = len(candidate_evidence.text or "")
122
+ if budget.max_total_characters is not None:
123
+ if total_characters + text_character_count > budget.max_total_characters:
124
+ continue
125
+
126
+ selected_evidence.append(candidate_evidence)
127
+ source_counts[source_key] = source_counts.get(source_key, 0) + 1
128
+ total_characters += text_character_count
129
+
130
+ return [
131
+ evidence_item.model_copy(update={"rank": index})
132
+ for index, evidence_item in enumerate(selected_evidence, start=1)
133
+ ]
biblicus/sources.py ADDED
@@ -0,0 +1,212 @@
1
+ """
2
+ Source loading helpers for Biblicus ingestion.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import mimetypes
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Optional
11
+ from urllib.parse import unquote, urlparse
12
+ from urllib.request import Request, urlopen
13
+
14
+
15
+ def _looks_like_uri(value: str) -> bool:
16
+ """
17
+ Check whether a string resembles a uniform resource identifier.
18
+
19
+ :param value: Candidate string.
20
+ :type value: str
21
+ :return: True if the string has a valid uniform resource identifier scheme prefix.
22
+ :rtype: bool
23
+ """
24
+ return "://" in value and value.split("://", 1)[0].isidentifier()
25
+
26
+
27
+ def _filename_from_url_path(path: str) -> str:
28
+ """
29
+ Derive a filename from a uniform resource locator path.
30
+
31
+ :param path: Uniform resource locator path component.
32
+ :type path: str
33
+ :return: Filename or a fallback name.
34
+ :rtype: str
35
+ """
36
+ filename = Path(unquote(path)).name
37
+ return filename or "download"
38
+
39
+
40
+ def _media_type_from_filename(name: str) -> str:
41
+ """
42
+ Guess media type from a filename.
43
+
44
+ :param name: Filename to inspect.
45
+ :type name: str
46
+ :return: Guessed media type or application/octet-stream.
47
+ :rtype: str
48
+ """
49
+ media_type, _ = mimetypes.guess_type(name)
50
+ return media_type or "application/octet-stream"
51
+
52
+
53
+ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
54
+ """
55
+ Sniff a media type from leading bytes for a small set of common formats.
56
+
57
+ :param data: Raw bytes to inspect.
58
+ :type data: bytes
59
+ :return: Detected media type or None.
60
+ :rtype: str or None
61
+ """
62
+ prefix = data[:32]
63
+ if prefix.startswith(b"%PDF-"):
64
+ return "application/pdf"
65
+ if prefix.startswith(b"\x89PNG\r\n\x1a\n"):
66
+ return "image/png"
67
+ if prefix[:3] == b"\xff\xd8\xff":
68
+ return "image/jpeg"
69
+ if prefix.startswith(b"RIFF") and prefix[8:12] == b"WAVE":
70
+ return "audio/x-wav"
71
+ if prefix.startswith(b"ID3") or (
72
+ len(prefix) >= 2 and prefix[0] == 0xFF and (prefix[1] & 0xE0) == 0xE0
73
+ ):
74
+ return "audio/mpeg"
75
+ if prefix.startswith(b"OggS"):
76
+ return "audio/ogg"
77
+ if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(
78
+ b"<html"
79
+ ):
80
+ return "text/html"
81
+ return None
82
+
83
+
84
+ def _normalize_media_type(*, filename: str, media_type: str) -> str:
85
+ """
86
+ Normalize media types that are commonly mislabelled by upstream sources.
87
+
88
+ This function exists to keep the corpus usable for humans. When a source provides a filename
89
+ extension that users recognize (for example, ``.ogg``), Biblicus prefers a matching media type
90
+ so that downstream processing can make reasonable decisions.
91
+
92
+ :param filename: Filename associated with the payload.
93
+ :type filename: str
94
+ :param media_type: Media type reported or guessed for the payload.
95
+ :type media_type: str
96
+ :return: Normalized media type.
97
+ :rtype: str
98
+ """
99
+ suffix = Path(filename).suffix.lower()
100
+ if media_type in {"application/ogg", "application/x-ogg"} and suffix in {
101
+ ".ogg",
102
+ ".oga",
103
+ ".ogx",
104
+ }:
105
+ return "audio/ogg"
106
+ return media_type
107
+
108
+
109
+ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
110
+ """
111
+ Ensure the filename has a usable extension for the media type.
112
+
113
+ :param filename: Filename candidate.
114
+ :type filename: str
115
+ :param media_type: Media type to target.
116
+ :type media_type: str
117
+ :return: Filename with extension.
118
+ :rtype: str
119
+ """
120
+ if Path(filename).suffix:
121
+ return filename
122
+ if media_type == "audio/ogg":
123
+ ext = ".ogg"
124
+ else:
125
+ ext = mimetypes.guess_extension(media_type) or ""
126
+ return filename + ext if ext else filename
127
+
128
+
129
+ @dataclass(frozen=True)
130
+ class SourcePayload:
131
+ """
132
+ Loaded source payload for ingestion.
133
+
134
+ :ivar data: Raw bytes from the source.
135
+ :vartype data: bytes
136
+ :ivar filename: Suggested filename for the payload.
137
+ :vartype filename: str
138
+ :ivar media_type: Internet Assigned Numbers Authority media type for the payload.
139
+ :vartype media_type: str
140
+ :ivar source_uri: Source uniform resource identifier used to load the payload.
141
+ :vartype source_uri: str
142
+ """
143
+
144
+ data: bytes
145
+ filename: str
146
+ media_type: str
147
+ source_uri: str
148
+
149
+
150
+ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> SourcePayload:
151
+ """
152
+ Load bytes from a source reference.
153
+
154
+ :param source: File path or uniform resource locator to load.
155
+ :type source: str or Path
156
+ :param source_uri: Optional override for the source uniform resource identifier.
157
+ :type source_uri: str or None
158
+ :return: Source payload with bytes and metadata.
159
+ :rtype: SourcePayload
160
+ :raises ValueError: If a file:// uniform resource identifier has a non-local host.
161
+ :raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
162
+ """
163
+ if isinstance(source, Path):
164
+ path = source.resolve()
165
+ media_type = _media_type_from_filename(path.name)
166
+ if path.suffix.lower() in {".md", ".markdown"}:
167
+ media_type = "text/markdown"
168
+ return SourcePayload(
169
+ data=path.read_bytes(),
170
+ filename=path.name,
171
+ media_type=media_type,
172
+ source_uri=source_uri or path.as_uri(),
173
+ )
174
+
175
+ if _looks_like_uri(source):
176
+ parsed = urlparse(source)
177
+ if parsed.scheme == "file":
178
+ if parsed.netloc not in ("", "localhost"):
179
+ raise ValueError(
180
+ f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
181
+ )
182
+ path = Path(unquote(parsed.path)).resolve()
183
+ return load_source(path, source_uri=source_uri or source)
184
+
185
+ if parsed.scheme in {"http", "https"}:
186
+ request = Request(source, headers={"User-Agent": "biblicus/0"})
187
+ with urlopen(request, timeout=30) as response:
188
+ response_bytes = response.read()
189
+ content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
190
+ filename = _filename_from_url_path(parsed.path)
191
+ media_type = content_type or _media_type_from_filename(filename)
192
+ if media_type == "application/octet-stream":
193
+ sniffed = _sniff_media_type_from_bytes(response_bytes)
194
+ if sniffed:
195
+ media_type = sniffed
196
+ filename = _ensure_extension_for_media_type(filename, media_type)
197
+ media_type = _normalize_media_type(filename=filename, media_type=media_type)
198
+ if Path(filename).suffix.lower() in {".md", ".markdown"}:
199
+ media_type = "text/markdown"
200
+ return SourcePayload(
201
+ data=response_bytes,
202
+ filename=filename,
203
+ media_type=media_type,
204
+ source_uri=source_uri or source,
205
+ )
206
+
207
+ raise NotImplementedError(
208
+ f"Unsupported source uniform resource identifier scheme: {parsed.scheme}://"
209
+ )
210
+
211
+ path = Path(source).resolve()
212
+ return load_source(path, source_uri=source_uri)
biblicus/time.py ADDED
@@ -0,0 +1,17 @@
1
+ """
2
+ Time utilities for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime, timezone
8
+
9
+
10
+ def utc_now_iso() -> str:
11
+ """
12
+ Return the current Coordinated Universal Time as an International Organization for Standardization 8601 string.
13
+
14
+ :return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
15
+ :rtype: str
16
+ """
17
+ return datetime.now(timezone.utc).isoformat(timespec="microseconds")
biblicus/uris.py ADDED
@@ -0,0 +1,63 @@
1
+ """
2
+ Uniform resource identifier and path helpers for Biblicus corpora.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+ from typing import Union
9
+ from urllib.parse import unquote, urlparse
10
+
11
+
12
+ def _looks_like_uri(value: str) -> bool:
13
+ """
14
+ Check whether a string resembles a uniform resource identifier.
15
+
16
+ :param value: Candidate string.
17
+ :type value: str
18
+ :return: True if the string has a valid uniform resource identifier scheme prefix.
19
+ :rtype: bool
20
+ """
21
+ return "://" in value and value.split("://", 1)[0].isidentifier()
22
+
23
+
24
+ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
25
+ """
26
+ Convert a corpus reference to a filesystem path.
27
+
28
+ :param ref: Filesystem path or file:// uniform resource identifier.
29
+ :type ref: str or Path
30
+ :return: Resolved filesystem path.
31
+ :rtype: Path
32
+ :raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
33
+ :raises ValueError: If a file:// uniform resource identifier has a non-local host.
34
+ """
35
+ if isinstance(ref, Path):
36
+ return ref.resolve()
37
+
38
+ if _looks_like_uri(ref):
39
+ parsed = urlparse(ref)
40
+ if parsed.scheme != "file":
41
+ raise NotImplementedError(
42
+ "Only file:// corpus uniform resource identifiers are supported in version zero "
43
+ f"(got {parsed.scheme}://)"
44
+ )
45
+ if parsed.netloc not in ("", "localhost"):
46
+ raise ValueError(
47
+ f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
48
+ )
49
+ return Path(unquote(parsed.path)).resolve()
50
+
51
+ return Path(ref).resolve()
52
+
53
+
54
+ def normalize_corpus_uri(ref: Union[str, Path]) -> str:
55
+ """
56
+ Normalize a corpus reference into a file:// uniform resource identifier.
57
+
58
+ :param ref: Filesystem path or file:// uniform resource identifier.
59
+ :type ref: str or Path
60
+ :return: Canonical file:// uniform resource identifier.
61
+ :rtype: str
62
+ """
63
+ return corpus_ref_to_path(ref).as_uri()
@@ -0,0 +1,138 @@
1
+ """
2
+ User configuration file loading for Biblicus.
3
+
4
+ User configuration is intended for small, local settings such as credentials for optional
5
+ integrations. It is separate from corpus configuration.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Optional
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field
15
+
16
+ from ._vendor.dotyaml import ConfigLoader
17
+
18
+
19
+ class OpenAiUserConfig(BaseModel):
20
+ """
21
+ Configuration for OpenAI integrations.
22
+
23
+ :ivar api_key: OpenAI API key used for authenticated requests.
24
+ :vartype api_key: str
25
+ """
26
+
27
+ model_config = ConfigDict(extra="forbid")
28
+
29
+ api_key: str = Field(min_length=1)
30
+
31
+
32
+ class BiblicusUserConfig(BaseModel):
33
+ """
34
+ Parsed user configuration for Biblicus.
35
+
36
+ :ivar openai: Optional OpenAI configuration.
37
+ :vartype openai: OpenAiUserConfig or None
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ openai: Optional[OpenAiUserConfig] = None
43
+
44
+
45
+ def default_user_config_paths(
46
+ *, cwd: Optional[Path] = None, home: Optional[Path] = None
47
+ ) -> list[Path]:
48
+ """
49
+ Compute the default user configuration file search paths.
50
+
51
+ The search order is:
52
+
53
+ 1. Home configuration: ``~/.biblicus/config.yml``
54
+ 2. Local configuration: ``./.biblicus/config.yml``
55
+
56
+ Local configuration overrides home configuration when both exist.
57
+
58
+ :param cwd: Optional working directory to use instead of the process current directory.
59
+ :type cwd: Path or None
60
+ :param home: Optional home directory to use instead of the current user's home directory.
61
+ :type home: Path or None
62
+ :return: Ordered list of configuration file paths.
63
+ :rtype: list[Path]
64
+ """
65
+ resolved_home = (home or Path.home()).expanduser()
66
+ resolved_cwd = cwd or Path.cwd()
67
+ return [
68
+ resolved_home / ".biblicus" / "config.yml",
69
+ resolved_cwd / ".biblicus" / "config.yml",
70
+ ]
71
+
72
+
73
+ def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
74
+ merged: Dict[str, Any] = {key: value for key, value in base.items()}
75
+ for key, value in override.items():
76
+ if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
77
+ merged[key] = _deep_merge(merged[key], value)
78
+ else:
79
+ merged[key] = value
80
+ return merged
81
+
82
+
83
+ def _load_dotyaml_data(path: Path) -> Dict[str, Any]:
84
+ """
85
+ Load a dotyaml configuration file and return a nested mapping.
86
+
87
+ :param path: Configuration file path.
88
+ :type path: Path
89
+ :return: Parsed YAML data mapping.
90
+ :rtype: dict[str, Any]
91
+ """
92
+ loader = ConfigLoader(prefix="", load_dotenv_first=False)
93
+ loaded = loader.load_from_yaml(path)
94
+ return loaded if isinstance(loaded, dict) else {}
95
+
96
+
97
+ def load_user_config(*, paths: Optional[list[Path]] = None) -> BiblicusUserConfig:
98
+ """
99
+ Load user configuration from known locations.
100
+
101
+ This function merges multiple configuration files in order. Later files override earlier files.
102
+
103
+ :param paths: Optional explicit search paths. When omitted, the default paths are used.
104
+ :type paths: list[Path] or None
105
+ :return: Parsed user configuration. When no files exist, the configuration is empty.
106
+ :rtype: BiblicusUserConfig
107
+ :raises ValueError: If an existing configuration file is not parseable.
108
+ """
109
+ search_paths = paths or default_user_config_paths()
110
+ merged_data: Dict[str, Any] = {}
111
+
112
+ for path in search_paths:
113
+ if not path.is_file():
114
+ continue
115
+ loaded = _load_dotyaml_data(path)
116
+ merged_data = _deep_merge(merged_data, loaded)
117
+
118
+ return BiblicusUserConfig.model_validate(merged_data)
119
+
120
+
121
+ def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
122
+ """
123
+ Resolve an OpenAI API key from environment or user configuration.
124
+
125
+ Environment takes precedence over configuration.
126
+
127
+ :param config: Optional pre-loaded user configuration.
128
+ :type config: BiblicusUserConfig or None
129
+ :return: API key string, or None when no key is available.
130
+ :rtype: str or None
131
+ """
132
+ env_key = os.environ.get("OPENAI_API_KEY")
133
+ if env_key:
134
+ return env_key
135
+ loaded = config or load_user_config()
136
+ if loaded.openai is None:
137
+ return None
138
+ return loaded.openai.api_key