biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,203 @@
1
+ """
2
+ Hook manager for executing configured lifecycle hooks.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Iterable, List, Optional
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from .constants import CORPUS_DIR_NAME, HOOK_LOGS_DIR_NAME
13
+ from .hook_logging import HookLogger, new_operation_id
14
+ from .hooks import (
15
+ HookContext,
16
+ HookPoint,
17
+ HookSpec,
18
+ IngestHookContext,
19
+ IngestMutation,
20
+ LifecycleHook,
21
+ build_builtin_hook,
22
+ )
23
+ from .time import utc_now_iso
24
+
25
+
26
+ class HookManager:
27
+ """
28
+ Hook manager that executes configured hooks and records execution.
29
+
30
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus.
31
+ :vartype corpus_uri: str
32
+ :ivar log_dir: Directory where hook logs are recorded.
33
+ :vartype log_dir: object
34
+ :ivar operation_id: Identifier for this hook execution session.
35
+ :vartype operation_id: str
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ *,
41
+ corpus_uri: str,
42
+ log_dir: Path,
43
+ hooks: Iterable[LifecycleHook],
44
+ operation_id: Optional[str] = None,
45
+ ):
46
+ """
47
+ Initialize a hook manager.
48
+
49
+ :param corpus_uri: Canonical uniform resource identifier for the corpus.
50
+ :type corpus_uri: str
51
+ :param log_dir: Directory where hook logs are written.
52
+ :type log_dir: object
53
+ :param hooks: Hook instances to execute.
54
+ :type hooks: Iterable[LifecycleHook]
55
+ :param operation_id: Optional operation identifier override.
56
+ :type operation_id: str or None
57
+ """
58
+ self.corpus_uri = corpus_uri
59
+ self.log_dir = log_dir
60
+ self.operation_id = operation_id or new_operation_id()
61
+ self._hooks = list(hooks)
62
+ self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
63
+
64
+ @classmethod
65
+ def from_config(
66
+ cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]
67
+ ) -> "HookManager":
68
+ """
69
+ Build a hook manager from config data.
70
+
71
+ :param corpus_root: Corpus root directory.
72
+ :type corpus_root: Path
73
+ :param corpus_uri: Canonical uniform resource identifier for the corpus.
74
+ :type corpus_uri: str
75
+ :param hook_specs: Hook specifications loaded from config.
76
+ :type hook_specs: Iterable[HookSpec]
77
+ :return: Hook manager.
78
+ :rtype: HookManager
79
+ :raises KeyError: If a hook identifier is unknown.
80
+ """
81
+ log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
82
+ hooks: List[LifecycleHook] = []
83
+
84
+ for spec in hook_specs:
85
+ hooks.append(build_builtin_hook(spec))
86
+
87
+ return cls(corpus_uri=corpus_uri, log_dir=log_dir, hooks=hooks)
88
+
89
+ def run_ingest_hooks(
90
+ self,
91
+ *,
92
+ hook_point: HookPoint,
93
+ filename: Optional[str],
94
+ media_type: str,
95
+ title: Optional[str],
96
+ tags: List[str],
97
+ metadata: Dict[str, Any],
98
+ source_uri: str,
99
+ item_id: Optional[str] = None,
100
+ relpath: Optional[str] = None,
101
+ ) -> IngestMutation:
102
+ """
103
+ Run ingestion hooks for a hook point.
104
+
105
+ :param hook_point: Hook point to execute.
106
+ :type hook_point: HookPoint
107
+ :param filename: Suggested filename.
108
+ :type filename: str or None
109
+ :param media_type: Media type for the item.
110
+ :type media_type: str
111
+ :param title: Optional title.
112
+ :type title: str or None
113
+ :param tags: Tags associated with the item.
114
+ :type tags: list[str]
115
+ :param metadata: Metadata mapping.
116
+ :type metadata: dict[str, Any]
117
+ :param source_uri: Source uniform resource identifier.
118
+ :type source_uri: str
119
+ :param item_id: Optional item identifier.
120
+ :type item_id: str or None
121
+ :param relpath: Optional relative path.
122
+ :type relpath: str or None
123
+ :return: Combined ingestion mutation result.
124
+ :rtype: IngestMutation
125
+ :raises ValueError: If ingestion is denied by a hook.
126
+ """
127
+ context = IngestHookContext(
128
+ hook_point=hook_point,
129
+ operation_id=self.operation_id,
130
+ corpus_uri=self.corpus_uri,
131
+ created_at=utc_now_iso(),
132
+ filename=filename,
133
+ media_type=media_type,
134
+ title=title,
135
+ tags=list(tags),
136
+ metadata=dict(metadata),
137
+ source_uri=source_uri,
138
+ item_id=item_id,
139
+ relpath=relpath,
140
+ )
141
+
142
+ combined = IngestMutation()
143
+ for hook in self._hooks_for_point(hook_point):
144
+ result_dict = self._run_single(hook=hook, context=context)
145
+ mutation = IngestMutation.model_validate(result_dict)
146
+ if mutation.deny:
147
+ self._logger.record(
148
+ hook_point=hook_point,
149
+ hook_id=hook.hook_id,
150
+ status="denied",
151
+ message=mutation.deny_reason or mutation.message,
152
+ item_id=item_id,
153
+ relpath=relpath,
154
+ source_uri=source_uri,
155
+ details={"add_tags": mutation.add_tags},
156
+ )
157
+ raise ValueError(mutation.deny_reason or "Ingest denied")
158
+ if mutation.add_tags:
159
+ combined.add_tags.extend(mutation.add_tags)
160
+ self._logger.record(
161
+ hook_point=hook_point,
162
+ hook_id=hook.hook_id,
163
+ status="ok",
164
+ message=mutation.message,
165
+ item_id=item_id,
166
+ relpath=relpath,
167
+ source_uri=source_uri,
168
+ details={"add_tags": mutation.add_tags},
169
+ )
170
+
171
+ deduplicated_tags: List[str] = []
172
+ for tag in combined.add_tags:
173
+ if tag not in deduplicated_tags:
174
+ deduplicated_tags.append(tag)
175
+ combined.add_tags = deduplicated_tags
176
+ return combined
177
+
178
+ def _hooks_for_point(self, hook_point: HookPoint) -> List[LifecycleHook]:
179
+ eligible: List[LifecycleHook] = []
180
+ for hook in self._hooks:
181
+ if hook_point in list(getattr(hook, "hook_points", [])):
182
+ eligible.append(hook)
183
+ return eligible
184
+
185
+ def _run_single(self, *, hook: LifecycleHook, context: HookContext) -> Dict[str, Any]:
186
+ """
187
+ Run a single hook with error capture.
188
+
189
+ :param hook: Hook to execute.
190
+ :type hook: LifecycleHook
191
+ :param context: Hook context.
192
+ :type context: HookContext
193
+ :return: Hook result mapping.
194
+ :rtype: dict[str, Any]
195
+ :raises ValueError: If a hook raises an exception.
196
+ """
197
+ try:
198
+ result = hook.run(context)
199
+ except Exception as exc:
200
+ raise ValueError(f"Hook {hook.hook_id!r} failed: {exc}") from exc
201
+ if isinstance(result, BaseModel):
202
+ return result.model_dump()
203
+ raise ValueError(f"Hook {hook.hook_id!r} returned a non-Pydantic result")
biblicus/hooks.py ADDED
@@ -0,0 +1,261 @@
1
+ """
2
+ Lifecycle hook interfaces and built-in hook implementations.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from enum import Enum
8
+ from typing import Any, Dict, List, Optional, Sequence
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+
12
+
13
+ class HookPoint(str, Enum):
14
+ """
15
+ Canonical lifecycle hook points for corpus operations.
16
+
17
+ :cvar before_ingest: Called before an item is ingested.
18
+ :cvar after_ingest: Called after an item is ingested and indexed.
19
+ :cvar before_reindex: Called before a catalog rebuild starts.
20
+ :cvar after_reindex: Called after a catalog rebuild completes.
21
+ :cvar before_build_run: Called before a backend run build starts.
22
+ :cvar after_build_run: Called after a backend run build completes.
23
+ :cvar before_query: Called before a query is executed.
24
+ :cvar after_query: Called after a query completes.
25
+ :cvar before_evaluate_run: Called before an evaluation starts.
26
+ :cvar after_evaluate_run: Called after an evaluation completes.
27
+ """
28
+
29
+ before_ingest = "before_ingest"
30
+ after_ingest = "after_ingest"
31
+ before_reindex = "before_reindex"
32
+ after_reindex = "after_reindex"
33
+ before_build_run = "before_build_run"
34
+ after_build_run = "after_build_run"
35
+ before_query = "before_query"
36
+ after_query = "after_query"
37
+ before_evaluate_run = "before_evaluate_run"
38
+ after_evaluate_run = "after_evaluate_run"
39
+
40
+
41
+ class HookSpec(BaseModel):
42
+ """
43
+ On-disk hook specification stored in a corpus config.
44
+
45
+ :ivar hook_id: Identifier used to locate a hook implementation.
46
+ :vartype hook_id: str
47
+ :ivar hook_points: Hook points where the hook executes.
48
+ :vartype hook_points: list[HookPoint]
49
+ :ivar config: Hook-specific configuration values.
50
+ :vartype config: dict[str, Any]
51
+ """
52
+
53
+ model_config = ConfigDict(extra="forbid")
54
+
55
+ hook_id: str = Field(min_length=1)
56
+ hook_points: List[HookPoint] = Field(default_factory=list)
57
+ config: Dict[str, Any] = Field(default_factory=dict)
58
+
59
+
60
+ class HookContext(BaseModel):
61
+ """
62
+ Base context passed to hooks.
63
+
64
+ :ivar hook_point: Hook point currently executing.
65
+ :vartype hook_point: HookPoint
66
+ :ivar operation_id: Identifier for the enclosing command or call.
67
+ :vartype operation_id: str
68
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus.
69
+ :vartype corpus_uri: str
70
+ :ivar created_at: International Organization for Standardization 8601 timestamp when the context was created.
71
+ :vartype created_at: str
72
+ """
73
+
74
+ model_config = ConfigDict(extra="forbid")
75
+
76
+ hook_point: HookPoint
77
+ operation_id: str
78
+ corpus_uri: str
79
+ created_at: str
80
+
81
+
82
+ class IngestHookContext(HookContext):
83
+ """
84
+ Hook context for ingestion hooks.
85
+
86
+ :ivar filename: Suggested filename for the item.
87
+ :vartype filename: str or None
88
+ :ivar media_type: Media type for the item.
89
+ :vartype media_type: str
90
+ :ivar title: Optional title associated with the item.
91
+ :vartype title: str or None
92
+ :ivar tags: Tags associated with the item.
93
+ :vartype tags: list[str]
94
+ :ivar metadata: Metadata mapping associated with the item.
95
+ :vartype metadata: dict[str, Any]
96
+ :ivar source_uri: Source uniform resource identifier.
97
+ :vartype source_uri: str
98
+ :ivar item_id: Item identifier when available.
99
+ :vartype item_id: str or None
100
+ :ivar relpath: Relative path to stored raw bytes when available.
101
+ :vartype relpath: str or None
102
+ """
103
+
104
+ filename: Optional[str] = None
105
+ media_type: str
106
+ title: Optional[str] = None
107
+ tags: List[str] = Field(default_factory=list)
108
+ metadata: Dict[str, Any] = Field(default_factory=dict)
109
+ source_uri: str
110
+ item_id: Optional[str] = None
111
+ relpath: Optional[str] = None
112
+
113
+
114
+ class HookResult(BaseModel):
115
+ """
116
+ Base hook result with optional message fields.
117
+
118
+ :ivar message: Optional human-readable message.
119
+ :vartype message: str or None
120
+ """
121
+
122
+ model_config = ConfigDict(extra="forbid")
123
+
124
+ message: Optional[str] = None
125
+
126
+
127
+ class IngestMutation(HookResult):
128
+ """
129
+ Hook result describing ingestion mutations.
130
+
131
+ :ivar deny: Whether ingest should be denied.
132
+ :vartype deny: bool
133
+ :ivar deny_reason: Optional reason for denial.
134
+ :vartype deny_reason: str or None
135
+ :ivar add_tags: Tags to add.
136
+ :vartype add_tags: list[str]
137
+ """
138
+
139
+ deny: bool = False
140
+ deny_reason: Optional[str] = None
141
+ add_tags: List[str] = Field(default_factory=list)
142
+
143
+
144
+ class LifecycleHook:
145
+ """
146
+ Base class for a lifecycle hook implementation.
147
+
148
+ :param context: Validated hook context.
149
+ :type context: HookContext
150
+ :return: Hook result. Concrete hook points may require a more specific result type.
151
+ :rtype: HookResult
152
+ """
153
+
154
+ hook_id: str
155
+ hook_points: Sequence[HookPoint]
156
+
157
+ def run(self, context: HookContext) -> HookResult:
158
+ """
159
+ Execute the hook.
160
+
161
+ :param context: Hook context.
162
+ :type context: HookContext
163
+ :return: Hook result.
164
+ :rtype: HookResult
165
+ :raises NotImplementedError: If the hook does not implement run.
166
+ """
167
+ _ = context
168
+ raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
169
+
170
+
171
+ class AddTagsHook:
172
+ """
173
+ Built-in hook that adds tags during ingestion.
174
+
175
+ :ivar hook_id: Hook identifier.
176
+ :vartype hook_id: str
177
+ :ivar hook_points: Hook points where the hook applies.
178
+ :vartype hook_points: list[HookPoint]
179
+ :ivar tags: Tags to add.
180
+ :vartype tags: list[str]
181
+ """
182
+
183
+ hook_id = "add-tags"
184
+
185
+ def __init__(self, *, hook_points: Sequence[HookPoint], tags: Sequence[str]):
186
+ """
187
+ Initialize the add-tags hook.
188
+
189
+ :param hook_points: Hook points where the hook runs.
190
+ :type hook_points: Sequence[HookPoint]
191
+ :param tags: Tags to add.
192
+ :type tags: Sequence[str]
193
+ """
194
+ self.hook_points = list(hook_points)
195
+ self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
196
+
197
+ def run(self, context: HookContext) -> HookResult:
198
+ """
199
+ Run the hook.
200
+
201
+ :param context: Hook context.
202
+ :type context: HookContext
203
+ :return: Ingest mutation result.
204
+ :rtype: HookResult
205
+ """
206
+ _ = context
207
+ return IngestMutation(add_tags=list(self.tags))
208
+
209
+
210
+ class DenyAllHook:
211
+ """
212
+ Built-in hook that denies every ingest.
213
+
214
+ :ivar hook_id: Hook identifier.
215
+ :vartype hook_id: str
216
+ :ivar hook_points: Hook points where the hook applies.
217
+ :vartype hook_points: list[HookPoint]
218
+ """
219
+
220
+ hook_id = "deny-all"
221
+
222
+ def __init__(self, *, hook_points: Sequence[HookPoint]):
223
+ """
224
+ Initialize the deny-all hook.
225
+
226
+ :param hook_points: Hook points where the hook runs.
227
+ :type hook_points: Sequence[HookPoint]
228
+ """
229
+ self.hook_points = list(hook_points)
230
+
231
+ def run(self, context: HookContext) -> HookResult:
232
+ """
233
+ Run the hook.
234
+
235
+ :param context: Hook context.
236
+ :type context: HookContext
237
+ :return: Ingest denial result.
238
+ :rtype: HookResult
239
+ """
240
+ _ = context
241
+ return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
242
+
243
+
244
+ def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
245
+ """
246
+ Build a built-in hook from a hook specification.
247
+
248
+ :param spec: Hook specification.
249
+ :type spec: HookSpec
250
+ :return: Hook instance.
251
+ :rtype: LifecycleHook
252
+ :raises KeyError: If the hook identifier is unknown.
253
+ """
254
+ if spec.hook_id == AddTagsHook.hook_id:
255
+ tags = spec.config.get("tags") or []
256
+ return AddTagsHook(
257
+ hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else []
258
+ )
259
+ if spec.hook_id == DenyAllHook.hook_id:
260
+ return DenyAllHook(hook_points=spec.hook_points)
261
+ raise KeyError(f"Unknown hook_id {spec.hook_id!r}")
biblicus/ignore.py ADDED
@@ -0,0 +1,64 @@
1
+ """
2
+ Corpus ignore rules for bulk import and crawling.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import fnmatch
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+
14
+ class CorpusIgnoreSpec(BaseModel):
15
+ """
16
+ Parsed corpus ignore patterns.
17
+
18
+ Patterns are matched against a forward-slash relative path string.
19
+
20
+ :ivar patterns: Glob patterns to ignore.
21
+ :vartype patterns: list[str]
22
+ """
23
+
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+ patterns: List[str] = Field(default_factory=list)
27
+
28
+ def matches(self, relpath: str) -> bool:
29
+ """
30
+ Return True if the relative path matches any ignore pattern.
31
+
32
+ :param relpath: Forward-slash relative path.
33
+ :type relpath: str
34
+ :return: True if the path should be ignored.
35
+ :rtype: bool
36
+ """
37
+ normalized = relpath.replace("\\", "/").lstrip("/")
38
+ return any(fnmatch.fnmatch(normalized, pattern) for pattern in self.patterns)
39
+
40
+
41
+ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
42
+ """
43
+ Load ignore patterns from the corpus ignore file, if present.
44
+
45
+ The ignore file is stored at the corpus root as `.biblicusignore`.
46
+
47
+ :param corpus_root: Corpus root directory.
48
+ :type corpus_root: Path
49
+ :return: Parsed ignore specification.
50
+ :rtype: CorpusIgnoreSpec
51
+ """
52
+ ignore_path = corpus_root / ".biblicusignore"
53
+ if not ignore_path.is_file():
54
+ return CorpusIgnoreSpec(patterns=[])
55
+
56
+ patterns: List[str] = []
57
+ for raw_line in ignore_path.read_text(encoding="utf-8").splitlines():
58
+ line = raw_line.strip()
59
+ if not line:
60
+ continue
61
+ if line.startswith("#"):
62
+ continue
63
+ patterns.append(line)
64
+ return CorpusIgnoreSpec(patterns=patterns)