biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/hook_manager.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hook manager for executing configured lifecycle hooks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from .constants import CORPUS_DIR_NAME, HOOK_LOGS_DIR_NAME
|
|
13
|
+
from .hook_logging import HookLogger, new_operation_id
|
|
14
|
+
from .hooks import (
|
|
15
|
+
HookContext,
|
|
16
|
+
HookPoint,
|
|
17
|
+
HookSpec,
|
|
18
|
+
IngestHookContext,
|
|
19
|
+
IngestMutation,
|
|
20
|
+
LifecycleHook,
|
|
21
|
+
build_builtin_hook,
|
|
22
|
+
)
|
|
23
|
+
from .time import utc_now_iso
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HookManager:
|
|
27
|
+
"""
|
|
28
|
+
Hook manager that executes configured hooks and records execution.
|
|
29
|
+
|
|
30
|
+
:ivar corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
31
|
+
:vartype corpus_uri: str
|
|
32
|
+
:ivar log_dir: Directory where hook logs are recorded.
|
|
33
|
+
:vartype log_dir: object
|
|
34
|
+
:ivar operation_id: Identifier for this hook execution session.
|
|
35
|
+
:vartype operation_id: str
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
corpus_uri: str,
|
|
42
|
+
log_dir: Path,
|
|
43
|
+
hooks: Iterable[LifecycleHook],
|
|
44
|
+
operation_id: Optional[str] = None,
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize a hook manager.
|
|
48
|
+
|
|
49
|
+
:param corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
50
|
+
:type corpus_uri: str
|
|
51
|
+
:param log_dir: Directory where hook logs are written.
|
|
52
|
+
:type log_dir: object
|
|
53
|
+
:param hooks: Hook instances to execute.
|
|
54
|
+
:type hooks: Iterable[LifecycleHook]
|
|
55
|
+
:param operation_id: Optional operation identifier override.
|
|
56
|
+
:type operation_id: str or None
|
|
57
|
+
"""
|
|
58
|
+
self.corpus_uri = corpus_uri
|
|
59
|
+
self.log_dir = log_dir
|
|
60
|
+
self.operation_id = operation_id or new_operation_id()
|
|
61
|
+
self._hooks = list(hooks)
|
|
62
|
+
self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_config(
|
|
66
|
+
cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]
|
|
67
|
+
) -> "HookManager":
|
|
68
|
+
"""
|
|
69
|
+
Build a hook manager from config data.
|
|
70
|
+
|
|
71
|
+
:param corpus_root: Corpus root directory.
|
|
72
|
+
:type corpus_root: Path
|
|
73
|
+
:param corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
74
|
+
:type corpus_uri: str
|
|
75
|
+
:param hook_specs: Hook specifications loaded from config.
|
|
76
|
+
:type hook_specs: Iterable[HookSpec]
|
|
77
|
+
:return: Hook manager.
|
|
78
|
+
:rtype: HookManager
|
|
79
|
+
:raises KeyError: If a hook identifier is unknown.
|
|
80
|
+
"""
|
|
81
|
+
log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
|
|
82
|
+
hooks: List[LifecycleHook] = []
|
|
83
|
+
|
|
84
|
+
for spec in hook_specs:
|
|
85
|
+
hooks.append(build_builtin_hook(spec))
|
|
86
|
+
|
|
87
|
+
return cls(corpus_uri=corpus_uri, log_dir=log_dir, hooks=hooks)
|
|
88
|
+
|
|
89
|
+
def run_ingest_hooks(
|
|
90
|
+
self,
|
|
91
|
+
*,
|
|
92
|
+
hook_point: HookPoint,
|
|
93
|
+
filename: Optional[str],
|
|
94
|
+
media_type: str,
|
|
95
|
+
title: Optional[str],
|
|
96
|
+
tags: List[str],
|
|
97
|
+
metadata: Dict[str, Any],
|
|
98
|
+
source_uri: str,
|
|
99
|
+
item_id: Optional[str] = None,
|
|
100
|
+
relpath: Optional[str] = None,
|
|
101
|
+
) -> IngestMutation:
|
|
102
|
+
"""
|
|
103
|
+
Run ingestion hooks for a hook point.
|
|
104
|
+
|
|
105
|
+
:param hook_point: Hook point to execute.
|
|
106
|
+
:type hook_point: HookPoint
|
|
107
|
+
:param filename: Suggested filename.
|
|
108
|
+
:type filename: str or None
|
|
109
|
+
:param media_type: Media type for the item.
|
|
110
|
+
:type media_type: str
|
|
111
|
+
:param title: Optional title.
|
|
112
|
+
:type title: str or None
|
|
113
|
+
:param tags: Tags associated with the item.
|
|
114
|
+
:type tags: list[str]
|
|
115
|
+
:param metadata: Metadata mapping.
|
|
116
|
+
:type metadata: dict[str, Any]
|
|
117
|
+
:param source_uri: Source uniform resource identifier.
|
|
118
|
+
:type source_uri: str
|
|
119
|
+
:param item_id: Optional item identifier.
|
|
120
|
+
:type item_id: str or None
|
|
121
|
+
:param relpath: Optional relative path.
|
|
122
|
+
:type relpath: str or None
|
|
123
|
+
:return: Combined ingestion mutation result.
|
|
124
|
+
:rtype: IngestMutation
|
|
125
|
+
:raises ValueError: If ingestion is denied by a hook.
|
|
126
|
+
"""
|
|
127
|
+
context = IngestHookContext(
|
|
128
|
+
hook_point=hook_point,
|
|
129
|
+
operation_id=self.operation_id,
|
|
130
|
+
corpus_uri=self.corpus_uri,
|
|
131
|
+
created_at=utc_now_iso(),
|
|
132
|
+
filename=filename,
|
|
133
|
+
media_type=media_type,
|
|
134
|
+
title=title,
|
|
135
|
+
tags=list(tags),
|
|
136
|
+
metadata=dict(metadata),
|
|
137
|
+
source_uri=source_uri,
|
|
138
|
+
item_id=item_id,
|
|
139
|
+
relpath=relpath,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
combined = IngestMutation()
|
|
143
|
+
for hook in self._hooks_for_point(hook_point):
|
|
144
|
+
result_dict = self._run_single(hook=hook, context=context)
|
|
145
|
+
mutation = IngestMutation.model_validate(result_dict)
|
|
146
|
+
if mutation.deny:
|
|
147
|
+
self._logger.record(
|
|
148
|
+
hook_point=hook_point,
|
|
149
|
+
hook_id=hook.hook_id,
|
|
150
|
+
status="denied",
|
|
151
|
+
message=mutation.deny_reason or mutation.message,
|
|
152
|
+
item_id=item_id,
|
|
153
|
+
relpath=relpath,
|
|
154
|
+
source_uri=source_uri,
|
|
155
|
+
details={"add_tags": mutation.add_tags},
|
|
156
|
+
)
|
|
157
|
+
raise ValueError(mutation.deny_reason or "Ingest denied")
|
|
158
|
+
if mutation.add_tags:
|
|
159
|
+
combined.add_tags.extend(mutation.add_tags)
|
|
160
|
+
self._logger.record(
|
|
161
|
+
hook_point=hook_point,
|
|
162
|
+
hook_id=hook.hook_id,
|
|
163
|
+
status="ok",
|
|
164
|
+
message=mutation.message,
|
|
165
|
+
item_id=item_id,
|
|
166
|
+
relpath=relpath,
|
|
167
|
+
source_uri=source_uri,
|
|
168
|
+
details={"add_tags": mutation.add_tags},
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
deduplicated_tags: List[str] = []
|
|
172
|
+
for tag in combined.add_tags:
|
|
173
|
+
if tag not in deduplicated_tags:
|
|
174
|
+
deduplicated_tags.append(tag)
|
|
175
|
+
combined.add_tags = deduplicated_tags
|
|
176
|
+
return combined
|
|
177
|
+
|
|
178
|
+
def _hooks_for_point(self, hook_point: HookPoint) -> List[LifecycleHook]:
|
|
179
|
+
eligible: List[LifecycleHook] = []
|
|
180
|
+
for hook in self._hooks:
|
|
181
|
+
if hook_point in list(getattr(hook, "hook_points", [])):
|
|
182
|
+
eligible.append(hook)
|
|
183
|
+
return eligible
|
|
184
|
+
|
|
185
|
+
def _run_single(self, *, hook: LifecycleHook, context: HookContext) -> Dict[str, Any]:
|
|
186
|
+
"""
|
|
187
|
+
Run a single hook with error capture.
|
|
188
|
+
|
|
189
|
+
:param hook: Hook to execute.
|
|
190
|
+
:type hook: LifecycleHook
|
|
191
|
+
:param context: Hook context.
|
|
192
|
+
:type context: HookContext
|
|
193
|
+
:return: Hook result mapping.
|
|
194
|
+
:rtype: dict[str, Any]
|
|
195
|
+
:raises ValueError: If a hook raises an exception.
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
result = hook.run(context)
|
|
199
|
+
except Exception as exc:
|
|
200
|
+
raise ValueError(f"Hook {hook.hook_id!r} failed: {exc}") from exc
|
|
201
|
+
if isinstance(result, BaseModel):
|
|
202
|
+
return result.model_dump()
|
|
203
|
+
raise ValueError(f"Hook {hook.hook_id!r} returned a non-Pydantic result")
|
biblicus/hooks.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lifecycle hook interfaces and built-in hook implementations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HookPoint(str, Enum):
|
|
14
|
+
"""
|
|
15
|
+
Canonical lifecycle hook points for corpus operations.
|
|
16
|
+
|
|
17
|
+
:cvar before_ingest: Called before an item is ingested.
|
|
18
|
+
:cvar after_ingest: Called after an item is ingested and indexed.
|
|
19
|
+
:cvar before_reindex: Called before a catalog rebuild starts.
|
|
20
|
+
:cvar after_reindex: Called after a catalog rebuild completes.
|
|
21
|
+
:cvar before_build_run: Called before a backend run build starts.
|
|
22
|
+
:cvar after_build_run: Called after a backend run build completes.
|
|
23
|
+
:cvar before_query: Called before a query is executed.
|
|
24
|
+
:cvar after_query: Called after a query completes.
|
|
25
|
+
:cvar before_evaluate_run: Called before an evaluation starts.
|
|
26
|
+
:cvar after_evaluate_run: Called after an evaluation completes.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
before_ingest = "before_ingest"
|
|
30
|
+
after_ingest = "after_ingest"
|
|
31
|
+
before_reindex = "before_reindex"
|
|
32
|
+
after_reindex = "after_reindex"
|
|
33
|
+
before_build_run = "before_build_run"
|
|
34
|
+
after_build_run = "after_build_run"
|
|
35
|
+
before_query = "before_query"
|
|
36
|
+
after_query = "after_query"
|
|
37
|
+
before_evaluate_run = "before_evaluate_run"
|
|
38
|
+
after_evaluate_run = "after_evaluate_run"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class HookSpec(BaseModel):
|
|
42
|
+
"""
|
|
43
|
+
On-disk hook specification stored in a corpus config.
|
|
44
|
+
|
|
45
|
+
:ivar hook_id: Identifier used to locate a hook implementation.
|
|
46
|
+
:vartype hook_id: str
|
|
47
|
+
:ivar hook_points: Hook points where the hook executes.
|
|
48
|
+
:vartype hook_points: list[HookPoint]
|
|
49
|
+
:ivar config: Hook-specific configuration values.
|
|
50
|
+
:vartype config: dict[str, Any]
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
model_config = ConfigDict(extra="forbid")
|
|
54
|
+
|
|
55
|
+
hook_id: str = Field(min_length=1)
|
|
56
|
+
hook_points: List[HookPoint] = Field(default_factory=list)
|
|
57
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class HookContext(BaseModel):
|
|
61
|
+
"""
|
|
62
|
+
Base context passed to hooks.
|
|
63
|
+
|
|
64
|
+
:ivar hook_point: Hook point currently executing.
|
|
65
|
+
:vartype hook_point: HookPoint
|
|
66
|
+
:ivar operation_id: Identifier for the enclosing command or call.
|
|
67
|
+
:vartype operation_id: str
|
|
68
|
+
:ivar corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
69
|
+
:vartype corpus_uri: str
|
|
70
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp when the context was created.
|
|
71
|
+
:vartype created_at: str
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
model_config = ConfigDict(extra="forbid")
|
|
75
|
+
|
|
76
|
+
hook_point: HookPoint
|
|
77
|
+
operation_id: str
|
|
78
|
+
corpus_uri: str
|
|
79
|
+
created_at: str
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class IngestHookContext(HookContext):
|
|
83
|
+
"""
|
|
84
|
+
Hook context for ingestion hooks.
|
|
85
|
+
|
|
86
|
+
:ivar filename: Suggested filename for the item.
|
|
87
|
+
:vartype filename: str or None
|
|
88
|
+
:ivar media_type: Media type for the item.
|
|
89
|
+
:vartype media_type: str
|
|
90
|
+
:ivar title: Optional title associated with the item.
|
|
91
|
+
:vartype title: str or None
|
|
92
|
+
:ivar tags: Tags associated with the item.
|
|
93
|
+
:vartype tags: list[str]
|
|
94
|
+
:ivar metadata: Metadata mapping associated with the item.
|
|
95
|
+
:vartype metadata: dict[str, Any]
|
|
96
|
+
:ivar source_uri: Source uniform resource identifier.
|
|
97
|
+
:vartype source_uri: str
|
|
98
|
+
:ivar item_id: Item identifier when available.
|
|
99
|
+
:vartype item_id: str or None
|
|
100
|
+
:ivar relpath: Relative path to stored raw bytes when available.
|
|
101
|
+
:vartype relpath: str or None
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
filename: Optional[str] = None
|
|
105
|
+
media_type: str
|
|
106
|
+
title: Optional[str] = None
|
|
107
|
+
tags: List[str] = Field(default_factory=list)
|
|
108
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
109
|
+
source_uri: str
|
|
110
|
+
item_id: Optional[str] = None
|
|
111
|
+
relpath: Optional[str] = None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class HookResult(BaseModel):
|
|
115
|
+
"""
|
|
116
|
+
Base hook result with optional message fields.
|
|
117
|
+
|
|
118
|
+
:ivar message: Optional human-readable message.
|
|
119
|
+
:vartype message: str or None
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
model_config = ConfigDict(extra="forbid")
|
|
123
|
+
|
|
124
|
+
message: Optional[str] = None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class IngestMutation(HookResult):
|
|
128
|
+
"""
|
|
129
|
+
Hook result describing ingestion mutations.
|
|
130
|
+
|
|
131
|
+
:ivar deny: Whether ingest should be denied.
|
|
132
|
+
:vartype deny: bool
|
|
133
|
+
:ivar deny_reason: Optional reason for denial.
|
|
134
|
+
:vartype deny_reason: str or None
|
|
135
|
+
:ivar add_tags: Tags to add.
|
|
136
|
+
:vartype add_tags: list[str]
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
deny: bool = False
|
|
140
|
+
deny_reason: Optional[str] = None
|
|
141
|
+
add_tags: List[str] = Field(default_factory=list)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class LifecycleHook:
|
|
145
|
+
"""
|
|
146
|
+
Base class for a lifecycle hook implementation.
|
|
147
|
+
|
|
148
|
+
:param context: Validated hook context.
|
|
149
|
+
:type context: HookContext
|
|
150
|
+
:return: Hook result. Concrete hook points may require a more specific result type.
|
|
151
|
+
:rtype: HookResult
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
hook_id: str
|
|
155
|
+
hook_points: Sequence[HookPoint]
|
|
156
|
+
|
|
157
|
+
def run(self, context: HookContext) -> HookResult:
|
|
158
|
+
"""
|
|
159
|
+
Execute the hook.
|
|
160
|
+
|
|
161
|
+
:param context: Hook context.
|
|
162
|
+
:type context: HookContext
|
|
163
|
+
:return: Hook result.
|
|
164
|
+
:rtype: HookResult
|
|
165
|
+
:raises NotImplementedError: If the hook does not implement run.
|
|
166
|
+
"""
|
|
167
|
+
_ = context
|
|
168
|
+
raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class AddTagsHook:
|
|
172
|
+
"""
|
|
173
|
+
Built-in hook that adds tags during ingestion.
|
|
174
|
+
|
|
175
|
+
:ivar hook_id: Hook identifier.
|
|
176
|
+
:vartype hook_id: str
|
|
177
|
+
:ivar hook_points: Hook points where the hook applies.
|
|
178
|
+
:vartype hook_points: list[HookPoint]
|
|
179
|
+
:ivar tags: Tags to add.
|
|
180
|
+
:vartype tags: list[str]
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
hook_id = "add-tags"
|
|
184
|
+
|
|
185
|
+
def __init__(self, *, hook_points: Sequence[HookPoint], tags: Sequence[str]):
|
|
186
|
+
"""
|
|
187
|
+
Initialize the add-tags hook.
|
|
188
|
+
|
|
189
|
+
:param hook_points: Hook points where the hook runs.
|
|
190
|
+
:type hook_points: Sequence[HookPoint]
|
|
191
|
+
:param tags: Tags to add.
|
|
192
|
+
:type tags: Sequence[str]
|
|
193
|
+
"""
|
|
194
|
+
self.hook_points = list(hook_points)
|
|
195
|
+
self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
196
|
+
|
|
197
|
+
def run(self, context: HookContext) -> HookResult:
|
|
198
|
+
"""
|
|
199
|
+
Run the hook.
|
|
200
|
+
|
|
201
|
+
:param context: Hook context.
|
|
202
|
+
:type context: HookContext
|
|
203
|
+
:return: Ingest mutation result.
|
|
204
|
+
:rtype: HookResult
|
|
205
|
+
"""
|
|
206
|
+
_ = context
|
|
207
|
+
return IngestMutation(add_tags=list(self.tags))
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class DenyAllHook:
|
|
211
|
+
"""
|
|
212
|
+
Built-in hook that denies every ingest.
|
|
213
|
+
|
|
214
|
+
:ivar hook_id: Hook identifier.
|
|
215
|
+
:vartype hook_id: str
|
|
216
|
+
:ivar hook_points: Hook points where the hook applies.
|
|
217
|
+
:vartype hook_points: list[HookPoint]
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
hook_id = "deny-all"
|
|
221
|
+
|
|
222
|
+
def __init__(self, *, hook_points: Sequence[HookPoint]):
|
|
223
|
+
"""
|
|
224
|
+
Initialize the deny-all hook.
|
|
225
|
+
|
|
226
|
+
:param hook_points: Hook points where the hook runs.
|
|
227
|
+
:type hook_points: Sequence[HookPoint]
|
|
228
|
+
"""
|
|
229
|
+
self.hook_points = list(hook_points)
|
|
230
|
+
|
|
231
|
+
def run(self, context: HookContext) -> HookResult:
|
|
232
|
+
"""
|
|
233
|
+
Run the hook.
|
|
234
|
+
|
|
235
|
+
:param context: Hook context.
|
|
236
|
+
:type context: HookContext
|
|
237
|
+
:return: Ingest denial result.
|
|
238
|
+
:rtype: HookResult
|
|
239
|
+
"""
|
|
240
|
+
_ = context
|
|
241
|
+
return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
|
|
245
|
+
"""
|
|
246
|
+
Build a built-in hook from a hook specification.
|
|
247
|
+
|
|
248
|
+
:param spec: Hook specification.
|
|
249
|
+
:type spec: HookSpec
|
|
250
|
+
:return: Hook instance.
|
|
251
|
+
:rtype: LifecycleHook
|
|
252
|
+
:raises KeyError: If the hook identifier is unknown.
|
|
253
|
+
"""
|
|
254
|
+
if spec.hook_id == AddTagsHook.hook_id:
|
|
255
|
+
tags = spec.config.get("tags") or []
|
|
256
|
+
return AddTagsHook(
|
|
257
|
+
hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else []
|
|
258
|
+
)
|
|
259
|
+
if spec.hook_id == DenyAllHook.hook_id:
|
|
260
|
+
return DenyAllHook(hook_points=spec.hook_points)
|
|
261
|
+
raise KeyError(f"Unknown hook_id {spec.hook_id!r}")
|
biblicus/ignore.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Corpus ignore rules for bulk import and crawling.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import fnmatch
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CorpusIgnoreSpec(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
Parsed corpus ignore patterns.
|
|
17
|
+
|
|
18
|
+
Patterns are matched against a forward-slash relative path string.
|
|
19
|
+
|
|
20
|
+
:ivar patterns: Glob patterns to ignore.
|
|
21
|
+
:vartype patterns: list[str]
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
25
|
+
|
|
26
|
+
patterns: List[str] = Field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
def matches(self, relpath: str) -> bool:
|
|
29
|
+
"""
|
|
30
|
+
Return True if the relative path matches any ignore pattern.
|
|
31
|
+
|
|
32
|
+
:param relpath: Forward-slash relative path.
|
|
33
|
+
:type relpath: str
|
|
34
|
+
:return: True if the path should be ignored.
|
|
35
|
+
:rtype: bool
|
|
36
|
+
"""
|
|
37
|
+
normalized = relpath.replace("\\", "/").lstrip("/")
|
|
38
|
+
return any(fnmatch.fnmatch(normalized, pattern) for pattern in self.patterns)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
|
|
42
|
+
"""
|
|
43
|
+
Load ignore patterns from the corpus ignore file, if present.
|
|
44
|
+
|
|
45
|
+
The ignore file is stored at the corpus root as `.biblicusignore`.
|
|
46
|
+
|
|
47
|
+
:param corpus_root: Corpus root directory.
|
|
48
|
+
:type corpus_root: Path
|
|
49
|
+
:return: Parsed ignore specification.
|
|
50
|
+
:rtype: CorpusIgnoreSpec
|
|
51
|
+
"""
|
|
52
|
+
ignore_path = corpus_root / ".biblicusignore"
|
|
53
|
+
if not ignore_path.is_file():
|
|
54
|
+
return CorpusIgnoreSpec(patterns=[])
|
|
55
|
+
|
|
56
|
+
patterns: List[str] = []
|
|
57
|
+
for raw_line in ignore_path.read_text(encoding="utf-8").splitlines():
|
|
58
|
+
line = raw_line.strip()
|
|
59
|
+
if not line:
|
|
60
|
+
continue
|
|
61
|
+
if line.startswith("#"):
|
|
62
|
+
continue
|
|
63
|
+
patterns.append(line)
|
|
64
|
+
return CorpusIgnoreSpec(patterns=patterns)
|