biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/backends/scan.py +81 -4
- biblicus/backends/sqlite_full_text_search.py +63 -2
- biblicus/cli.py +123 -0
- biblicus/constants.py +2 -0
- biblicus/corpus.py +431 -2
- biblicus/extraction.py +330 -0
- biblicus/extractors/__init__.py +33 -0
- biblicus/extractors/base.py +61 -0
- biblicus/extractors/cascade.py +101 -0
- biblicus/extractors/metadata_text.py +98 -0
- biblicus/extractors/pass_through_text.py +74 -0
- biblicus/hook_logging.py +185 -0
- biblicus/hook_manager.py +205 -0
- biblicus/hooks.py +265 -0
- biblicus/ignore.py +67 -0
- biblicus/models.py +20 -0
- biblicus/sources.py +45 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/METADATA +101 -1
- biblicus-0.2.0.dist-info/RECORD +32 -0
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/top_level.txt +0 -0
biblicus/hook_logging.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structured hook execution logging.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import uuid
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
from urllib.parse import urlparse, urlunparse
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from .hooks import HookPoint
|
|
16
|
+
from .time import utc_now_iso
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def new_operation_id() -> str:
|
|
20
|
+
"""
|
|
21
|
+
Create a new operation identifier for hook log grouping.
|
|
22
|
+
|
|
23
|
+
:return: Operation identifier.
|
|
24
|
+
:rtype: str
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
return str(uuid.uuid4())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def redact_source_uri(source_uri: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Redact sensitive components from a source uniform resource identifier.
|
|
33
|
+
|
|
34
|
+
:param source_uri: Source uniform resource identifier.
|
|
35
|
+
:type source_uri: str
|
|
36
|
+
:return: Redacted source uniform resource identifier.
|
|
37
|
+
:rtype: str
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
parsed = urlparse(source_uri)
|
|
41
|
+
|
|
42
|
+
if not parsed.scheme:
|
|
43
|
+
return source_uri
|
|
44
|
+
|
|
45
|
+
netloc = parsed.netloc
|
|
46
|
+
if "@" in netloc:
|
|
47
|
+
netloc = netloc.split("@", 1)[-1]
|
|
48
|
+
|
|
49
|
+
return urlunparse(
|
|
50
|
+
(
|
|
51
|
+
parsed.scheme,
|
|
52
|
+
netloc,
|
|
53
|
+
parsed.path,
|
|
54
|
+
parsed.params,
|
|
55
|
+
parsed.query,
|
|
56
|
+
parsed.fragment,
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class HookLogEntry(BaseModel):
|
|
62
|
+
"""
|
|
63
|
+
Single structured log record for hook execution.
|
|
64
|
+
|
|
65
|
+
:ivar operation_id: Identifier for the enclosing command or call.
|
|
66
|
+
:vartype operation_id: str
|
|
67
|
+
:ivar hook_point: Hook point that executed.
|
|
68
|
+
:vartype hook_point: HookPoint
|
|
69
|
+
:ivar hook_id: Hook implementation identifier.
|
|
70
|
+
:vartype hook_id: str
|
|
71
|
+
:ivar recorded_at: International Organization for Standardization 8601 timestamp for log record creation.
|
|
72
|
+
:vartype recorded_at: str
|
|
73
|
+
:ivar status: Execution status string.
|
|
74
|
+
:vartype status: str
|
|
75
|
+
:ivar message: Optional message describing execution results.
|
|
76
|
+
:vartype message: str or None
|
|
77
|
+
:ivar item_id: Optional item identifier.
|
|
78
|
+
:vartype item_id: str or None
|
|
79
|
+
:ivar relpath: Optional relative path associated with an item.
|
|
80
|
+
:vartype relpath: str or None
|
|
81
|
+
:ivar source_uri: Optional redacted source uniform resource identifier.
|
|
82
|
+
:vartype source_uri: str or None
|
|
83
|
+
:ivar details: Optional structured details about changes.
|
|
84
|
+
:vartype details: dict[str, Any]
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
model_config = ConfigDict(extra="forbid")
|
|
88
|
+
|
|
89
|
+
operation_id: str
|
|
90
|
+
hook_point: HookPoint
|
|
91
|
+
hook_id: str
|
|
92
|
+
recorded_at: str
|
|
93
|
+
status: str = Field(min_length=1)
|
|
94
|
+
message: Optional[str] = None
|
|
95
|
+
item_id: Optional[str] = None
|
|
96
|
+
relpath: Optional[str] = None
|
|
97
|
+
source_uri: Optional[str] = None
|
|
98
|
+
details: Dict[str, Any] = Field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class HookLogger:
|
|
102
|
+
"""
|
|
103
|
+
Hook logger that writes JSON lines records to a corpus log directory.
|
|
104
|
+
|
|
105
|
+
:ivar log_dir: Directory where log files are written.
|
|
106
|
+
:vartype log_dir: Path
|
|
107
|
+
:ivar operation_id: Operation identifier for grouping records.
|
|
108
|
+
:vartype operation_id: str
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(self, *, log_dir: Path, operation_id: str):
|
|
112
|
+
"""
|
|
113
|
+
Initialize a hook logger.
|
|
114
|
+
|
|
115
|
+
:param log_dir: Log directory to write into.
|
|
116
|
+
:type log_dir: Path
|
|
117
|
+
:param operation_id: Operation identifier for grouping records.
|
|
118
|
+
:type operation_id: str
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
self.log_dir = log_dir
|
|
122
|
+
self.operation_id = operation_id
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def path(self) -> Path:
|
|
126
|
+
"""
|
|
127
|
+
Return the log file path for this operation.
|
|
128
|
+
|
|
129
|
+
:return: Log file path.
|
|
130
|
+
:rtype: Path
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
return self.log_dir / f"{self.operation_id}.jsonl"
|
|
134
|
+
|
|
135
|
+
def record(
|
|
136
|
+
self,
|
|
137
|
+
*,
|
|
138
|
+
hook_point: HookPoint,
|
|
139
|
+
hook_id: str,
|
|
140
|
+
status: str,
|
|
141
|
+
message: Optional[str] = None,
|
|
142
|
+
item_id: Optional[str] = None,
|
|
143
|
+
relpath: Optional[str] = None,
|
|
144
|
+
source_uri: Optional[str] = None,
|
|
145
|
+
details: Optional[Dict[str, Any]] = None,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""
|
|
148
|
+
Append a structured hook log record.
|
|
149
|
+
|
|
150
|
+
:param hook_point: Hook point that executed.
|
|
151
|
+
:type hook_point: HookPoint
|
|
152
|
+
:param hook_id: Hook identifier.
|
|
153
|
+
:type hook_id: str
|
|
154
|
+
:param status: Status string such as ok, denied, or error.
|
|
155
|
+
:type status: str
|
|
156
|
+
:param message: Optional message describing results.
|
|
157
|
+
:type message: str or None
|
|
158
|
+
:param item_id: Optional item identifier.
|
|
159
|
+
:type item_id: str or None
|
|
160
|
+
:param relpath: Optional relative path for the item.
|
|
161
|
+
:type relpath: str or None
|
|
162
|
+
:param source_uri: Optional source uniform resource identifier.
|
|
163
|
+
:type source_uri: str or None
|
|
164
|
+
:param details: Optional structured details.
|
|
165
|
+
:type details: dict[str, Any] or None
|
|
166
|
+
:return: None.
|
|
167
|
+
:rtype: None
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
self.log_dir.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
entry = HookLogEntry(
|
|
172
|
+
operation_id=self.operation_id,
|
|
173
|
+
hook_point=hook_point,
|
|
174
|
+
hook_id=hook_id,
|
|
175
|
+
recorded_at=utc_now_iso(),
|
|
176
|
+
status=status,
|
|
177
|
+
message=message,
|
|
178
|
+
item_id=item_id,
|
|
179
|
+
relpath=relpath,
|
|
180
|
+
source_uri=redact_source_uri(source_uri) if source_uri else None,
|
|
181
|
+
details=dict(details or {}),
|
|
182
|
+
)
|
|
183
|
+
line = json.dumps(entry.model_dump(), sort_keys=False)
|
|
184
|
+
with self.path.open("a", encoding="utf-8") as handle:
|
|
185
|
+
handle.write(line + "\n")
|
biblicus/hook_manager.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hook manager for executing configured lifecycle hooks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from .constants import CORPUS_DIR_NAME, HOOK_LOGS_DIR_NAME
|
|
13
|
+
from .hook_logging import HookLogger, new_operation_id
|
|
14
|
+
from .hooks import (
|
|
15
|
+
HookContext,
|
|
16
|
+
HookPoint,
|
|
17
|
+
HookSpec,
|
|
18
|
+
IngestHookContext,
|
|
19
|
+
IngestMutation,
|
|
20
|
+
LifecycleHook,
|
|
21
|
+
build_builtin_hook,
|
|
22
|
+
)
|
|
23
|
+
from .time import utc_now_iso
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HookManager:
|
|
27
|
+
"""
|
|
28
|
+
Hook manager that executes configured hooks and records execution.
|
|
29
|
+
|
|
30
|
+
:ivar corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
31
|
+
:vartype corpus_uri: str
|
|
32
|
+
:ivar log_dir: Directory where hook logs are recorded.
|
|
33
|
+
:vartype log_dir: object
|
|
34
|
+
:ivar operation_id: Identifier for this hook execution session.
|
|
35
|
+
:vartype operation_id: str
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
corpus_uri: str,
|
|
42
|
+
log_dir: Path,
|
|
43
|
+
hooks: Iterable[LifecycleHook],
|
|
44
|
+
operation_id: Optional[str] = None,
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize a hook manager.
|
|
48
|
+
|
|
49
|
+
:param corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
50
|
+
:type corpus_uri: str
|
|
51
|
+
:param log_dir: Directory where hook logs are written.
|
|
52
|
+
:type log_dir: object
|
|
53
|
+
:param hooks: Hook instances to execute.
|
|
54
|
+
:type hooks: Iterable[LifecycleHook]
|
|
55
|
+
:param operation_id: Optional operation identifier override.
|
|
56
|
+
:type operation_id: str or None
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
self.corpus_uri = corpus_uri
|
|
60
|
+
self.log_dir = log_dir
|
|
61
|
+
self.operation_id = operation_id or new_operation_id()
|
|
62
|
+
self._hooks = list(hooks)
|
|
63
|
+
self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_config(cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]) -> "HookManager":
|
|
67
|
+
"""
|
|
68
|
+
Build a hook manager from config data.
|
|
69
|
+
|
|
70
|
+
:param corpus_root: Corpus root directory.
|
|
71
|
+
:type corpus_root: Path
|
|
72
|
+
:param corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
73
|
+
:type corpus_uri: str
|
|
74
|
+
:param hook_specs: Hook specifications loaded from config.
|
|
75
|
+
:type hook_specs: Iterable[HookSpec]
|
|
76
|
+
:return: Hook manager.
|
|
77
|
+
:rtype: HookManager
|
|
78
|
+
:raises KeyError: If a hook identifier is unknown.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
|
|
82
|
+
hooks: List[LifecycleHook] = []
|
|
83
|
+
|
|
84
|
+
for spec in hook_specs:
|
|
85
|
+
hooks.append(build_builtin_hook(spec))
|
|
86
|
+
|
|
87
|
+
return cls(corpus_uri=corpus_uri, log_dir=log_dir, hooks=hooks)
|
|
88
|
+
|
|
89
|
+
def run_ingest_hooks(
|
|
90
|
+
self,
|
|
91
|
+
*,
|
|
92
|
+
hook_point: HookPoint,
|
|
93
|
+
filename: Optional[str],
|
|
94
|
+
media_type: str,
|
|
95
|
+
title: Optional[str],
|
|
96
|
+
tags: List[str],
|
|
97
|
+
metadata: Dict[str, Any],
|
|
98
|
+
source_uri: str,
|
|
99
|
+
item_id: Optional[str] = None,
|
|
100
|
+
relpath: Optional[str] = None,
|
|
101
|
+
) -> IngestMutation:
|
|
102
|
+
"""
|
|
103
|
+
Run ingestion hooks for a hook point.
|
|
104
|
+
|
|
105
|
+
:param hook_point: Hook point to execute.
|
|
106
|
+
:type hook_point: HookPoint
|
|
107
|
+
:param filename: Suggested filename.
|
|
108
|
+
:type filename: str or None
|
|
109
|
+
:param media_type: Media type for the item.
|
|
110
|
+
:type media_type: str
|
|
111
|
+
:param title: Optional title.
|
|
112
|
+
:type title: str or None
|
|
113
|
+
:param tags: Tags associated with the item.
|
|
114
|
+
:type tags: list[str]
|
|
115
|
+
:param metadata: Metadata mapping.
|
|
116
|
+
:type metadata: dict[str, Any]
|
|
117
|
+
:param source_uri: Source uniform resource identifier.
|
|
118
|
+
:type source_uri: str
|
|
119
|
+
:param item_id: Optional item identifier.
|
|
120
|
+
:type item_id: str or None
|
|
121
|
+
:param relpath: Optional relative path.
|
|
122
|
+
:type relpath: str or None
|
|
123
|
+
:return: Combined ingestion mutation result.
|
|
124
|
+
:rtype: IngestMutation
|
|
125
|
+
:raises ValueError: If ingestion is denied by a hook.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
context = IngestHookContext(
|
|
129
|
+
hook_point=hook_point,
|
|
130
|
+
operation_id=self.operation_id,
|
|
131
|
+
corpus_uri=self.corpus_uri,
|
|
132
|
+
created_at=utc_now_iso(),
|
|
133
|
+
filename=filename,
|
|
134
|
+
media_type=media_type,
|
|
135
|
+
title=title,
|
|
136
|
+
tags=list(tags),
|
|
137
|
+
metadata=dict(metadata),
|
|
138
|
+
source_uri=source_uri,
|
|
139
|
+
item_id=item_id,
|
|
140
|
+
relpath=relpath,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
combined = IngestMutation()
|
|
144
|
+
for hook in self._hooks_for_point(hook_point):
|
|
145
|
+
result_dict = self._run_single(hook=hook, context=context)
|
|
146
|
+
mutation = IngestMutation.model_validate(result_dict)
|
|
147
|
+
if mutation.deny:
|
|
148
|
+
self._logger.record(
|
|
149
|
+
hook_point=hook_point,
|
|
150
|
+
hook_id=hook.hook_id,
|
|
151
|
+
status="denied",
|
|
152
|
+
message=mutation.deny_reason or mutation.message,
|
|
153
|
+
item_id=item_id,
|
|
154
|
+
relpath=relpath,
|
|
155
|
+
source_uri=source_uri,
|
|
156
|
+
details={"add_tags": mutation.add_tags},
|
|
157
|
+
)
|
|
158
|
+
raise ValueError(mutation.deny_reason or "Ingest denied")
|
|
159
|
+
if mutation.add_tags:
|
|
160
|
+
combined.add_tags.extend(mutation.add_tags)
|
|
161
|
+
self._logger.record(
|
|
162
|
+
hook_point=hook_point,
|
|
163
|
+
hook_id=hook.hook_id,
|
|
164
|
+
status="ok",
|
|
165
|
+
message=mutation.message,
|
|
166
|
+
item_id=item_id,
|
|
167
|
+
relpath=relpath,
|
|
168
|
+
source_uri=source_uri,
|
|
169
|
+
details={"add_tags": mutation.add_tags},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
deduplicated_tags: List[str] = []
|
|
173
|
+
for tag in combined.add_tags:
|
|
174
|
+
if tag not in deduplicated_tags:
|
|
175
|
+
deduplicated_tags.append(tag)
|
|
176
|
+
combined.add_tags = deduplicated_tags
|
|
177
|
+
return combined
|
|
178
|
+
|
|
179
|
+
def _hooks_for_point(self, hook_point: HookPoint) -> List[LifecycleHook]:
|
|
180
|
+
eligible: List[LifecycleHook] = []
|
|
181
|
+
for hook in self._hooks:
|
|
182
|
+
if hook_point in list(getattr(hook, "hook_points", [])):
|
|
183
|
+
eligible.append(hook)
|
|
184
|
+
return eligible
|
|
185
|
+
|
|
186
|
+
def _run_single(self, *, hook: LifecycleHook, context: HookContext) -> Dict[str, Any]:
|
|
187
|
+
"""
|
|
188
|
+
Run a single hook with error capture.
|
|
189
|
+
|
|
190
|
+
:param hook: Hook to execute.
|
|
191
|
+
:type hook: LifecycleHook
|
|
192
|
+
:param context: Hook context.
|
|
193
|
+
:type context: HookContext
|
|
194
|
+
:return: Hook result mapping.
|
|
195
|
+
:rtype: dict[str, Any]
|
|
196
|
+
:raises ValueError: If a hook raises an exception.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
result = hook.run(context)
|
|
201
|
+
except Exception as exc:
|
|
202
|
+
raise ValueError(f"Hook {hook.hook_id!r} failed: {exc}") from exc
|
|
203
|
+
if isinstance(result, BaseModel):
|
|
204
|
+
return result.model_dump()
|
|
205
|
+
raise ValueError(f"Hook {hook.hook_id!r} returned a non-Pydantic result")
|
biblicus/hooks.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lifecycle hook interfaces and built-in hook implementations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HookPoint(str, Enum):
|
|
14
|
+
"""
|
|
15
|
+
Canonical lifecycle hook points for corpus operations.
|
|
16
|
+
|
|
17
|
+
:cvar before_ingest: Called before an item is ingested.
|
|
18
|
+
:cvar after_ingest: Called after an item is ingested and indexed.
|
|
19
|
+
:cvar before_reindex: Called before a catalog rebuild starts.
|
|
20
|
+
:cvar after_reindex: Called after a catalog rebuild completes.
|
|
21
|
+
:cvar before_build_run: Called before a backend run build starts.
|
|
22
|
+
:cvar after_build_run: Called after a backend run build completes.
|
|
23
|
+
:cvar before_query: Called before a query is executed.
|
|
24
|
+
:cvar after_query: Called after a query completes.
|
|
25
|
+
:cvar before_evaluate_run: Called before an evaluation starts.
|
|
26
|
+
:cvar after_evaluate_run: Called after an evaluation completes.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
before_ingest = "before_ingest"
|
|
30
|
+
after_ingest = "after_ingest"
|
|
31
|
+
before_reindex = "before_reindex"
|
|
32
|
+
after_reindex = "after_reindex"
|
|
33
|
+
before_build_run = "before_build_run"
|
|
34
|
+
after_build_run = "after_build_run"
|
|
35
|
+
before_query = "before_query"
|
|
36
|
+
after_query = "after_query"
|
|
37
|
+
before_evaluate_run = "before_evaluate_run"
|
|
38
|
+
after_evaluate_run = "after_evaluate_run"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class HookSpec(BaseModel):
|
|
42
|
+
"""
|
|
43
|
+
On-disk hook specification stored in a corpus config.
|
|
44
|
+
|
|
45
|
+
:ivar hook_id: Identifier used to locate a hook implementation.
|
|
46
|
+
:vartype hook_id: str
|
|
47
|
+
:ivar hook_points: Hook points where the hook executes.
|
|
48
|
+
:vartype hook_points: list[HookPoint]
|
|
49
|
+
:ivar config: Hook-specific configuration values.
|
|
50
|
+
:vartype config: dict[str, Any]
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
model_config = ConfigDict(extra="forbid")
|
|
54
|
+
|
|
55
|
+
hook_id: str = Field(min_length=1)
|
|
56
|
+
hook_points: List[HookPoint] = Field(default_factory=list)
|
|
57
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class HookContext(BaseModel):
|
|
61
|
+
"""
|
|
62
|
+
Base context passed to hooks.
|
|
63
|
+
|
|
64
|
+
:ivar hook_point: Hook point currently executing.
|
|
65
|
+
:vartype hook_point: HookPoint
|
|
66
|
+
:ivar operation_id: Identifier for the enclosing command or call.
|
|
67
|
+
:vartype operation_id: str
|
|
68
|
+
:ivar corpus_uri: Canonical uniform resource identifier for the corpus.
|
|
69
|
+
:vartype corpus_uri: str
|
|
70
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp when the context was created.
|
|
71
|
+
:vartype created_at: str
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
model_config = ConfigDict(extra="forbid")
|
|
75
|
+
|
|
76
|
+
hook_point: HookPoint
|
|
77
|
+
operation_id: str
|
|
78
|
+
corpus_uri: str
|
|
79
|
+
created_at: str
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class IngestHookContext(HookContext):
|
|
83
|
+
"""
|
|
84
|
+
Hook context for ingestion hooks.
|
|
85
|
+
|
|
86
|
+
:ivar filename: Suggested filename for the item.
|
|
87
|
+
:vartype filename: str or None
|
|
88
|
+
:ivar media_type: Media type for the item.
|
|
89
|
+
:vartype media_type: str
|
|
90
|
+
:ivar title: Optional title associated with the item.
|
|
91
|
+
:vartype title: str or None
|
|
92
|
+
:ivar tags: Tags associated with the item.
|
|
93
|
+
:vartype tags: list[str]
|
|
94
|
+
:ivar metadata: Metadata mapping associated with the item.
|
|
95
|
+
:vartype metadata: dict[str, Any]
|
|
96
|
+
:ivar source_uri: Source uniform resource identifier.
|
|
97
|
+
:vartype source_uri: str
|
|
98
|
+
:ivar item_id: Item identifier when available.
|
|
99
|
+
:vartype item_id: str or None
|
|
100
|
+
:ivar relpath: Relative path to stored raw bytes when available.
|
|
101
|
+
:vartype relpath: str or None
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
filename: Optional[str] = None
|
|
105
|
+
media_type: str
|
|
106
|
+
title: Optional[str] = None
|
|
107
|
+
tags: List[str] = Field(default_factory=list)
|
|
108
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
109
|
+
source_uri: str
|
|
110
|
+
item_id: Optional[str] = None
|
|
111
|
+
relpath: Optional[str] = None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class HookResult(BaseModel):
|
|
115
|
+
"""
|
|
116
|
+
Base hook result with optional message fields.
|
|
117
|
+
|
|
118
|
+
:ivar message: Optional human-readable message.
|
|
119
|
+
:vartype message: str or None
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
model_config = ConfigDict(extra="forbid")
|
|
123
|
+
|
|
124
|
+
message: Optional[str] = None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class IngestMutation(HookResult):
|
|
128
|
+
"""
|
|
129
|
+
Hook result describing ingestion mutations.
|
|
130
|
+
|
|
131
|
+
:ivar deny: Whether ingest should be denied.
|
|
132
|
+
:vartype deny: bool
|
|
133
|
+
:ivar deny_reason: Optional reason for denial.
|
|
134
|
+
:vartype deny_reason: str or None
|
|
135
|
+
:ivar add_tags: Tags to add.
|
|
136
|
+
:vartype add_tags: list[str]
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
deny: bool = False
|
|
140
|
+
deny_reason: Optional[str] = None
|
|
141
|
+
add_tags: List[str] = Field(default_factory=list)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class LifecycleHook:
|
|
145
|
+
"""
|
|
146
|
+
Base class for a lifecycle hook implementation.
|
|
147
|
+
|
|
148
|
+
:param context: Validated hook context.
|
|
149
|
+
:type context: HookContext
|
|
150
|
+
:return: Hook result. Concrete hook points may require a more specific result type.
|
|
151
|
+
:rtype: HookResult
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
hook_id: str
|
|
155
|
+
hook_points: Sequence[HookPoint]
|
|
156
|
+
|
|
157
|
+
def run(self, context: HookContext) -> HookResult:
|
|
158
|
+
"""
|
|
159
|
+
Execute the hook.
|
|
160
|
+
|
|
161
|
+
:param context: Hook context.
|
|
162
|
+
:type context: HookContext
|
|
163
|
+
:return: Hook result.
|
|
164
|
+
:rtype: HookResult
|
|
165
|
+
:raises NotImplementedError: If the hook does not implement run.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
_ = context
|
|
169
|
+
raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class AddTagsHook:
|
|
173
|
+
"""
|
|
174
|
+
Built-in hook that adds tags during ingestion.
|
|
175
|
+
|
|
176
|
+
:ivar hook_id: Hook identifier.
|
|
177
|
+
:vartype hook_id: str
|
|
178
|
+
:ivar hook_points: Hook points where the hook applies.
|
|
179
|
+
:vartype hook_points: list[HookPoint]
|
|
180
|
+
:ivar tags: Tags to add.
|
|
181
|
+
:vartype tags: list[str]
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
hook_id = "add-tags"
|
|
185
|
+
|
|
186
|
+
def __init__(self, *, hook_points: Sequence[HookPoint], tags: Sequence[str]):
|
|
187
|
+
"""
|
|
188
|
+
Initialize the add-tags hook.
|
|
189
|
+
|
|
190
|
+
:param hook_points: Hook points where the hook runs.
|
|
191
|
+
:type hook_points: Sequence[HookPoint]
|
|
192
|
+
:param tags: Tags to add.
|
|
193
|
+
:type tags: Sequence[str]
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
self.hook_points = list(hook_points)
|
|
197
|
+
self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
198
|
+
|
|
199
|
+
def run(self, context: HookContext) -> HookResult:
|
|
200
|
+
"""
|
|
201
|
+
Run the hook.
|
|
202
|
+
|
|
203
|
+
:param context: Hook context.
|
|
204
|
+
:type context: HookContext
|
|
205
|
+
:return: Ingest mutation result.
|
|
206
|
+
:rtype: HookResult
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
_ = context
|
|
210
|
+
return IngestMutation(add_tags=list(self.tags))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class DenyAllHook:
|
|
214
|
+
"""
|
|
215
|
+
Built-in hook that denies every ingest.
|
|
216
|
+
|
|
217
|
+
:ivar hook_id: Hook identifier.
|
|
218
|
+
:vartype hook_id: str
|
|
219
|
+
:ivar hook_points: Hook points where the hook applies.
|
|
220
|
+
:vartype hook_points: list[HookPoint]
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
hook_id = "deny-all"
|
|
224
|
+
|
|
225
|
+
def __init__(self, *, hook_points: Sequence[HookPoint]):
|
|
226
|
+
"""
|
|
227
|
+
Initialize the deny-all hook.
|
|
228
|
+
|
|
229
|
+
:param hook_points: Hook points where the hook runs.
|
|
230
|
+
:type hook_points: Sequence[HookPoint]
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
self.hook_points = list(hook_points)
|
|
234
|
+
|
|
235
|
+
def run(self, context: HookContext) -> HookResult:
|
|
236
|
+
"""
|
|
237
|
+
Run the hook.
|
|
238
|
+
|
|
239
|
+
:param context: Hook context.
|
|
240
|
+
:type context: HookContext
|
|
241
|
+
:return: Ingest denial result.
|
|
242
|
+
:rtype: HookResult
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
_ = context
|
|
246
|
+
return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
|
|
250
|
+
"""
|
|
251
|
+
Build a built-in hook from a hook specification.
|
|
252
|
+
|
|
253
|
+
:param spec: Hook specification.
|
|
254
|
+
:type spec: HookSpec
|
|
255
|
+
:return: Hook instance.
|
|
256
|
+
:rtype: LifecycleHook
|
|
257
|
+
:raises KeyError: If the hook identifier is unknown.
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
if spec.hook_id == AddTagsHook.hook_id:
|
|
261
|
+
tags = spec.config.get("tags") or []
|
|
262
|
+
return AddTagsHook(hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else [])
|
|
263
|
+
if spec.hook_id == DenyAllHook.hook_id:
|
|
264
|
+
return DenyAllHook(hook_points=spec.hook_points)
|
|
265
|
+
raise KeyError(f"Unknown hook_id {spec.hook_id!r}")
|