biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,185 @@
1
+ """
2
+ Structured hook execution logging.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import uuid
9
+ from pathlib import Path
10
+ from typing import Any, Dict, Optional
11
+ from urllib.parse import urlparse, urlunparse
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field
14
+
15
+ from .hooks import HookPoint
16
+ from .time import utc_now_iso
17
+
18
+
19
+ def new_operation_id() -> str:
20
+ """
21
+ Create a new operation identifier for hook log grouping.
22
+
23
+ :return: Operation identifier.
24
+ :rtype: str
25
+ """
26
+
27
+ return str(uuid.uuid4())
28
+
29
+
30
+ def redact_source_uri(source_uri: str) -> str:
31
+ """
32
+ Redact sensitive components from a source uniform resource identifier.
33
+
34
+ :param source_uri: Source uniform resource identifier.
35
+ :type source_uri: str
36
+ :return: Redacted source uniform resource identifier.
37
+ :rtype: str
38
+ """
39
+
40
+ parsed = urlparse(source_uri)
41
+
42
+ if not parsed.scheme:
43
+ return source_uri
44
+
45
+ netloc = parsed.netloc
46
+ if "@" in netloc:
47
+ netloc = netloc.split("@", 1)[-1]
48
+
49
+ return urlunparse(
50
+ (
51
+ parsed.scheme,
52
+ netloc,
53
+ parsed.path,
54
+ parsed.params,
55
+ parsed.query,
56
+ parsed.fragment,
57
+ )
58
+ )
59
+
60
+
61
+ class HookLogEntry(BaseModel):
62
+ """
63
+ Single structured log record for hook execution.
64
+
65
+ :ivar operation_id: Identifier for the enclosing command or call.
66
+ :vartype operation_id: str
67
+ :ivar hook_point: Hook point that executed.
68
+ :vartype hook_point: HookPoint
69
+ :ivar hook_id: Hook implementation identifier.
70
+ :vartype hook_id: str
71
+ :ivar recorded_at: International Organization for Standardization 8601 timestamp for log record creation.
72
+ :vartype recorded_at: str
73
+ :ivar status: Execution status string.
74
+ :vartype status: str
75
+ :ivar message: Optional message describing execution results.
76
+ :vartype message: str or None
77
+ :ivar item_id: Optional item identifier.
78
+ :vartype item_id: str or None
79
+ :ivar relpath: Optional relative path associated with an item.
80
+ :vartype relpath: str or None
81
+ :ivar source_uri: Optional redacted source uniform resource identifier.
82
+ :vartype source_uri: str or None
83
+ :ivar details: Optional structured details about changes.
84
+ :vartype details: dict[str, Any]
85
+ """
86
+
87
+ model_config = ConfigDict(extra="forbid")
88
+
89
+ operation_id: str
90
+ hook_point: HookPoint
91
+ hook_id: str
92
+ recorded_at: str
93
+ status: str = Field(min_length=1)
94
+ message: Optional[str] = None
95
+ item_id: Optional[str] = None
96
+ relpath: Optional[str] = None
97
+ source_uri: Optional[str] = None
98
+ details: Dict[str, Any] = Field(default_factory=dict)
99
+
100
+
101
+ class HookLogger:
102
+ """
103
+ Hook logger that writes JSON lines records to a corpus log directory.
104
+
105
+ :ivar log_dir: Directory where log files are written.
106
+ :vartype log_dir: Path
107
+ :ivar operation_id: Operation identifier for grouping records.
108
+ :vartype operation_id: str
109
+ """
110
+
111
+ def __init__(self, *, log_dir: Path, operation_id: str):
112
+ """
113
+ Initialize a hook logger.
114
+
115
+ :param log_dir: Log directory to write into.
116
+ :type log_dir: Path
117
+ :param operation_id: Operation identifier for grouping records.
118
+ :type operation_id: str
119
+ """
120
+
121
+ self.log_dir = log_dir
122
+ self.operation_id = operation_id
123
+
124
+ @property
125
+ def path(self) -> Path:
126
+ """
127
+ Return the log file path for this operation.
128
+
129
+ :return: Log file path.
130
+ :rtype: Path
131
+ """
132
+
133
+ return self.log_dir / f"{self.operation_id}.jsonl"
134
+
135
+ def record(
136
+ self,
137
+ *,
138
+ hook_point: HookPoint,
139
+ hook_id: str,
140
+ status: str,
141
+ message: Optional[str] = None,
142
+ item_id: Optional[str] = None,
143
+ relpath: Optional[str] = None,
144
+ source_uri: Optional[str] = None,
145
+ details: Optional[Dict[str, Any]] = None,
146
+ ) -> None:
147
+ """
148
+ Append a structured hook log record.
149
+
150
+ :param hook_point: Hook point that executed.
151
+ :type hook_point: HookPoint
152
+ :param hook_id: Hook identifier.
153
+ :type hook_id: str
154
+ :param status: Status string such as ok, denied, or error.
155
+ :type status: str
156
+ :param message: Optional message describing results.
157
+ :type message: str or None
158
+ :param item_id: Optional item identifier.
159
+ :type item_id: str or None
160
+ :param relpath: Optional relative path for the item.
161
+ :type relpath: str or None
162
+ :param source_uri: Optional source uniform resource identifier.
163
+ :type source_uri: str or None
164
+ :param details: Optional structured details.
165
+ :type details: dict[str, Any] or None
166
+ :return: None.
167
+ :rtype: None
168
+ """
169
+
170
+ self.log_dir.mkdir(parents=True, exist_ok=True)
171
+ entry = HookLogEntry(
172
+ operation_id=self.operation_id,
173
+ hook_point=hook_point,
174
+ hook_id=hook_id,
175
+ recorded_at=utc_now_iso(),
176
+ status=status,
177
+ message=message,
178
+ item_id=item_id,
179
+ relpath=relpath,
180
+ source_uri=redact_source_uri(source_uri) if source_uri else None,
181
+ details=dict(details or {}),
182
+ )
183
+ line = json.dumps(entry.model_dump(), sort_keys=False)
184
+ with self.path.open("a", encoding="utf-8") as handle:
185
+ handle.write(line + "\n")
@@ -0,0 +1,205 @@
1
+ """
2
+ Hook manager for executing configured lifecycle hooks.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Iterable, List, Optional
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from .constants import CORPUS_DIR_NAME, HOOK_LOGS_DIR_NAME
13
+ from .hook_logging import HookLogger, new_operation_id
14
+ from .hooks import (
15
+ HookContext,
16
+ HookPoint,
17
+ HookSpec,
18
+ IngestHookContext,
19
+ IngestMutation,
20
+ LifecycleHook,
21
+ build_builtin_hook,
22
+ )
23
+ from .time import utc_now_iso
24
+
25
+
26
+ class HookManager:
27
+ """
28
+ Hook manager that executes configured hooks and records execution.
29
+
30
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus.
31
+ :vartype corpus_uri: str
32
+ :ivar log_dir: Directory where hook logs are recorded.
33
+ :vartype log_dir: object
34
+ :ivar operation_id: Identifier for this hook execution session.
35
+ :vartype operation_id: str
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ *,
41
+ corpus_uri: str,
42
+ log_dir: Path,
43
+ hooks: Iterable[LifecycleHook],
44
+ operation_id: Optional[str] = None,
45
+ ):
46
+ """
47
+ Initialize a hook manager.
48
+
49
+ :param corpus_uri: Canonical uniform resource identifier for the corpus.
50
+ :type corpus_uri: str
51
+ :param log_dir: Directory where hook logs are written.
52
+ :type log_dir: object
53
+ :param hooks: Hook instances to execute.
54
+ :type hooks: Iterable[LifecycleHook]
55
+ :param operation_id: Optional operation identifier override.
56
+ :type operation_id: str or None
57
+ """
58
+
59
+ self.corpus_uri = corpus_uri
60
+ self.log_dir = log_dir
61
+ self.operation_id = operation_id or new_operation_id()
62
+ self._hooks = list(hooks)
63
+ self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
64
+
65
+ @classmethod
66
+ def from_config(cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]) -> "HookManager":
67
+ """
68
+ Build a hook manager from config data.
69
+
70
+ :param corpus_root: Corpus root directory.
71
+ :type corpus_root: Path
72
+ :param corpus_uri: Canonical uniform resource identifier for the corpus.
73
+ :type corpus_uri: str
74
+ :param hook_specs: Hook specifications loaded from config.
75
+ :type hook_specs: Iterable[HookSpec]
76
+ :return: Hook manager.
77
+ :rtype: HookManager
78
+ :raises KeyError: If a hook identifier is unknown.
79
+ """
80
+
81
+ log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
82
+ hooks: List[LifecycleHook] = []
83
+
84
+ for spec in hook_specs:
85
+ hooks.append(build_builtin_hook(spec))
86
+
87
+ return cls(corpus_uri=corpus_uri, log_dir=log_dir, hooks=hooks)
88
+
89
+ def run_ingest_hooks(
90
+ self,
91
+ *,
92
+ hook_point: HookPoint,
93
+ filename: Optional[str],
94
+ media_type: str,
95
+ title: Optional[str],
96
+ tags: List[str],
97
+ metadata: Dict[str, Any],
98
+ source_uri: str,
99
+ item_id: Optional[str] = None,
100
+ relpath: Optional[str] = None,
101
+ ) -> IngestMutation:
102
+ """
103
+ Run ingestion hooks for a hook point.
104
+
105
+ :param hook_point: Hook point to execute.
106
+ :type hook_point: HookPoint
107
+ :param filename: Suggested filename.
108
+ :type filename: str or None
109
+ :param media_type: Media type for the item.
110
+ :type media_type: str
111
+ :param title: Optional title.
112
+ :type title: str or None
113
+ :param tags: Tags associated with the item.
114
+ :type tags: list[str]
115
+ :param metadata: Metadata mapping.
116
+ :type metadata: dict[str, Any]
117
+ :param source_uri: Source uniform resource identifier.
118
+ :type source_uri: str
119
+ :param item_id: Optional item identifier.
120
+ :type item_id: str or None
121
+ :param relpath: Optional relative path.
122
+ :type relpath: str or None
123
+ :return: Combined ingestion mutation result.
124
+ :rtype: IngestMutation
125
+ :raises ValueError: If ingestion is denied by a hook.
126
+ """
127
+
128
+ context = IngestHookContext(
129
+ hook_point=hook_point,
130
+ operation_id=self.operation_id,
131
+ corpus_uri=self.corpus_uri,
132
+ created_at=utc_now_iso(),
133
+ filename=filename,
134
+ media_type=media_type,
135
+ title=title,
136
+ tags=list(tags),
137
+ metadata=dict(metadata),
138
+ source_uri=source_uri,
139
+ item_id=item_id,
140
+ relpath=relpath,
141
+ )
142
+
143
+ combined = IngestMutation()
144
+ for hook in self._hooks_for_point(hook_point):
145
+ result_dict = self._run_single(hook=hook, context=context)
146
+ mutation = IngestMutation.model_validate(result_dict)
147
+ if mutation.deny:
148
+ self._logger.record(
149
+ hook_point=hook_point,
150
+ hook_id=hook.hook_id,
151
+ status="denied",
152
+ message=mutation.deny_reason or mutation.message,
153
+ item_id=item_id,
154
+ relpath=relpath,
155
+ source_uri=source_uri,
156
+ details={"add_tags": mutation.add_tags},
157
+ )
158
+ raise ValueError(mutation.deny_reason or "Ingest denied")
159
+ if mutation.add_tags:
160
+ combined.add_tags.extend(mutation.add_tags)
161
+ self._logger.record(
162
+ hook_point=hook_point,
163
+ hook_id=hook.hook_id,
164
+ status="ok",
165
+ message=mutation.message,
166
+ item_id=item_id,
167
+ relpath=relpath,
168
+ source_uri=source_uri,
169
+ details={"add_tags": mutation.add_tags},
170
+ )
171
+
172
+ deduplicated_tags: List[str] = []
173
+ for tag in combined.add_tags:
174
+ if tag not in deduplicated_tags:
175
+ deduplicated_tags.append(tag)
176
+ combined.add_tags = deduplicated_tags
177
+ return combined
178
+
179
+ def _hooks_for_point(self, hook_point: HookPoint) -> List[LifecycleHook]:
180
+ eligible: List[LifecycleHook] = []
181
+ for hook in self._hooks:
182
+ if hook_point in list(getattr(hook, "hook_points", [])):
183
+ eligible.append(hook)
184
+ return eligible
185
+
186
+ def _run_single(self, *, hook: LifecycleHook, context: HookContext) -> Dict[str, Any]:
187
+ """
188
+ Run a single hook with error capture.
189
+
190
+ :param hook: Hook to execute.
191
+ :type hook: LifecycleHook
192
+ :param context: Hook context.
193
+ :type context: HookContext
194
+ :return: Hook result mapping.
195
+ :rtype: dict[str, Any]
196
+ :raises ValueError: If a hook raises an exception.
197
+ """
198
+
199
+ try:
200
+ result = hook.run(context)
201
+ except Exception as exc:
202
+ raise ValueError(f"Hook {hook.hook_id!r} failed: {exc}") from exc
203
+ if isinstance(result, BaseModel):
204
+ return result.model_dump()
205
+ raise ValueError(f"Hook {hook.hook_id!r} returned a non-Pydantic result")
biblicus/hooks.py ADDED
@@ -0,0 +1,265 @@
1
+ """
2
+ Lifecycle hook interfaces and built-in hook implementations.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from enum import Enum
8
+ from typing import Any, Dict, List, Optional, Sequence
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+
12
+
13
+ class HookPoint(str, Enum):
14
+ """
15
+ Canonical lifecycle hook points for corpus operations.
16
+
17
+ :cvar before_ingest: Called before an item is ingested.
18
+ :cvar after_ingest: Called after an item is ingested and indexed.
19
+ :cvar before_reindex: Called before a catalog rebuild starts.
20
+ :cvar after_reindex: Called after a catalog rebuild completes.
21
+ :cvar before_build_run: Called before a backend run build starts.
22
+ :cvar after_build_run: Called after a backend run build completes.
23
+ :cvar before_query: Called before a query is executed.
24
+ :cvar after_query: Called after a query completes.
25
+ :cvar before_evaluate_run: Called before an evaluation starts.
26
+ :cvar after_evaluate_run: Called after an evaluation completes.
27
+ """
28
+
29
+ before_ingest = "before_ingest"
30
+ after_ingest = "after_ingest"
31
+ before_reindex = "before_reindex"
32
+ after_reindex = "after_reindex"
33
+ before_build_run = "before_build_run"
34
+ after_build_run = "after_build_run"
35
+ before_query = "before_query"
36
+ after_query = "after_query"
37
+ before_evaluate_run = "before_evaluate_run"
38
+ after_evaluate_run = "after_evaluate_run"
39
+
40
+
41
+ class HookSpec(BaseModel):
42
+ """
43
+ On-disk hook specification stored in a corpus config.
44
+
45
+ :ivar hook_id: Identifier used to locate a hook implementation.
46
+ :vartype hook_id: str
47
+ :ivar hook_points: Hook points where the hook executes.
48
+ :vartype hook_points: list[HookPoint]
49
+ :ivar config: Hook-specific configuration values.
50
+ :vartype config: dict[str, Any]
51
+ """
52
+
53
+ model_config = ConfigDict(extra="forbid")
54
+
55
+ hook_id: str = Field(min_length=1)
56
+ hook_points: List[HookPoint] = Field(default_factory=list)
57
+ config: Dict[str, Any] = Field(default_factory=dict)
58
+
59
+
60
+ class HookContext(BaseModel):
61
+ """
62
+ Base context passed to hooks.
63
+
64
+ :ivar hook_point: Hook point currently executing.
65
+ :vartype hook_point: HookPoint
66
+ :ivar operation_id: Identifier for the enclosing command or call.
67
+ :vartype operation_id: str
68
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus.
69
+ :vartype corpus_uri: str
70
+ :ivar created_at: International Organization for Standardization 8601 timestamp when the context was created.
71
+ :vartype created_at: str
72
+ """
73
+
74
+ model_config = ConfigDict(extra="forbid")
75
+
76
+ hook_point: HookPoint
77
+ operation_id: str
78
+ corpus_uri: str
79
+ created_at: str
80
+
81
+
82
+ class IngestHookContext(HookContext):
83
+ """
84
+ Hook context for ingestion hooks.
85
+
86
+ :ivar filename: Suggested filename for the item.
87
+ :vartype filename: str or None
88
+ :ivar media_type: Media type for the item.
89
+ :vartype media_type: str
90
+ :ivar title: Optional title associated with the item.
91
+ :vartype title: str or None
92
+ :ivar tags: Tags associated with the item.
93
+ :vartype tags: list[str]
94
+ :ivar metadata: Metadata mapping associated with the item.
95
+ :vartype metadata: dict[str, Any]
96
+ :ivar source_uri: Source uniform resource identifier.
97
+ :vartype source_uri: str
98
+ :ivar item_id: Item identifier when available.
99
+ :vartype item_id: str or None
100
+ :ivar relpath: Relative path to stored raw bytes when available.
101
+ :vartype relpath: str or None
102
+ """
103
+
104
+ filename: Optional[str] = None
105
+ media_type: str
106
+ title: Optional[str] = None
107
+ tags: List[str] = Field(default_factory=list)
108
+ metadata: Dict[str, Any] = Field(default_factory=dict)
109
+ source_uri: str
110
+ item_id: Optional[str] = None
111
+ relpath: Optional[str] = None
112
+
113
+
114
+ class HookResult(BaseModel):
115
+ """
116
+ Base hook result with optional message fields.
117
+
118
+ :ivar message: Optional human-readable message.
119
+ :vartype message: str or None
120
+ """
121
+
122
+ model_config = ConfigDict(extra="forbid")
123
+
124
+ message: Optional[str] = None
125
+
126
+
127
+ class IngestMutation(HookResult):
128
+ """
129
+ Hook result describing ingestion mutations.
130
+
131
+ :ivar deny: Whether ingest should be denied.
132
+ :vartype deny: bool
133
+ :ivar deny_reason: Optional reason for denial.
134
+ :vartype deny_reason: str or None
135
+ :ivar add_tags: Tags to add.
136
+ :vartype add_tags: list[str]
137
+ """
138
+
139
+ deny: bool = False
140
+ deny_reason: Optional[str] = None
141
+ add_tags: List[str] = Field(default_factory=list)
142
+
143
+
144
+ class LifecycleHook:
145
+ """
146
+ Base class for a lifecycle hook implementation.
147
+
148
+ :param context: Validated hook context.
149
+ :type context: HookContext
150
+ :return: Hook result. Concrete hook points may require a more specific result type.
151
+ :rtype: HookResult
152
+ """
153
+
154
+ hook_id: str
155
+ hook_points: Sequence[HookPoint]
156
+
157
+ def run(self, context: HookContext) -> HookResult:
158
+ """
159
+ Execute the hook.
160
+
161
+ :param context: Hook context.
162
+ :type context: HookContext
163
+ :return: Hook result.
164
+ :rtype: HookResult
165
+ :raises NotImplementedError: If the hook does not implement run.
166
+ """
167
+
168
+ _ = context
169
+ raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
170
+
171
+
172
+ class AddTagsHook:
173
+ """
174
+ Built-in hook that adds tags during ingestion.
175
+
176
+ :ivar hook_id: Hook identifier.
177
+ :vartype hook_id: str
178
+ :ivar hook_points: Hook points where the hook applies.
179
+ :vartype hook_points: list[HookPoint]
180
+ :ivar tags: Tags to add.
181
+ :vartype tags: list[str]
182
+ """
183
+
184
+ hook_id = "add-tags"
185
+
186
+ def __init__(self, *, hook_points: Sequence[HookPoint], tags: Sequence[str]):
187
+ """
188
+ Initialize the add-tags hook.
189
+
190
+ :param hook_points: Hook points where the hook runs.
191
+ :type hook_points: Sequence[HookPoint]
192
+ :param tags: Tags to add.
193
+ :type tags: Sequence[str]
194
+ """
195
+
196
+ self.hook_points = list(hook_points)
197
+ self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
198
+
199
+ def run(self, context: HookContext) -> HookResult:
200
+ """
201
+ Run the hook.
202
+
203
+ :param context: Hook context.
204
+ :type context: HookContext
205
+ :return: Ingest mutation result.
206
+ :rtype: HookResult
207
+ """
208
+
209
+ _ = context
210
+ return IngestMutation(add_tags=list(self.tags))
211
+
212
+
213
+ class DenyAllHook:
214
+ """
215
+ Built-in hook that denies every ingest.
216
+
217
+ :ivar hook_id: Hook identifier.
218
+ :vartype hook_id: str
219
+ :ivar hook_points: Hook points where the hook applies.
220
+ :vartype hook_points: list[HookPoint]
221
+ """
222
+
223
+ hook_id = "deny-all"
224
+
225
+ def __init__(self, *, hook_points: Sequence[HookPoint]):
226
+ """
227
+ Initialize the deny-all hook.
228
+
229
+ :param hook_points: Hook points where the hook runs.
230
+ :type hook_points: Sequence[HookPoint]
231
+ """
232
+
233
+ self.hook_points = list(hook_points)
234
+
235
+ def run(self, context: HookContext) -> HookResult:
236
+ """
237
+ Run the hook.
238
+
239
+ :param context: Hook context.
240
+ :type context: HookContext
241
+ :return: Ingest denial result.
242
+ :rtype: HookResult
243
+ """
244
+
245
+ _ = context
246
+ return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
247
+
248
+
249
+ def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
250
+ """
251
+ Build a built-in hook from a hook specification.
252
+
253
+ :param spec: Hook specification.
254
+ :type spec: HookSpec
255
+ :return: Hook instance.
256
+ :rtype: LifecycleHook
257
+ :raises KeyError: If the hook identifier is unknown.
258
+ """
259
+
260
+ if spec.hook_id == AddTagsHook.hook_id:
261
+ tags = spec.config.get("tags") or []
262
+ return AddTagsHook(hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else [])
263
+ if spec.hook_id == DenyAllHook.hook_id:
264
+ return DenyAllHook(hook_points=spec.hook_points)
265
+ raise KeyError(f"Unknown hook_id {spec.hook_id!r}")