biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/corpus.py
ADDED
|
@@ -0,0 +1,1531 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Corpus storage and ingestion for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import mimetypes
|
|
10
|
+
import shutil
|
|
11
|
+
import uuid
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
14
|
+
|
|
15
|
+
import yaml
|
|
16
|
+
from pydantic import ValidationError
|
|
17
|
+
|
|
18
|
+
from .constants import (
|
|
19
|
+
CORPUS_DIR_NAME,
|
|
20
|
+
DEFAULT_RAW_DIR,
|
|
21
|
+
EXTRACTION_RUNS_DIR_NAME,
|
|
22
|
+
RUNS_DIR_NAME,
|
|
23
|
+
SCHEMA_VERSION,
|
|
24
|
+
SIDECAR_SUFFIX,
|
|
25
|
+
)
|
|
26
|
+
from .frontmatter import parse_front_matter, render_front_matter
|
|
27
|
+
from .hook_manager import HookManager
|
|
28
|
+
from .hooks import HookPoint
|
|
29
|
+
from .ignore import load_corpus_ignore_spec
|
|
30
|
+
from .models import (
|
|
31
|
+
CatalogItem,
|
|
32
|
+
CorpusCatalog,
|
|
33
|
+
CorpusConfig,
|
|
34
|
+
ExtractionRunListEntry,
|
|
35
|
+
IngestResult,
|
|
36
|
+
RetrievalRun,
|
|
37
|
+
)
|
|
38
|
+
from .sources import load_source
|
|
39
|
+
from .time import utc_now_iso
|
|
40
|
+
from .uris import corpus_ref_to_path, normalize_corpus_uri
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _sha256_bytes(data: bytes) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Compute a Secure Hash Algorithm 256 digest for byte content.
|
|
46
|
+
|
|
47
|
+
:param data: Input bytes.
|
|
48
|
+
:type data: bytes
|
|
49
|
+
:return: Secure Hash Algorithm 256 hex digest.
|
|
50
|
+
:rtype: str
|
|
51
|
+
"""
|
|
52
|
+
return hashlib.sha256(data).hexdigest()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _write_stream_and_hash(
|
|
56
|
+
stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
|
|
57
|
+
) -> Dict[str, object]:
|
|
58
|
+
"""
|
|
59
|
+
Write a binary stream to disk while computing a digest.
|
|
60
|
+
|
|
61
|
+
:param stream: Binary stream to read from.
|
|
62
|
+
:type stream: object
|
|
63
|
+
:param destination_path: Destination path to write to.
|
|
64
|
+
:type destination_path: Path
|
|
65
|
+
:param chunk_size: Chunk size for reads.
|
|
66
|
+
:type chunk_size: int
|
|
67
|
+
:return: Mapping containing sha256 and bytes_written.
|
|
68
|
+
:rtype: dict[str, object]
|
|
69
|
+
:raises OSError: If the destination cannot be written.
|
|
70
|
+
"""
|
|
71
|
+
hasher = hashlib.sha256()
|
|
72
|
+
bytes_written = 0
|
|
73
|
+
with destination_path.open("wb") as destination_handle:
|
|
74
|
+
while True:
|
|
75
|
+
chunk = stream.read(chunk_size)
|
|
76
|
+
if not chunk:
|
|
77
|
+
break
|
|
78
|
+
hasher.update(chunk)
|
|
79
|
+
destination_handle.write(chunk)
|
|
80
|
+
bytes_written += len(chunk)
|
|
81
|
+
return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _sanitize_filename(name: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Sanitize a filename into a portable, filesystem-friendly form.
|
|
87
|
+
|
|
88
|
+
:param name: Raw filename.
|
|
89
|
+
:type name: str
|
|
90
|
+
:return: Sanitized filename.
|
|
91
|
+
:rtype: str
|
|
92
|
+
"""
|
|
93
|
+
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
94
|
+
sanitized_name = "".join(
|
|
95
|
+
(character if character in allowed_characters else "_") for character in name
|
|
96
|
+
).strip()
|
|
97
|
+
return sanitized_name or "file"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
101
|
+
"""
|
|
102
|
+
Return a preferred filename extension for a media type.
|
|
103
|
+
|
|
104
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
105
|
+
:type media_type: str
|
|
106
|
+
:return: Preferred extension or None.
|
|
107
|
+
:rtype: str or None
|
|
108
|
+
"""
|
|
109
|
+
media_type_overrides = {
|
|
110
|
+
"image/jpeg": ".jpg",
|
|
111
|
+
"audio/ogg": ".ogg",
|
|
112
|
+
}
|
|
113
|
+
if media_type in media_type_overrides:
|
|
114
|
+
return media_type_overrides[media_type]
|
|
115
|
+
return mimetypes.guess_extension(media_type)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
119
|
+
"""
|
|
120
|
+
Ensure a usable filename extension for a media type.
|
|
121
|
+
|
|
122
|
+
:param filename: Raw filename.
|
|
123
|
+
:type filename: str
|
|
124
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
125
|
+
:type media_type: str
|
|
126
|
+
:return: Filename with a compatible extension.
|
|
127
|
+
:rtype: str
|
|
128
|
+
"""
|
|
129
|
+
raw_name = filename.strip()
|
|
130
|
+
|
|
131
|
+
if media_type == "text/markdown":
|
|
132
|
+
if raw_name.lower().endswith((".md", ".markdown")):
|
|
133
|
+
return raw_name
|
|
134
|
+
return raw_name + ".md"
|
|
135
|
+
|
|
136
|
+
if Path(raw_name).suffix:
|
|
137
|
+
return raw_name
|
|
138
|
+
|
|
139
|
+
ext = _preferred_extension_for_media_type(media_type)
|
|
140
|
+
if not ext:
|
|
141
|
+
return raw_name
|
|
142
|
+
return raw_name + ext
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
146
|
+
"""
|
|
147
|
+
Merge tags from explicit input and front matter values.
|
|
148
|
+
|
|
149
|
+
:param explicit: Explicit tags provided by callers.
|
|
150
|
+
:type explicit: Sequence[str]
|
|
151
|
+
:param from_frontmatter: Tags from front matter.
|
|
152
|
+
:type from_frontmatter: Any
|
|
153
|
+
:return: Deduplicated tag list preserving order.
|
|
154
|
+
:rtype: list[str]
|
|
155
|
+
"""
|
|
156
|
+
merged_tags: List[str] = []
|
|
157
|
+
|
|
158
|
+
for explicit_tag in explicit:
|
|
159
|
+
cleaned_tag = explicit_tag.strip()
|
|
160
|
+
if cleaned_tag:
|
|
161
|
+
merged_tags.append(cleaned_tag)
|
|
162
|
+
|
|
163
|
+
if isinstance(from_frontmatter, str):
|
|
164
|
+
merged_tags.append(from_frontmatter)
|
|
165
|
+
elif isinstance(from_frontmatter, list):
|
|
166
|
+
for item in from_frontmatter:
|
|
167
|
+
if isinstance(item, str) and item.strip():
|
|
168
|
+
merged_tags.append(item.strip())
|
|
169
|
+
|
|
170
|
+
seen_tags = set()
|
|
171
|
+
deduplicated_tags: List[str] = []
|
|
172
|
+
for tag_value in merged_tags:
|
|
173
|
+
if tag_value not in seen_tags:
|
|
174
|
+
seen_tags.add(tag_value)
|
|
175
|
+
deduplicated_tags.append(tag_value)
|
|
176
|
+
return deduplicated_tags
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _sidecar_path_for(content_path: Path) -> Path:
|
|
180
|
+
"""
|
|
181
|
+
Compute the sidecar metadata path for a content file.
|
|
182
|
+
|
|
183
|
+
:param content_path: Path to the content file.
|
|
184
|
+
:type content_path: Path
|
|
185
|
+
:return: Sidecar path.
|
|
186
|
+
:rtype: Path
|
|
187
|
+
"""
|
|
188
|
+
return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _load_sidecar(content_path: Path) -> Dict[str, Any]:
|
|
192
|
+
"""
|
|
193
|
+
Load sidecar metadata for a content file.
|
|
194
|
+
|
|
195
|
+
:param content_path: Path to the content file.
|
|
196
|
+
:type content_path: Path
|
|
197
|
+
:return: Parsed sidecar metadata.
|
|
198
|
+
:rtype: dict[str, Any]
|
|
199
|
+
:raises ValueError: If the sidecar content is not a mapping.
|
|
200
|
+
"""
|
|
201
|
+
path = _sidecar_path_for(content_path)
|
|
202
|
+
if not path.is_file():
|
|
203
|
+
return {}
|
|
204
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
205
|
+
if not isinstance(data, dict):
|
|
206
|
+
raise ValueError(f"Sidecar metadata must be a mapping/object: {path}")
|
|
207
|
+
return dict(data)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
|
|
211
|
+
"""
|
|
212
|
+
Write a sidecar metadata file.
|
|
213
|
+
|
|
214
|
+
:param content_path: Path to the content file.
|
|
215
|
+
:type content_path: Path
|
|
216
|
+
:param metadata: Metadata to serialize.
|
|
217
|
+
:type metadata: dict[str, Any]
|
|
218
|
+
:return: None.
|
|
219
|
+
:rtype: None
|
|
220
|
+
"""
|
|
221
|
+
path = _sidecar_path_for(content_path)
|
|
222
|
+
text = yaml.safe_dump(
|
|
223
|
+
metadata,
|
|
224
|
+
sort_keys=False,
|
|
225
|
+
allow_unicode=True,
|
|
226
|
+
default_flow_style=False,
|
|
227
|
+
).strip()
|
|
228
|
+
path.write_text(text + "\n", encoding="utf-8")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _ensure_biblicus_block(
|
|
232
|
+
metadata: Dict[str, Any], *, item_id: str, source_uri: str
|
|
233
|
+
) -> Dict[str, Any]:
|
|
234
|
+
"""
|
|
235
|
+
Ensure the biblicus metadata block exists and is populated.
|
|
236
|
+
|
|
237
|
+
:param metadata: Existing metadata.
|
|
238
|
+
:type metadata: dict[str, Any]
|
|
239
|
+
:param item_id: Item identifier to store.
|
|
240
|
+
:type item_id: str
|
|
241
|
+
:param source_uri: Source uniform resource identifier to store.
|
|
242
|
+
:type source_uri: str
|
|
243
|
+
:return: Updated metadata mapping.
|
|
244
|
+
:rtype: dict[str, Any]
|
|
245
|
+
"""
|
|
246
|
+
updated_metadata = dict(metadata)
|
|
247
|
+
existing_biblicus = updated_metadata.get("biblicus")
|
|
248
|
+
if not isinstance(existing_biblicus, dict):
|
|
249
|
+
existing_biblicus = {}
|
|
250
|
+
biblicus_block = dict(existing_biblicus)
|
|
251
|
+
biblicus_block["id"] = item_id
|
|
252
|
+
biblicus_block["source"] = source_uri
|
|
253
|
+
updated_metadata["biblicus"] = biblicus_block
|
|
254
|
+
return updated_metadata
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _parse_uuid_prefix(filename: str) -> Optional[str]:
|
|
258
|
+
"""
|
|
259
|
+
Extract a universally unique identifier prefix from a filename, if present.
|
|
260
|
+
|
|
261
|
+
:param filename: Filename to inspect.
|
|
262
|
+
:type filename: str
|
|
263
|
+
:return: Universally unique identifier string or None.
|
|
264
|
+
:rtype: str or None
|
|
265
|
+
"""
|
|
266
|
+
if len(filename) < 36:
|
|
267
|
+
return None
|
|
268
|
+
prefix = filename[:36]
|
|
269
|
+
try:
|
|
270
|
+
return str(uuid.UUID(prefix))
|
|
271
|
+
except ValueError:
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _merge_metadata(front: Dict[str, Any], side: Dict[str, Any]) -> Dict[str, Any]:
|
|
276
|
+
"""
|
|
277
|
+
Merge front matter and sidecar metadata.
|
|
278
|
+
|
|
279
|
+
:param front: Front matter metadata.
|
|
280
|
+
:type front: dict[str, Any]
|
|
281
|
+
:param side: Sidecar metadata.
|
|
282
|
+
:type side: dict[str, Any]
|
|
283
|
+
:return: Merged metadata.
|
|
284
|
+
:rtype: dict[str, Any]
|
|
285
|
+
"""
|
|
286
|
+
merged_metadata: Dict[str, Any] = dict(front)
|
|
287
|
+
|
|
288
|
+
front_biblicus = merged_metadata.get("biblicus")
|
|
289
|
+
sidecar_biblicus = side.get("biblicus")
|
|
290
|
+
if isinstance(front_biblicus, dict) or isinstance(sidecar_biblicus, dict):
|
|
291
|
+
merged_biblicus: Dict[str, Any] = {}
|
|
292
|
+
if isinstance(front_biblicus, dict):
|
|
293
|
+
merged_biblicus.update(front_biblicus)
|
|
294
|
+
if isinstance(sidecar_biblicus, dict):
|
|
295
|
+
merged_biblicus.update(sidecar_biblicus)
|
|
296
|
+
merged_metadata["biblicus"] = merged_biblicus
|
|
297
|
+
|
|
298
|
+
merged_tags = _merge_tags(_merge_tags([], front.get("tags")), side.get("tags"))
|
|
299
|
+
if merged_tags:
|
|
300
|
+
merged_metadata["tags"] = merged_tags
|
|
301
|
+
|
|
302
|
+
for metadata_key, metadata_value in side.items():
|
|
303
|
+
if metadata_key in {"biblicus", "tags"}:
|
|
304
|
+
continue
|
|
305
|
+
merged_metadata[metadata_key] = metadata_value
|
|
306
|
+
|
|
307
|
+
return merged_metadata
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class Corpus:
|
|
311
|
+
"""
|
|
312
|
+
Local corpus manager for Biblicus.
|
|
313
|
+
|
|
314
|
+
:ivar root: Corpus root directory.
|
|
315
|
+
:vartype root: Path
|
|
316
|
+
:ivar meta_dir: Metadata directory under the corpus root.
|
|
317
|
+
:vartype meta_dir: Path
|
|
318
|
+
:ivar raw_dir: Raw item directory under the corpus root.
|
|
319
|
+
:vartype raw_dir: Path
|
|
320
|
+
:ivar config: Parsed corpus config, if present.
|
|
321
|
+
:vartype config: CorpusConfig or None
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
def __init__(self, root: Path):
|
|
325
|
+
"""
|
|
326
|
+
Initialize a corpus wrapper around a filesystem path.
|
|
327
|
+
|
|
328
|
+
:param root: Corpus root directory.
|
|
329
|
+
:type root: Path
|
|
330
|
+
"""
|
|
331
|
+
self.root = root
|
|
332
|
+
self.meta_dir = self.root / CORPUS_DIR_NAME
|
|
333
|
+
self.raw_dir = self.root / DEFAULT_RAW_DIR
|
|
334
|
+
self.config = self._load_config()
|
|
335
|
+
self._hooks = self._load_hooks()
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def uri(self) -> str:
|
|
339
|
+
"""
|
|
340
|
+
Return the canonical uniform resource identifier for the corpus root.
|
|
341
|
+
|
|
342
|
+
:return: Corpus uniform resource identifier.
|
|
343
|
+
:rtype: str
|
|
344
|
+
"""
|
|
345
|
+
return self.root.as_uri()
|
|
346
|
+
|
|
347
|
+
def _load_config(self) -> Optional[CorpusConfig]:
|
|
348
|
+
"""
|
|
349
|
+
Load the corpus config if it exists.
|
|
350
|
+
|
|
351
|
+
:return: Parsed corpus config or None.
|
|
352
|
+
:rtype: CorpusConfig or None
|
|
353
|
+
:raises ValueError: If the config schema is invalid.
|
|
354
|
+
"""
|
|
355
|
+
path = self.meta_dir / "config.json"
|
|
356
|
+
if not path.is_file():
|
|
357
|
+
return None
|
|
358
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
359
|
+
try:
|
|
360
|
+
return CorpusConfig.model_validate(data)
|
|
361
|
+
except ValidationError as exc:
|
|
362
|
+
has_hook_error = any(
|
|
363
|
+
isinstance(error.get("loc"), tuple)
|
|
364
|
+
and error.get("loc")
|
|
365
|
+
and error.get("loc")[0] == "hooks"
|
|
366
|
+
for error in exc.errors()
|
|
367
|
+
)
|
|
368
|
+
if has_hook_error:
|
|
369
|
+
raise ValueError(f"Invalid hook specification: {exc}") from exc
|
|
370
|
+
raise ValueError(f"Invalid corpus config: {exc}") from exc
|
|
371
|
+
|
|
372
|
+
def _load_hooks(self) -> Optional[HookManager]:
|
|
373
|
+
"""
|
|
374
|
+
Load the hook manager from config if hooks are configured.
|
|
375
|
+
|
|
376
|
+
:return: Hook manager or None.
|
|
377
|
+
:rtype: HookManager or None
|
|
378
|
+
:raises ValueError: If hook specifications are invalid.
|
|
379
|
+
"""
|
|
380
|
+
if self.config is None or not self.config.hooks:
|
|
381
|
+
return None
|
|
382
|
+
return HookManager.from_config(
|
|
383
|
+
corpus_root=self.root,
|
|
384
|
+
corpus_uri=self.uri,
|
|
385
|
+
hook_specs=self.config.hooks,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
@classmethod
|
|
389
|
+
def find(cls, start: Path) -> "Corpus":
|
|
390
|
+
"""
|
|
391
|
+
Locate a corpus by searching upward from a path.
|
|
392
|
+
|
|
393
|
+
:param start: Starting path to search.
|
|
394
|
+
:type start: Path
|
|
395
|
+
:return: Located corpus instance.
|
|
396
|
+
:rtype: Corpus
|
|
397
|
+
:raises FileNotFoundError: If no corpus config is found.
|
|
398
|
+
"""
|
|
399
|
+
start = start.resolve()
|
|
400
|
+
for candidate in [start, *start.parents]:
|
|
401
|
+
if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
|
|
402
|
+
return cls(candidate)
|
|
403
|
+
raise FileNotFoundError(
|
|
404
|
+
f"Not a Biblicus corpus (no {CORPUS_DIR_NAME}/config.json found from {start})"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
@classmethod
|
|
408
|
+
def open(cls, ref: str | Path) -> "Corpus":
|
|
409
|
+
"""
|
|
410
|
+
Open a corpus from a path or uniform resource identifier reference.
|
|
411
|
+
|
|
412
|
+
:param ref: Filesystem path or file:// uniform resource identifier.
|
|
413
|
+
:type ref: str or Path
|
|
414
|
+
:return: Opened corpus instance.
|
|
415
|
+
:rtype: Corpus
|
|
416
|
+
"""
|
|
417
|
+
return cls.find(corpus_ref_to_path(ref))
|
|
418
|
+
|
|
419
|
+
@classmethod
|
|
420
|
+
def init(cls, root: Path, *, force: bool = False) -> "Corpus":
|
|
421
|
+
"""
|
|
422
|
+
Initialize a new corpus on disk.
|
|
423
|
+
|
|
424
|
+
:param root: Corpus root directory.
|
|
425
|
+
:type root: Path
|
|
426
|
+
:param force: Whether to overwrite existing config.
|
|
427
|
+
:type force: bool
|
|
428
|
+
:return: Initialized corpus instance.
|
|
429
|
+
:rtype: Corpus
|
|
430
|
+
:raises FileExistsError: If the corpus already exists and force is False.
|
|
431
|
+
"""
|
|
432
|
+
root = root.resolve()
|
|
433
|
+
corpus = cls(root)
|
|
434
|
+
|
|
435
|
+
corpus.meta_dir.mkdir(parents=True, exist_ok=True)
|
|
436
|
+
corpus.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
437
|
+
|
|
438
|
+
config_path = corpus.meta_dir / "config.json"
|
|
439
|
+
if config_path.exists() and not force:
|
|
440
|
+
raise FileExistsError(f"Corpus already exists at {root}")
|
|
441
|
+
|
|
442
|
+
config = CorpusConfig(
|
|
443
|
+
schema_version=SCHEMA_VERSION,
|
|
444
|
+
created_at=utc_now_iso(),
|
|
445
|
+
corpus_uri=normalize_corpus_uri(root),
|
|
446
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
447
|
+
)
|
|
448
|
+
config_path.write_text(config.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
449
|
+
|
|
450
|
+
corpus._init_catalog()
|
|
451
|
+
return corpus
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def catalog_path(self) -> Path:
|
|
455
|
+
"""
|
|
456
|
+
Return the path to the corpus catalog file.
|
|
457
|
+
|
|
458
|
+
:return: Catalog file path.
|
|
459
|
+
:rtype: Path
|
|
460
|
+
"""
|
|
461
|
+
return self.meta_dir / "catalog.json"
|
|
462
|
+
|
|
463
|
+
def _init_catalog(self) -> None:
|
|
464
|
+
"""
|
|
465
|
+
Initialize the catalog if it does not already exist.
|
|
466
|
+
|
|
467
|
+
:return: None.
|
|
468
|
+
:rtype: None
|
|
469
|
+
"""
|
|
470
|
+
if self.catalog_path.exists():
|
|
471
|
+
return
|
|
472
|
+
catalog = CorpusCatalog(
|
|
473
|
+
schema_version=SCHEMA_VERSION,
|
|
474
|
+
generated_at=utc_now_iso(),
|
|
475
|
+
corpus_uri=normalize_corpus_uri(self.root),
|
|
476
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
477
|
+
latest_run_id=None,
|
|
478
|
+
items={},
|
|
479
|
+
order=[],
|
|
480
|
+
)
|
|
481
|
+
self._write_catalog(catalog)
|
|
482
|
+
|
|
483
|
+
def _load_catalog(self) -> CorpusCatalog:
|
|
484
|
+
"""
|
|
485
|
+
Read and validate the corpus catalog file.
|
|
486
|
+
|
|
487
|
+
:return: Parsed corpus catalog.
|
|
488
|
+
:rtype: CorpusCatalog
|
|
489
|
+
:raises FileNotFoundError: If the catalog file does not exist.
|
|
490
|
+
:raises ValueError: If the catalog schema is invalid.
|
|
491
|
+
"""
|
|
492
|
+
if not self.catalog_path.is_file():
|
|
493
|
+
raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
|
|
494
|
+
catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
|
|
495
|
+
return CorpusCatalog.model_validate(catalog_data)
|
|
496
|
+
|
|
497
|
+
def load_catalog(self) -> CorpusCatalog:
|
|
498
|
+
"""
|
|
499
|
+
Load the current corpus catalog.
|
|
500
|
+
|
|
501
|
+
:return: Parsed corpus catalog.
|
|
502
|
+
:rtype: CorpusCatalog
|
|
503
|
+
:raises FileNotFoundError: If the catalog file does not exist.
|
|
504
|
+
:raises ValueError: If the catalog schema is invalid.
|
|
505
|
+
"""
|
|
506
|
+
return self._load_catalog()
|
|
507
|
+
|
|
508
|
+
def _write_catalog(self, catalog: CorpusCatalog) -> None:
|
|
509
|
+
"""
|
|
510
|
+
Atomically write a corpus catalog to disk.
|
|
511
|
+
|
|
512
|
+
:param catalog: Catalog to persist.
|
|
513
|
+
:type catalog: CorpusCatalog
|
|
514
|
+
:return: None.
|
|
515
|
+
:rtype: None
|
|
516
|
+
"""
|
|
517
|
+
temp_path = self.catalog_path.with_suffix(".json.tmp")
|
|
518
|
+
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
519
|
+
temp_path.replace(self.catalog_path)
|
|
520
|
+
|
|
521
|
+
@property
|
|
522
|
+
def runs_dir(self) -> Path:
|
|
523
|
+
"""
|
|
524
|
+
Location of retrieval run manifests.
|
|
525
|
+
|
|
526
|
+
:return: Path to the runs directory.
|
|
527
|
+
:rtype: Path
|
|
528
|
+
"""
|
|
529
|
+
return self.meta_dir / RUNS_DIR_NAME
|
|
530
|
+
|
|
531
|
+
@property
|
|
532
|
+
def extraction_runs_dir(self) -> Path:
|
|
533
|
+
"""
|
|
534
|
+
Location of extraction run artifacts.
|
|
535
|
+
|
|
536
|
+
:return: Path to the extraction runs directory.
|
|
537
|
+
:rtype: Path
|
|
538
|
+
"""
|
|
539
|
+
return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
|
|
540
|
+
|
|
541
|
+
def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
|
|
542
|
+
"""
|
|
543
|
+
Resolve an extraction run directory.
|
|
544
|
+
|
|
545
|
+
:param extractor_id: Extractor plugin identifier.
|
|
546
|
+
:type extractor_id: str
|
|
547
|
+
:param run_id: Extraction run identifier.
|
|
548
|
+
:type run_id: str
|
|
549
|
+
:return: Extraction run directory.
|
|
550
|
+
:rtype: Path
|
|
551
|
+
"""
|
|
552
|
+
return self.extraction_runs_dir / extractor_id / run_id
|
|
553
|
+
|
|
554
|
+
def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
|
|
555
|
+
"""
|
|
556
|
+
Read extracted text for an item from an extraction run, when present.
|
|
557
|
+
|
|
558
|
+
:param extractor_id: Extractor plugin identifier.
|
|
559
|
+
:type extractor_id: str
|
|
560
|
+
:param run_id: Extraction run identifier.
|
|
561
|
+
:type run_id: str
|
|
562
|
+
:param item_id: Item identifier.
|
|
563
|
+
:type item_id: str
|
|
564
|
+
:return: Extracted text or None if the artifact does not exist.
|
|
565
|
+
:rtype: str or None
|
|
566
|
+
:raises OSError: If the file exists but cannot be read.
|
|
567
|
+
"""
|
|
568
|
+
path = (
|
|
569
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
570
|
+
/ "text"
|
|
571
|
+
/ f"{item_id}.txt"
|
|
572
|
+
)
|
|
573
|
+
if not path.is_file():
|
|
574
|
+
return None
|
|
575
|
+
return path.read_text(encoding="utf-8")
|
|
576
|
+
|
|
577
|
+
def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
|
|
578
|
+
"""
|
|
579
|
+
Load an extraction run manifest from the corpus.
|
|
580
|
+
|
|
581
|
+
:param extractor_id: Extractor plugin identifier.
|
|
582
|
+
:type extractor_id: str
|
|
583
|
+
:param run_id: Extraction run identifier.
|
|
584
|
+
:type run_id: str
|
|
585
|
+
:return: Parsed extraction run manifest.
|
|
586
|
+
:rtype: biblicus.extraction.ExtractionRunManifest
|
|
587
|
+
:raises FileNotFoundError: If the manifest file does not exist.
|
|
588
|
+
:raises ValueError: If the manifest data is invalid.
|
|
589
|
+
"""
|
|
590
|
+
from .extraction import ExtractionRunManifest
|
|
591
|
+
|
|
592
|
+
manifest_path = (
|
|
593
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
|
|
594
|
+
)
|
|
595
|
+
if not manifest_path.is_file():
|
|
596
|
+
raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
|
|
597
|
+
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
598
|
+
return ExtractionRunManifest.model_validate(data)
|
|
599
|
+
|
|
600
|
+
def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
|
|
601
|
+
"""
|
|
602
|
+
List extraction runs stored under the corpus.
|
|
603
|
+
|
|
604
|
+
:param extractor_id: Optional extractor identifier filter.
|
|
605
|
+
:type extractor_id: str or None
|
|
606
|
+
:return: Summary list entries for each run.
|
|
607
|
+
:rtype: list[biblicus.models.ExtractionRunListEntry]
|
|
608
|
+
"""
|
|
609
|
+
runs_root = self.extraction_runs_dir
|
|
610
|
+
if not runs_root.is_dir():
|
|
611
|
+
return []
|
|
612
|
+
|
|
613
|
+
extractor_dirs: List[Path]
|
|
614
|
+
if extractor_id is None:
|
|
615
|
+
extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
|
|
616
|
+
else:
|
|
617
|
+
extractor_path = runs_root / extractor_id
|
|
618
|
+
extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
|
|
619
|
+
|
|
620
|
+
entries: List[ExtractionRunListEntry] = []
|
|
621
|
+
for extractor_dir in extractor_dirs:
|
|
622
|
+
for run_dir in sorted(extractor_dir.iterdir()):
|
|
623
|
+
if not run_dir.is_dir():
|
|
624
|
+
continue
|
|
625
|
+
manifest_path = run_dir / "manifest.json"
|
|
626
|
+
if not manifest_path.is_file():
|
|
627
|
+
continue
|
|
628
|
+
try:
|
|
629
|
+
manifest = self.load_extraction_run_manifest(
|
|
630
|
+
extractor_id=extractor_dir.name,
|
|
631
|
+
run_id=run_dir.name,
|
|
632
|
+
)
|
|
633
|
+
except (FileNotFoundError, ValueError):
|
|
634
|
+
continue
|
|
635
|
+
entries.append(
|
|
636
|
+
ExtractionRunListEntry(
|
|
637
|
+
extractor_id=extractor_dir.name,
|
|
638
|
+
run_id=run_dir.name,
|
|
639
|
+
recipe_id=manifest.recipe.recipe_id,
|
|
640
|
+
recipe_name=manifest.recipe.name,
|
|
641
|
+
catalog_generated_at=manifest.catalog_generated_at,
|
|
642
|
+
created_at=manifest.created_at,
|
|
643
|
+
stats=dict(manifest.stats),
|
|
644
|
+
)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
|
|
648
|
+
return entries
|
|
649
|
+
|
|
650
|
+
def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
|
|
651
|
+
"""
|
|
652
|
+
Delete an extraction run directory and its derived artifacts.
|
|
653
|
+
|
|
654
|
+
:param extractor_id: Extractor plugin identifier.
|
|
655
|
+
:type extractor_id: str
|
|
656
|
+
:param run_id: Extraction run identifier.
|
|
657
|
+
:type run_id: str
|
|
658
|
+
:return: None.
|
|
659
|
+
:rtype: None
|
|
660
|
+
:raises FileNotFoundError: If the extraction run directory does not exist.
|
|
661
|
+
"""
|
|
662
|
+
run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
663
|
+
if not run_dir.is_dir():
|
|
664
|
+
raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
|
|
665
|
+
shutil.rmtree(run_dir)
|
|
666
|
+
|
|
667
|
+
def _ensure_runs_dir(self) -> None:
|
|
668
|
+
"""
|
|
669
|
+
Ensure the retrieval runs directory exists.
|
|
670
|
+
|
|
671
|
+
:return: None.
|
|
672
|
+
:rtype: None
|
|
673
|
+
"""
|
|
674
|
+
self.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
675
|
+
|
|
676
|
+
def write_run(self, run: RetrievalRun) -> None:
|
|
677
|
+
"""
|
|
678
|
+
Persist a retrieval run manifest and update the catalog pointer.
|
|
679
|
+
|
|
680
|
+
:param run: Run manifest to persist.
|
|
681
|
+
:type run: RetrievalRun
|
|
682
|
+
:return: None.
|
|
683
|
+
:rtype: None
|
|
684
|
+
"""
|
|
685
|
+
self._ensure_runs_dir()
|
|
686
|
+
path = self.runs_dir / f"{run.run_id}.json"
|
|
687
|
+
path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
688
|
+
catalog = self._load_catalog()
|
|
689
|
+
catalog.latest_run_id = run.run_id
|
|
690
|
+
catalog.generated_at = utc_now_iso()
|
|
691
|
+
self._write_catalog(catalog)
|
|
692
|
+
|
|
693
|
+
def load_run(self, run_id: str) -> RetrievalRun:
|
|
694
|
+
"""
|
|
695
|
+
Load a retrieval run manifest by identifier.
|
|
696
|
+
|
|
697
|
+
:param run_id: Run identifier.
|
|
698
|
+
:type run_id: str
|
|
699
|
+
:return: Parsed run manifest.
|
|
700
|
+
:rtype: RetrievalRun
|
|
701
|
+
:raises FileNotFoundError: If the run manifest does not exist.
|
|
702
|
+
"""
|
|
703
|
+
path = self.runs_dir / f"{run_id}.json"
|
|
704
|
+
if not path.is_file():
|
|
705
|
+
raise FileNotFoundError(f"Missing run manifest: {path}")
|
|
706
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
707
|
+
return RetrievalRun.model_validate(data)
|
|
708
|
+
|
|
709
|
+
@property
|
|
710
|
+
def latest_run_id(self) -> Optional[str]:
|
|
711
|
+
"""
|
|
712
|
+
Latest retrieval run identifier recorded in the catalog.
|
|
713
|
+
|
|
714
|
+
:return: Latest run identifier or None.
|
|
715
|
+
:rtype: str or None
|
|
716
|
+
"""
|
|
717
|
+
return self._load_catalog().latest_run_id
|
|
718
|
+
|
|
719
|
+
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
720
|
+
"""
|
|
721
|
+
Upsert a catalog item and reset the latest run pointer.
|
|
722
|
+
|
|
723
|
+
:param item: Catalog item to insert or update.
|
|
724
|
+
:type item: CatalogItem
|
|
725
|
+
:return: None.
|
|
726
|
+
:rtype: None
|
|
727
|
+
"""
|
|
728
|
+
self._init_catalog()
|
|
729
|
+
catalog = self._load_catalog()
|
|
730
|
+
catalog.items[item.id] = item
|
|
731
|
+
|
|
732
|
+
ordered_ids = [item_id for item_id in catalog.order if item_id != item.id]
|
|
733
|
+
ordered_ids.insert(0, item.id)
|
|
734
|
+
catalog.order = ordered_ids
|
|
735
|
+
catalog.generated_at = utc_now_iso()
|
|
736
|
+
catalog.latest_run_id = None
|
|
737
|
+
|
|
738
|
+
self._write_catalog(catalog)
|
|
739
|
+
|
|
740
|
+
def ingest_item(
|
|
741
|
+
self,
|
|
742
|
+
data: bytes,
|
|
743
|
+
*,
|
|
744
|
+
filename: Optional[str] = None,
|
|
745
|
+
media_type: str = "application/octet-stream",
|
|
746
|
+
title: Optional[str] = None,
|
|
747
|
+
tags: Sequence[str] = (),
|
|
748
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
749
|
+
source_uri: str = "unknown",
|
|
750
|
+
) -> IngestResult:
|
|
751
|
+
"""
|
|
752
|
+
Ingest a single raw item into the corpus.
|
|
753
|
+
|
|
754
|
+
This is the modality-neutral primitive: callers provide bytes + a media type.
|
|
755
|
+
Higher-level conveniences (ingest_note, ingest_source, and related methods) build on top.
|
|
756
|
+
|
|
757
|
+
:param data: Raw item bytes.
|
|
758
|
+
:type data: bytes
|
|
759
|
+
:param filename: Optional filename for the stored item.
|
|
760
|
+
:type filename: str or None
|
|
761
|
+
:param media_type: Internet Assigned Numbers Authority media type for the item.
|
|
762
|
+
:type media_type: str
|
|
763
|
+
:param title: Optional title metadata.
|
|
764
|
+
:type title: str or None
|
|
765
|
+
:param tags: Tags to associate with the item.
|
|
766
|
+
:type tags: Sequence[str]
|
|
767
|
+
:param metadata: Optional metadata mapping.
|
|
768
|
+
:type metadata: dict[str, Any] or None
|
|
769
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
770
|
+
:type source_uri: str
|
|
771
|
+
:return: Ingestion result summary.
|
|
772
|
+
:rtype: IngestResult
|
|
773
|
+
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
774
|
+
"""
|
|
775
|
+
item_id = str(uuid.uuid4())
|
|
776
|
+
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
777
|
+
|
|
778
|
+
if safe_filename:
|
|
779
|
+
safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
|
|
780
|
+
|
|
781
|
+
if media_type == "text/markdown":
|
|
782
|
+
output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
|
|
783
|
+
else:
|
|
784
|
+
if safe_filename:
|
|
785
|
+
output_name = f"{item_id}--{safe_filename}"
|
|
786
|
+
else:
|
|
787
|
+
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
788
|
+
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
789
|
+
|
|
790
|
+
relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
|
|
791
|
+
output_path = self.root / relpath
|
|
792
|
+
|
|
793
|
+
resolved_title = title.strip() if isinstance(title, str) and title.strip() else None
|
|
794
|
+
resolved_tags = list(tags)
|
|
795
|
+
metadata_input: Dict[str, Any] = dict(metadata or {})
|
|
796
|
+
if resolved_title and "title" not in metadata_input:
|
|
797
|
+
metadata_input["title"] = resolved_title
|
|
798
|
+
if resolved_tags and "tags" not in metadata_input:
|
|
799
|
+
metadata_input["tags"] = list(resolved_tags)
|
|
800
|
+
|
|
801
|
+
if self._hooks is not None:
|
|
802
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
803
|
+
hook_point=HookPoint.before_ingest,
|
|
804
|
+
filename=filename,
|
|
805
|
+
media_type=media_type,
|
|
806
|
+
title=resolved_title,
|
|
807
|
+
tags=list(resolved_tags),
|
|
808
|
+
metadata=dict(metadata_input),
|
|
809
|
+
source_uri=source_uri,
|
|
810
|
+
)
|
|
811
|
+
if mutation.add_tags:
|
|
812
|
+
for tag in mutation.add_tags:
|
|
813
|
+
if tag not in resolved_tags:
|
|
814
|
+
resolved_tags.append(tag)
|
|
815
|
+
|
|
816
|
+
frontmatter: Dict[str, Any] = {}
|
|
817
|
+
|
|
818
|
+
if media_type == "text/markdown":
|
|
819
|
+
try:
|
|
820
|
+
markdown_text = data.decode("utf-8")
|
|
821
|
+
except UnicodeDecodeError as decode_error:
|
|
822
|
+
raise ValueError(
|
|
823
|
+
"Markdown must be Unicode Transformation Format 8"
|
|
824
|
+
) from decode_error
|
|
825
|
+
|
|
826
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
827
|
+
frontmatter = dict(parsed_document.metadata)
|
|
828
|
+
|
|
829
|
+
merged_tags = _merge_tags(resolved_tags, frontmatter.get("tags"))
|
|
830
|
+
if merged_tags:
|
|
831
|
+
frontmatter["tags"] = merged_tags
|
|
832
|
+
resolved_tags = merged_tags
|
|
833
|
+
|
|
834
|
+
if resolved_title and not (
|
|
835
|
+
isinstance(frontmatter.get("title"), str) and frontmatter.get("title").strip()
|
|
836
|
+
):
|
|
837
|
+
frontmatter["title"] = resolved_title
|
|
838
|
+
|
|
839
|
+
title_value = frontmatter.get("title")
|
|
840
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
841
|
+
resolved_title = title_value.strip()
|
|
842
|
+
|
|
843
|
+
frontmatter = _ensure_biblicus_block(
|
|
844
|
+
frontmatter, item_id=item_id, source_uri=source_uri
|
|
845
|
+
)
|
|
846
|
+
rendered_document = render_front_matter(frontmatter, parsed_document.body)
|
|
847
|
+
data_to_write = rendered_document.encode("utf-8")
|
|
848
|
+
else:
|
|
849
|
+
data_to_write = data
|
|
850
|
+
|
|
851
|
+
sha256_digest = _sha256_bytes(data_to_write)
|
|
852
|
+
output_path.write_bytes(data_to_write)
|
|
853
|
+
|
|
854
|
+
if media_type != "text/markdown":
|
|
855
|
+
sidecar: Dict[str, Any] = {}
|
|
856
|
+
sidecar["media_type"] = media_type
|
|
857
|
+
if resolved_tags:
|
|
858
|
+
sidecar["tags"] = resolved_tags
|
|
859
|
+
if metadata_input:
|
|
860
|
+
for metadata_key, metadata_value in metadata_input.items():
|
|
861
|
+
if metadata_key in {"tags", "biblicus"}:
|
|
862
|
+
continue
|
|
863
|
+
sidecar[metadata_key] = metadata_value
|
|
864
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
865
|
+
_write_sidecar(output_path, sidecar)
|
|
866
|
+
frontmatter = sidecar
|
|
867
|
+
|
|
868
|
+
if self._hooks is not None:
|
|
869
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
870
|
+
hook_point=HookPoint.after_ingest,
|
|
871
|
+
filename=filename,
|
|
872
|
+
media_type=media_type,
|
|
873
|
+
title=resolved_title,
|
|
874
|
+
tags=list(resolved_tags),
|
|
875
|
+
metadata=dict(metadata_input),
|
|
876
|
+
source_uri=source_uri,
|
|
877
|
+
item_id=item_id,
|
|
878
|
+
relpath=relpath,
|
|
879
|
+
)
|
|
880
|
+
if mutation.add_tags:
|
|
881
|
+
updated_tags = list(resolved_tags)
|
|
882
|
+
for tag in mutation.add_tags:
|
|
883
|
+
if tag not in updated_tags:
|
|
884
|
+
updated_tags.append(tag)
|
|
885
|
+
resolved_tags = updated_tags
|
|
886
|
+
sidecar_metadata = _load_sidecar(output_path)
|
|
887
|
+
sidecar_metadata["tags"] = resolved_tags
|
|
888
|
+
if media_type != "text/markdown":
|
|
889
|
+
sidecar_metadata["media_type"] = media_type
|
|
890
|
+
sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
|
|
891
|
+
_write_sidecar(output_path, sidecar_metadata)
|
|
892
|
+
frontmatter = _merge_metadata(
|
|
893
|
+
frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
created_at = utc_now_iso()
|
|
897
|
+
item_record = CatalogItem(
|
|
898
|
+
id=item_id,
|
|
899
|
+
relpath=relpath,
|
|
900
|
+
sha256=sha256_digest,
|
|
901
|
+
bytes=len(data_to_write),
|
|
902
|
+
media_type=media_type,
|
|
903
|
+
title=resolved_title,
|
|
904
|
+
tags=list(resolved_tags),
|
|
905
|
+
metadata=dict(frontmatter or {}),
|
|
906
|
+
created_at=created_at,
|
|
907
|
+
source_uri=source_uri,
|
|
908
|
+
)
|
|
909
|
+
self._upsert_catalog_item(item_record)
|
|
910
|
+
|
|
911
|
+
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
912
|
+
|
|
913
|
+
def ingest_item_stream(
|
|
914
|
+
self,
|
|
915
|
+
stream,
|
|
916
|
+
*,
|
|
917
|
+
filename: Optional[str] = None,
|
|
918
|
+
media_type: str = "application/octet-stream",
|
|
919
|
+
tags: Sequence[str] = (),
|
|
920
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
921
|
+
source_uri: str = "unknown",
|
|
922
|
+
) -> IngestResult:
|
|
923
|
+
"""
|
|
924
|
+
Ingest a binary item from a readable stream.
|
|
925
|
+
|
|
926
|
+
This method is intended for large non-markdown items. It writes bytes to disk incrementally
|
|
927
|
+
while computing a checksum.
|
|
928
|
+
|
|
929
|
+
:param stream: Readable binary stream.
|
|
930
|
+
:type stream: object
|
|
931
|
+
:param filename: Optional filename for the stored item.
|
|
932
|
+
:type filename: str or None
|
|
933
|
+
:param media_type: Internet Assigned Numbers Authority media type for the item.
|
|
934
|
+
:type media_type: str
|
|
935
|
+
:param tags: Tags to associate with the item.
|
|
936
|
+
:type tags: Sequence[str]
|
|
937
|
+
:param metadata: Optional metadata mapping.
|
|
938
|
+
:type metadata: dict[str, Any] or None
|
|
939
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
940
|
+
:type source_uri: str
|
|
941
|
+
:return: Ingestion result summary.
|
|
942
|
+
:rtype: IngestResult
|
|
943
|
+
:raises ValueError: If the media_type is text/markdown.
|
|
944
|
+
"""
|
|
945
|
+
if media_type == "text/markdown":
|
|
946
|
+
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
947
|
+
|
|
948
|
+
item_id = str(uuid.uuid4())
|
|
949
|
+
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
950
|
+
if safe_filename:
|
|
951
|
+
safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
|
|
952
|
+
|
|
953
|
+
if safe_filename:
|
|
954
|
+
output_name = f"{item_id}--{safe_filename}"
|
|
955
|
+
else:
|
|
956
|
+
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
957
|
+
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
958
|
+
|
|
959
|
+
relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
|
|
960
|
+
output_path = self.root / relpath
|
|
961
|
+
|
|
962
|
+
resolved_tags = list(tags)
|
|
963
|
+
metadata_input: Dict[str, Any] = dict(metadata or {})
|
|
964
|
+
if resolved_tags and "tags" not in metadata_input:
|
|
965
|
+
metadata_input["tags"] = list(resolved_tags)
|
|
966
|
+
|
|
967
|
+
if self._hooks is not None:
|
|
968
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
969
|
+
hook_point=HookPoint.before_ingest,
|
|
970
|
+
filename=filename,
|
|
971
|
+
media_type=media_type,
|
|
972
|
+
title=None,
|
|
973
|
+
tags=list(resolved_tags),
|
|
974
|
+
metadata=dict(metadata_input),
|
|
975
|
+
source_uri=source_uri,
|
|
976
|
+
)
|
|
977
|
+
if mutation.add_tags:
|
|
978
|
+
for tag in mutation.add_tags:
|
|
979
|
+
if tag not in resolved_tags:
|
|
980
|
+
resolved_tags.append(tag)
|
|
981
|
+
|
|
982
|
+
write_result = _write_stream_and_hash(stream, output_path)
|
|
983
|
+
sha256_digest = str(write_result["sha256"])
|
|
984
|
+
bytes_written = int(write_result["bytes_written"])
|
|
985
|
+
|
|
986
|
+
sidecar: Dict[str, Any] = {}
|
|
987
|
+
sidecar["media_type"] = media_type
|
|
988
|
+
if resolved_tags:
|
|
989
|
+
sidecar["tags"] = resolved_tags
|
|
990
|
+
if metadata_input:
|
|
991
|
+
for metadata_key, metadata_value in metadata_input.items():
|
|
992
|
+
if metadata_key in {"tags", "biblicus"}:
|
|
993
|
+
continue
|
|
994
|
+
sidecar[metadata_key] = metadata_value
|
|
995
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
996
|
+
_write_sidecar(output_path, sidecar)
|
|
997
|
+
|
|
998
|
+
if self._hooks is not None:
|
|
999
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
1000
|
+
hook_point=HookPoint.after_ingest,
|
|
1001
|
+
filename=filename,
|
|
1002
|
+
media_type=media_type,
|
|
1003
|
+
title=None,
|
|
1004
|
+
tags=list(resolved_tags),
|
|
1005
|
+
metadata=dict(metadata_input),
|
|
1006
|
+
source_uri=source_uri,
|
|
1007
|
+
item_id=item_id,
|
|
1008
|
+
relpath=relpath,
|
|
1009
|
+
)
|
|
1010
|
+
if mutation.add_tags:
|
|
1011
|
+
updated_tags = list(resolved_tags)
|
|
1012
|
+
for tag in mutation.add_tags:
|
|
1013
|
+
if tag not in updated_tags:
|
|
1014
|
+
updated_tags.append(tag)
|
|
1015
|
+
resolved_tags = updated_tags
|
|
1016
|
+
sidecar["tags"] = resolved_tags
|
|
1017
|
+
_write_sidecar(output_path, sidecar)
|
|
1018
|
+
|
|
1019
|
+
created_at = utc_now_iso()
|
|
1020
|
+
item_record = CatalogItem(
|
|
1021
|
+
id=item_id,
|
|
1022
|
+
relpath=relpath,
|
|
1023
|
+
sha256=sha256_digest,
|
|
1024
|
+
bytes=bytes_written,
|
|
1025
|
+
media_type=media_type,
|
|
1026
|
+
title=None,
|
|
1027
|
+
tags=list(resolved_tags),
|
|
1028
|
+
metadata=dict(sidecar or {}),
|
|
1029
|
+
created_at=created_at,
|
|
1030
|
+
source_uri=source_uri,
|
|
1031
|
+
)
|
|
1032
|
+
self._upsert_catalog_item(item_record)
|
|
1033
|
+
|
|
1034
|
+
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
1035
|
+
|
|
1036
|
+
def ingest_note(
|
|
1037
|
+
self,
|
|
1038
|
+
text: str,
|
|
1039
|
+
*,
|
|
1040
|
+
title: Optional[str] = None,
|
|
1041
|
+
tags: Sequence[str] = (),
|
|
1042
|
+
source_uri: str = "text",
|
|
1043
|
+
) -> IngestResult:
|
|
1044
|
+
"""
|
|
1045
|
+
Ingest a text note as Markdown.
|
|
1046
|
+
|
|
1047
|
+
:param text: Note content.
|
|
1048
|
+
:type text: str
|
|
1049
|
+
:param title: Optional title metadata.
|
|
1050
|
+
:type title: str or None
|
|
1051
|
+
:param tags: Tags to associate with the note.
|
|
1052
|
+
:type tags: Sequence[str]
|
|
1053
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
1054
|
+
:type source_uri: str
|
|
1055
|
+
:return: Ingestion result summary.
|
|
1056
|
+
:rtype: IngestResult
|
|
1057
|
+
"""
|
|
1058
|
+
data = text.encode("utf-8")
|
|
1059
|
+
return self.ingest_item(
|
|
1060
|
+
data,
|
|
1061
|
+
filename=None,
|
|
1062
|
+
media_type="text/markdown",
|
|
1063
|
+
title=title,
|
|
1064
|
+
tags=tags,
|
|
1065
|
+
metadata=None,
|
|
1066
|
+
source_uri=source_uri,
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
def ingest_source(
|
|
1070
|
+
self,
|
|
1071
|
+
source: str | Path,
|
|
1072
|
+
*,
|
|
1073
|
+
tags: Sequence[str] = (),
|
|
1074
|
+
source_uri: Optional[str] = None,
|
|
1075
|
+
) -> IngestResult:
|
|
1076
|
+
"""
|
|
1077
|
+
Ingest a file path or uniform resource locator source.
|
|
1078
|
+
|
|
1079
|
+
:param source: File path or uniform resource locator.
|
|
1080
|
+
:type source: str or Path
|
|
1081
|
+
:param tags: Tags to associate with the item.
|
|
1082
|
+
:type tags: Sequence[str]
|
|
1083
|
+
:param source_uri: Optional override for the source uniform resource identifier.
|
|
1084
|
+
:type source_uri: str or None
|
|
1085
|
+
:return: Ingestion result summary.
|
|
1086
|
+
:rtype: IngestResult
|
|
1087
|
+
"""
|
|
1088
|
+
candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
|
|
1089
|
+
if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
|
|
1090
|
+
path = source if isinstance(source, Path) else candidate_path
|
|
1091
|
+
assert isinstance(path, Path)
|
|
1092
|
+
path = path.resolve()
|
|
1093
|
+
filename = path.name
|
|
1094
|
+
media_type, _ = mimetypes.guess_type(filename)
|
|
1095
|
+
media_type = media_type or "application/octet-stream"
|
|
1096
|
+
if path.suffix.lower() in {".md", ".markdown"}:
|
|
1097
|
+
media_type = "text/markdown"
|
|
1098
|
+
if media_type == "text/markdown":
|
|
1099
|
+
return self.ingest_item(
|
|
1100
|
+
path.read_bytes(),
|
|
1101
|
+
filename=filename,
|
|
1102
|
+
media_type=media_type,
|
|
1103
|
+
title=None,
|
|
1104
|
+
tags=tags,
|
|
1105
|
+
metadata=None,
|
|
1106
|
+
source_uri=source_uri or path.as_uri(),
|
|
1107
|
+
)
|
|
1108
|
+
with path.open("rb") as handle:
|
|
1109
|
+
return self.ingest_item_stream(
|
|
1110
|
+
handle,
|
|
1111
|
+
filename=filename,
|
|
1112
|
+
media_type=media_type,
|
|
1113
|
+
tags=tags,
|
|
1114
|
+
metadata=None,
|
|
1115
|
+
source_uri=source_uri or path.as_uri(),
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
payload = load_source(source, source_uri=source_uri)
|
|
1119
|
+
return self.ingest_item(
|
|
1120
|
+
payload.data,
|
|
1121
|
+
filename=payload.filename,
|
|
1122
|
+
media_type=payload.media_type,
|
|
1123
|
+
title=None,
|
|
1124
|
+
tags=tags,
|
|
1125
|
+
metadata=None,
|
|
1126
|
+
source_uri=payload.source_uri,
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
|
|
1130
|
+
"""
|
|
1131
|
+
Import a folder tree into the corpus, preserving relative paths and provenance.
|
|
1132
|
+
|
|
1133
|
+
Imported content is stored under the raw directory in a dedicated import namespace so that
|
|
1134
|
+
operators can inspect and back up imported content as a structured tree.
|
|
1135
|
+
|
|
1136
|
+
:param source_root: Root directory of the folder tree to import.
|
|
1137
|
+
:type source_root: Path
|
|
1138
|
+
:param tags: Tags to associate with imported items.
|
|
1139
|
+
:type tags: Sequence[str]
|
|
1140
|
+
:return: Import statistics.
|
|
1141
|
+
:rtype: dict[str, int]
|
|
1142
|
+
:raises FileNotFoundError: If the source_root does not exist.
|
|
1143
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1144
|
+
"""
|
|
1145
|
+
source_root = source_root.resolve()
|
|
1146
|
+
if not source_root.is_dir():
|
|
1147
|
+
raise FileNotFoundError(f"Import source root does not exist: {source_root}")
|
|
1148
|
+
|
|
1149
|
+
ignore_spec = load_corpus_ignore_spec(self.root)
|
|
1150
|
+
import_id = str(uuid.uuid4())
|
|
1151
|
+
stats = {"scanned": 0, "ignored": 0, "imported": 0}
|
|
1152
|
+
|
|
1153
|
+
for source_path in sorted(source_root.rglob("*")):
|
|
1154
|
+
if not source_path.is_file():
|
|
1155
|
+
continue
|
|
1156
|
+
relative_source_path = source_path.relative_to(source_root).as_posix()
|
|
1157
|
+
stats["scanned"] += 1
|
|
1158
|
+
if ignore_spec.matches(relative_source_path):
|
|
1159
|
+
stats["ignored"] += 1
|
|
1160
|
+
continue
|
|
1161
|
+
self._import_file(
|
|
1162
|
+
source_path=source_path,
|
|
1163
|
+
import_id=import_id,
|
|
1164
|
+
relative_source_path=relative_source_path,
|
|
1165
|
+
tags=tags,
|
|
1166
|
+
)
|
|
1167
|
+
stats["imported"] += 1
|
|
1168
|
+
|
|
1169
|
+
return stats
|
|
1170
|
+
|
|
1171
|
+
def _import_file(
|
|
1172
|
+
self,
|
|
1173
|
+
*,
|
|
1174
|
+
source_path: Path,
|
|
1175
|
+
import_id: str,
|
|
1176
|
+
relative_source_path: str,
|
|
1177
|
+
tags: Sequence[str],
|
|
1178
|
+
) -> None:
|
|
1179
|
+
"""
|
|
1180
|
+
Import a single file into the corpus under an import namespace.
|
|
1181
|
+
|
|
1182
|
+
:param source_path: Source file path to import.
|
|
1183
|
+
:type source_path: Path
|
|
1184
|
+
:param import_id: Import identifier.
|
|
1185
|
+
:type import_id: str
|
|
1186
|
+
:param relative_source_path: Relative path within the imported tree.
|
|
1187
|
+
:type relative_source_path: str
|
|
1188
|
+
:param tags: Tags to apply.
|
|
1189
|
+
:type tags: Sequence[str]
|
|
1190
|
+
:return: None.
|
|
1191
|
+
:rtype: None
|
|
1192
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1193
|
+
"""
|
|
1194
|
+
item_id = str(uuid.uuid4())
|
|
1195
|
+
destination_relpath = str(
|
|
1196
|
+
Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
|
|
1197
|
+
)
|
|
1198
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1199
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1200
|
+
|
|
1201
|
+
raw_bytes = source_path.read_bytes()
|
|
1202
|
+
sha256_digest = _sha256_bytes(raw_bytes)
|
|
1203
|
+
|
|
1204
|
+
media_type, _ = mimetypes.guess_type(source_path.name)
|
|
1205
|
+
media_type = media_type or "application/octet-stream"
|
|
1206
|
+
if source_path.suffix.lower() in {".md", ".markdown"}:
|
|
1207
|
+
media_type = "text/markdown"
|
|
1208
|
+
|
|
1209
|
+
title: Optional[str] = None
|
|
1210
|
+
frontmatter_metadata: Dict[str, Any] = {}
|
|
1211
|
+
if media_type == "text/markdown":
|
|
1212
|
+
try:
|
|
1213
|
+
text = raw_bytes.decode("utf-8")
|
|
1214
|
+
except UnicodeDecodeError as decode_error:
|
|
1215
|
+
raise ValueError(
|
|
1216
|
+
f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
|
|
1217
|
+
) from decode_error
|
|
1218
|
+
parsed_document = parse_front_matter(text)
|
|
1219
|
+
frontmatter_metadata = dict(parsed_document.metadata)
|
|
1220
|
+
title_value = frontmatter_metadata.get("title")
|
|
1221
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
1222
|
+
title = title_value.strip()
|
|
1223
|
+
|
|
1224
|
+
destination_path.write_bytes(raw_bytes)
|
|
1225
|
+
|
|
1226
|
+
sidecar: Dict[str, Any] = {}
|
|
1227
|
+
if tags:
|
|
1228
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1229
|
+
if media_type != "text/markdown":
|
|
1230
|
+
sidecar["media_type"] = media_type
|
|
1231
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
|
|
1232
|
+
_write_sidecar(destination_path, sidecar)
|
|
1233
|
+
|
|
1234
|
+
merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
|
|
1235
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1236
|
+
|
|
1237
|
+
item_record = CatalogItem(
|
|
1238
|
+
id=item_id,
|
|
1239
|
+
relpath=destination_relpath,
|
|
1240
|
+
sha256=sha256_digest,
|
|
1241
|
+
bytes=len(raw_bytes),
|
|
1242
|
+
media_type=media_type,
|
|
1243
|
+
title=title,
|
|
1244
|
+
tags=list(resolved_tags),
|
|
1245
|
+
metadata=dict(merged_metadata or {}),
|
|
1246
|
+
created_at=utc_now_iso(),
|
|
1247
|
+
source_uri=source_path.as_uri(),
|
|
1248
|
+
)
|
|
1249
|
+
self._upsert_catalog_item(item_record)
|
|
1250
|
+
|
|
1251
|
+
def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
|
|
1252
|
+
"""
|
|
1253
|
+
List items from the catalog.
|
|
1254
|
+
|
|
1255
|
+
:param limit: Maximum number of items to return.
|
|
1256
|
+
:type limit: int
|
|
1257
|
+
:return: Catalog items ordered by recency.
|
|
1258
|
+
:rtype: list[CatalogItem]
|
|
1259
|
+
"""
|
|
1260
|
+
catalog = self._load_catalog()
|
|
1261
|
+
ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
1262
|
+
collected_items: List[CatalogItem] = []
|
|
1263
|
+
for item_id in ordered_ids:
|
|
1264
|
+
item = catalog.items.get(item_id)
|
|
1265
|
+
if item is not None:
|
|
1266
|
+
collected_items.append(item)
|
|
1267
|
+
return collected_items
|
|
1268
|
+
|
|
1269
|
+
def get_item(self, item_id: str) -> CatalogItem:
|
|
1270
|
+
"""
|
|
1271
|
+
Fetch a catalog item by identifier.
|
|
1272
|
+
|
|
1273
|
+
:param item_id: Item identifier.
|
|
1274
|
+
:type item_id: str
|
|
1275
|
+
:return: Catalog item.
|
|
1276
|
+
:rtype: CatalogItem
|
|
1277
|
+
:raises KeyError: If the item identifier is unknown.
|
|
1278
|
+
"""
|
|
1279
|
+
catalog = self._load_catalog()
|
|
1280
|
+
item = catalog.items.get(item_id)
|
|
1281
|
+
if item is None:
|
|
1282
|
+
raise KeyError(f"Unknown item identifier: {item_id}")
|
|
1283
|
+
return item
|
|
1284
|
+
|
|
1285
|
+
def create_crawl_id(self) -> str:
|
|
1286
|
+
"""
|
|
1287
|
+
Create a new crawl identifier.
|
|
1288
|
+
|
|
1289
|
+
:return: Crawl identifier.
|
|
1290
|
+
:rtype: str
|
|
1291
|
+
"""
|
|
1292
|
+
return str(uuid.uuid4())
|
|
1293
|
+
|
|
1294
|
+
def ingest_crawled_payload(
|
|
1295
|
+
self,
|
|
1296
|
+
*,
|
|
1297
|
+
crawl_id: str,
|
|
1298
|
+
relative_path: str,
|
|
1299
|
+
data: bytes,
|
|
1300
|
+
filename: str,
|
|
1301
|
+
media_type: str,
|
|
1302
|
+
source_uri: str,
|
|
1303
|
+
tags: Sequence[str],
|
|
1304
|
+
) -> None:
|
|
1305
|
+
"""
|
|
1306
|
+
Ingest a crawled payload under a crawl import namespace.
|
|
1307
|
+
|
|
1308
|
+
:param crawl_id: Crawl identifier used to group crawled artifacts.
|
|
1309
|
+
:type crawl_id: str
|
|
1310
|
+
:param relative_path: Relative path within the crawl prefix.
|
|
1311
|
+
:type relative_path: str
|
|
1312
|
+
:param data: Raw payload bytes.
|
|
1313
|
+
:type data: bytes
|
|
1314
|
+
:param filename: Suggested filename from the payload metadata.
|
|
1315
|
+
:type filename: str
|
|
1316
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
1317
|
+
:type media_type: str
|
|
1318
|
+
:param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
|
|
1319
|
+
:type source_uri: str
|
|
1320
|
+
:param tags: Tags to attach to the stored item.
|
|
1321
|
+
:type tags: Sequence[str]
|
|
1322
|
+
:return: None.
|
|
1323
|
+
:rtype: None
|
|
1324
|
+
"""
|
|
1325
|
+
_ = filename
|
|
1326
|
+
item_id = str(uuid.uuid4())
|
|
1327
|
+
destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
|
|
1328
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1329
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1330
|
+
destination_path.write_bytes(data)
|
|
1331
|
+
|
|
1332
|
+
sha256_digest = _sha256_bytes(data)
|
|
1333
|
+
|
|
1334
|
+
sidecar: Dict[str, Any] = {}
|
|
1335
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1336
|
+
sidecar["media_type"] = media_type
|
|
1337
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
1338
|
+
_write_sidecar(destination_path, sidecar)
|
|
1339
|
+
|
|
1340
|
+
merged_metadata = _merge_metadata({}, sidecar)
|
|
1341
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1342
|
+
|
|
1343
|
+
item_record = CatalogItem(
|
|
1344
|
+
id=item_id,
|
|
1345
|
+
relpath=destination_relpath,
|
|
1346
|
+
sha256=sha256_digest,
|
|
1347
|
+
bytes=len(data),
|
|
1348
|
+
media_type=media_type,
|
|
1349
|
+
title=None,
|
|
1350
|
+
tags=list(resolved_tags),
|
|
1351
|
+
metadata=dict(merged_metadata or {}),
|
|
1352
|
+
created_at=utc_now_iso(),
|
|
1353
|
+
source_uri=source_uri,
|
|
1354
|
+
)
|
|
1355
|
+
self._upsert_catalog_item(item_record)
|
|
1356
|
+
|
|
1357
|
+
def reindex(self) -> Dict[str, int]:
|
|
1358
|
+
"""
|
|
1359
|
+
Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
|
|
1360
|
+
|
|
1361
|
+
This is the core "mutable corpus with re-indexing" loop: edit raw files or sidecars,
|
|
1362
|
+
then reindex to refresh the derived catalog.
|
|
1363
|
+
|
|
1364
|
+
:return: Reindex statistics.
|
|
1365
|
+
:rtype: dict[str, int]
|
|
1366
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1367
|
+
"""
|
|
1368
|
+
self._init_catalog()
|
|
1369
|
+
existing_catalog = self._load_catalog()
|
|
1370
|
+
stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
|
|
1371
|
+
|
|
1372
|
+
content_files = [
|
|
1373
|
+
content_path
|
|
1374
|
+
for content_path in self.raw_dir.rglob("*")
|
|
1375
|
+
if content_path.is_file() and not content_path.name.endswith(SIDECAR_SUFFIX)
|
|
1376
|
+
]
|
|
1377
|
+
|
|
1378
|
+
new_items: Dict[str, CatalogItem] = {}
|
|
1379
|
+
|
|
1380
|
+
for content_path in content_files:
|
|
1381
|
+
stats["scanned"] += 1
|
|
1382
|
+
relpath = str(content_path.relative_to(self.root))
|
|
1383
|
+
data = content_path.read_bytes()
|
|
1384
|
+
sha256 = _sha256_bytes(data)
|
|
1385
|
+
|
|
1386
|
+
media_type, _ = mimetypes.guess_type(content_path.name)
|
|
1387
|
+
media_type = media_type or "application/octet-stream"
|
|
1388
|
+
|
|
1389
|
+
sidecar = _load_sidecar(content_path)
|
|
1390
|
+
|
|
1391
|
+
frontmatter: Dict[str, Any] = {}
|
|
1392
|
+
if content_path.suffix.lower() in {".md", ".markdown"}:
|
|
1393
|
+
try:
|
|
1394
|
+
text = data.decode("utf-8")
|
|
1395
|
+
except UnicodeDecodeError as decode_error:
|
|
1396
|
+
raise ValueError(
|
|
1397
|
+
f"Markdown file must be Unicode Transformation Format 8: {relpath}"
|
|
1398
|
+
) from decode_error
|
|
1399
|
+
parsed_document = parse_front_matter(text)
|
|
1400
|
+
frontmatter = parsed_document.metadata
|
|
1401
|
+
media_type = "text/markdown"
|
|
1402
|
+
|
|
1403
|
+
merged_metadata = _merge_metadata(frontmatter, sidecar)
|
|
1404
|
+
|
|
1405
|
+
if media_type != "text/markdown":
|
|
1406
|
+
media_type_override = merged_metadata.get("media_type")
|
|
1407
|
+
if isinstance(media_type_override, str) and media_type_override.strip():
|
|
1408
|
+
media_type = media_type_override.strip()
|
|
1409
|
+
|
|
1410
|
+
item_id: Optional[str] = None
|
|
1411
|
+
biblicus_block = merged_metadata.get("biblicus")
|
|
1412
|
+
if isinstance(biblicus_block, dict):
|
|
1413
|
+
biblicus_id = biblicus_block.get("id")
|
|
1414
|
+
if isinstance(biblicus_id, str):
|
|
1415
|
+
try:
|
|
1416
|
+
item_id = str(uuid.UUID(biblicus_id))
|
|
1417
|
+
except ValueError:
|
|
1418
|
+
item_id = None
|
|
1419
|
+
|
|
1420
|
+
if item_id is None:
|
|
1421
|
+
item_id = _parse_uuid_prefix(content_path.name)
|
|
1422
|
+
|
|
1423
|
+
if item_id is None:
|
|
1424
|
+
stats["skipped"] += 1
|
|
1425
|
+
continue
|
|
1426
|
+
|
|
1427
|
+
title: Optional[str] = None
|
|
1428
|
+
title_value = merged_metadata.get("title")
|
|
1429
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
1430
|
+
title = title_value.strip()
|
|
1431
|
+
|
|
1432
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1433
|
+
|
|
1434
|
+
source_uri: Optional[str] = None
|
|
1435
|
+
if isinstance(biblicus_block, dict):
|
|
1436
|
+
source_value = biblicus_block.get("source")
|
|
1437
|
+
if isinstance(source_value, str) and source_value.strip():
|
|
1438
|
+
source_uri = source_value.strip()
|
|
1439
|
+
|
|
1440
|
+
previous_item = existing_catalog.items.get(item_id)
|
|
1441
|
+
created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
|
|
1442
|
+
source_uri = source_uri or (
|
|
1443
|
+
previous_item.source_uri if previous_item is not None else None
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
if previous_item is None:
|
|
1447
|
+
stats["inserted"] += 1
|
|
1448
|
+
else:
|
|
1449
|
+
stats["updated"] += 1
|
|
1450
|
+
|
|
1451
|
+
new_items[item_id] = CatalogItem(
|
|
1452
|
+
id=item_id,
|
|
1453
|
+
relpath=relpath,
|
|
1454
|
+
sha256=sha256,
|
|
1455
|
+
bytes=len(data),
|
|
1456
|
+
media_type=media_type,
|
|
1457
|
+
title=title,
|
|
1458
|
+
tags=list(resolved_tags),
|
|
1459
|
+
metadata=dict(merged_metadata or {}),
|
|
1460
|
+
created_at=created_at,
|
|
1461
|
+
source_uri=source_uri,
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1464
|
+
order = sorted(
|
|
1465
|
+
new_items.keys(),
|
|
1466
|
+
key=lambda item_id: (new_items[item_id].created_at, item_id),
|
|
1467
|
+
reverse=True,
|
|
1468
|
+
)
|
|
1469
|
+
|
|
1470
|
+
catalog = CorpusCatalog(
|
|
1471
|
+
schema_version=SCHEMA_VERSION,
|
|
1472
|
+
generated_at=utc_now_iso(),
|
|
1473
|
+
corpus_uri=normalize_corpus_uri(self.root),
|
|
1474
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
1475
|
+
latest_run_id=None,
|
|
1476
|
+
items=new_items,
|
|
1477
|
+
order=order,
|
|
1478
|
+
)
|
|
1479
|
+
self._write_catalog(catalog)
|
|
1480
|
+
|
|
1481
|
+
return stats
|
|
1482
|
+
|
|
1483
|
+
@property
|
|
1484
|
+
def name(self) -> str:
|
|
1485
|
+
"""
|
|
1486
|
+
Return the corpus name (directory basename).
|
|
1487
|
+
|
|
1488
|
+
:return: Corpus name.
|
|
1489
|
+
:rtype: str
|
|
1490
|
+
"""
|
|
1491
|
+
return self.root.name
|
|
1492
|
+
|
|
1493
|
+
def purge(self, *, confirm: str) -> None:
|
|
1494
|
+
"""
|
|
1495
|
+
Delete all ingested items and derived files, preserving corpus identity/config.
|
|
1496
|
+
|
|
1497
|
+
:param confirm: Confirmation string matching the corpus name.
|
|
1498
|
+
:type confirm: str
|
|
1499
|
+
:return: None.
|
|
1500
|
+
:rtype: None
|
|
1501
|
+
:raises ValueError: If the confirmation does not match.
|
|
1502
|
+
"""
|
|
1503
|
+
expected = self.name
|
|
1504
|
+
if confirm != expected:
|
|
1505
|
+
raise ValueError(
|
|
1506
|
+
f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1509
|
+
if self.raw_dir.exists():
|
|
1510
|
+
shutil.rmtree(self.raw_dir)
|
|
1511
|
+
self.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
1512
|
+
|
|
1513
|
+
for path in self.meta_dir.iterdir():
|
|
1514
|
+
if path.name == "config.json":
|
|
1515
|
+
continue
|
|
1516
|
+
if path.is_dir():
|
|
1517
|
+
shutil.rmtree(path)
|
|
1518
|
+
else:
|
|
1519
|
+
path.unlink()
|
|
1520
|
+
self._init_catalog()
|
|
1521
|
+
self._write_catalog(
|
|
1522
|
+
CorpusCatalog(
|
|
1523
|
+
schema_version=SCHEMA_VERSION,
|
|
1524
|
+
generated_at=utc_now_iso(),
|
|
1525
|
+
corpus_uri=normalize_corpus_uri(self.root),
|
|
1526
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
1527
|
+
latest_run_id=None,
|
|
1528
|
+
items={},
|
|
1529
|
+
order=[],
|
|
1530
|
+
)
|
|
1531
|
+
)
|