biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/corpus.py ADDED
@@ -0,0 +1,1531 @@
1
+ """
2
+ Corpus storage and ingestion for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import hashlib
8
+ import json
9
+ import mimetypes
10
+ import shutil
11
+ import uuid
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional, Sequence
14
+
15
+ import yaml
16
+ from pydantic import ValidationError
17
+
18
+ from .constants import (
19
+ CORPUS_DIR_NAME,
20
+ DEFAULT_RAW_DIR,
21
+ EXTRACTION_RUNS_DIR_NAME,
22
+ RUNS_DIR_NAME,
23
+ SCHEMA_VERSION,
24
+ SIDECAR_SUFFIX,
25
+ )
26
+ from .frontmatter import parse_front_matter, render_front_matter
27
+ from .hook_manager import HookManager
28
+ from .hooks import HookPoint
29
+ from .ignore import load_corpus_ignore_spec
30
+ from .models import (
31
+ CatalogItem,
32
+ CorpusCatalog,
33
+ CorpusConfig,
34
+ ExtractionRunListEntry,
35
+ IngestResult,
36
+ RetrievalRun,
37
+ )
38
+ from .sources import load_source
39
+ from .time import utc_now_iso
40
+ from .uris import corpus_ref_to_path, normalize_corpus_uri
41
+
42
+
43
+ def _sha256_bytes(data: bytes) -> str:
44
+ """
45
+ Compute a Secure Hash Algorithm 256 digest for byte content.
46
+
47
+ :param data: Input bytes.
48
+ :type data: bytes
49
+ :return: Secure Hash Algorithm 256 hex digest.
50
+ :rtype: str
51
+ """
52
+ return hashlib.sha256(data).hexdigest()
53
+
54
+
55
+ def _write_stream_and_hash(
56
+ stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
57
+ ) -> Dict[str, object]:
58
+ """
59
+ Write a binary stream to disk while computing a digest.
60
+
61
+ :param stream: Binary stream to read from.
62
+ :type stream: object
63
+ :param destination_path: Destination path to write to.
64
+ :type destination_path: Path
65
+ :param chunk_size: Chunk size for reads.
66
+ :type chunk_size: int
67
+ :return: Mapping containing sha256 and bytes_written.
68
+ :rtype: dict[str, object]
69
+ :raises OSError: If the destination cannot be written.
70
+ """
71
+ hasher = hashlib.sha256()
72
+ bytes_written = 0
73
+ with destination_path.open("wb") as destination_handle:
74
+ while True:
75
+ chunk = stream.read(chunk_size)
76
+ if not chunk:
77
+ break
78
+ hasher.update(chunk)
79
+ destination_handle.write(chunk)
80
+ bytes_written += len(chunk)
81
+ return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
82
+
83
+
84
+ def _sanitize_filename(name: str) -> str:
85
+ """
86
+ Sanitize a filename into a portable, filesystem-friendly form.
87
+
88
+ :param name: Raw filename.
89
+ :type name: str
90
+ :return: Sanitized filename.
91
+ :rtype: str
92
+ """
93
+ allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
94
+ sanitized_name = "".join(
95
+ (character if character in allowed_characters else "_") for character in name
96
+ ).strip()
97
+ return sanitized_name or "file"
98
+
99
+
100
+ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
101
+ """
102
+ Return a preferred filename extension for a media type.
103
+
104
+ :param media_type: Internet Assigned Numbers Authority media type.
105
+ :type media_type: str
106
+ :return: Preferred extension or None.
107
+ :rtype: str or None
108
+ """
109
+ media_type_overrides = {
110
+ "image/jpeg": ".jpg",
111
+ "audio/ogg": ".ogg",
112
+ }
113
+ if media_type in media_type_overrides:
114
+ return media_type_overrides[media_type]
115
+ return mimetypes.guess_extension(media_type)
116
+
117
+
118
+ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
119
+ """
120
+ Ensure a usable filename extension for a media type.
121
+
122
+ :param filename: Raw filename.
123
+ :type filename: str
124
+ :param media_type: Internet Assigned Numbers Authority media type.
125
+ :type media_type: str
126
+ :return: Filename with a compatible extension.
127
+ :rtype: str
128
+ """
129
+ raw_name = filename.strip()
130
+
131
+ if media_type == "text/markdown":
132
+ if raw_name.lower().endswith((".md", ".markdown")):
133
+ return raw_name
134
+ return raw_name + ".md"
135
+
136
+ if Path(raw_name).suffix:
137
+ return raw_name
138
+
139
+ ext = _preferred_extension_for_media_type(media_type)
140
+ if not ext:
141
+ return raw_name
142
+ return raw_name + ext
143
+
144
+
145
+ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
146
+ """
147
+ Merge tags from explicit input and front matter values.
148
+
149
+ :param explicit: Explicit tags provided by callers.
150
+ :type explicit: Sequence[str]
151
+ :param from_frontmatter: Tags from front matter.
152
+ :type from_frontmatter: Any
153
+ :return: Deduplicated tag list preserving order.
154
+ :rtype: list[str]
155
+ """
156
+ merged_tags: List[str] = []
157
+
158
+ for explicit_tag in explicit:
159
+ cleaned_tag = explicit_tag.strip()
160
+ if cleaned_tag:
161
+ merged_tags.append(cleaned_tag)
162
+
163
+ if isinstance(from_frontmatter, str):
164
+ merged_tags.append(from_frontmatter)
165
+ elif isinstance(from_frontmatter, list):
166
+ for item in from_frontmatter:
167
+ if isinstance(item, str) and item.strip():
168
+ merged_tags.append(item.strip())
169
+
170
+ seen_tags = set()
171
+ deduplicated_tags: List[str] = []
172
+ for tag_value in merged_tags:
173
+ if tag_value not in seen_tags:
174
+ seen_tags.add(tag_value)
175
+ deduplicated_tags.append(tag_value)
176
+ return deduplicated_tags
177
+
178
+
179
+ def _sidecar_path_for(content_path: Path) -> Path:
180
+ """
181
+ Compute the sidecar metadata path for a content file.
182
+
183
+ :param content_path: Path to the content file.
184
+ :type content_path: Path
185
+ :return: Sidecar path.
186
+ :rtype: Path
187
+ """
188
+ return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
189
+
190
+
191
+ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
192
+ """
193
+ Load sidecar metadata for a content file.
194
+
195
+ :param content_path: Path to the content file.
196
+ :type content_path: Path
197
+ :return: Parsed sidecar metadata.
198
+ :rtype: dict[str, Any]
199
+ :raises ValueError: If the sidecar content is not a mapping.
200
+ """
201
+ path = _sidecar_path_for(content_path)
202
+ if not path.is_file():
203
+ return {}
204
+ data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
205
+ if not isinstance(data, dict):
206
+ raise ValueError(f"Sidecar metadata must be a mapping/object: {path}")
207
+ return dict(data)
208
+
209
+
210
+ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
211
+ """
212
+ Write a sidecar metadata file.
213
+
214
+ :param content_path: Path to the content file.
215
+ :type content_path: Path
216
+ :param metadata: Metadata to serialize.
217
+ :type metadata: dict[str, Any]
218
+ :return: None.
219
+ :rtype: None
220
+ """
221
+ path = _sidecar_path_for(content_path)
222
+ text = yaml.safe_dump(
223
+ metadata,
224
+ sort_keys=False,
225
+ allow_unicode=True,
226
+ default_flow_style=False,
227
+ ).strip()
228
+ path.write_text(text + "\n", encoding="utf-8")
229
+
230
+
231
+ def _ensure_biblicus_block(
232
+ metadata: Dict[str, Any], *, item_id: str, source_uri: str
233
+ ) -> Dict[str, Any]:
234
+ """
235
+ Ensure the biblicus metadata block exists and is populated.
236
+
237
+ :param metadata: Existing metadata.
238
+ :type metadata: dict[str, Any]
239
+ :param item_id: Item identifier to store.
240
+ :type item_id: str
241
+ :param source_uri: Source uniform resource identifier to store.
242
+ :type source_uri: str
243
+ :return: Updated metadata mapping.
244
+ :rtype: dict[str, Any]
245
+ """
246
+ updated_metadata = dict(metadata)
247
+ existing_biblicus = updated_metadata.get("biblicus")
248
+ if not isinstance(existing_biblicus, dict):
249
+ existing_biblicus = {}
250
+ biblicus_block = dict(existing_biblicus)
251
+ biblicus_block["id"] = item_id
252
+ biblicus_block["source"] = source_uri
253
+ updated_metadata["biblicus"] = biblicus_block
254
+ return updated_metadata
255
+
256
+
257
+ def _parse_uuid_prefix(filename: str) -> Optional[str]:
258
+ """
259
+ Extract a universally unique identifier prefix from a filename, if present.
260
+
261
+ :param filename: Filename to inspect.
262
+ :type filename: str
263
+ :return: Universally unique identifier string or None.
264
+ :rtype: str or None
265
+ """
266
+ if len(filename) < 36:
267
+ return None
268
+ prefix = filename[:36]
269
+ try:
270
+ return str(uuid.UUID(prefix))
271
+ except ValueError:
272
+ return None
273
+
274
+
275
+ def _merge_metadata(front: Dict[str, Any], side: Dict[str, Any]) -> Dict[str, Any]:
276
+ """
277
+ Merge front matter and sidecar metadata.
278
+
279
+ :param front: Front matter metadata.
280
+ :type front: dict[str, Any]
281
+ :param side: Sidecar metadata.
282
+ :type side: dict[str, Any]
283
+ :return: Merged metadata.
284
+ :rtype: dict[str, Any]
285
+ """
286
+ merged_metadata: Dict[str, Any] = dict(front)
287
+
288
+ front_biblicus = merged_metadata.get("biblicus")
289
+ sidecar_biblicus = side.get("biblicus")
290
+ if isinstance(front_biblicus, dict) or isinstance(sidecar_biblicus, dict):
291
+ merged_biblicus: Dict[str, Any] = {}
292
+ if isinstance(front_biblicus, dict):
293
+ merged_biblicus.update(front_biblicus)
294
+ if isinstance(sidecar_biblicus, dict):
295
+ merged_biblicus.update(sidecar_biblicus)
296
+ merged_metadata["biblicus"] = merged_biblicus
297
+
298
+ merged_tags = _merge_tags(_merge_tags([], front.get("tags")), side.get("tags"))
299
+ if merged_tags:
300
+ merged_metadata["tags"] = merged_tags
301
+
302
+ for metadata_key, metadata_value in side.items():
303
+ if metadata_key in {"biblicus", "tags"}:
304
+ continue
305
+ merged_metadata[metadata_key] = metadata_value
306
+
307
+ return merged_metadata
308
+
309
+
310
+ class Corpus:
311
+ """
312
+ Local corpus manager for Biblicus.
313
+
314
+ :ivar root: Corpus root directory.
315
+ :vartype root: Path
316
+ :ivar meta_dir: Metadata directory under the corpus root.
317
+ :vartype meta_dir: Path
318
+ :ivar raw_dir: Raw item directory under the corpus root.
319
+ :vartype raw_dir: Path
320
+ :ivar config: Parsed corpus config, if present.
321
+ :vartype config: CorpusConfig or None
322
+ """
323
+
324
+ def __init__(self, root: Path):
325
+ """
326
+ Initialize a corpus wrapper around a filesystem path.
327
+
328
+ :param root: Corpus root directory.
329
+ :type root: Path
330
+ """
331
+ self.root = root
332
+ self.meta_dir = self.root / CORPUS_DIR_NAME
333
+ self.raw_dir = self.root / DEFAULT_RAW_DIR
334
+ self.config = self._load_config()
335
+ self._hooks = self._load_hooks()
336
+
337
+ @property
338
+ def uri(self) -> str:
339
+ """
340
+ Return the canonical uniform resource identifier for the corpus root.
341
+
342
+ :return: Corpus uniform resource identifier.
343
+ :rtype: str
344
+ """
345
+ return self.root.as_uri()
346
+
347
+ def _load_config(self) -> Optional[CorpusConfig]:
348
+ """
349
+ Load the corpus config if it exists.
350
+
351
+ :return: Parsed corpus config or None.
352
+ :rtype: CorpusConfig or None
353
+ :raises ValueError: If the config schema is invalid.
354
+ """
355
+ path = self.meta_dir / "config.json"
356
+ if not path.is_file():
357
+ return None
358
+ data = json.loads(path.read_text(encoding="utf-8"))
359
+ try:
360
+ return CorpusConfig.model_validate(data)
361
+ except ValidationError as exc:
362
+ has_hook_error = any(
363
+ isinstance(error.get("loc"), tuple)
364
+ and error.get("loc")
365
+ and error.get("loc")[0] == "hooks"
366
+ for error in exc.errors()
367
+ )
368
+ if has_hook_error:
369
+ raise ValueError(f"Invalid hook specification: {exc}") from exc
370
+ raise ValueError(f"Invalid corpus config: {exc}") from exc
371
+
372
+ def _load_hooks(self) -> Optional[HookManager]:
373
+ """
374
+ Load the hook manager from config if hooks are configured.
375
+
376
+ :return: Hook manager or None.
377
+ :rtype: HookManager or None
378
+ :raises ValueError: If hook specifications are invalid.
379
+ """
380
+ if self.config is None or not self.config.hooks:
381
+ return None
382
+ return HookManager.from_config(
383
+ corpus_root=self.root,
384
+ corpus_uri=self.uri,
385
+ hook_specs=self.config.hooks,
386
+ )
387
+
388
+ @classmethod
389
+ def find(cls, start: Path) -> "Corpus":
390
+ """
391
+ Locate a corpus by searching upward from a path.
392
+
393
+ :param start: Starting path to search.
394
+ :type start: Path
395
+ :return: Located corpus instance.
396
+ :rtype: Corpus
397
+ :raises FileNotFoundError: If no corpus config is found.
398
+ """
399
+ start = start.resolve()
400
+ for candidate in [start, *start.parents]:
401
+ if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
402
+ return cls(candidate)
403
+ raise FileNotFoundError(
404
+ f"Not a Biblicus corpus (no {CORPUS_DIR_NAME}/config.json found from {start})"
405
+ )
406
+
407
+ @classmethod
408
+ def open(cls, ref: str | Path) -> "Corpus":
409
+ """
410
+ Open a corpus from a path or uniform resource identifier reference.
411
+
412
+ :param ref: Filesystem path or file:// uniform resource identifier.
413
+ :type ref: str or Path
414
+ :return: Opened corpus instance.
415
+ :rtype: Corpus
416
+ """
417
+ return cls.find(corpus_ref_to_path(ref))
418
+
419
+ @classmethod
420
+ def init(cls, root: Path, *, force: bool = False) -> "Corpus":
421
+ """
422
+ Initialize a new corpus on disk.
423
+
424
+ :param root: Corpus root directory.
425
+ :type root: Path
426
+ :param force: Whether to overwrite existing config.
427
+ :type force: bool
428
+ :return: Initialized corpus instance.
429
+ :rtype: Corpus
430
+ :raises FileExistsError: If the corpus already exists and force is False.
431
+ """
432
+ root = root.resolve()
433
+ corpus = cls(root)
434
+
435
+ corpus.meta_dir.mkdir(parents=True, exist_ok=True)
436
+ corpus.raw_dir.mkdir(parents=True, exist_ok=True)
437
+
438
+ config_path = corpus.meta_dir / "config.json"
439
+ if config_path.exists() and not force:
440
+ raise FileExistsError(f"Corpus already exists at {root}")
441
+
442
+ config = CorpusConfig(
443
+ schema_version=SCHEMA_VERSION,
444
+ created_at=utc_now_iso(),
445
+ corpus_uri=normalize_corpus_uri(root),
446
+ raw_dir=DEFAULT_RAW_DIR,
447
+ )
448
+ config_path.write_text(config.model_dump_json(indent=2) + "\n", encoding="utf-8")
449
+
450
+ corpus._init_catalog()
451
+ return corpus
452
+
453
+ @property
454
+ def catalog_path(self) -> Path:
455
+ """
456
+ Return the path to the corpus catalog file.
457
+
458
+ :return: Catalog file path.
459
+ :rtype: Path
460
+ """
461
+ return self.meta_dir / "catalog.json"
462
+
463
+ def _init_catalog(self) -> None:
464
+ """
465
+ Initialize the catalog if it does not already exist.
466
+
467
+ :return: None.
468
+ :rtype: None
469
+ """
470
+ if self.catalog_path.exists():
471
+ return
472
+ catalog = CorpusCatalog(
473
+ schema_version=SCHEMA_VERSION,
474
+ generated_at=utc_now_iso(),
475
+ corpus_uri=normalize_corpus_uri(self.root),
476
+ raw_dir=DEFAULT_RAW_DIR,
477
+ latest_run_id=None,
478
+ items={},
479
+ order=[],
480
+ )
481
+ self._write_catalog(catalog)
482
+
483
+ def _load_catalog(self) -> CorpusCatalog:
484
+ """
485
+ Read and validate the corpus catalog file.
486
+
487
+ :return: Parsed corpus catalog.
488
+ :rtype: CorpusCatalog
489
+ :raises FileNotFoundError: If the catalog file does not exist.
490
+ :raises ValueError: If the catalog schema is invalid.
491
+ """
492
+ if not self.catalog_path.is_file():
493
+ raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
494
+ catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
495
+ return CorpusCatalog.model_validate(catalog_data)
496
+
497
+ def load_catalog(self) -> CorpusCatalog:
498
+ """
499
+ Load the current corpus catalog.
500
+
501
+ :return: Parsed corpus catalog.
502
+ :rtype: CorpusCatalog
503
+ :raises FileNotFoundError: If the catalog file does not exist.
504
+ :raises ValueError: If the catalog schema is invalid.
505
+ """
506
+ return self._load_catalog()
507
+
508
+ def _write_catalog(self, catalog: CorpusCatalog) -> None:
509
+ """
510
+ Atomically write a corpus catalog to disk.
511
+
512
+ :param catalog: Catalog to persist.
513
+ :type catalog: CorpusCatalog
514
+ :return: None.
515
+ :rtype: None
516
+ """
517
+ temp_path = self.catalog_path.with_suffix(".json.tmp")
518
+ temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
519
+ temp_path.replace(self.catalog_path)
520
+
521
+ @property
522
+ def runs_dir(self) -> Path:
523
+ """
524
+ Location of retrieval run manifests.
525
+
526
+ :return: Path to the runs directory.
527
+ :rtype: Path
528
+ """
529
+ return self.meta_dir / RUNS_DIR_NAME
530
+
531
+ @property
532
+ def extraction_runs_dir(self) -> Path:
533
+ """
534
+ Location of extraction run artifacts.
535
+
536
+ :return: Path to the extraction runs directory.
537
+ :rtype: Path
538
+ """
539
+ return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
540
+
541
+ def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
542
+ """
543
+ Resolve an extraction run directory.
544
+
545
+ :param extractor_id: Extractor plugin identifier.
546
+ :type extractor_id: str
547
+ :param run_id: Extraction run identifier.
548
+ :type run_id: str
549
+ :return: Extraction run directory.
550
+ :rtype: Path
551
+ """
552
+ return self.extraction_runs_dir / extractor_id / run_id
553
+
554
+ def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
555
+ """
556
+ Read extracted text for an item from an extraction run, when present.
557
+
558
+ :param extractor_id: Extractor plugin identifier.
559
+ :type extractor_id: str
560
+ :param run_id: Extraction run identifier.
561
+ :type run_id: str
562
+ :param item_id: Item identifier.
563
+ :type item_id: str
564
+ :return: Extracted text or None if the artifact does not exist.
565
+ :rtype: str or None
566
+ :raises OSError: If the file exists but cannot be read.
567
+ """
568
+ path = (
569
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
570
+ / "text"
571
+ / f"{item_id}.txt"
572
+ )
573
+ if not path.is_file():
574
+ return None
575
+ return path.read_text(encoding="utf-8")
576
+
577
+ def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
578
+ """
579
+ Load an extraction run manifest from the corpus.
580
+
581
+ :param extractor_id: Extractor plugin identifier.
582
+ :type extractor_id: str
583
+ :param run_id: Extraction run identifier.
584
+ :type run_id: str
585
+ :return: Parsed extraction run manifest.
586
+ :rtype: biblicus.extraction.ExtractionRunManifest
587
+ :raises FileNotFoundError: If the manifest file does not exist.
588
+ :raises ValueError: If the manifest data is invalid.
589
+ """
590
+ from .extraction import ExtractionRunManifest
591
+
592
+ manifest_path = (
593
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
594
+ )
595
+ if not manifest_path.is_file():
596
+ raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
597
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
598
+ return ExtractionRunManifest.model_validate(data)
599
+
600
+ def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
601
+ """
602
+ List extraction runs stored under the corpus.
603
+
604
+ :param extractor_id: Optional extractor identifier filter.
605
+ :type extractor_id: str or None
606
+ :return: Summary list entries for each run.
607
+ :rtype: list[biblicus.models.ExtractionRunListEntry]
608
+ """
609
+ runs_root = self.extraction_runs_dir
610
+ if not runs_root.is_dir():
611
+ return []
612
+
613
+ extractor_dirs: List[Path]
614
+ if extractor_id is None:
615
+ extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
616
+ else:
617
+ extractor_path = runs_root / extractor_id
618
+ extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
619
+
620
+ entries: List[ExtractionRunListEntry] = []
621
+ for extractor_dir in extractor_dirs:
622
+ for run_dir in sorted(extractor_dir.iterdir()):
623
+ if not run_dir.is_dir():
624
+ continue
625
+ manifest_path = run_dir / "manifest.json"
626
+ if not manifest_path.is_file():
627
+ continue
628
+ try:
629
+ manifest = self.load_extraction_run_manifest(
630
+ extractor_id=extractor_dir.name,
631
+ run_id=run_dir.name,
632
+ )
633
+ except (FileNotFoundError, ValueError):
634
+ continue
635
+ entries.append(
636
+ ExtractionRunListEntry(
637
+ extractor_id=extractor_dir.name,
638
+ run_id=run_dir.name,
639
+ recipe_id=manifest.recipe.recipe_id,
640
+ recipe_name=manifest.recipe.name,
641
+ catalog_generated_at=manifest.catalog_generated_at,
642
+ created_at=manifest.created_at,
643
+ stats=dict(manifest.stats),
644
+ )
645
+ )
646
+
647
+ entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
648
+ return entries
649
+
650
+ def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
651
+ """
652
+ Delete an extraction run directory and its derived artifacts.
653
+
654
+ :param extractor_id: Extractor plugin identifier.
655
+ :type extractor_id: str
656
+ :param run_id: Extraction run identifier.
657
+ :type run_id: str
658
+ :return: None.
659
+ :rtype: None
660
+ :raises FileNotFoundError: If the extraction run directory does not exist.
661
+ """
662
+ run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
663
+ if not run_dir.is_dir():
664
+ raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
665
+ shutil.rmtree(run_dir)
666
+
667
+ def _ensure_runs_dir(self) -> None:
668
+ """
669
+ Ensure the retrieval runs directory exists.
670
+
671
+ :return: None.
672
+ :rtype: None
673
+ """
674
+ self.runs_dir.mkdir(parents=True, exist_ok=True)
675
+
676
+ def write_run(self, run: RetrievalRun) -> None:
677
+ """
678
+ Persist a retrieval run manifest and update the catalog pointer.
679
+
680
+ :param run: Run manifest to persist.
681
+ :type run: RetrievalRun
682
+ :return: None.
683
+ :rtype: None
684
+ """
685
+ self._ensure_runs_dir()
686
+ path = self.runs_dir / f"{run.run_id}.json"
687
+ path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
688
+ catalog = self._load_catalog()
689
+ catalog.latest_run_id = run.run_id
690
+ catalog.generated_at = utc_now_iso()
691
+ self._write_catalog(catalog)
692
+
693
+ def load_run(self, run_id: str) -> RetrievalRun:
694
+ """
695
+ Load a retrieval run manifest by identifier.
696
+
697
+ :param run_id: Run identifier.
698
+ :type run_id: str
699
+ :return: Parsed run manifest.
700
+ :rtype: RetrievalRun
701
+ :raises FileNotFoundError: If the run manifest does not exist.
702
+ """
703
+ path = self.runs_dir / f"{run_id}.json"
704
+ if not path.is_file():
705
+ raise FileNotFoundError(f"Missing run manifest: {path}")
706
+ data = json.loads(path.read_text(encoding="utf-8"))
707
+ return RetrievalRun.model_validate(data)
708
+
709
+ @property
710
+ def latest_run_id(self) -> Optional[str]:
711
+ """
712
+ Latest retrieval run identifier recorded in the catalog.
713
+
714
+ :return: Latest run identifier or None.
715
+ :rtype: str or None
716
+ """
717
+ return self._load_catalog().latest_run_id
718
+
719
+ def _upsert_catalog_item(self, item: CatalogItem) -> None:
720
+ """
721
+ Upsert a catalog item and reset the latest run pointer.
722
+
723
+ :param item: Catalog item to insert or update.
724
+ :type item: CatalogItem
725
+ :return: None.
726
+ :rtype: None
727
+ """
728
+ self._init_catalog()
729
+ catalog = self._load_catalog()
730
+ catalog.items[item.id] = item
731
+
732
+ ordered_ids = [item_id for item_id in catalog.order if item_id != item.id]
733
+ ordered_ids.insert(0, item.id)
734
+ catalog.order = ordered_ids
735
+ catalog.generated_at = utc_now_iso()
736
+ catalog.latest_run_id = None
737
+
738
+ self._write_catalog(catalog)
739
+
740
+ def ingest_item(
741
+ self,
742
+ data: bytes,
743
+ *,
744
+ filename: Optional[str] = None,
745
+ media_type: str = "application/octet-stream",
746
+ title: Optional[str] = None,
747
+ tags: Sequence[str] = (),
748
+ metadata: Optional[Dict[str, Any]] = None,
749
+ source_uri: str = "unknown",
750
+ ) -> IngestResult:
751
+ """
752
+ Ingest a single raw item into the corpus.
753
+
754
+ This is the modality-neutral primitive: callers provide bytes + a media type.
755
+ Higher-level conveniences (ingest_note, ingest_source, and related methods) build on top.
756
+
757
+ :param data: Raw item bytes.
758
+ :type data: bytes
759
+ :param filename: Optional filename for the stored item.
760
+ :type filename: str or None
761
+ :param media_type: Internet Assigned Numbers Authority media type for the item.
762
+ :type media_type: str
763
+ :param title: Optional title metadata.
764
+ :type title: str or None
765
+ :param tags: Tags to associate with the item.
766
+ :type tags: Sequence[str]
767
+ :param metadata: Optional metadata mapping.
768
+ :type metadata: dict[str, Any] or None
769
+ :param source_uri: Source uniform resource identifier for provenance.
770
+ :type source_uri: str
771
+ :return: Ingestion result summary.
772
+ :rtype: IngestResult
773
+ :raises ValueError: If markdown is not Unicode Transformation Format 8.
774
+ """
775
+ item_id = str(uuid.uuid4())
776
+ safe_filename = _sanitize_filename(filename) if filename else ""
777
+
778
+ if safe_filename:
779
+ safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
780
+
781
+ if media_type == "text/markdown":
782
+ output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
783
+ else:
784
+ if safe_filename:
785
+ output_name = f"{item_id}--{safe_filename}"
786
+ else:
787
+ extension = _preferred_extension_for_media_type(media_type) or ""
788
+ output_name = f"{item_id}{extension}" if extension else f"{item_id}"
789
+
790
+ relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
791
+ output_path = self.root / relpath
792
+
793
+ resolved_title = title.strip() if isinstance(title, str) and title.strip() else None
794
+ resolved_tags = list(tags)
795
+ metadata_input: Dict[str, Any] = dict(metadata or {})
796
+ if resolved_title and "title" not in metadata_input:
797
+ metadata_input["title"] = resolved_title
798
+ if resolved_tags and "tags" not in metadata_input:
799
+ metadata_input["tags"] = list(resolved_tags)
800
+
801
+ if self._hooks is not None:
802
+ mutation = self._hooks.run_ingest_hooks(
803
+ hook_point=HookPoint.before_ingest,
804
+ filename=filename,
805
+ media_type=media_type,
806
+ title=resolved_title,
807
+ tags=list(resolved_tags),
808
+ metadata=dict(metadata_input),
809
+ source_uri=source_uri,
810
+ )
811
+ if mutation.add_tags:
812
+ for tag in mutation.add_tags:
813
+ if tag not in resolved_tags:
814
+ resolved_tags.append(tag)
815
+
816
+ frontmatter: Dict[str, Any] = {}
817
+
818
+ if media_type == "text/markdown":
819
+ try:
820
+ markdown_text = data.decode("utf-8")
821
+ except UnicodeDecodeError as decode_error:
822
+ raise ValueError(
823
+ "Markdown must be Unicode Transformation Format 8"
824
+ ) from decode_error
825
+
826
+ parsed_document = parse_front_matter(markdown_text)
827
+ frontmatter = dict(parsed_document.metadata)
828
+
829
+ merged_tags = _merge_tags(resolved_tags, frontmatter.get("tags"))
830
+ if merged_tags:
831
+ frontmatter["tags"] = merged_tags
832
+ resolved_tags = merged_tags
833
+
834
+ if resolved_title and not (
835
+ isinstance(frontmatter.get("title"), str) and frontmatter.get("title").strip()
836
+ ):
837
+ frontmatter["title"] = resolved_title
838
+
839
+ title_value = frontmatter.get("title")
840
+ if isinstance(title_value, str) and title_value.strip():
841
+ resolved_title = title_value.strip()
842
+
843
+ frontmatter = _ensure_biblicus_block(
844
+ frontmatter, item_id=item_id, source_uri=source_uri
845
+ )
846
+ rendered_document = render_front_matter(frontmatter, parsed_document.body)
847
+ data_to_write = rendered_document.encode("utf-8")
848
+ else:
849
+ data_to_write = data
850
+
851
+ sha256_digest = _sha256_bytes(data_to_write)
852
+ output_path.write_bytes(data_to_write)
853
+
854
+ if media_type != "text/markdown":
855
+ sidecar: Dict[str, Any] = {}
856
+ sidecar["media_type"] = media_type
857
+ if resolved_tags:
858
+ sidecar["tags"] = resolved_tags
859
+ if metadata_input:
860
+ for metadata_key, metadata_value in metadata_input.items():
861
+ if metadata_key in {"tags", "biblicus"}:
862
+ continue
863
+ sidecar[metadata_key] = metadata_value
864
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
865
+ _write_sidecar(output_path, sidecar)
866
+ frontmatter = sidecar
867
+
868
+ if self._hooks is not None:
869
+ mutation = self._hooks.run_ingest_hooks(
870
+ hook_point=HookPoint.after_ingest,
871
+ filename=filename,
872
+ media_type=media_type,
873
+ title=resolved_title,
874
+ tags=list(resolved_tags),
875
+ metadata=dict(metadata_input),
876
+ source_uri=source_uri,
877
+ item_id=item_id,
878
+ relpath=relpath,
879
+ )
880
+ if mutation.add_tags:
881
+ updated_tags = list(resolved_tags)
882
+ for tag in mutation.add_tags:
883
+ if tag not in updated_tags:
884
+ updated_tags.append(tag)
885
+ resolved_tags = updated_tags
886
+ sidecar_metadata = _load_sidecar(output_path)
887
+ sidecar_metadata["tags"] = resolved_tags
888
+ if media_type != "text/markdown":
889
+ sidecar_metadata["media_type"] = media_type
890
+ sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
891
+ _write_sidecar(output_path, sidecar_metadata)
892
+ frontmatter = _merge_metadata(
893
+ frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
894
+ )
895
+
896
+ created_at = utc_now_iso()
897
+ item_record = CatalogItem(
898
+ id=item_id,
899
+ relpath=relpath,
900
+ sha256=sha256_digest,
901
+ bytes=len(data_to_write),
902
+ media_type=media_type,
903
+ title=resolved_title,
904
+ tags=list(resolved_tags),
905
+ metadata=dict(frontmatter or {}),
906
+ created_at=created_at,
907
+ source_uri=source_uri,
908
+ )
909
+ self._upsert_catalog_item(item_record)
910
+
911
+ return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
912
+
913
+ def ingest_item_stream(
914
+ self,
915
+ stream,
916
+ *,
917
+ filename: Optional[str] = None,
918
+ media_type: str = "application/octet-stream",
919
+ tags: Sequence[str] = (),
920
+ metadata: Optional[Dict[str, Any]] = None,
921
+ source_uri: str = "unknown",
922
+ ) -> IngestResult:
923
+ """
924
+ Ingest a binary item from a readable stream.
925
+
926
+ This method is intended for large non-markdown items. It writes bytes to disk incrementally
927
+ while computing a checksum.
928
+
929
+ :param stream: Readable binary stream.
930
+ :type stream: object
931
+ :param filename: Optional filename for the stored item.
932
+ :type filename: str or None
933
+ :param media_type: Internet Assigned Numbers Authority media type for the item.
934
+ :type media_type: str
935
+ :param tags: Tags to associate with the item.
936
+ :type tags: Sequence[str]
937
+ :param metadata: Optional metadata mapping.
938
+ :type metadata: dict[str, Any] or None
939
+ :param source_uri: Source uniform resource identifier for provenance.
940
+ :type source_uri: str
941
+ :return: Ingestion result summary.
942
+ :rtype: IngestResult
943
+ :raises ValueError: If the media_type is text/markdown.
944
+ """
945
+ if media_type == "text/markdown":
946
+ raise ValueError("Stream ingestion is not supported for Markdown")
947
+
948
+ item_id = str(uuid.uuid4())
949
+ safe_filename = _sanitize_filename(filename) if filename else ""
950
+ if safe_filename:
951
+ safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
952
+
953
+ if safe_filename:
954
+ output_name = f"{item_id}--{safe_filename}"
955
+ else:
956
+ extension = _preferred_extension_for_media_type(media_type) or ""
957
+ output_name = f"{item_id}{extension}" if extension else f"{item_id}"
958
+
959
+ relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
960
+ output_path = self.root / relpath
961
+
962
+ resolved_tags = list(tags)
963
+ metadata_input: Dict[str, Any] = dict(metadata or {})
964
+ if resolved_tags and "tags" not in metadata_input:
965
+ metadata_input["tags"] = list(resolved_tags)
966
+
967
+ if self._hooks is not None:
968
+ mutation = self._hooks.run_ingest_hooks(
969
+ hook_point=HookPoint.before_ingest,
970
+ filename=filename,
971
+ media_type=media_type,
972
+ title=None,
973
+ tags=list(resolved_tags),
974
+ metadata=dict(metadata_input),
975
+ source_uri=source_uri,
976
+ )
977
+ if mutation.add_tags:
978
+ for tag in mutation.add_tags:
979
+ if tag not in resolved_tags:
980
+ resolved_tags.append(tag)
981
+
982
+ write_result = _write_stream_and_hash(stream, output_path)
983
+ sha256_digest = str(write_result["sha256"])
984
+ bytes_written = int(write_result["bytes_written"])
985
+
986
+ sidecar: Dict[str, Any] = {}
987
+ sidecar["media_type"] = media_type
988
+ if resolved_tags:
989
+ sidecar["tags"] = resolved_tags
990
+ if metadata_input:
991
+ for metadata_key, metadata_value in metadata_input.items():
992
+ if metadata_key in {"tags", "biblicus"}:
993
+ continue
994
+ sidecar[metadata_key] = metadata_value
995
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
996
+ _write_sidecar(output_path, sidecar)
997
+
998
+ if self._hooks is not None:
999
+ mutation = self._hooks.run_ingest_hooks(
1000
+ hook_point=HookPoint.after_ingest,
1001
+ filename=filename,
1002
+ media_type=media_type,
1003
+ title=None,
1004
+ tags=list(resolved_tags),
1005
+ metadata=dict(metadata_input),
1006
+ source_uri=source_uri,
1007
+ item_id=item_id,
1008
+ relpath=relpath,
1009
+ )
1010
+ if mutation.add_tags:
1011
+ updated_tags = list(resolved_tags)
1012
+ for tag in mutation.add_tags:
1013
+ if tag not in updated_tags:
1014
+ updated_tags.append(tag)
1015
+ resolved_tags = updated_tags
1016
+ sidecar["tags"] = resolved_tags
1017
+ _write_sidecar(output_path, sidecar)
1018
+
1019
+ created_at = utc_now_iso()
1020
+ item_record = CatalogItem(
1021
+ id=item_id,
1022
+ relpath=relpath,
1023
+ sha256=sha256_digest,
1024
+ bytes=bytes_written,
1025
+ media_type=media_type,
1026
+ title=None,
1027
+ tags=list(resolved_tags),
1028
+ metadata=dict(sidecar or {}),
1029
+ created_at=created_at,
1030
+ source_uri=source_uri,
1031
+ )
1032
+ self._upsert_catalog_item(item_record)
1033
+
1034
+ return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
1035
+
1036
+ def ingest_note(
1037
+ self,
1038
+ text: str,
1039
+ *,
1040
+ title: Optional[str] = None,
1041
+ tags: Sequence[str] = (),
1042
+ source_uri: str = "text",
1043
+ ) -> IngestResult:
1044
+ """
1045
+ Ingest a text note as Markdown.
1046
+
1047
+ :param text: Note content.
1048
+ :type text: str
1049
+ :param title: Optional title metadata.
1050
+ :type title: str or None
1051
+ :param tags: Tags to associate with the note.
1052
+ :type tags: Sequence[str]
1053
+ :param source_uri: Source uniform resource identifier for provenance.
1054
+ :type source_uri: str
1055
+ :return: Ingestion result summary.
1056
+ :rtype: IngestResult
1057
+ """
1058
+ data = text.encode("utf-8")
1059
+ return self.ingest_item(
1060
+ data,
1061
+ filename=None,
1062
+ media_type="text/markdown",
1063
+ title=title,
1064
+ tags=tags,
1065
+ metadata=None,
1066
+ source_uri=source_uri,
1067
+ )
1068
+
1069
+ def ingest_source(
1070
+ self,
1071
+ source: str | Path,
1072
+ *,
1073
+ tags: Sequence[str] = (),
1074
+ source_uri: Optional[str] = None,
1075
+ ) -> IngestResult:
1076
+ """
1077
+ Ingest a file path or uniform resource locator source.
1078
+
1079
+ :param source: File path or uniform resource locator.
1080
+ :type source: str or Path
1081
+ :param tags: Tags to associate with the item.
1082
+ :type tags: Sequence[str]
1083
+ :param source_uri: Optional override for the source uniform resource identifier.
1084
+ :type source_uri: str or None
1085
+ :return: Ingestion result summary.
1086
+ :rtype: IngestResult
1087
+ """
1088
+ candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
1089
+ if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
1090
+ path = source if isinstance(source, Path) else candidate_path
1091
+ assert isinstance(path, Path)
1092
+ path = path.resolve()
1093
+ filename = path.name
1094
+ media_type, _ = mimetypes.guess_type(filename)
1095
+ media_type = media_type or "application/octet-stream"
1096
+ if path.suffix.lower() in {".md", ".markdown"}:
1097
+ media_type = "text/markdown"
1098
+ if media_type == "text/markdown":
1099
+ return self.ingest_item(
1100
+ path.read_bytes(),
1101
+ filename=filename,
1102
+ media_type=media_type,
1103
+ title=None,
1104
+ tags=tags,
1105
+ metadata=None,
1106
+ source_uri=source_uri or path.as_uri(),
1107
+ )
1108
+ with path.open("rb") as handle:
1109
+ return self.ingest_item_stream(
1110
+ handle,
1111
+ filename=filename,
1112
+ media_type=media_type,
1113
+ tags=tags,
1114
+ metadata=None,
1115
+ source_uri=source_uri or path.as_uri(),
1116
+ )
1117
+
1118
+ payload = load_source(source, source_uri=source_uri)
1119
+ return self.ingest_item(
1120
+ payload.data,
1121
+ filename=payload.filename,
1122
+ media_type=payload.media_type,
1123
+ title=None,
1124
+ tags=tags,
1125
+ metadata=None,
1126
+ source_uri=payload.source_uri,
1127
+ )
1128
+
1129
+ def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
1130
+ """
1131
+ Import a folder tree into the corpus, preserving relative paths and provenance.
1132
+
1133
+ Imported content is stored under the raw directory in a dedicated import namespace so that
1134
+ operators can inspect and back up imported content as a structured tree.
1135
+
1136
+ :param source_root: Root directory of the folder tree to import.
1137
+ :type source_root: Path
1138
+ :param tags: Tags to associate with imported items.
1139
+ :type tags: Sequence[str]
1140
+ :return: Import statistics.
1141
+ :rtype: dict[str, int]
1142
+ :raises FileNotFoundError: If the source_root does not exist.
1143
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1144
+ """
1145
+ source_root = source_root.resolve()
1146
+ if not source_root.is_dir():
1147
+ raise FileNotFoundError(f"Import source root does not exist: {source_root}")
1148
+
1149
+ ignore_spec = load_corpus_ignore_spec(self.root)
1150
+ import_id = str(uuid.uuid4())
1151
+ stats = {"scanned": 0, "ignored": 0, "imported": 0}
1152
+
1153
+ for source_path in sorted(source_root.rglob("*")):
1154
+ if not source_path.is_file():
1155
+ continue
1156
+ relative_source_path = source_path.relative_to(source_root).as_posix()
1157
+ stats["scanned"] += 1
1158
+ if ignore_spec.matches(relative_source_path):
1159
+ stats["ignored"] += 1
1160
+ continue
1161
+ self._import_file(
1162
+ source_path=source_path,
1163
+ import_id=import_id,
1164
+ relative_source_path=relative_source_path,
1165
+ tags=tags,
1166
+ )
1167
+ stats["imported"] += 1
1168
+
1169
+ return stats
1170
+
1171
+ def _import_file(
1172
+ self,
1173
+ *,
1174
+ source_path: Path,
1175
+ import_id: str,
1176
+ relative_source_path: str,
1177
+ tags: Sequence[str],
1178
+ ) -> None:
1179
+ """
1180
+ Import a single file into the corpus under an import namespace.
1181
+
1182
+ :param source_path: Source file path to import.
1183
+ :type source_path: Path
1184
+ :param import_id: Import identifier.
1185
+ :type import_id: str
1186
+ :param relative_source_path: Relative path within the imported tree.
1187
+ :type relative_source_path: str
1188
+ :param tags: Tags to apply.
1189
+ :type tags: Sequence[str]
1190
+ :return: None.
1191
+ :rtype: None
1192
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1193
+ """
1194
+ item_id = str(uuid.uuid4())
1195
+ destination_relpath = str(
1196
+ Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
1197
+ )
1198
+ destination_path = (self.root / destination_relpath).resolve()
1199
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1200
+
1201
+ raw_bytes = source_path.read_bytes()
1202
+ sha256_digest = _sha256_bytes(raw_bytes)
1203
+
1204
+ media_type, _ = mimetypes.guess_type(source_path.name)
1205
+ media_type = media_type or "application/octet-stream"
1206
+ if source_path.suffix.lower() in {".md", ".markdown"}:
1207
+ media_type = "text/markdown"
1208
+
1209
+ title: Optional[str] = None
1210
+ frontmatter_metadata: Dict[str, Any] = {}
1211
+ if media_type == "text/markdown":
1212
+ try:
1213
+ text = raw_bytes.decode("utf-8")
1214
+ except UnicodeDecodeError as decode_error:
1215
+ raise ValueError(
1216
+ f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
1217
+ ) from decode_error
1218
+ parsed_document = parse_front_matter(text)
1219
+ frontmatter_metadata = dict(parsed_document.metadata)
1220
+ title_value = frontmatter_metadata.get("title")
1221
+ if isinstance(title_value, str) and title_value.strip():
1222
+ title = title_value.strip()
1223
+
1224
+ destination_path.write_bytes(raw_bytes)
1225
+
1226
+ sidecar: Dict[str, Any] = {}
1227
+ if tags:
1228
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1229
+ if media_type != "text/markdown":
1230
+ sidecar["media_type"] = media_type
1231
+ sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
1232
+ _write_sidecar(destination_path, sidecar)
1233
+
1234
+ merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
1235
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1236
+
1237
+ item_record = CatalogItem(
1238
+ id=item_id,
1239
+ relpath=destination_relpath,
1240
+ sha256=sha256_digest,
1241
+ bytes=len(raw_bytes),
1242
+ media_type=media_type,
1243
+ title=title,
1244
+ tags=list(resolved_tags),
1245
+ metadata=dict(merged_metadata or {}),
1246
+ created_at=utc_now_iso(),
1247
+ source_uri=source_path.as_uri(),
1248
+ )
1249
+ self._upsert_catalog_item(item_record)
1250
+
1251
+ def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
1252
+ """
1253
+ List items from the catalog.
1254
+
1255
+ :param limit: Maximum number of items to return.
1256
+ :type limit: int
1257
+ :return: Catalog items ordered by recency.
1258
+ :rtype: list[CatalogItem]
1259
+ """
1260
+ catalog = self._load_catalog()
1261
+ ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
1262
+ collected_items: List[CatalogItem] = []
1263
+ for item_id in ordered_ids:
1264
+ item = catalog.items.get(item_id)
1265
+ if item is not None:
1266
+ collected_items.append(item)
1267
+ return collected_items
1268
+
1269
+ def get_item(self, item_id: str) -> CatalogItem:
1270
+ """
1271
+ Fetch a catalog item by identifier.
1272
+
1273
+ :param item_id: Item identifier.
1274
+ :type item_id: str
1275
+ :return: Catalog item.
1276
+ :rtype: CatalogItem
1277
+ :raises KeyError: If the item identifier is unknown.
1278
+ """
1279
+ catalog = self._load_catalog()
1280
+ item = catalog.items.get(item_id)
1281
+ if item is None:
1282
+ raise KeyError(f"Unknown item identifier: {item_id}")
1283
+ return item
1284
+
1285
+ def create_crawl_id(self) -> str:
1286
+ """
1287
+ Create a new crawl identifier.
1288
+
1289
+ :return: Crawl identifier.
1290
+ :rtype: str
1291
+ """
1292
+ return str(uuid.uuid4())
1293
+
1294
+ def ingest_crawled_payload(
1295
+ self,
1296
+ *,
1297
+ crawl_id: str,
1298
+ relative_path: str,
1299
+ data: bytes,
1300
+ filename: str,
1301
+ media_type: str,
1302
+ source_uri: str,
1303
+ tags: Sequence[str],
1304
+ ) -> None:
1305
+ """
1306
+ Ingest a crawled payload under a crawl import namespace.
1307
+
1308
+ :param crawl_id: Crawl identifier used to group crawled artifacts.
1309
+ :type crawl_id: str
1310
+ :param relative_path: Relative path within the crawl prefix.
1311
+ :type relative_path: str
1312
+ :param data: Raw payload bytes.
1313
+ :type data: bytes
1314
+ :param filename: Suggested filename from the payload metadata.
1315
+ :type filename: str
1316
+ :param media_type: Internet Assigned Numbers Authority media type.
1317
+ :type media_type: str
1318
+ :param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
1319
+ :type source_uri: str
1320
+ :param tags: Tags to attach to the stored item.
1321
+ :type tags: Sequence[str]
1322
+ :return: None.
1323
+ :rtype: None
1324
+ """
1325
+ _ = filename
1326
+ item_id = str(uuid.uuid4())
1327
+ destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
1328
+ destination_path = (self.root / destination_relpath).resolve()
1329
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1330
+ destination_path.write_bytes(data)
1331
+
1332
+ sha256_digest = _sha256_bytes(data)
1333
+
1334
+ sidecar: Dict[str, Any] = {}
1335
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1336
+ sidecar["media_type"] = media_type
1337
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
1338
+ _write_sidecar(destination_path, sidecar)
1339
+
1340
+ merged_metadata = _merge_metadata({}, sidecar)
1341
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1342
+
1343
+ item_record = CatalogItem(
1344
+ id=item_id,
1345
+ relpath=destination_relpath,
1346
+ sha256=sha256_digest,
1347
+ bytes=len(data),
1348
+ media_type=media_type,
1349
+ title=None,
1350
+ tags=list(resolved_tags),
1351
+ metadata=dict(merged_metadata or {}),
1352
+ created_at=utc_now_iso(),
1353
+ source_uri=source_uri,
1354
+ )
1355
+ self._upsert_catalog_item(item_record)
1356
+
1357
+ def reindex(self) -> Dict[str, int]:
1358
+ """
1359
+ Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
1360
+
1361
+ This is the core "mutable corpus with re-indexing" loop: edit raw files or sidecars,
1362
+ then reindex to refresh the derived catalog.
1363
+
1364
+ :return: Reindex statistics.
1365
+ :rtype: dict[str, int]
1366
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1367
+ """
1368
+ self._init_catalog()
1369
+ existing_catalog = self._load_catalog()
1370
+ stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
1371
+
1372
+ content_files = [
1373
+ content_path
1374
+ for content_path in self.raw_dir.rglob("*")
1375
+ if content_path.is_file() and not content_path.name.endswith(SIDECAR_SUFFIX)
1376
+ ]
1377
+
1378
+ new_items: Dict[str, CatalogItem] = {}
1379
+
1380
+ for content_path in content_files:
1381
+ stats["scanned"] += 1
1382
+ relpath = str(content_path.relative_to(self.root))
1383
+ data = content_path.read_bytes()
1384
+ sha256 = _sha256_bytes(data)
1385
+
1386
+ media_type, _ = mimetypes.guess_type(content_path.name)
1387
+ media_type = media_type or "application/octet-stream"
1388
+
1389
+ sidecar = _load_sidecar(content_path)
1390
+
1391
+ frontmatter: Dict[str, Any] = {}
1392
+ if content_path.suffix.lower() in {".md", ".markdown"}:
1393
+ try:
1394
+ text = data.decode("utf-8")
1395
+ except UnicodeDecodeError as decode_error:
1396
+ raise ValueError(
1397
+ f"Markdown file must be Unicode Transformation Format 8: {relpath}"
1398
+ ) from decode_error
1399
+ parsed_document = parse_front_matter(text)
1400
+ frontmatter = parsed_document.metadata
1401
+ media_type = "text/markdown"
1402
+
1403
+ merged_metadata = _merge_metadata(frontmatter, sidecar)
1404
+
1405
+ if media_type != "text/markdown":
1406
+ media_type_override = merged_metadata.get("media_type")
1407
+ if isinstance(media_type_override, str) and media_type_override.strip():
1408
+ media_type = media_type_override.strip()
1409
+
1410
+ item_id: Optional[str] = None
1411
+ biblicus_block = merged_metadata.get("biblicus")
1412
+ if isinstance(biblicus_block, dict):
1413
+ biblicus_id = biblicus_block.get("id")
1414
+ if isinstance(biblicus_id, str):
1415
+ try:
1416
+ item_id = str(uuid.UUID(biblicus_id))
1417
+ except ValueError:
1418
+ item_id = None
1419
+
1420
+ if item_id is None:
1421
+ item_id = _parse_uuid_prefix(content_path.name)
1422
+
1423
+ if item_id is None:
1424
+ stats["skipped"] += 1
1425
+ continue
1426
+
1427
+ title: Optional[str] = None
1428
+ title_value = merged_metadata.get("title")
1429
+ if isinstance(title_value, str) and title_value.strip():
1430
+ title = title_value.strip()
1431
+
1432
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1433
+
1434
+ source_uri: Optional[str] = None
1435
+ if isinstance(biblicus_block, dict):
1436
+ source_value = biblicus_block.get("source")
1437
+ if isinstance(source_value, str) and source_value.strip():
1438
+ source_uri = source_value.strip()
1439
+
1440
+ previous_item = existing_catalog.items.get(item_id)
1441
+ created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
1442
+ source_uri = source_uri or (
1443
+ previous_item.source_uri if previous_item is not None else None
1444
+ )
1445
+
1446
+ if previous_item is None:
1447
+ stats["inserted"] += 1
1448
+ else:
1449
+ stats["updated"] += 1
1450
+
1451
+ new_items[item_id] = CatalogItem(
1452
+ id=item_id,
1453
+ relpath=relpath,
1454
+ sha256=sha256,
1455
+ bytes=len(data),
1456
+ media_type=media_type,
1457
+ title=title,
1458
+ tags=list(resolved_tags),
1459
+ metadata=dict(merged_metadata or {}),
1460
+ created_at=created_at,
1461
+ source_uri=source_uri,
1462
+ )
1463
+
1464
+ order = sorted(
1465
+ new_items.keys(),
1466
+ key=lambda item_id: (new_items[item_id].created_at, item_id),
1467
+ reverse=True,
1468
+ )
1469
+
1470
+ catalog = CorpusCatalog(
1471
+ schema_version=SCHEMA_VERSION,
1472
+ generated_at=utc_now_iso(),
1473
+ corpus_uri=normalize_corpus_uri(self.root),
1474
+ raw_dir=DEFAULT_RAW_DIR,
1475
+ latest_run_id=None,
1476
+ items=new_items,
1477
+ order=order,
1478
+ )
1479
+ self._write_catalog(catalog)
1480
+
1481
+ return stats
1482
+
1483
+ @property
1484
+ def name(self) -> str:
1485
+ """
1486
+ Return the corpus name (directory basename).
1487
+
1488
+ :return: Corpus name.
1489
+ :rtype: str
1490
+ """
1491
+ return self.root.name
1492
+
1493
+ def purge(self, *, confirm: str) -> None:
1494
+ """
1495
+ Delete all ingested items and derived files, preserving corpus identity/config.
1496
+
1497
+ :param confirm: Confirmation string matching the corpus name.
1498
+ :type confirm: str
1499
+ :return: None.
1500
+ :rtype: None
1501
+ :raises ValueError: If the confirmation does not match.
1502
+ """
1503
+ expected = self.name
1504
+ if confirm != expected:
1505
+ raise ValueError(
1506
+ f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
1507
+ )
1508
+
1509
+ if self.raw_dir.exists():
1510
+ shutil.rmtree(self.raw_dir)
1511
+ self.raw_dir.mkdir(parents=True, exist_ok=True)
1512
+
1513
+ for path in self.meta_dir.iterdir():
1514
+ if path.name == "config.json":
1515
+ continue
1516
+ if path.is_dir():
1517
+ shutil.rmtree(path)
1518
+ else:
1519
+ path.unlink()
1520
+ self._init_catalog()
1521
+ self._write_catalog(
1522
+ CorpusCatalog(
1523
+ schema_version=SCHEMA_VERSION,
1524
+ generated_at=utc_now_iso(),
1525
+ corpus_uri=normalize_corpus_uri(self.root),
1526
+ raw_dir=DEFAULT_RAW_DIR,
1527
+ latest_run_id=None,
1528
+ items={},
1529
+ order=[],
1530
+ )
1531
+ )