biblicus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/corpus.py ADDED
@@ -0,0 +1,952 @@
1
+ """
2
+ Corpus storage and ingestion for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import hashlib
8
+ import json
9
+ import mimetypes
10
+ import shutil
11
+ import uuid
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional, Sequence
14
+
15
+ import yaml
16
+
17
+ from .constants import CORPUS_DIR_NAME, DEFAULT_RAW_DIR, RUNS_DIR_NAME, SCHEMA_VERSION, SIDECAR_SUFFIX
18
+ from .frontmatter import parse_front_matter, render_front_matter
19
+ from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
20
+ from .sources import load_source
21
+ from .time import utc_now_iso
22
+ from .uris import normalize_corpus_uri, corpus_ref_to_path
23
+
24
+
25
+ def _sha256_bytes(data: bytes) -> str:
26
+ """
27
+ Compute a Secure Hash Algorithm 256 digest for byte content.
28
+
29
+ :param data: Input bytes.
30
+ :type data: bytes
31
+ :return: Secure Hash Algorithm 256 hex digest.
32
+ :rtype: str
33
+ """
34
+
35
+ return hashlib.sha256(data).hexdigest()
36
+
37
+
38
+ def _sanitize_filename(name: str) -> str:
39
+ """
40
+ Sanitize a filename into a portable, filesystem-friendly form.
41
+
42
+ :param name: Raw filename.
43
+ :type name: str
44
+ :return: Sanitized filename.
45
+ :rtype: str
46
+ """
47
+
48
+ allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
49
+ sanitized_name = "".join(
50
+ (character if character in allowed_characters else "_") for character in name
51
+ ).strip()
52
+ return sanitized_name or "file"
53
+
54
+
55
+ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
56
+ """
57
+ Return a preferred filename extension for a media type.
58
+
59
+ :param media_type: Internet Assigned Numbers Authority media type.
60
+ :type media_type: str
61
+ :return: Preferred extension or None.
62
+ :rtype: str or None
63
+ """
64
+
65
+ media_type_overrides = {
66
+ "image/jpeg": ".jpg",
67
+ }
68
+ if media_type in media_type_overrides:
69
+ return media_type_overrides[media_type]
70
+ return mimetypes.guess_extension(media_type)
71
+
72
+
73
+ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
74
+ """
75
+ Ensure a usable filename extension for a media type.
76
+
77
+ :param filename: Raw filename.
78
+ :type filename: str
79
+ :param media_type: Internet Assigned Numbers Authority media type.
80
+ :type media_type: str
81
+ :return: Filename with a compatible extension.
82
+ :rtype: str
83
+ """
84
+
85
+ raw_name = filename.strip()
86
+
87
+ if media_type == "text/markdown":
88
+ if raw_name.lower().endswith((".md", ".markdown")):
89
+ return raw_name
90
+ return raw_name + ".md"
91
+
92
+ ext = _preferred_extension_for_media_type(media_type)
93
+ if not ext:
94
+ return raw_name
95
+ if raw_name.lower().endswith(ext.lower()):
96
+ return raw_name
97
+ return raw_name + ext
98
+
99
+
100
+ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
101
+ """
102
+ Merge tags from explicit input and front matter values.
103
+
104
+ :param explicit: Explicit tags provided by callers.
105
+ :type explicit: Sequence[str]
106
+ :param from_frontmatter: Tags from front matter.
107
+ :type from_frontmatter: Any
108
+ :return: Deduplicated tag list preserving order.
109
+ :rtype: list[str]
110
+ """
111
+
112
+ merged_tags: List[str] = []
113
+
114
+ for explicit_tag in explicit:
115
+ cleaned_tag = explicit_tag.strip()
116
+ if cleaned_tag:
117
+ merged_tags.append(cleaned_tag)
118
+
119
+ if isinstance(from_frontmatter, str):
120
+ merged_tags.append(from_frontmatter)
121
+ elif isinstance(from_frontmatter, list):
122
+ for item in from_frontmatter:
123
+ if isinstance(item, str) and item.strip():
124
+ merged_tags.append(item.strip())
125
+
126
+ seen_tags = set()
127
+ deduplicated_tags: List[str] = []
128
+ for tag_value in merged_tags:
129
+ if tag_value not in seen_tags:
130
+ seen_tags.add(tag_value)
131
+ deduplicated_tags.append(tag_value)
132
+ return deduplicated_tags
133
+
134
+
135
+ def _sidecar_path_for(content_path: Path) -> Path:
136
+ """
137
+ Compute the sidecar metadata path for a content file.
138
+
139
+ :param content_path: Path to the content file.
140
+ :type content_path: Path
141
+ :return: Sidecar path.
142
+ :rtype: Path
143
+ """
144
+
145
+ return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
146
+
147
+
148
+ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
149
+ """
150
+ Load sidecar metadata for a content file.
151
+
152
+ :param content_path: Path to the content file.
153
+ :type content_path: Path
154
+ :return: Parsed sidecar metadata.
155
+ :rtype: dict[str, Any]
156
+ :raises ValueError: If the sidecar content is not a mapping.
157
+ """
158
+
159
+ path = _sidecar_path_for(content_path)
160
+ if not path.is_file():
161
+ return {}
162
+ data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
163
+ if not isinstance(data, dict):
164
+ raise ValueError(f"Sidecar metadata must be a mapping/object: {path}")
165
+ return dict(data)
166
+
167
+
168
+ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
169
+ """
170
+ Write a sidecar metadata file.
171
+
172
+ :param content_path: Path to the content file.
173
+ :type content_path: Path
174
+ :param metadata: Metadata to serialize.
175
+ :type metadata: dict[str, Any]
176
+ :return: None.
177
+ :rtype: None
178
+ """
179
+ path = _sidecar_path_for(content_path)
180
+ text = yaml.safe_dump(
181
+ metadata,
182
+ sort_keys=False,
183
+ allow_unicode=True,
184
+ default_flow_style=False,
185
+ ).strip()
186
+ path.write_text(text + "\n", encoding="utf-8")
187
+
188
+
189
+ def _ensure_biblicus_block(metadata: Dict[str, Any], *, item_id: str, source_uri: str) -> Dict[str, Any]:
190
+ """
191
+ Ensure the biblicus metadata block exists and is populated.
192
+
193
+ :param metadata: Existing metadata.
194
+ :type metadata: dict[str, Any]
195
+ :param item_id: Item identifier to store.
196
+ :type item_id: str
197
+ :param source_uri: Source uniform resource identifier to store.
198
+ :type source_uri: str
199
+ :return: Updated metadata mapping.
200
+ :rtype: dict[str, Any]
201
+ """
202
+ updated_metadata = dict(metadata)
203
+ existing_biblicus = updated_metadata.get("biblicus")
204
+ if not isinstance(existing_biblicus, dict):
205
+ existing_biblicus = {}
206
+ biblicus_block = dict(existing_biblicus)
207
+ biblicus_block["id"] = item_id
208
+ biblicus_block["source"] = source_uri
209
+ updated_metadata["biblicus"] = biblicus_block
210
+ return updated_metadata
211
+
212
+
213
+ def _parse_uuid_prefix(filename: str) -> Optional[str]:
214
+ """
215
+ Extract a universally unique identifier prefix from a filename, if present.
216
+
217
+ :param filename: Filename to inspect.
218
+ :type filename: str
219
+ :return: Universally unique identifier string or None.
220
+ :rtype: str or None
221
+ """
222
+ if len(filename) < 36:
223
+ return None
224
+ prefix = filename[:36]
225
+ try:
226
+ return str(uuid.UUID(prefix))
227
+ except ValueError:
228
+ return None
229
+
230
+
231
+ def _merge_metadata(front: Dict[str, Any], side: Dict[str, Any]) -> Dict[str, Any]:
232
+ """
233
+ Merge front matter and sidecar metadata.
234
+
235
+ :param front: Front matter metadata.
236
+ :type front: dict[str, Any]
237
+ :param side: Sidecar metadata.
238
+ :type side: dict[str, Any]
239
+ :return: Merged metadata.
240
+ :rtype: dict[str, Any]
241
+ """
242
+ merged_metadata: Dict[str, Any] = dict(front)
243
+
244
+ front_biblicus = merged_metadata.get("biblicus")
245
+ sidecar_biblicus = side.get("biblicus")
246
+ if isinstance(front_biblicus, dict) or isinstance(sidecar_biblicus, dict):
247
+ merged_biblicus: Dict[str, Any] = {}
248
+ if isinstance(front_biblicus, dict):
249
+ merged_biblicus.update(front_biblicus)
250
+ if isinstance(sidecar_biblicus, dict):
251
+ merged_biblicus.update(sidecar_biblicus)
252
+ merged_metadata["biblicus"] = merged_biblicus
253
+
254
+ merged_tags = _merge_tags(_merge_tags([], front.get("tags")), side.get("tags"))
255
+ if merged_tags:
256
+ merged_metadata["tags"] = merged_tags
257
+
258
+ for metadata_key, metadata_value in side.items():
259
+ if metadata_key in {"biblicus", "tags"}:
260
+ continue
261
+ merged_metadata[metadata_key] = metadata_value
262
+
263
+ return merged_metadata
264
+
265
+
266
+ class Corpus:
267
+ """
268
+ Local corpus manager for Biblicus.
269
+
270
+ :ivar root: Corpus root directory.
271
+ :vartype root: Path
272
+ :ivar meta_dir: Metadata directory under the corpus root.
273
+ :vartype meta_dir: Path
274
+ :ivar raw_dir: Raw item directory under the corpus root.
275
+ :vartype raw_dir: Path
276
+ :ivar config: Parsed corpus config, if present.
277
+ :vartype config: CorpusConfig or None
278
+ """
279
+
280
+ def __init__(self, root: Path):
281
+ """
282
+ Initialize a corpus wrapper around a filesystem path.
283
+
284
+ :param root: Corpus root directory.
285
+ :type root: Path
286
+ """
287
+
288
+ self.root = root
289
+ self.meta_dir = self.root / CORPUS_DIR_NAME
290
+ self.raw_dir = self.root / DEFAULT_RAW_DIR
291
+ self.config = self._load_config()
292
+
293
+ @property
294
+ def uri(self) -> str:
295
+ """
296
+ Return the canonical uniform resource identifier for the corpus root.
297
+
298
+ :return: Corpus uniform resource identifier.
299
+ :rtype: str
300
+ """
301
+
302
+ return self.root.as_uri()
303
+
304
+ def _load_config(self) -> Optional[CorpusConfig]:
305
+ """
306
+ Load the corpus config if it exists.
307
+
308
+ :return: Parsed corpus config or None.
309
+ :rtype: CorpusConfig or None
310
+ :raises ValueError: If the config schema is invalid.
311
+ """
312
+
313
+ path = self.meta_dir / "config.json"
314
+ if not path.is_file():
315
+ return None
316
+ data = json.loads(path.read_text(encoding="utf-8"))
317
+ return CorpusConfig.model_validate(data)
318
+
319
+ @classmethod
320
+ def find(cls, start: Path) -> "Corpus":
321
+ """
322
+ Locate a corpus by searching upward from a path.
323
+
324
+ :param start: Starting path to search.
325
+ :type start: Path
326
+ :return: Located corpus instance.
327
+ :rtype: Corpus
328
+ :raises FileNotFoundError: If no corpus config is found.
329
+ """
330
+
331
+ start = start.resolve()
332
+ for candidate in [start, *start.parents]:
333
+ if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
334
+ return cls(candidate)
335
+ raise FileNotFoundError(
336
+ f"Not a Biblicus corpus (no {CORPUS_DIR_NAME}/config.json found from {start})"
337
+ )
338
+
339
+ @classmethod
340
+ def open(cls, ref: str | Path) -> "Corpus":
341
+ """
342
+ Open a corpus from a path or uniform resource identifier reference.
343
+
344
+ :param ref: Filesystem path or file:// uniform resource identifier.
345
+ :type ref: str or Path
346
+ :return: Opened corpus instance.
347
+ :rtype: Corpus
348
+ """
349
+
350
+ return cls.find(corpus_ref_to_path(ref))
351
+
352
+ @classmethod
353
+ def init(cls, root: Path, *, force: bool = False) -> "Corpus":
354
+ """
355
+ Initialize a new corpus on disk.
356
+
357
+ :param root: Corpus root directory.
358
+ :type root: Path
359
+ :param force: Whether to overwrite existing config.
360
+ :type force: bool
361
+ :return: Initialized corpus instance.
362
+ :rtype: Corpus
363
+ :raises FileExistsError: If the corpus already exists and force is False.
364
+ """
365
+
366
+ root = root.resolve()
367
+ corpus = cls(root)
368
+
369
+ corpus.meta_dir.mkdir(parents=True, exist_ok=True)
370
+ corpus.raw_dir.mkdir(parents=True, exist_ok=True)
371
+
372
+ config_path = corpus.meta_dir / "config.json"
373
+ if config_path.exists() and not force:
374
+ raise FileExistsError(f"Corpus already exists at {root}")
375
+
376
+ config = CorpusConfig(
377
+ schema_version=SCHEMA_VERSION,
378
+ created_at=utc_now_iso(),
379
+ corpus_uri=normalize_corpus_uri(root),
380
+ raw_dir=DEFAULT_RAW_DIR,
381
+ )
382
+ config_path.write_text(config.model_dump_json(indent=2) + "\n", encoding="utf-8")
383
+
384
+ corpus._init_catalog()
385
+ return corpus
386
+
387
+ @property
388
+ def catalog_path(self) -> Path:
389
+ """
390
+ Return the path to the corpus catalog file.
391
+
392
+ :return: Catalog file path.
393
+ :rtype: Path
394
+ """
395
+
396
+ return self.meta_dir / "catalog.json"
397
+
398
+ def _init_catalog(self) -> None:
399
+ """
400
+ Initialize the catalog if it does not already exist.
401
+
402
+ :return: None.
403
+ :rtype: None
404
+ """
405
+
406
+ if self.catalog_path.exists():
407
+ return
408
+ catalog = CorpusCatalog(
409
+ schema_version=SCHEMA_VERSION,
410
+ generated_at=utc_now_iso(),
411
+ corpus_uri=normalize_corpus_uri(self.root),
412
+ raw_dir=DEFAULT_RAW_DIR,
413
+ latest_run_id=None,
414
+ items={},
415
+ order=[],
416
+ )
417
+ self._write_catalog(catalog)
418
+
419
+ def _load_catalog(self) -> CorpusCatalog:
420
+ """
421
+ Read and validate the corpus catalog file.
422
+
423
+ :return: Parsed corpus catalog.
424
+ :rtype: CorpusCatalog
425
+ :raises FileNotFoundError: If the catalog file does not exist.
426
+ :raises ValueError: If the catalog schema is invalid.
427
+ """
428
+
429
+ if not self.catalog_path.is_file():
430
+ raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
431
+ catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
432
+ return CorpusCatalog.model_validate(catalog_data)
433
+
434
+ def load_catalog(self) -> CorpusCatalog:
435
+ """
436
+ Load the current corpus catalog.
437
+
438
+ :return: Parsed corpus catalog.
439
+ :rtype: CorpusCatalog
440
+ :raises FileNotFoundError: If the catalog file does not exist.
441
+ :raises ValueError: If the catalog schema is invalid.
442
+ """
443
+
444
+ return self._load_catalog()
445
+
446
+ def _write_catalog(self, catalog: CorpusCatalog) -> None:
447
+ """
448
+ Atomically write a corpus catalog to disk.
449
+
450
+ :param catalog: Catalog to persist.
451
+ :type catalog: CorpusCatalog
452
+ :return: None.
453
+ :rtype: None
454
+ """
455
+
456
+ temp_path = self.catalog_path.with_suffix(".json.tmp")
457
+ temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
458
+ temp_path.replace(self.catalog_path)
459
+
460
+ @property
461
+ def runs_dir(self) -> Path:
462
+ """
463
+ Location of retrieval run manifests.
464
+
465
+ :return: Path to the runs directory.
466
+ :rtype: Path
467
+ """
468
+
469
+ return self.meta_dir / RUNS_DIR_NAME
470
+
471
+ def _ensure_runs_dir(self) -> None:
472
+ """
473
+ Ensure the retrieval runs directory exists.
474
+
475
+ :return: None.
476
+ :rtype: None
477
+ """
478
+
479
+ self.runs_dir.mkdir(parents=True, exist_ok=True)
480
+
481
+ def write_run(self, run: RetrievalRun) -> None:
482
+ """
483
+ Persist a retrieval run manifest and update the catalog pointer.
484
+
485
+ :param run: Run manifest to persist.
486
+ :type run: RetrievalRun
487
+ :return: None.
488
+ :rtype: None
489
+ """
490
+
491
+ self._ensure_runs_dir()
492
+ path = self.runs_dir / f"{run.run_id}.json"
493
+ path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
494
+ catalog = self._load_catalog()
495
+ catalog.latest_run_id = run.run_id
496
+ catalog.generated_at = utc_now_iso()
497
+ self._write_catalog(catalog)
498
+
499
+ def load_run(self, run_id: str) -> RetrievalRun:
500
+ """
501
+ Load a retrieval run manifest by identifier.
502
+
503
+ :param run_id: Run identifier.
504
+ :type run_id: str
505
+ :return: Parsed run manifest.
506
+ :rtype: RetrievalRun
507
+ :raises FileNotFoundError: If the run manifest does not exist.
508
+ """
509
+
510
+ path = self.runs_dir / f"{run_id}.json"
511
+ if not path.is_file():
512
+ raise FileNotFoundError(f"Missing run manifest: {path}")
513
+ data = json.loads(path.read_text(encoding="utf-8"))
514
+ return RetrievalRun.model_validate(data)
515
+
516
+ @property
517
+ def latest_run_id(self) -> Optional[str]:
518
+ """
519
+ Latest retrieval run identifier recorded in the catalog.
520
+
521
+ :return: Latest run identifier or None.
522
+ :rtype: str or None
523
+ """
524
+
525
+ return self._load_catalog().latest_run_id
526
+
527
+ def _upsert_catalog_item(self, item: CatalogItem) -> None:
528
+ """
529
+ Upsert a catalog item and reset the latest run pointer.
530
+
531
+ :param item: Catalog item to insert or update.
532
+ :type item: CatalogItem
533
+ :return: None.
534
+ :rtype: None
535
+ """
536
+
537
+ self._init_catalog()
538
+ catalog = self._load_catalog()
539
+ catalog.items[item.id] = item
540
+
541
+ ordered_ids = [item_id for item_id in catalog.order if item_id != item.id]
542
+ ordered_ids.insert(0, item.id)
543
+ catalog.order = ordered_ids
544
+ catalog.generated_at = utc_now_iso()
545
+ catalog.latest_run_id = None
546
+
547
+ self._write_catalog(catalog)
548
+
549
+ def ingest_item(
550
+ self,
551
+ data: bytes,
552
+ *,
553
+ filename: Optional[str] = None,
554
+ media_type: str = "application/octet-stream",
555
+ title: Optional[str] = None,
556
+ tags: Sequence[str] = (),
557
+ metadata: Optional[Dict[str, Any]] = None,
558
+ source_uri: str = "unknown",
559
+ ) -> IngestResult:
560
+ """
561
+ Ingest a single raw item into the corpus.
562
+
563
+ This is the modality-neutral primitive: callers provide bytes + a media type.
564
+ Higher-level conveniences (ingest_note, ingest_source, and related methods) build on top.
565
+
566
+ :param data: Raw item bytes.
567
+ :type data: bytes
568
+ :param filename: Optional filename for the stored item.
569
+ :type filename: str or None
570
+ :param media_type: Internet Assigned Numbers Authority media type for the item.
571
+ :type media_type: str
572
+ :param title: Optional title metadata.
573
+ :type title: str or None
574
+ :param tags: Tags to associate with the item.
575
+ :type tags: Sequence[str]
576
+ :param metadata: Optional metadata mapping.
577
+ :type metadata: dict[str, Any] or None
578
+ :param source_uri: Source uniform resource identifier for provenance.
579
+ :type source_uri: str
580
+ :return: Ingestion result summary.
581
+ :rtype: IngestResult
582
+ :raises ValueError: If markdown is not Unicode Transformation Format 8.
583
+ """
584
+
585
+ item_id = str(uuid.uuid4())
586
+ safe_filename = _sanitize_filename(filename) if filename else ""
587
+
588
+ if safe_filename:
589
+ safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
590
+
591
+ if media_type == "text/markdown":
592
+ output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
593
+ else:
594
+ if safe_filename:
595
+ output_name = f"{item_id}--{safe_filename}"
596
+ else:
597
+ extension = _preferred_extension_for_media_type(media_type) or ""
598
+ output_name = f"{item_id}{extension}" if extension else f"{item_id}"
599
+
600
+ relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
601
+ output_path = self.root / relpath
602
+
603
+ resolved_title = title.strip() if isinstance(title, str) and title.strip() else None
604
+ resolved_tags = list(tags)
605
+ metadata_input: Dict[str, Any] = dict(metadata or {})
606
+ if resolved_title and "title" not in metadata_input:
607
+ metadata_input["title"] = resolved_title
608
+ if resolved_tags and "tags" not in metadata_input:
609
+ metadata_input["tags"] = list(resolved_tags)
610
+
611
+ frontmatter: Dict[str, Any] = {}
612
+
613
+ if media_type == "text/markdown":
614
+ try:
615
+ markdown_text = data.decode("utf-8")
616
+ except UnicodeDecodeError as decode_error:
617
+ raise ValueError("Markdown must be Unicode Transformation Format 8") from decode_error
618
+
619
+ parsed_document = parse_front_matter(markdown_text)
620
+ frontmatter = dict(parsed_document.metadata)
621
+
622
+ merged_tags = _merge_tags(resolved_tags, frontmatter.get("tags"))
623
+ if merged_tags:
624
+ frontmatter["tags"] = merged_tags
625
+ resolved_tags = merged_tags
626
+
627
+ if resolved_title and not (
628
+ isinstance(frontmatter.get("title"), str) and frontmatter.get("title").strip()
629
+ ):
630
+ frontmatter["title"] = resolved_title
631
+
632
+ title_value = frontmatter.get("title")
633
+ if isinstance(title_value, str) and title_value.strip():
634
+ resolved_title = title_value.strip()
635
+
636
+ frontmatter = _ensure_biblicus_block(frontmatter, item_id=item_id, source_uri=source_uri)
637
+ rendered_document = render_front_matter(frontmatter, parsed_document.body)
638
+ data_to_write = rendered_document.encode("utf-8")
639
+ else:
640
+ data_to_write = data
641
+
642
+ sha256_digest = _sha256_bytes(data_to_write)
643
+ output_path.write_bytes(data_to_write)
644
+
645
+ if media_type != "text/markdown":
646
+ sidecar: Dict[str, Any] = {}
647
+ sidecar["media_type"] = media_type
648
+ if resolved_tags:
649
+ sidecar["tags"] = resolved_tags
650
+ if metadata_input:
651
+ for metadata_key, metadata_value in metadata_input.items():
652
+ if metadata_key in {"tags", "biblicus"}:
653
+ continue
654
+ sidecar[metadata_key] = metadata_value
655
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
656
+ _write_sidecar(output_path, sidecar)
657
+ frontmatter = sidecar
658
+
659
+ created_at = utc_now_iso()
660
+ item_record = CatalogItem(
661
+ id=item_id,
662
+ relpath=relpath,
663
+ sha256=sha256_digest,
664
+ bytes=len(data_to_write),
665
+ media_type=media_type,
666
+ title=resolved_title,
667
+ tags=list(resolved_tags),
668
+ metadata=dict(frontmatter or {}),
669
+ created_at=created_at,
670
+ source_uri=source_uri,
671
+ )
672
+ self._upsert_catalog_item(item_record)
673
+
674
+ return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
675
+
676
+ def ingest_note(
677
+ self,
678
+ text: str,
679
+ *,
680
+ title: Optional[str] = None,
681
+ tags: Sequence[str] = (),
682
+ source_uri: str = "text",
683
+ ) -> IngestResult:
684
+ """
685
+ Ingest a text note as Markdown.
686
+
687
+ :param text: Note content.
688
+ :type text: str
689
+ :param title: Optional title metadata.
690
+ :type title: str or None
691
+ :param tags: Tags to associate with the note.
692
+ :type tags: Sequence[str]
693
+ :param source_uri: Source uniform resource identifier for provenance.
694
+ :type source_uri: str
695
+ :return: Ingestion result summary.
696
+ :rtype: IngestResult
697
+ """
698
+
699
+ data = text.encode("utf-8")
700
+ return self.ingest_item(
701
+ data,
702
+ filename=None,
703
+ media_type="text/markdown",
704
+ title=title,
705
+ tags=tags,
706
+ metadata=None,
707
+ source_uri=source_uri,
708
+ )
709
+
710
+ def ingest_source(
711
+ self,
712
+ source: str | Path,
713
+ *,
714
+ tags: Sequence[str] = (),
715
+ source_uri: Optional[str] = None,
716
+ ) -> IngestResult:
717
+ """
718
+ Ingest a file path or uniform resource locator source.
719
+
720
+ :param source: File path or uniform resource locator.
721
+ :type source: str or Path
722
+ :param tags: Tags to associate with the item.
723
+ :type tags: Sequence[str]
724
+ :param source_uri: Optional override for the source uniform resource identifier.
725
+ :type source_uri: str or None
726
+ :return: Ingestion result summary.
727
+ :rtype: IngestResult
728
+ """
729
+
730
+ payload = load_source(source, source_uri=source_uri)
731
+ return self.ingest_item(
732
+ payload.data,
733
+ filename=payload.filename,
734
+ media_type=payload.media_type,
735
+ title=None,
736
+ tags=tags,
737
+ metadata=None,
738
+ source_uri=payload.source_uri,
739
+ )
740
+
741
+ def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
742
+ """
743
+ List items from the catalog.
744
+
745
+ :param limit: Maximum number of items to return.
746
+ :type limit: int
747
+ :return: Catalog items ordered by recency.
748
+ :rtype: list[CatalogItem]
749
+ """
750
+
751
+ catalog = self._load_catalog()
752
+ ordered_ids = (
753
+ catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
754
+ )
755
+ collected_items: List[CatalogItem] = []
756
+ for item_id in ordered_ids:
757
+ item = catalog.items.get(item_id)
758
+ if item is not None:
759
+ collected_items.append(item)
760
+ return collected_items
761
+
762
+ def get_item(self, item_id: str) -> CatalogItem:
763
+ """
764
+ Fetch a catalog item by identifier.
765
+
766
+ :param item_id: Item identifier.
767
+ :type item_id: str
768
+ :return: Catalog item.
769
+ :rtype: CatalogItem
770
+ :raises KeyError: If the item identifier is unknown.
771
+ """
772
+
773
+ catalog = self._load_catalog()
774
+ item = catalog.items.get(item_id)
775
+ if item is None:
776
+ raise KeyError(f"Unknown item identifier: {item_id}")
777
+ return item
778
+
779
+ def reindex(self) -> Dict[str, int]:
780
+ """
781
+ Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
782
+
783
+ This is the core "mutable corpus with re-indexing" loop: edit raw files or sidecars,
784
+ then reindex to refresh the derived catalog.
785
+
786
+ :return: Reindex statistics.
787
+ :rtype: dict[str, int]
788
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
789
+ """
790
+
791
+ self._init_catalog()
792
+ existing_catalog = self._load_catalog()
793
+ stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
794
+
795
+ content_files = [
796
+ content_path
797
+ for content_path in self.raw_dir.rglob("*")
798
+ if content_path.is_file() and not content_path.name.endswith(SIDECAR_SUFFIX)
799
+ ]
800
+
801
+ new_items: Dict[str, CatalogItem] = {}
802
+
803
+ for content_path in content_files:
804
+ stats["scanned"] += 1
805
+ relpath = str(content_path.relative_to(self.root))
806
+ data = content_path.read_bytes()
807
+ sha256 = _sha256_bytes(data)
808
+
809
+ media_type, _ = mimetypes.guess_type(content_path.name)
810
+ media_type = media_type or "application/octet-stream"
811
+
812
+ sidecar = _load_sidecar(content_path)
813
+
814
+ frontmatter: Dict[str, Any] = {}
815
+ if content_path.suffix.lower() in {".md", ".markdown"}:
816
+ try:
817
+ text = data.decode("utf-8")
818
+ except UnicodeDecodeError as decode_error:
819
+ raise ValueError(
820
+ f"Markdown file must be Unicode Transformation Format 8: {relpath}"
821
+ ) from decode_error
822
+ parsed_document = parse_front_matter(text)
823
+ frontmatter = parsed_document.metadata
824
+ media_type = "text/markdown"
825
+
826
+ merged_metadata = _merge_metadata(frontmatter, sidecar)
827
+
828
+ if media_type != "text/markdown":
829
+ media_type_override = merged_metadata.get("media_type")
830
+ if isinstance(media_type_override, str) and media_type_override.strip():
831
+ media_type = media_type_override.strip()
832
+
833
+ item_id: Optional[str] = None
834
+ biblicus_block = merged_metadata.get("biblicus")
835
+ if isinstance(biblicus_block, dict):
836
+ biblicus_id = biblicus_block.get("id")
837
+ if isinstance(biblicus_id, str):
838
+ try:
839
+ item_id = str(uuid.UUID(biblicus_id))
840
+ except ValueError:
841
+ item_id = None
842
+
843
+ if item_id is None:
844
+ item_id = _parse_uuid_prefix(content_path.name)
845
+
846
+ if item_id is None:
847
+ stats["skipped"] += 1
848
+ continue
849
+
850
+ title: Optional[str] = None
851
+ title_value = merged_metadata.get("title")
852
+ if isinstance(title_value, str) and title_value.strip():
853
+ title = title_value.strip()
854
+
855
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
856
+
857
+ source_uri: Optional[str] = None
858
+ if isinstance(biblicus_block, dict):
859
+ source_value = biblicus_block.get("source")
860
+ if isinstance(source_value, str) and source_value.strip():
861
+ source_uri = source_value.strip()
862
+
863
+ previous_item = existing_catalog.items.get(item_id)
864
+ created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
865
+ source_uri = source_uri or (previous_item.source_uri if previous_item is not None else None)
866
+
867
+ if previous_item is None:
868
+ stats["inserted"] += 1
869
+ else:
870
+ stats["updated"] += 1
871
+
872
+ new_items[item_id] = CatalogItem(
873
+ id=item_id,
874
+ relpath=relpath,
875
+ sha256=sha256,
876
+ bytes=len(data),
877
+ media_type=media_type,
878
+ title=title,
879
+ tags=list(resolved_tags),
880
+ metadata=dict(merged_metadata or {}),
881
+ created_at=created_at,
882
+ source_uri=source_uri,
883
+ )
884
+
885
+ order = sorted(
886
+ new_items.keys(),
887
+ key=lambda item_id: (new_items[item_id].created_at, item_id),
888
+ reverse=True,
889
+ )
890
+
891
+ catalog = CorpusCatalog(
892
+ schema_version=SCHEMA_VERSION,
893
+ generated_at=utc_now_iso(),
894
+ corpus_uri=normalize_corpus_uri(self.root),
895
+ raw_dir=DEFAULT_RAW_DIR,
896
+ latest_run_id=None,
897
+ items=new_items,
898
+ order=order,
899
+ )
900
+ self._write_catalog(catalog)
901
+
902
+ return stats
903
+
904
+ @property
905
+ def name(self) -> str:
906
+ """
907
+ Return the corpus name (directory basename).
908
+
909
+ :return: Corpus name.
910
+ :rtype: str
911
+ """
912
+
913
+ return self.root.name
914
+
915
+ def purge(self, *, confirm: str) -> None:
916
+ """
917
+ Delete all ingested items and derived files, preserving corpus identity/config.
918
+
919
+ :param confirm: Confirmation string matching the corpus name.
920
+ :type confirm: str
921
+ :return: None.
922
+ :rtype: None
923
+ :raises ValueError: If the confirmation does not match.
924
+ """
925
+
926
+ expected = self.name
927
+ if confirm != expected:
928
+ raise ValueError(f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus")
929
+
930
+ if self.raw_dir.exists():
931
+ shutil.rmtree(self.raw_dir)
932
+ self.raw_dir.mkdir(parents=True, exist_ok=True)
933
+
934
+ for path in self.meta_dir.iterdir():
935
+ if path.name == "config.json":
936
+ continue
937
+ if path.is_dir():
938
+ shutil.rmtree(path)
939
+ else:
940
+ path.unlink()
941
+ self._init_catalog()
942
+ self._write_catalog(
943
+ CorpusCatalog(
944
+ schema_version=SCHEMA_VERSION,
945
+ generated_at=utc_now_iso(),
946
+ corpus_uri=normalize_corpus_uri(self.root),
947
+ raw_dir=DEFAULT_RAW_DIR,
948
+ latest_run_id=None,
949
+ items={},
950
+ order=[],
951
+ )
952
+ )