biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +96 -13
  9. biblicus/backends/sqlite_full_text_search.py +74 -14
  10. biblicus/cli.py +126 -19
  11. biblicus/constants.py +2 -0
  12. biblicus/corpus.py +455 -45
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +529 -0
  16. biblicus/extractors/__init__.py +44 -0
  17. biblicus/extractors/base.py +68 -0
  18. biblicus/extractors/metadata_text.py +106 -0
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +84 -0
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +180 -0
  29. biblicus/hook_manager.py +203 -0
  30. biblicus/hooks.py +261 -0
  31. biblicus/ignore.py +64 -0
  32. biblicus/models.py +107 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +85 -5
  35. biblicus/time.py +0 -1
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. biblicus-0.3.0.dist-info/METADATA +336 -0
  39. biblicus-0.3.0.dist-info/RECORD +44 -0
  40. biblicus-0.1.1.dist-info/METADATA +0 -174
  41. biblicus-0.1.1.dist-info/RECORD +0 -22
  42. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py CHANGED
@@ -13,13 +13,24 @@ from pathlib import Path
13
13
  from typing import Any, Dict, List, Optional, Sequence
14
14
 
15
15
  import yaml
16
-
17
- from .constants import CORPUS_DIR_NAME, DEFAULT_RAW_DIR, RUNS_DIR_NAME, SCHEMA_VERSION, SIDECAR_SUFFIX
16
+ from pydantic import ValidationError
17
+
18
+ from .constants import (
19
+ CORPUS_DIR_NAME,
20
+ DEFAULT_RAW_DIR,
21
+ EXTRACTION_RUNS_DIR_NAME,
22
+ RUNS_DIR_NAME,
23
+ SCHEMA_VERSION,
24
+ SIDECAR_SUFFIX,
25
+ )
18
26
  from .frontmatter import parse_front_matter, render_front_matter
27
+ from .hook_manager import HookManager
28
+ from .hooks import HookPoint
29
+ from .ignore import load_corpus_ignore_spec
19
30
  from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
20
31
  from .sources import load_source
21
32
  from .time import utc_now_iso
22
- from .uris import normalize_corpus_uri, corpus_ref_to_path
33
+ from .uris import corpus_ref_to_path, normalize_corpus_uri
23
34
 
24
35
 
25
36
  def _sha256_bytes(data: bytes) -> str:
@@ -31,10 +42,38 @@ def _sha256_bytes(data: bytes) -> str:
31
42
  :return: Secure Hash Algorithm 256 hex digest.
32
43
  :rtype: str
33
44
  """
34
-
35
45
  return hashlib.sha256(data).hexdigest()
36
46
 
37
47
 
48
+ def _write_stream_and_hash(
49
+ stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
50
+ ) -> Dict[str, object]:
51
+ """
52
+ Write a binary stream to disk while computing a digest.
53
+
54
+ :param stream: Binary stream to read from.
55
+ :type stream: object
56
+ :param destination_path: Destination path to write to.
57
+ :type destination_path: Path
58
+ :param chunk_size: Chunk size for reads.
59
+ :type chunk_size: int
60
+ :return: Mapping containing sha256 and bytes_written.
61
+ :rtype: dict[str, object]
62
+ :raises OSError: If the destination cannot be written.
63
+ """
64
+ hasher = hashlib.sha256()
65
+ bytes_written = 0
66
+ with destination_path.open("wb") as destination_handle:
67
+ while True:
68
+ chunk = stream.read(chunk_size)
69
+ if not chunk:
70
+ break
71
+ hasher.update(chunk)
72
+ destination_handle.write(chunk)
73
+ bytes_written += len(chunk)
74
+ return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
75
+
76
+
38
77
  def _sanitize_filename(name: str) -> str:
39
78
  """
40
79
  Sanitize a filename into a portable, filesystem-friendly form.
@@ -44,7 +83,6 @@ def _sanitize_filename(name: str) -> str:
44
83
  :return: Sanitized filename.
45
84
  :rtype: str
46
85
  """
47
-
48
86
  allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
49
87
  sanitized_name = "".join(
50
88
  (character if character in allowed_characters else "_") for character in name
@@ -61,9 +99,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
61
99
  :return: Preferred extension or None.
62
100
  :rtype: str or None
63
101
  """
64
-
65
102
  media_type_overrides = {
66
103
  "image/jpeg": ".jpg",
104
+ "audio/ogg": ".ogg",
67
105
  }
68
106
  if media_type in media_type_overrides:
69
107
  return media_type_overrides[media_type]
@@ -81,7 +119,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
81
119
  :return: Filename with a compatible extension.
82
120
  :rtype: str
83
121
  """
84
-
85
122
  raw_name = filename.strip()
86
123
 
87
124
  if media_type == "text/markdown":
@@ -89,11 +126,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
89
126
  return raw_name
90
127
  return raw_name + ".md"
91
128
 
129
+ if Path(raw_name).suffix:
130
+ return raw_name
131
+
92
132
  ext = _preferred_extension_for_media_type(media_type)
93
133
  if not ext:
94
134
  return raw_name
95
- if raw_name.lower().endswith(ext.lower()):
96
- return raw_name
97
135
  return raw_name + ext
98
136
 
99
137
 
@@ -108,7 +146,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
108
146
  :return: Deduplicated tag list preserving order.
109
147
  :rtype: list[str]
110
148
  """
111
-
112
149
  merged_tags: List[str] = []
113
150
 
114
151
  for explicit_tag in explicit:
@@ -141,7 +178,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
141
178
  :return: Sidecar path.
142
179
  :rtype: Path
143
180
  """
144
-
145
181
  return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
146
182
 
147
183
 
@@ -155,7 +191,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
155
191
  :rtype: dict[str, Any]
156
192
  :raises ValueError: If the sidecar content is not a mapping.
157
193
  """
158
-
159
194
  path = _sidecar_path_for(content_path)
160
195
  if not path.is_file():
161
196
  return {}
@@ -186,7 +221,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
186
221
  path.write_text(text + "\n", encoding="utf-8")
187
222
 
188
223
 
189
- def _ensure_biblicus_block(metadata: Dict[str, Any], *, item_id: str, source_uri: str) -> Dict[str, Any]:
224
+ def _ensure_biblicus_block(
225
+ metadata: Dict[str, Any], *, item_id: str, source_uri: str
226
+ ) -> Dict[str, Any]:
190
227
  """
191
228
  Ensure the biblicus metadata block exists and is populated.
192
229
 
@@ -284,11 +321,11 @@ class Corpus:
284
321
  :param root: Corpus root directory.
285
322
  :type root: Path
286
323
  """
287
-
288
324
  self.root = root
289
325
  self.meta_dir = self.root / CORPUS_DIR_NAME
290
326
  self.raw_dir = self.root / DEFAULT_RAW_DIR
291
327
  self.config = self._load_config()
328
+ self._hooks = self._load_hooks()
292
329
 
293
330
  @property
294
331
  def uri(self) -> str:
@@ -298,7 +335,6 @@ class Corpus:
298
335
  :return: Corpus uniform resource identifier.
299
336
  :rtype: str
300
337
  """
301
-
302
338
  return self.root.as_uri()
303
339
 
304
340
  def _load_config(self) -> Optional[CorpusConfig]:
@@ -309,12 +345,38 @@ class Corpus:
309
345
  :rtype: CorpusConfig or None
310
346
  :raises ValueError: If the config schema is invalid.
311
347
  """
312
-
313
348
  path = self.meta_dir / "config.json"
314
349
  if not path.is_file():
315
350
  return None
316
351
  data = json.loads(path.read_text(encoding="utf-8"))
317
- return CorpusConfig.model_validate(data)
352
+ try:
353
+ return CorpusConfig.model_validate(data)
354
+ except ValidationError as exc:
355
+ has_hook_error = any(
356
+ isinstance(error.get("loc"), tuple)
357
+ and error.get("loc")
358
+ and error.get("loc")[0] == "hooks"
359
+ for error in exc.errors()
360
+ )
361
+ if has_hook_error:
362
+ raise ValueError(f"Invalid hook specification: {exc}") from exc
363
+ raise ValueError(f"Invalid corpus config: {exc}") from exc
364
+
365
+ def _load_hooks(self) -> Optional[HookManager]:
366
+ """
367
+ Load the hook manager from config if hooks are configured.
368
+
369
+ :return: Hook manager or None.
370
+ :rtype: HookManager or None
371
+ :raises ValueError: If hook specifications are invalid.
372
+ """
373
+ if self.config is None or not self.config.hooks:
374
+ return None
375
+ return HookManager.from_config(
376
+ corpus_root=self.root,
377
+ corpus_uri=self.uri,
378
+ hook_specs=self.config.hooks,
379
+ )
318
380
 
319
381
  @classmethod
320
382
  def find(cls, start: Path) -> "Corpus":
@@ -327,7 +389,6 @@ class Corpus:
327
389
  :rtype: Corpus
328
390
  :raises FileNotFoundError: If no corpus config is found.
329
391
  """
330
-
331
392
  start = start.resolve()
332
393
  for candidate in [start, *start.parents]:
333
394
  if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
@@ -346,7 +407,6 @@ class Corpus:
346
407
  :return: Opened corpus instance.
347
408
  :rtype: Corpus
348
409
  """
349
-
350
410
  return cls.find(corpus_ref_to_path(ref))
351
411
 
352
412
  @classmethod
@@ -362,7 +422,6 @@ class Corpus:
362
422
  :rtype: Corpus
363
423
  :raises FileExistsError: If the corpus already exists and force is False.
364
424
  """
365
-
366
425
  root = root.resolve()
367
426
  corpus = cls(root)
368
427
 
@@ -392,7 +451,6 @@ class Corpus:
392
451
  :return: Catalog file path.
393
452
  :rtype: Path
394
453
  """
395
-
396
454
  return self.meta_dir / "catalog.json"
397
455
 
398
456
  def _init_catalog(self) -> None:
@@ -402,7 +460,6 @@ class Corpus:
402
460
  :return: None.
403
461
  :rtype: None
404
462
  """
405
-
406
463
  if self.catalog_path.exists():
407
464
  return
408
465
  catalog = CorpusCatalog(
@@ -425,7 +482,6 @@ class Corpus:
425
482
  :raises FileNotFoundError: If the catalog file does not exist.
426
483
  :raises ValueError: If the catalog schema is invalid.
427
484
  """
428
-
429
485
  if not self.catalog_path.is_file():
430
486
  raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
431
487
  catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
@@ -440,7 +496,6 @@ class Corpus:
440
496
  :raises FileNotFoundError: If the catalog file does not exist.
441
497
  :raises ValueError: If the catalog schema is invalid.
442
498
  """
443
-
444
499
  return self._load_catalog()
445
500
 
446
501
  def _write_catalog(self, catalog: CorpusCatalog) -> None:
@@ -452,7 +507,6 @@ class Corpus:
452
507
  :return: None.
453
508
  :rtype: None
454
509
  """
455
-
456
510
  temp_path = self.catalog_path.with_suffix(".json.tmp")
457
511
  temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
458
512
  temp_path.replace(self.catalog_path)
@@ -465,9 +519,54 @@ class Corpus:
465
519
  :return: Path to the runs directory.
466
520
  :rtype: Path
467
521
  """
468
-
469
522
  return self.meta_dir / RUNS_DIR_NAME
470
523
 
524
+ @property
525
+ def extraction_runs_dir(self) -> Path:
526
+ """
527
+ Location of extraction run artifacts.
528
+
529
+ :return: Path to the extraction runs directory.
530
+ :rtype: Path
531
+ """
532
+ return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
533
+
534
+ def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
535
+ """
536
+ Resolve an extraction run directory.
537
+
538
+ :param extractor_id: Extractor plugin identifier.
539
+ :type extractor_id: str
540
+ :param run_id: Extraction run identifier.
541
+ :type run_id: str
542
+ :return: Extraction run directory.
543
+ :rtype: Path
544
+ """
545
+ return self.extraction_runs_dir / extractor_id / run_id
546
+
547
+ def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
548
+ """
549
+ Read extracted text for an item from an extraction run, when present.
550
+
551
+ :param extractor_id: Extractor plugin identifier.
552
+ :type extractor_id: str
553
+ :param run_id: Extraction run identifier.
554
+ :type run_id: str
555
+ :param item_id: Item identifier.
556
+ :type item_id: str
557
+ :return: Extracted text or None if the artifact does not exist.
558
+ :rtype: str or None
559
+ :raises OSError: If the file exists but cannot be read.
560
+ """
561
+ path = (
562
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
563
+ / "text"
564
+ / f"{item_id}.txt"
565
+ )
566
+ if not path.is_file():
567
+ return None
568
+ return path.read_text(encoding="utf-8")
569
+
471
570
  def _ensure_runs_dir(self) -> None:
472
571
  """
473
572
  Ensure the retrieval runs directory exists.
@@ -475,7 +574,6 @@ class Corpus:
475
574
  :return: None.
476
575
  :rtype: None
477
576
  """
478
-
479
577
  self.runs_dir.mkdir(parents=True, exist_ok=True)
480
578
 
481
579
  def write_run(self, run: RetrievalRun) -> None:
@@ -487,7 +585,6 @@ class Corpus:
487
585
  :return: None.
488
586
  :rtype: None
489
587
  """
490
-
491
588
  self._ensure_runs_dir()
492
589
  path = self.runs_dir / f"{run.run_id}.json"
493
590
  path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
@@ -506,7 +603,6 @@ class Corpus:
506
603
  :rtype: RetrievalRun
507
604
  :raises FileNotFoundError: If the run manifest does not exist.
508
605
  """
509
-
510
606
  path = self.runs_dir / f"{run_id}.json"
511
607
  if not path.is_file():
512
608
  raise FileNotFoundError(f"Missing run manifest: {path}")
@@ -521,7 +617,6 @@ class Corpus:
521
617
  :return: Latest run identifier or None.
522
618
  :rtype: str or None
523
619
  """
524
-
525
620
  return self._load_catalog().latest_run_id
526
621
 
527
622
  def _upsert_catalog_item(self, item: CatalogItem) -> None:
@@ -533,7 +628,6 @@ class Corpus:
533
628
  :return: None.
534
629
  :rtype: None
535
630
  """
536
-
537
631
  self._init_catalog()
538
632
  catalog = self._load_catalog()
539
633
  catalog.items[item.id] = item
@@ -581,7 +675,6 @@ class Corpus:
581
675
  :rtype: IngestResult
582
676
  :raises ValueError: If markdown is not Unicode Transformation Format 8.
583
677
  """
584
-
585
678
  item_id = str(uuid.uuid4())
586
679
  safe_filename = _sanitize_filename(filename) if filename else ""
587
680
 
@@ -608,13 +701,30 @@ class Corpus:
608
701
  if resolved_tags and "tags" not in metadata_input:
609
702
  metadata_input["tags"] = list(resolved_tags)
610
703
 
704
+ if self._hooks is not None:
705
+ mutation = self._hooks.run_ingest_hooks(
706
+ hook_point=HookPoint.before_ingest,
707
+ filename=filename,
708
+ media_type=media_type,
709
+ title=resolved_title,
710
+ tags=list(resolved_tags),
711
+ metadata=dict(metadata_input),
712
+ source_uri=source_uri,
713
+ )
714
+ if mutation.add_tags:
715
+ for tag in mutation.add_tags:
716
+ if tag not in resolved_tags:
717
+ resolved_tags.append(tag)
718
+
611
719
  frontmatter: Dict[str, Any] = {}
612
720
 
613
721
  if media_type == "text/markdown":
614
722
  try:
615
723
  markdown_text = data.decode("utf-8")
616
724
  except UnicodeDecodeError as decode_error:
617
- raise ValueError("Markdown must be Unicode Transformation Format 8") from decode_error
725
+ raise ValueError(
726
+ "Markdown must be Unicode Transformation Format 8"
727
+ ) from decode_error
618
728
 
619
729
  parsed_document = parse_front_matter(markdown_text)
620
730
  frontmatter = dict(parsed_document.metadata)
@@ -633,7 +743,9 @@ class Corpus:
633
743
  if isinstance(title_value, str) and title_value.strip():
634
744
  resolved_title = title_value.strip()
635
745
 
636
- frontmatter = _ensure_biblicus_block(frontmatter, item_id=item_id, source_uri=source_uri)
746
+ frontmatter = _ensure_biblicus_block(
747
+ frontmatter, item_id=item_id, source_uri=source_uri
748
+ )
637
749
  rendered_document = render_front_matter(frontmatter, parsed_document.body)
638
750
  data_to_write = rendered_document.encode("utf-8")
639
751
  else:
@@ -656,6 +768,34 @@ class Corpus:
656
768
  _write_sidecar(output_path, sidecar)
657
769
  frontmatter = sidecar
658
770
 
771
+ if self._hooks is not None:
772
+ mutation = self._hooks.run_ingest_hooks(
773
+ hook_point=HookPoint.after_ingest,
774
+ filename=filename,
775
+ media_type=media_type,
776
+ title=resolved_title,
777
+ tags=list(resolved_tags),
778
+ metadata=dict(metadata_input),
779
+ source_uri=source_uri,
780
+ item_id=item_id,
781
+ relpath=relpath,
782
+ )
783
+ if mutation.add_tags:
784
+ updated_tags = list(resolved_tags)
785
+ for tag in mutation.add_tags:
786
+ if tag not in updated_tags:
787
+ updated_tags.append(tag)
788
+ resolved_tags = updated_tags
789
+ sidecar_metadata = _load_sidecar(output_path)
790
+ sidecar_metadata["tags"] = resolved_tags
791
+ if media_type != "text/markdown":
792
+ sidecar_metadata["media_type"] = media_type
793
+ sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
794
+ _write_sidecar(output_path, sidecar_metadata)
795
+ frontmatter = _merge_metadata(
796
+ frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
797
+ )
798
+
659
799
  created_at = utc_now_iso()
660
800
  item_record = CatalogItem(
661
801
  id=item_id,
@@ -673,6 +813,129 @@ class Corpus:
673
813
 
674
814
  return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
675
815
 
816
+ def ingest_item_stream(
817
+ self,
818
+ stream,
819
+ *,
820
+ filename: Optional[str] = None,
821
+ media_type: str = "application/octet-stream",
822
+ tags: Sequence[str] = (),
823
+ metadata: Optional[Dict[str, Any]] = None,
824
+ source_uri: str = "unknown",
825
+ ) -> IngestResult:
826
+ """
827
+ Ingest a binary item from a readable stream.
828
+
829
+ This method is intended for large non-markdown items. It writes bytes to disk incrementally
830
+ while computing a checksum.
831
+
832
+ :param stream: Readable binary stream.
833
+ :type stream: object
834
+ :param filename: Optional filename for the stored item.
835
+ :type filename: str or None
836
+ :param media_type: Internet Assigned Numbers Authority media type for the item.
837
+ :type media_type: str
838
+ :param tags: Tags to associate with the item.
839
+ :type tags: Sequence[str]
840
+ :param metadata: Optional metadata mapping.
841
+ :type metadata: dict[str, Any] or None
842
+ :param source_uri: Source uniform resource identifier for provenance.
843
+ :type source_uri: str
844
+ :return: Ingestion result summary.
845
+ :rtype: IngestResult
846
+ :raises ValueError: If the media_type is text/markdown.
847
+ """
848
+ if media_type == "text/markdown":
849
+ raise ValueError("Stream ingestion is not supported for Markdown")
850
+
851
+ item_id = str(uuid.uuid4())
852
+ safe_filename = _sanitize_filename(filename) if filename else ""
853
+ if safe_filename:
854
+ safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
855
+
856
+ if safe_filename:
857
+ output_name = f"{item_id}--{safe_filename}"
858
+ else:
859
+ extension = _preferred_extension_for_media_type(media_type) or ""
860
+ output_name = f"{item_id}{extension}" if extension else f"{item_id}"
861
+
862
+ relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
863
+ output_path = self.root / relpath
864
+
865
+ resolved_tags = list(tags)
866
+ metadata_input: Dict[str, Any] = dict(metadata or {})
867
+ if resolved_tags and "tags" not in metadata_input:
868
+ metadata_input["tags"] = list(resolved_tags)
869
+
870
+ if self._hooks is not None:
871
+ mutation = self._hooks.run_ingest_hooks(
872
+ hook_point=HookPoint.before_ingest,
873
+ filename=filename,
874
+ media_type=media_type,
875
+ title=None,
876
+ tags=list(resolved_tags),
877
+ metadata=dict(metadata_input),
878
+ source_uri=source_uri,
879
+ )
880
+ if mutation.add_tags:
881
+ for tag in mutation.add_tags:
882
+ if tag not in resolved_tags:
883
+ resolved_tags.append(tag)
884
+
885
+ write_result = _write_stream_and_hash(stream, output_path)
886
+ sha256_digest = str(write_result["sha256"])
887
+ bytes_written = int(write_result["bytes_written"])
888
+
889
+ sidecar: Dict[str, Any] = {}
890
+ sidecar["media_type"] = media_type
891
+ if resolved_tags:
892
+ sidecar["tags"] = resolved_tags
893
+ if metadata_input:
894
+ for metadata_key, metadata_value in metadata_input.items():
895
+ if metadata_key in {"tags", "biblicus"}:
896
+ continue
897
+ sidecar[metadata_key] = metadata_value
898
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
899
+ _write_sidecar(output_path, sidecar)
900
+
901
+ if self._hooks is not None:
902
+ mutation = self._hooks.run_ingest_hooks(
903
+ hook_point=HookPoint.after_ingest,
904
+ filename=filename,
905
+ media_type=media_type,
906
+ title=None,
907
+ tags=list(resolved_tags),
908
+ metadata=dict(metadata_input),
909
+ source_uri=source_uri,
910
+ item_id=item_id,
911
+ relpath=relpath,
912
+ )
913
+ if mutation.add_tags:
914
+ updated_tags = list(resolved_tags)
915
+ for tag in mutation.add_tags:
916
+ if tag not in updated_tags:
917
+ updated_tags.append(tag)
918
+ resolved_tags = updated_tags
919
+ sidecar["tags"] = resolved_tags
920
+ _write_sidecar(output_path, sidecar)
921
+
922
+ created_at = utc_now_iso()
923
+ item_record = CatalogItem(
924
+ id=item_id,
925
+ relpath=relpath,
926
+ sha256=sha256_digest,
927
+ bytes=bytes_written,
928
+ media_type=media_type,
929
+ title=None,
930
+ tags=list(resolved_tags),
931
+ metadata=dict(sidecar or {}),
932
+ created_at=created_at,
933
+ source_uri=source_uri,
934
+ )
935
+ self._upsert_catalog_item(item_record)
936
+
937
+ return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
938
+
676
939
  def ingest_note(
677
940
  self,
678
941
  text: str,
@@ -695,7 +958,6 @@ class Corpus:
695
958
  :return: Ingestion result summary.
696
959
  :rtype: IngestResult
697
960
  """
698
-
699
961
  data = text.encode("utf-8")
700
962
  return self.ingest_item(
701
963
  data,
@@ -726,6 +988,35 @@ class Corpus:
726
988
  :return: Ingestion result summary.
727
989
  :rtype: IngestResult
728
990
  """
991
+ candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
992
+ if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
993
+ path = source if isinstance(source, Path) else candidate_path
994
+ assert isinstance(path, Path)
995
+ path = path.resolve()
996
+ filename = path.name
997
+ media_type, _ = mimetypes.guess_type(filename)
998
+ media_type = media_type or "application/octet-stream"
999
+ if path.suffix.lower() in {".md", ".markdown"}:
1000
+ media_type = "text/markdown"
1001
+ if media_type == "text/markdown":
1002
+ return self.ingest_item(
1003
+ path.read_bytes(),
1004
+ filename=filename,
1005
+ media_type=media_type,
1006
+ title=None,
1007
+ tags=tags,
1008
+ metadata=None,
1009
+ source_uri=source_uri or path.as_uri(),
1010
+ )
1011
+ with path.open("rb") as handle:
1012
+ return self.ingest_item_stream(
1013
+ handle,
1014
+ filename=filename,
1015
+ media_type=media_type,
1016
+ tags=tags,
1017
+ metadata=None,
1018
+ source_uri=source_uri or path.as_uri(),
1019
+ )
729
1020
 
730
1021
  payload = load_source(source, source_uri=source_uri)
731
1022
  return self.ingest_item(
@@ -738,6 +1029,128 @@ class Corpus:
738
1029
  source_uri=payload.source_uri,
739
1030
  )
740
1031
 
1032
+ def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
1033
+ """
1034
+ Import a folder tree into the corpus, preserving relative paths and provenance.
1035
+
1036
+ Imported content is stored under the raw directory in a dedicated import namespace so that
1037
+ operators can inspect and back up imported content as a structured tree.
1038
+
1039
+ :param source_root: Root directory of the folder tree to import.
1040
+ :type source_root: Path
1041
+ :param tags: Tags to associate with imported items.
1042
+ :type tags: Sequence[str]
1043
+ :return: Import statistics.
1044
+ :rtype: dict[str, int]
1045
+ :raises FileNotFoundError: If the source_root does not exist.
1046
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1047
+ """
1048
+ source_root = source_root.resolve()
1049
+ if not source_root.is_dir():
1050
+ raise FileNotFoundError(f"Import source root does not exist: {source_root}")
1051
+
1052
+ ignore_spec = load_corpus_ignore_spec(self.root)
1053
+ import_id = str(uuid.uuid4())
1054
+ stats = {"scanned": 0, "ignored": 0, "imported": 0}
1055
+
1056
+ for source_path in sorted(source_root.rglob("*")):
1057
+ if not source_path.is_file():
1058
+ continue
1059
+ relative_source_path = source_path.relative_to(source_root).as_posix()
1060
+ stats["scanned"] += 1
1061
+ if ignore_spec.matches(relative_source_path):
1062
+ stats["ignored"] += 1
1063
+ continue
1064
+ self._import_file(
1065
+ source_path=source_path,
1066
+ import_id=import_id,
1067
+ relative_source_path=relative_source_path,
1068
+ tags=tags,
1069
+ )
1070
+ stats["imported"] += 1
1071
+
1072
+ return stats
1073
+
1074
+ def _import_file(
1075
+ self,
1076
+ *,
1077
+ source_path: Path,
1078
+ import_id: str,
1079
+ relative_source_path: str,
1080
+ tags: Sequence[str],
1081
+ ) -> None:
1082
+ """
1083
+ Import a single file into the corpus under an import namespace.
1084
+
1085
+ :param source_path: Source file path to import.
1086
+ :type source_path: Path
1087
+ :param import_id: Import identifier.
1088
+ :type import_id: str
1089
+ :param relative_source_path: Relative path within the imported tree.
1090
+ :type relative_source_path: str
1091
+ :param tags: Tags to apply.
1092
+ :type tags: Sequence[str]
1093
+ :return: None.
1094
+ :rtype: None
1095
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1096
+ """
1097
+ item_id = str(uuid.uuid4())
1098
+ destination_relpath = str(
1099
+ Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
1100
+ )
1101
+ destination_path = (self.root / destination_relpath).resolve()
1102
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1103
+
1104
+ raw_bytes = source_path.read_bytes()
1105
+ sha256_digest = _sha256_bytes(raw_bytes)
1106
+
1107
+ media_type, _ = mimetypes.guess_type(source_path.name)
1108
+ media_type = media_type or "application/octet-stream"
1109
+ if source_path.suffix.lower() in {".md", ".markdown"}:
1110
+ media_type = "text/markdown"
1111
+
1112
+ title: Optional[str] = None
1113
+ frontmatter_metadata: Dict[str, Any] = {}
1114
+ if media_type == "text/markdown":
1115
+ try:
1116
+ text = raw_bytes.decode("utf-8")
1117
+ except UnicodeDecodeError as decode_error:
1118
+ raise ValueError(
1119
+ f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
1120
+ ) from decode_error
1121
+ parsed_document = parse_front_matter(text)
1122
+ frontmatter_metadata = dict(parsed_document.metadata)
1123
+ title_value = frontmatter_metadata.get("title")
1124
+ if isinstance(title_value, str) and title_value.strip():
1125
+ title = title_value.strip()
1126
+
1127
+ destination_path.write_bytes(raw_bytes)
1128
+
1129
+ sidecar: Dict[str, Any] = {}
1130
+ if tags:
1131
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1132
+ if media_type != "text/markdown":
1133
+ sidecar["media_type"] = media_type
1134
+ sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
1135
+ _write_sidecar(destination_path, sidecar)
1136
+
1137
+ merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
1138
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1139
+
1140
+ item_record = CatalogItem(
1141
+ id=item_id,
1142
+ relpath=destination_relpath,
1143
+ sha256=sha256_digest,
1144
+ bytes=len(raw_bytes),
1145
+ media_type=media_type,
1146
+ title=title,
1147
+ tags=list(resolved_tags),
1148
+ metadata=dict(merged_metadata or {}),
1149
+ created_at=utc_now_iso(),
1150
+ source_uri=source_path.as_uri(),
1151
+ )
1152
+ self._upsert_catalog_item(item_record)
1153
+
741
1154
  def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
742
1155
  """
743
1156
  List items from the catalog.
@@ -747,11 +1160,8 @@ class Corpus:
747
1160
  :return: Catalog items ordered by recency.
748
1161
  :rtype: list[CatalogItem]
749
1162
  """
750
-
751
1163
  catalog = self._load_catalog()
752
- ordered_ids = (
753
- catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
754
- )
1164
+ ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
755
1165
  collected_items: List[CatalogItem] = []
756
1166
  for item_id in ordered_ids:
757
1167
  item = catalog.items.get(item_id)
@@ -769,7 +1179,6 @@ class Corpus:
769
1179
  :rtype: CatalogItem
770
1180
  :raises KeyError: If the item identifier is unknown.
771
1181
  """
772
-
773
1182
  catalog = self._load_catalog()
774
1183
  item = catalog.items.get(item_id)
775
1184
  if item is None:
@@ -787,7 +1196,6 @@ class Corpus:
787
1196
  :rtype: dict[str, int]
788
1197
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
789
1198
  """
790
-
791
1199
  self._init_catalog()
792
1200
  existing_catalog = self._load_catalog()
793
1201
  stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
@@ -862,7 +1270,9 @@ class Corpus:
862
1270
 
863
1271
  previous_item = existing_catalog.items.get(item_id)
864
1272
  created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
865
- source_uri = source_uri or (previous_item.source_uri if previous_item is not None else None)
1273
+ source_uri = source_uri or (
1274
+ previous_item.source_uri if previous_item is not None else None
1275
+ )
866
1276
 
867
1277
  if previous_item is None:
868
1278
  stats["inserted"] += 1
@@ -909,7 +1319,6 @@ class Corpus:
909
1319
  :return: Corpus name.
910
1320
  :rtype: str
911
1321
  """
912
-
913
1322
  return self.root.name
914
1323
 
915
1324
  def purge(self, *, confirm: str) -> None:
@@ -922,10 +1331,11 @@ class Corpus:
922
1331
  :rtype: None
923
1332
  :raises ValueError: If the confirmation does not match.
924
1333
  """
925
-
926
1334
  expected = self.name
927
1335
  if confirm != expected:
928
- raise ValueError(f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus")
1336
+ raise ValueError(
1337
+ f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
1338
+ )
929
1339
 
930
1340
  if self.raw_dir.exists():
931
1341
  shutil.rmtree(self.raw_dir)