biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/corpus.py CHANGED
@@ -14,8 +14,20 @@ from typing import Any, Dict, List, Optional, Sequence
14
14
 
15
15
  import yaml
16
16
 
17
- from .constants import CORPUS_DIR_NAME, DEFAULT_RAW_DIR, RUNS_DIR_NAME, SCHEMA_VERSION, SIDECAR_SUFFIX
17
+ from .constants import (
18
+ CORPUS_DIR_NAME,
19
+ DEFAULT_RAW_DIR,
20
+ EXTRACTION_RUNS_DIR_NAME,
21
+ RUNS_DIR_NAME,
22
+ SCHEMA_VERSION,
23
+ SIDECAR_SUFFIX,
24
+ )
18
25
  from .frontmatter import parse_front_matter, render_front_matter
26
+ from pydantic import ValidationError
27
+
28
+ from .hook_manager import HookManager
29
+ from .hooks import HookPoint
30
+ from .ignore import load_corpus_ignore_spec
19
31
  from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
20
32
  from .sources import load_source
21
33
  from .time import utc_now_iso
@@ -35,6 +47,34 @@ def _sha256_bytes(data: bytes) -> str:
35
47
  return hashlib.sha256(data).hexdigest()
36
48
 
37
49
 
50
+ def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int = 1024 * 1024) -> Dict[str, object]:
51
+ """
52
+ Write a binary stream to disk while computing a digest.
53
+
54
+ :param stream: Binary stream to read from.
55
+ :type stream: object
56
+ :param destination_path: Destination path to write to.
57
+ :type destination_path: Path
58
+ :param chunk_size: Chunk size for reads.
59
+ :type chunk_size: int
60
+ :return: Mapping containing sha256 and bytes_written.
61
+ :rtype: dict[str, object]
62
+ :raises OSError: If the destination cannot be written.
63
+ """
64
+
65
+ hasher = hashlib.sha256()
66
+ bytes_written = 0
67
+ with destination_path.open("wb") as destination_handle:
68
+ while True:
69
+ chunk = stream.read(chunk_size)
70
+ if not chunk:
71
+ break
72
+ hasher.update(chunk)
73
+ destination_handle.write(chunk)
74
+ bytes_written += len(chunk)
75
+ return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
76
+
77
+
38
78
  def _sanitize_filename(name: str) -> str:
39
79
  """
40
80
  Sanitize a filename into a portable, filesystem-friendly form.
@@ -289,6 +329,7 @@ class Corpus:
289
329
  self.meta_dir = self.root / CORPUS_DIR_NAME
290
330
  self.raw_dir = self.root / DEFAULT_RAW_DIR
291
331
  self.config = self._load_config()
332
+ self._hooks = self._load_hooks()
292
333
 
293
334
  @property
294
335
  def uri(self) -> str:
@@ -314,7 +355,33 @@ class Corpus:
314
355
  if not path.is_file():
315
356
  return None
316
357
  data = json.loads(path.read_text(encoding="utf-8"))
317
- return CorpusConfig.model_validate(data)
358
+ try:
359
+ return CorpusConfig.model_validate(data)
360
+ except ValidationError as exc:
361
+ has_hook_error = any(
362
+ isinstance(error.get("loc"), tuple) and error.get("loc") and error.get("loc")[0] == "hooks"
363
+ for error in exc.errors()
364
+ )
365
+ if has_hook_error:
366
+ raise ValueError(f"Invalid hook specification: {exc}") from exc
367
+ raise ValueError(f"Invalid corpus config: {exc}") from exc
368
+
369
+ def _load_hooks(self) -> Optional[HookManager]:
370
+ """
371
+ Load the hook manager from config if hooks are configured.
372
+
373
+ :return: Hook manager or None.
374
+ :rtype: HookManager or None
375
+ :raises ValueError: If hook specifications are invalid.
376
+ """
377
+
378
+ if self.config is None or not self.config.hooks:
379
+ return None
380
+ return HookManager.from_config(
381
+ corpus_root=self.root,
382
+ corpus_uri=self.uri,
383
+ hook_specs=self.config.hooks,
384
+ )
318
385
 
319
386
  @classmethod
320
387
  def find(cls, start: Path) -> "Corpus":
@@ -468,6 +535,51 @@ class Corpus:
468
535
 
469
536
  return self.meta_dir / RUNS_DIR_NAME
470
537
 
538
+ @property
539
+ def extraction_runs_dir(self) -> Path:
540
+ """
541
+ Location of extraction run artifacts.
542
+
543
+ :return: Path to the extraction runs directory.
544
+ :rtype: Path
545
+ """
546
+
547
+ return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
548
+
549
+ def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
550
+ """
551
+ Resolve an extraction run directory.
552
+
553
+ :param extractor_id: Extractor plugin identifier.
554
+ :type extractor_id: str
555
+ :param run_id: Extraction run identifier.
556
+ :type run_id: str
557
+ :return: Extraction run directory.
558
+ :rtype: Path
559
+ """
560
+
561
+ return self.extraction_runs_dir / extractor_id / run_id
562
+
563
+ def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
564
+ """
565
+ Read extracted text for an item from an extraction run, when present.
566
+
567
+ :param extractor_id: Extractor plugin identifier.
568
+ :type extractor_id: str
569
+ :param run_id: Extraction run identifier.
570
+ :type run_id: str
571
+ :param item_id: Item identifier.
572
+ :type item_id: str
573
+ :return: Extracted text or None if the artifact does not exist.
574
+ :rtype: str or None
575
+ :raises OSError: If the file exists but cannot be read.
576
+ """
577
+
578
+ path = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "text" / f"{item_id}.txt"
579
+ if not path.is_file():
580
+ return None
581
+ return path.read_text(encoding="utf-8")
582
+
471
583
  def _ensure_runs_dir(self) -> None:
472
584
  """
473
585
  Ensure the retrieval runs directory exists.
@@ -608,6 +720,21 @@ class Corpus:
608
720
  if resolved_tags and "tags" not in metadata_input:
609
721
  metadata_input["tags"] = list(resolved_tags)
610
722
 
723
+ if self._hooks is not None:
724
+ mutation = self._hooks.run_ingest_hooks(
725
+ hook_point=HookPoint.before_ingest,
726
+ filename=filename,
727
+ media_type=media_type,
728
+ title=resolved_title,
729
+ tags=list(resolved_tags),
730
+ metadata=dict(metadata_input),
731
+ source_uri=source_uri,
732
+ )
733
+ if mutation.add_tags:
734
+ for tag in mutation.add_tags:
735
+ if tag not in resolved_tags:
736
+ resolved_tags.append(tag)
737
+
611
738
  frontmatter: Dict[str, Any] = {}
612
739
 
613
740
  if media_type == "text/markdown":
@@ -656,6 +783,32 @@ class Corpus:
656
783
  _write_sidecar(output_path, sidecar)
657
784
  frontmatter = sidecar
658
785
 
786
+ if self._hooks is not None:
787
+ mutation = self._hooks.run_ingest_hooks(
788
+ hook_point=HookPoint.after_ingest,
789
+ filename=filename,
790
+ media_type=media_type,
791
+ title=resolved_title,
792
+ tags=list(resolved_tags),
793
+ metadata=dict(metadata_input),
794
+ source_uri=source_uri,
795
+ item_id=item_id,
796
+ relpath=relpath,
797
+ )
798
+ if mutation.add_tags:
799
+ updated_tags = list(resolved_tags)
800
+ for tag in mutation.add_tags:
801
+ if tag not in updated_tags:
802
+ updated_tags.append(tag)
803
+ resolved_tags = updated_tags
804
+ sidecar_metadata = _load_sidecar(output_path)
805
+ sidecar_metadata["tags"] = resolved_tags
806
+ if media_type != "text/markdown":
807
+ sidecar_metadata["media_type"] = media_type
808
+ sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
809
+ _write_sidecar(output_path, sidecar_metadata)
810
+ frontmatter = _merge_metadata(frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata)
811
+
659
812
  created_at = utc_now_iso()
660
813
  item_record = CatalogItem(
661
814
  id=item_id,
@@ -673,6 +826,130 @@ class Corpus:
673
826
 
674
827
  return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
675
828
 
829
+ def ingest_item_stream(
830
+ self,
831
+ stream,
832
+ *,
833
+ filename: Optional[str] = None,
834
+ media_type: str = "application/octet-stream",
835
+ tags: Sequence[str] = (),
836
+ metadata: Optional[Dict[str, Any]] = None,
837
+ source_uri: str = "unknown",
838
+ ) -> IngestResult:
839
+ """
840
+ Ingest a binary item from a readable stream.
841
+
842
+ This method is intended for large non-markdown items. It writes bytes to disk incrementally
843
+ while computing a checksum.
844
+
845
+ :param stream: Readable binary stream.
846
+ :type stream: object
847
+ :param filename: Optional filename for the stored item.
848
+ :type filename: str or None
849
+ :param media_type: Internet Assigned Numbers Authority media type for the item.
850
+ :type media_type: str
851
+ :param tags: Tags to associate with the item.
852
+ :type tags: Sequence[str]
853
+ :param metadata: Optional metadata mapping.
854
+ :type metadata: dict[str, Any] or None
855
+ :param source_uri: Source uniform resource identifier for provenance.
856
+ :type source_uri: str
857
+ :return: Ingestion result summary.
858
+ :rtype: IngestResult
859
+ :raises ValueError: If the media_type is text/markdown.
860
+ """
861
+
862
+ if media_type == "text/markdown":
863
+ raise ValueError("Stream ingestion is not supported for Markdown")
864
+
865
+ item_id = str(uuid.uuid4())
866
+ safe_filename = _sanitize_filename(filename) if filename else ""
867
+ if safe_filename:
868
+ safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
869
+
870
+ if safe_filename:
871
+ output_name = f"{item_id}--{safe_filename}"
872
+ else:
873
+ extension = _preferred_extension_for_media_type(media_type) or ""
874
+ output_name = f"{item_id}{extension}" if extension else f"{item_id}"
875
+
876
+ relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
877
+ output_path = self.root / relpath
878
+
879
+ resolved_tags = list(tags)
880
+ metadata_input: Dict[str, Any] = dict(metadata or {})
881
+ if resolved_tags and "tags" not in metadata_input:
882
+ metadata_input["tags"] = list(resolved_tags)
883
+
884
+ if self._hooks is not None:
885
+ mutation = self._hooks.run_ingest_hooks(
886
+ hook_point=HookPoint.before_ingest,
887
+ filename=filename,
888
+ media_type=media_type,
889
+ title=None,
890
+ tags=list(resolved_tags),
891
+ metadata=dict(metadata_input),
892
+ source_uri=source_uri,
893
+ )
894
+ if mutation.add_tags:
895
+ for tag in mutation.add_tags:
896
+ if tag not in resolved_tags:
897
+ resolved_tags.append(tag)
898
+
899
+ write_result = _write_stream_and_hash(stream, output_path)
900
+ sha256_digest = str(write_result["sha256"])
901
+ bytes_written = int(write_result["bytes_written"])
902
+
903
+ sidecar: Dict[str, Any] = {}
904
+ sidecar["media_type"] = media_type
905
+ if resolved_tags:
906
+ sidecar["tags"] = resolved_tags
907
+ if metadata_input:
908
+ for metadata_key, metadata_value in metadata_input.items():
909
+ if metadata_key in {"tags", "biblicus"}:
910
+ continue
911
+ sidecar[metadata_key] = metadata_value
912
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
913
+ _write_sidecar(output_path, sidecar)
914
+
915
+ if self._hooks is not None:
916
+ mutation = self._hooks.run_ingest_hooks(
917
+ hook_point=HookPoint.after_ingest,
918
+ filename=filename,
919
+ media_type=media_type,
920
+ title=None,
921
+ tags=list(resolved_tags),
922
+ metadata=dict(metadata_input),
923
+ source_uri=source_uri,
924
+ item_id=item_id,
925
+ relpath=relpath,
926
+ )
927
+ if mutation.add_tags:
928
+ updated_tags = list(resolved_tags)
929
+ for tag in mutation.add_tags:
930
+ if tag not in updated_tags:
931
+ updated_tags.append(tag)
932
+ resolved_tags = updated_tags
933
+ sidecar["tags"] = resolved_tags
934
+ _write_sidecar(output_path, sidecar)
935
+
936
+ created_at = utc_now_iso()
937
+ item_record = CatalogItem(
938
+ id=item_id,
939
+ relpath=relpath,
940
+ sha256=sha256_digest,
941
+ bytes=bytes_written,
942
+ media_type=media_type,
943
+ title=None,
944
+ tags=list(resolved_tags),
945
+ metadata=dict(sidecar or {}),
946
+ created_at=created_at,
947
+ source_uri=source_uri,
948
+ )
949
+ self._upsert_catalog_item(item_record)
950
+
951
+ return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
952
+
676
953
  def ingest_note(
677
954
  self,
678
955
  text: str,
@@ -727,6 +1004,36 @@ class Corpus:
727
1004
  :rtype: IngestResult
728
1005
  """
729
1006
 
1007
+ candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
1008
+ if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
1009
+ path = source if isinstance(source, Path) else candidate_path
1010
+ assert isinstance(path, Path)
1011
+ path = path.resolve()
1012
+ filename = path.name
1013
+ media_type, _ = mimetypes.guess_type(filename)
1014
+ media_type = media_type or "application/octet-stream"
1015
+ if path.suffix.lower() in {".md", ".markdown"}:
1016
+ media_type = "text/markdown"
1017
+ if media_type == "text/markdown":
1018
+ return self.ingest_item(
1019
+ path.read_bytes(),
1020
+ filename=filename,
1021
+ media_type=media_type,
1022
+ title=None,
1023
+ tags=tags,
1024
+ metadata=None,
1025
+ source_uri=source_uri or path.as_uri(),
1026
+ )
1027
+ with path.open("rb") as handle:
1028
+ return self.ingest_item_stream(
1029
+ handle,
1030
+ filename=filename,
1031
+ media_type=media_type,
1032
+ tags=tags,
1033
+ metadata=None,
1034
+ source_uri=source_uri or path.as_uri(),
1035
+ )
1036
+
730
1037
  payload = load_source(source, source_uri=source_uri)
731
1038
  return self.ingest_item(
732
1039
  payload.data,
@@ -738,6 +1045,128 @@ class Corpus:
738
1045
  source_uri=payload.source_uri,
739
1046
  )
740
1047
 
1048
+ def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
1049
+ """
1050
+ Import a folder tree into the corpus, preserving relative paths and provenance.
1051
+
1052
+ Imported content is stored under the raw directory in a dedicated import namespace so that
1053
+ operators can inspect and back up imported content as a structured tree.
1054
+
1055
+ :param source_root: Root directory of the folder tree to import.
1056
+ :type source_root: Path
1057
+ :param tags: Tags to associate with imported items.
1058
+ :type tags: Sequence[str]
1059
+ :return: Import statistics.
1060
+ :rtype: dict[str, int]
1061
+ :raises FileNotFoundError: If the source_root does not exist.
1062
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1063
+ """
1064
+
1065
+ source_root = source_root.resolve()
1066
+ if not source_root.is_dir():
1067
+ raise FileNotFoundError(f"Import source root does not exist: {source_root}")
1068
+
1069
+ ignore_spec = load_corpus_ignore_spec(self.root)
1070
+ import_id = str(uuid.uuid4())
1071
+ stats = {"scanned": 0, "ignored": 0, "imported": 0}
1072
+
1073
+ for source_path in sorted(source_root.rglob("*")):
1074
+ if not source_path.is_file():
1075
+ continue
1076
+ relative_source_path = source_path.relative_to(source_root).as_posix()
1077
+ stats["scanned"] += 1
1078
+ if ignore_spec.matches(relative_source_path):
1079
+ stats["ignored"] += 1
1080
+ continue
1081
+ self._import_file(
1082
+ source_path=source_path,
1083
+ import_id=import_id,
1084
+ relative_source_path=relative_source_path,
1085
+ tags=tags,
1086
+ )
1087
+ stats["imported"] += 1
1088
+
1089
+ return stats
1090
+
1091
+ def _import_file(
1092
+ self,
1093
+ *,
1094
+ source_path: Path,
1095
+ import_id: str,
1096
+ relative_source_path: str,
1097
+ tags: Sequence[str],
1098
+ ) -> None:
1099
+ """
1100
+ Import a single file into the corpus under an import namespace.
1101
+
1102
+ :param source_path: Source file path to import.
1103
+ :type source_path: Path
1104
+ :param import_id: Import identifier.
1105
+ :type import_id: str
1106
+ :param relative_source_path: Relative path within the imported tree.
1107
+ :type relative_source_path: str
1108
+ :param tags: Tags to apply.
1109
+ :type tags: Sequence[str]
1110
+ :return: None.
1111
+ :rtype: None
1112
+ :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1113
+ """
1114
+
1115
+ item_id = str(uuid.uuid4())
1116
+ destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path)
1117
+ destination_path = (self.root / destination_relpath).resolve()
1118
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1119
+
1120
+ raw_bytes = source_path.read_bytes()
1121
+ sha256_digest = _sha256_bytes(raw_bytes)
1122
+
1123
+ media_type, _ = mimetypes.guess_type(source_path.name)
1124
+ media_type = media_type or "application/octet-stream"
1125
+ if source_path.suffix.lower() in {".md", ".markdown"}:
1126
+ media_type = "text/markdown"
1127
+
1128
+ title: Optional[str] = None
1129
+ frontmatter_metadata: Dict[str, Any] = {}
1130
+ if media_type == "text/markdown":
1131
+ try:
1132
+ text = raw_bytes.decode("utf-8")
1133
+ except UnicodeDecodeError as decode_error:
1134
+ raise ValueError(
1135
+ f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
1136
+ ) from decode_error
1137
+ parsed_document = parse_front_matter(text)
1138
+ frontmatter_metadata = dict(parsed_document.metadata)
1139
+ title_value = frontmatter_metadata.get("title")
1140
+ if isinstance(title_value, str) and title_value.strip():
1141
+ title = title_value.strip()
1142
+
1143
+ destination_path.write_bytes(raw_bytes)
1144
+
1145
+ sidecar: Dict[str, Any] = {}
1146
+ if tags:
1147
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1148
+ if media_type != "text/markdown":
1149
+ sidecar["media_type"] = media_type
1150
+ sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
1151
+ _write_sidecar(destination_path, sidecar)
1152
+
1153
+ merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
1154
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1155
+
1156
+ item_record = CatalogItem(
1157
+ id=item_id,
1158
+ relpath=destination_relpath,
1159
+ sha256=sha256_digest,
1160
+ bytes=len(raw_bytes),
1161
+ media_type=media_type,
1162
+ title=title,
1163
+ tags=list(resolved_tags),
1164
+ metadata=dict(merged_metadata or {}),
1165
+ created_at=utc_now_iso(),
1166
+ source_uri=source_path.as_uri(),
1167
+ )
1168
+ self._upsert_catalog_item(item_record)
1169
+
741
1170
  def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
742
1171
  """
743
1172
  List items from the catalog.