biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/backends/scan.py +81 -4
- biblicus/backends/sqlite_full_text_search.py +63 -2
- biblicus/cli.py +123 -0
- biblicus/constants.py +2 -0
- biblicus/corpus.py +431 -2
- biblicus/extraction.py +330 -0
- biblicus/extractors/__init__.py +33 -0
- biblicus/extractors/base.py +61 -0
- biblicus/extractors/cascade.py +101 -0
- biblicus/extractors/metadata_text.py +98 -0
- biblicus/extractors/pass_through_text.py +74 -0
- biblicus/hook_logging.py +185 -0
- biblicus/hook_manager.py +205 -0
- biblicus/hooks.py +265 -0
- biblicus/ignore.py +67 -0
- biblicus/models.py +20 -0
- biblicus/sources.py +45 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/METADATA +101 -1
- biblicus-0.2.0.dist-info/RECORD +32 -0
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -14,8 +14,20 @@ from typing import Any, Dict, List, Optional, Sequence
|
|
|
14
14
|
|
|
15
15
|
import yaml
|
|
16
16
|
|
|
17
|
-
from .constants import
|
|
17
|
+
from .constants import (
|
|
18
|
+
CORPUS_DIR_NAME,
|
|
19
|
+
DEFAULT_RAW_DIR,
|
|
20
|
+
EXTRACTION_RUNS_DIR_NAME,
|
|
21
|
+
RUNS_DIR_NAME,
|
|
22
|
+
SCHEMA_VERSION,
|
|
23
|
+
SIDECAR_SUFFIX,
|
|
24
|
+
)
|
|
18
25
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
26
|
+
from pydantic import ValidationError
|
|
27
|
+
|
|
28
|
+
from .hook_manager import HookManager
|
|
29
|
+
from .hooks import HookPoint
|
|
30
|
+
from .ignore import load_corpus_ignore_spec
|
|
19
31
|
from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
|
|
20
32
|
from .sources import load_source
|
|
21
33
|
from .time import utc_now_iso
|
|
@@ -35,6 +47,34 @@ def _sha256_bytes(data: bytes) -> str:
|
|
|
35
47
|
return hashlib.sha256(data).hexdigest()
|
|
36
48
|
|
|
37
49
|
|
|
50
|
+
def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int = 1024 * 1024) -> Dict[str, object]:
|
|
51
|
+
"""
|
|
52
|
+
Write a binary stream to disk while computing a digest.
|
|
53
|
+
|
|
54
|
+
:param stream: Binary stream to read from.
|
|
55
|
+
:type stream: object
|
|
56
|
+
:param destination_path: Destination path to write to.
|
|
57
|
+
:type destination_path: Path
|
|
58
|
+
:param chunk_size: Chunk size for reads.
|
|
59
|
+
:type chunk_size: int
|
|
60
|
+
:return: Mapping containing sha256 and bytes_written.
|
|
61
|
+
:rtype: dict[str, object]
|
|
62
|
+
:raises OSError: If the destination cannot be written.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
hasher = hashlib.sha256()
|
|
66
|
+
bytes_written = 0
|
|
67
|
+
with destination_path.open("wb") as destination_handle:
|
|
68
|
+
while True:
|
|
69
|
+
chunk = stream.read(chunk_size)
|
|
70
|
+
if not chunk:
|
|
71
|
+
break
|
|
72
|
+
hasher.update(chunk)
|
|
73
|
+
destination_handle.write(chunk)
|
|
74
|
+
bytes_written += len(chunk)
|
|
75
|
+
return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
|
|
76
|
+
|
|
77
|
+
|
|
38
78
|
def _sanitize_filename(name: str) -> str:
|
|
39
79
|
"""
|
|
40
80
|
Sanitize a filename into a portable, filesystem-friendly form.
|
|
@@ -289,6 +329,7 @@ class Corpus:
|
|
|
289
329
|
self.meta_dir = self.root / CORPUS_DIR_NAME
|
|
290
330
|
self.raw_dir = self.root / DEFAULT_RAW_DIR
|
|
291
331
|
self.config = self._load_config()
|
|
332
|
+
self._hooks = self._load_hooks()
|
|
292
333
|
|
|
293
334
|
@property
|
|
294
335
|
def uri(self) -> str:
|
|
@@ -314,7 +355,33 @@ class Corpus:
|
|
|
314
355
|
if not path.is_file():
|
|
315
356
|
return None
|
|
316
357
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
317
|
-
|
|
358
|
+
try:
|
|
359
|
+
return CorpusConfig.model_validate(data)
|
|
360
|
+
except ValidationError as exc:
|
|
361
|
+
has_hook_error = any(
|
|
362
|
+
isinstance(error.get("loc"), tuple) and error.get("loc") and error.get("loc")[0] == "hooks"
|
|
363
|
+
for error in exc.errors()
|
|
364
|
+
)
|
|
365
|
+
if has_hook_error:
|
|
366
|
+
raise ValueError(f"Invalid hook specification: {exc}") from exc
|
|
367
|
+
raise ValueError(f"Invalid corpus config: {exc}") from exc
|
|
368
|
+
|
|
369
|
+
def _load_hooks(self) -> Optional[HookManager]:
|
|
370
|
+
"""
|
|
371
|
+
Load the hook manager from config if hooks are configured.
|
|
372
|
+
|
|
373
|
+
:return: Hook manager or None.
|
|
374
|
+
:rtype: HookManager or None
|
|
375
|
+
:raises ValueError: If hook specifications are invalid.
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
if self.config is None or not self.config.hooks:
|
|
379
|
+
return None
|
|
380
|
+
return HookManager.from_config(
|
|
381
|
+
corpus_root=self.root,
|
|
382
|
+
corpus_uri=self.uri,
|
|
383
|
+
hook_specs=self.config.hooks,
|
|
384
|
+
)
|
|
318
385
|
|
|
319
386
|
@classmethod
|
|
320
387
|
def find(cls, start: Path) -> "Corpus":
|
|
@@ -468,6 +535,51 @@ class Corpus:
|
|
|
468
535
|
|
|
469
536
|
return self.meta_dir / RUNS_DIR_NAME
|
|
470
537
|
|
|
538
|
+
@property
|
|
539
|
+
def extraction_runs_dir(self) -> Path:
|
|
540
|
+
"""
|
|
541
|
+
Location of extraction run artifacts.
|
|
542
|
+
|
|
543
|
+
:return: Path to the extraction runs directory.
|
|
544
|
+
:rtype: Path
|
|
545
|
+
"""
|
|
546
|
+
|
|
547
|
+
return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
|
|
548
|
+
|
|
549
|
+
def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
|
|
550
|
+
"""
|
|
551
|
+
Resolve an extraction run directory.
|
|
552
|
+
|
|
553
|
+
:param extractor_id: Extractor plugin identifier.
|
|
554
|
+
:type extractor_id: str
|
|
555
|
+
:param run_id: Extraction run identifier.
|
|
556
|
+
:type run_id: str
|
|
557
|
+
:return: Extraction run directory.
|
|
558
|
+
:rtype: Path
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
return self.extraction_runs_dir / extractor_id / run_id
|
|
562
|
+
|
|
563
|
+
def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
|
|
564
|
+
"""
|
|
565
|
+
Read extracted text for an item from an extraction run, when present.
|
|
566
|
+
|
|
567
|
+
:param extractor_id: Extractor plugin identifier.
|
|
568
|
+
:type extractor_id: str
|
|
569
|
+
:param run_id: Extraction run identifier.
|
|
570
|
+
:type run_id: str
|
|
571
|
+
:param item_id: Item identifier.
|
|
572
|
+
:type item_id: str
|
|
573
|
+
:return: Extracted text or None if the artifact does not exist.
|
|
574
|
+
:rtype: str or None
|
|
575
|
+
:raises OSError: If the file exists but cannot be read.
|
|
576
|
+
"""
|
|
577
|
+
|
|
578
|
+
path = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "text" / f"{item_id}.txt"
|
|
579
|
+
if not path.is_file():
|
|
580
|
+
return None
|
|
581
|
+
return path.read_text(encoding="utf-8")
|
|
582
|
+
|
|
471
583
|
def _ensure_runs_dir(self) -> None:
|
|
472
584
|
"""
|
|
473
585
|
Ensure the retrieval runs directory exists.
|
|
@@ -608,6 +720,21 @@ class Corpus:
|
|
|
608
720
|
if resolved_tags and "tags" not in metadata_input:
|
|
609
721
|
metadata_input["tags"] = list(resolved_tags)
|
|
610
722
|
|
|
723
|
+
if self._hooks is not None:
|
|
724
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
725
|
+
hook_point=HookPoint.before_ingest,
|
|
726
|
+
filename=filename,
|
|
727
|
+
media_type=media_type,
|
|
728
|
+
title=resolved_title,
|
|
729
|
+
tags=list(resolved_tags),
|
|
730
|
+
metadata=dict(metadata_input),
|
|
731
|
+
source_uri=source_uri,
|
|
732
|
+
)
|
|
733
|
+
if mutation.add_tags:
|
|
734
|
+
for tag in mutation.add_tags:
|
|
735
|
+
if tag not in resolved_tags:
|
|
736
|
+
resolved_tags.append(tag)
|
|
737
|
+
|
|
611
738
|
frontmatter: Dict[str, Any] = {}
|
|
612
739
|
|
|
613
740
|
if media_type == "text/markdown":
|
|
@@ -656,6 +783,32 @@ class Corpus:
|
|
|
656
783
|
_write_sidecar(output_path, sidecar)
|
|
657
784
|
frontmatter = sidecar
|
|
658
785
|
|
|
786
|
+
if self._hooks is not None:
|
|
787
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
788
|
+
hook_point=HookPoint.after_ingest,
|
|
789
|
+
filename=filename,
|
|
790
|
+
media_type=media_type,
|
|
791
|
+
title=resolved_title,
|
|
792
|
+
tags=list(resolved_tags),
|
|
793
|
+
metadata=dict(metadata_input),
|
|
794
|
+
source_uri=source_uri,
|
|
795
|
+
item_id=item_id,
|
|
796
|
+
relpath=relpath,
|
|
797
|
+
)
|
|
798
|
+
if mutation.add_tags:
|
|
799
|
+
updated_tags = list(resolved_tags)
|
|
800
|
+
for tag in mutation.add_tags:
|
|
801
|
+
if tag not in updated_tags:
|
|
802
|
+
updated_tags.append(tag)
|
|
803
|
+
resolved_tags = updated_tags
|
|
804
|
+
sidecar_metadata = _load_sidecar(output_path)
|
|
805
|
+
sidecar_metadata["tags"] = resolved_tags
|
|
806
|
+
if media_type != "text/markdown":
|
|
807
|
+
sidecar_metadata["media_type"] = media_type
|
|
808
|
+
sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
|
|
809
|
+
_write_sidecar(output_path, sidecar_metadata)
|
|
810
|
+
frontmatter = _merge_metadata(frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata)
|
|
811
|
+
|
|
659
812
|
created_at = utc_now_iso()
|
|
660
813
|
item_record = CatalogItem(
|
|
661
814
|
id=item_id,
|
|
@@ -673,6 +826,130 @@ class Corpus:
|
|
|
673
826
|
|
|
674
827
|
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
675
828
|
|
|
829
|
+
def ingest_item_stream(
|
|
830
|
+
self,
|
|
831
|
+
stream,
|
|
832
|
+
*,
|
|
833
|
+
filename: Optional[str] = None,
|
|
834
|
+
media_type: str = "application/octet-stream",
|
|
835
|
+
tags: Sequence[str] = (),
|
|
836
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
837
|
+
source_uri: str = "unknown",
|
|
838
|
+
) -> IngestResult:
|
|
839
|
+
"""
|
|
840
|
+
Ingest a binary item from a readable stream.
|
|
841
|
+
|
|
842
|
+
This method is intended for large non-markdown items. It writes bytes to disk incrementally
|
|
843
|
+
while computing a checksum.
|
|
844
|
+
|
|
845
|
+
:param stream: Readable binary stream.
|
|
846
|
+
:type stream: object
|
|
847
|
+
:param filename: Optional filename for the stored item.
|
|
848
|
+
:type filename: str or None
|
|
849
|
+
:param media_type: Internet Assigned Numbers Authority media type for the item.
|
|
850
|
+
:type media_type: str
|
|
851
|
+
:param tags: Tags to associate with the item.
|
|
852
|
+
:type tags: Sequence[str]
|
|
853
|
+
:param metadata: Optional metadata mapping.
|
|
854
|
+
:type metadata: dict[str, Any] or None
|
|
855
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
856
|
+
:type source_uri: str
|
|
857
|
+
:return: Ingestion result summary.
|
|
858
|
+
:rtype: IngestResult
|
|
859
|
+
:raises ValueError: If the media_type is text/markdown.
|
|
860
|
+
"""
|
|
861
|
+
|
|
862
|
+
if media_type == "text/markdown":
|
|
863
|
+
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
864
|
+
|
|
865
|
+
item_id = str(uuid.uuid4())
|
|
866
|
+
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
867
|
+
if safe_filename:
|
|
868
|
+
safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
|
|
869
|
+
|
|
870
|
+
if safe_filename:
|
|
871
|
+
output_name = f"{item_id}--{safe_filename}"
|
|
872
|
+
else:
|
|
873
|
+
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
874
|
+
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
875
|
+
|
|
876
|
+
relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
|
|
877
|
+
output_path = self.root / relpath
|
|
878
|
+
|
|
879
|
+
resolved_tags = list(tags)
|
|
880
|
+
metadata_input: Dict[str, Any] = dict(metadata or {})
|
|
881
|
+
if resolved_tags and "tags" not in metadata_input:
|
|
882
|
+
metadata_input["tags"] = list(resolved_tags)
|
|
883
|
+
|
|
884
|
+
if self._hooks is not None:
|
|
885
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
886
|
+
hook_point=HookPoint.before_ingest,
|
|
887
|
+
filename=filename,
|
|
888
|
+
media_type=media_type,
|
|
889
|
+
title=None,
|
|
890
|
+
tags=list(resolved_tags),
|
|
891
|
+
metadata=dict(metadata_input),
|
|
892
|
+
source_uri=source_uri,
|
|
893
|
+
)
|
|
894
|
+
if mutation.add_tags:
|
|
895
|
+
for tag in mutation.add_tags:
|
|
896
|
+
if tag not in resolved_tags:
|
|
897
|
+
resolved_tags.append(tag)
|
|
898
|
+
|
|
899
|
+
write_result = _write_stream_and_hash(stream, output_path)
|
|
900
|
+
sha256_digest = str(write_result["sha256"])
|
|
901
|
+
bytes_written = int(write_result["bytes_written"])
|
|
902
|
+
|
|
903
|
+
sidecar: Dict[str, Any] = {}
|
|
904
|
+
sidecar["media_type"] = media_type
|
|
905
|
+
if resolved_tags:
|
|
906
|
+
sidecar["tags"] = resolved_tags
|
|
907
|
+
if metadata_input:
|
|
908
|
+
for metadata_key, metadata_value in metadata_input.items():
|
|
909
|
+
if metadata_key in {"tags", "biblicus"}:
|
|
910
|
+
continue
|
|
911
|
+
sidecar[metadata_key] = metadata_value
|
|
912
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
913
|
+
_write_sidecar(output_path, sidecar)
|
|
914
|
+
|
|
915
|
+
if self._hooks is not None:
|
|
916
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
917
|
+
hook_point=HookPoint.after_ingest,
|
|
918
|
+
filename=filename,
|
|
919
|
+
media_type=media_type,
|
|
920
|
+
title=None,
|
|
921
|
+
tags=list(resolved_tags),
|
|
922
|
+
metadata=dict(metadata_input),
|
|
923
|
+
source_uri=source_uri,
|
|
924
|
+
item_id=item_id,
|
|
925
|
+
relpath=relpath,
|
|
926
|
+
)
|
|
927
|
+
if mutation.add_tags:
|
|
928
|
+
updated_tags = list(resolved_tags)
|
|
929
|
+
for tag in mutation.add_tags:
|
|
930
|
+
if tag not in updated_tags:
|
|
931
|
+
updated_tags.append(tag)
|
|
932
|
+
resolved_tags = updated_tags
|
|
933
|
+
sidecar["tags"] = resolved_tags
|
|
934
|
+
_write_sidecar(output_path, sidecar)
|
|
935
|
+
|
|
936
|
+
created_at = utc_now_iso()
|
|
937
|
+
item_record = CatalogItem(
|
|
938
|
+
id=item_id,
|
|
939
|
+
relpath=relpath,
|
|
940
|
+
sha256=sha256_digest,
|
|
941
|
+
bytes=bytes_written,
|
|
942
|
+
media_type=media_type,
|
|
943
|
+
title=None,
|
|
944
|
+
tags=list(resolved_tags),
|
|
945
|
+
metadata=dict(sidecar or {}),
|
|
946
|
+
created_at=created_at,
|
|
947
|
+
source_uri=source_uri,
|
|
948
|
+
)
|
|
949
|
+
self._upsert_catalog_item(item_record)
|
|
950
|
+
|
|
951
|
+
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
952
|
+
|
|
676
953
|
def ingest_note(
|
|
677
954
|
self,
|
|
678
955
|
text: str,
|
|
@@ -727,6 +1004,36 @@ class Corpus:
|
|
|
727
1004
|
:rtype: IngestResult
|
|
728
1005
|
"""
|
|
729
1006
|
|
|
1007
|
+
candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
|
|
1008
|
+
if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
|
|
1009
|
+
path = source if isinstance(source, Path) else candidate_path
|
|
1010
|
+
assert isinstance(path, Path)
|
|
1011
|
+
path = path.resolve()
|
|
1012
|
+
filename = path.name
|
|
1013
|
+
media_type, _ = mimetypes.guess_type(filename)
|
|
1014
|
+
media_type = media_type or "application/octet-stream"
|
|
1015
|
+
if path.suffix.lower() in {".md", ".markdown"}:
|
|
1016
|
+
media_type = "text/markdown"
|
|
1017
|
+
if media_type == "text/markdown":
|
|
1018
|
+
return self.ingest_item(
|
|
1019
|
+
path.read_bytes(),
|
|
1020
|
+
filename=filename,
|
|
1021
|
+
media_type=media_type,
|
|
1022
|
+
title=None,
|
|
1023
|
+
tags=tags,
|
|
1024
|
+
metadata=None,
|
|
1025
|
+
source_uri=source_uri or path.as_uri(),
|
|
1026
|
+
)
|
|
1027
|
+
with path.open("rb") as handle:
|
|
1028
|
+
return self.ingest_item_stream(
|
|
1029
|
+
handle,
|
|
1030
|
+
filename=filename,
|
|
1031
|
+
media_type=media_type,
|
|
1032
|
+
tags=tags,
|
|
1033
|
+
metadata=None,
|
|
1034
|
+
source_uri=source_uri or path.as_uri(),
|
|
1035
|
+
)
|
|
1036
|
+
|
|
730
1037
|
payload = load_source(source, source_uri=source_uri)
|
|
731
1038
|
return self.ingest_item(
|
|
732
1039
|
payload.data,
|
|
@@ -738,6 +1045,128 @@ class Corpus:
|
|
|
738
1045
|
source_uri=payload.source_uri,
|
|
739
1046
|
)
|
|
740
1047
|
|
|
1048
|
+
def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
|
|
1049
|
+
"""
|
|
1050
|
+
Import a folder tree into the corpus, preserving relative paths and provenance.
|
|
1051
|
+
|
|
1052
|
+
Imported content is stored under the raw directory in a dedicated import namespace so that
|
|
1053
|
+
operators can inspect and back up imported content as a structured tree.
|
|
1054
|
+
|
|
1055
|
+
:param source_root: Root directory of the folder tree to import.
|
|
1056
|
+
:type source_root: Path
|
|
1057
|
+
:param tags: Tags to associate with imported items.
|
|
1058
|
+
:type tags: Sequence[str]
|
|
1059
|
+
:return: Import statistics.
|
|
1060
|
+
:rtype: dict[str, int]
|
|
1061
|
+
:raises FileNotFoundError: If the source_root does not exist.
|
|
1062
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1063
|
+
"""
|
|
1064
|
+
|
|
1065
|
+
source_root = source_root.resolve()
|
|
1066
|
+
if not source_root.is_dir():
|
|
1067
|
+
raise FileNotFoundError(f"Import source root does not exist: {source_root}")
|
|
1068
|
+
|
|
1069
|
+
ignore_spec = load_corpus_ignore_spec(self.root)
|
|
1070
|
+
import_id = str(uuid.uuid4())
|
|
1071
|
+
stats = {"scanned": 0, "ignored": 0, "imported": 0}
|
|
1072
|
+
|
|
1073
|
+
for source_path in sorted(source_root.rglob("*")):
|
|
1074
|
+
if not source_path.is_file():
|
|
1075
|
+
continue
|
|
1076
|
+
relative_source_path = source_path.relative_to(source_root).as_posix()
|
|
1077
|
+
stats["scanned"] += 1
|
|
1078
|
+
if ignore_spec.matches(relative_source_path):
|
|
1079
|
+
stats["ignored"] += 1
|
|
1080
|
+
continue
|
|
1081
|
+
self._import_file(
|
|
1082
|
+
source_path=source_path,
|
|
1083
|
+
import_id=import_id,
|
|
1084
|
+
relative_source_path=relative_source_path,
|
|
1085
|
+
tags=tags,
|
|
1086
|
+
)
|
|
1087
|
+
stats["imported"] += 1
|
|
1088
|
+
|
|
1089
|
+
return stats
|
|
1090
|
+
|
|
1091
|
+
def _import_file(
|
|
1092
|
+
self,
|
|
1093
|
+
*,
|
|
1094
|
+
source_path: Path,
|
|
1095
|
+
import_id: str,
|
|
1096
|
+
relative_source_path: str,
|
|
1097
|
+
tags: Sequence[str],
|
|
1098
|
+
) -> None:
|
|
1099
|
+
"""
|
|
1100
|
+
Import a single file into the corpus under an import namespace.
|
|
1101
|
+
|
|
1102
|
+
:param source_path: Source file path to import.
|
|
1103
|
+
:type source_path: Path
|
|
1104
|
+
:param import_id: Import identifier.
|
|
1105
|
+
:type import_id: str
|
|
1106
|
+
:param relative_source_path: Relative path within the imported tree.
|
|
1107
|
+
:type relative_source_path: str
|
|
1108
|
+
:param tags: Tags to apply.
|
|
1109
|
+
:type tags: Sequence[str]
|
|
1110
|
+
:return: None.
|
|
1111
|
+
:rtype: None
|
|
1112
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1113
|
+
"""
|
|
1114
|
+
|
|
1115
|
+
item_id = str(uuid.uuid4())
|
|
1116
|
+
destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path)
|
|
1117
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1118
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1119
|
+
|
|
1120
|
+
raw_bytes = source_path.read_bytes()
|
|
1121
|
+
sha256_digest = _sha256_bytes(raw_bytes)
|
|
1122
|
+
|
|
1123
|
+
media_type, _ = mimetypes.guess_type(source_path.name)
|
|
1124
|
+
media_type = media_type or "application/octet-stream"
|
|
1125
|
+
if source_path.suffix.lower() in {".md", ".markdown"}:
|
|
1126
|
+
media_type = "text/markdown"
|
|
1127
|
+
|
|
1128
|
+
title: Optional[str] = None
|
|
1129
|
+
frontmatter_metadata: Dict[str, Any] = {}
|
|
1130
|
+
if media_type == "text/markdown":
|
|
1131
|
+
try:
|
|
1132
|
+
text = raw_bytes.decode("utf-8")
|
|
1133
|
+
except UnicodeDecodeError as decode_error:
|
|
1134
|
+
raise ValueError(
|
|
1135
|
+
f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
|
|
1136
|
+
) from decode_error
|
|
1137
|
+
parsed_document = parse_front_matter(text)
|
|
1138
|
+
frontmatter_metadata = dict(parsed_document.metadata)
|
|
1139
|
+
title_value = frontmatter_metadata.get("title")
|
|
1140
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
1141
|
+
title = title_value.strip()
|
|
1142
|
+
|
|
1143
|
+
destination_path.write_bytes(raw_bytes)
|
|
1144
|
+
|
|
1145
|
+
sidecar: Dict[str, Any] = {}
|
|
1146
|
+
if tags:
|
|
1147
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1148
|
+
if media_type != "text/markdown":
|
|
1149
|
+
sidecar["media_type"] = media_type
|
|
1150
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
|
|
1151
|
+
_write_sidecar(destination_path, sidecar)
|
|
1152
|
+
|
|
1153
|
+
merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
|
|
1154
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1155
|
+
|
|
1156
|
+
item_record = CatalogItem(
|
|
1157
|
+
id=item_id,
|
|
1158
|
+
relpath=destination_relpath,
|
|
1159
|
+
sha256=sha256_digest,
|
|
1160
|
+
bytes=len(raw_bytes),
|
|
1161
|
+
media_type=media_type,
|
|
1162
|
+
title=title,
|
|
1163
|
+
tags=list(resolved_tags),
|
|
1164
|
+
metadata=dict(merged_metadata or {}),
|
|
1165
|
+
created_at=utc_now_iso(),
|
|
1166
|
+
source_uri=source_path.as_uri(),
|
|
1167
|
+
)
|
|
1168
|
+
self._upsert_catalog_item(item_record)
|
|
1169
|
+
|
|
741
1170
|
def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
|
|
742
1171
|
"""
|
|
743
1172
|
List items from the catalog.
|