biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +177 -53
  11. biblicus/corpus.py +209 -59
  12. biblicus/crawl.py +186 -0
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +280 -79
  16. biblicus/extractors/__init__.py +14 -3
  17. biblicus/extractors/base.py +12 -5
  18. biblicus/extractors/metadata_text.py +13 -5
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +16 -6
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +0 -5
  29. biblicus/hook_manager.py +3 -5
  30. biblicus/hooks.py +3 -7
  31. biblicus/ignore.py +0 -3
  32. biblicus/models.py +118 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +44 -9
  35. biblicus/time.py +1 -2
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
  39. biblicus-0.4.0.dist-info/RECORD +45 -0
  40. biblicus/extractors/cascade.py +0 -101
  41. biblicus-0.2.0.dist-info/RECORD +0 -32
  42. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py CHANGED
@@ -13,6 +13,7 @@ from pathlib import Path
13
13
  from typing import Any, Dict, List, Optional, Sequence
14
14
 
15
15
  import yaml
16
+ from pydantic import ValidationError
16
17
 
17
18
  from .constants import (
18
19
  CORPUS_DIR_NAME,
@@ -23,15 +24,20 @@ from .constants import (
23
24
  SIDECAR_SUFFIX,
24
25
  )
25
26
  from .frontmatter import parse_front_matter, render_front_matter
26
- from pydantic import ValidationError
27
-
28
27
  from .hook_manager import HookManager
29
28
  from .hooks import HookPoint
30
29
  from .ignore import load_corpus_ignore_spec
31
- from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
30
+ from .models import (
31
+ CatalogItem,
32
+ CorpusCatalog,
33
+ CorpusConfig,
34
+ ExtractionRunListEntry,
35
+ IngestResult,
36
+ RetrievalRun,
37
+ )
32
38
  from .sources import load_source
33
39
  from .time import utc_now_iso
34
- from .uris import normalize_corpus_uri, corpus_ref_to_path
40
+ from .uris import corpus_ref_to_path, normalize_corpus_uri
35
41
 
36
42
 
37
43
  def _sha256_bytes(data: bytes) -> str:
@@ -43,11 +49,12 @@ def _sha256_bytes(data: bytes) -> str:
43
49
  :return: Secure Hash Algorithm 256 hex digest.
44
50
  :rtype: str
45
51
  """
46
-
47
52
  return hashlib.sha256(data).hexdigest()
48
53
 
49
54
 
50
- def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int = 1024 * 1024) -> Dict[str, object]:
55
+ def _write_stream_and_hash(
56
+ stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
57
+ ) -> Dict[str, object]:
51
58
  """
52
59
  Write a binary stream to disk while computing a digest.
53
60
 
@@ -61,7 +68,6 @@ def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int =
61
68
  :rtype: dict[str, object]
62
69
  :raises OSError: If the destination cannot be written.
63
70
  """
64
-
65
71
  hasher = hashlib.sha256()
66
72
  bytes_written = 0
67
73
  with destination_path.open("wb") as destination_handle:
@@ -84,7 +90,6 @@ def _sanitize_filename(name: str) -> str:
84
90
  :return: Sanitized filename.
85
91
  :rtype: str
86
92
  """
87
-
88
93
  allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
89
94
  sanitized_name = "".join(
90
95
  (character if character in allowed_characters else "_") for character in name
@@ -101,9 +106,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
101
106
  :return: Preferred extension or None.
102
107
  :rtype: str or None
103
108
  """
104
-
105
109
  media_type_overrides = {
106
110
  "image/jpeg": ".jpg",
111
+ "audio/ogg": ".ogg",
107
112
  }
108
113
  if media_type in media_type_overrides:
109
114
  return media_type_overrides[media_type]
@@ -121,7 +126,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
121
126
  :return: Filename with a compatible extension.
122
127
  :rtype: str
123
128
  """
124
-
125
129
  raw_name = filename.strip()
126
130
 
127
131
  if media_type == "text/markdown":
@@ -129,11 +133,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
129
133
  return raw_name
130
134
  return raw_name + ".md"
131
135
 
136
+ if Path(raw_name).suffix:
137
+ return raw_name
138
+
132
139
  ext = _preferred_extension_for_media_type(media_type)
133
140
  if not ext:
134
141
  return raw_name
135
- if raw_name.lower().endswith(ext.lower()):
136
- return raw_name
137
142
  return raw_name + ext
138
143
 
139
144
 
@@ -148,7 +153,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
148
153
  :return: Deduplicated tag list preserving order.
149
154
  :rtype: list[str]
150
155
  """
151
-
152
156
  merged_tags: List[str] = []
153
157
 
154
158
  for explicit_tag in explicit:
@@ -181,7 +185,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
181
185
  :return: Sidecar path.
182
186
  :rtype: Path
183
187
  """
184
-
185
188
  return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
186
189
 
187
190
 
@@ -195,7 +198,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
195
198
  :rtype: dict[str, Any]
196
199
  :raises ValueError: If the sidecar content is not a mapping.
197
200
  """
198
-
199
201
  path = _sidecar_path_for(content_path)
200
202
  if not path.is_file():
201
203
  return {}
@@ -226,7 +228,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
226
228
  path.write_text(text + "\n", encoding="utf-8")
227
229
 
228
230
 
229
- def _ensure_biblicus_block(metadata: Dict[str, Any], *, item_id: str, source_uri: str) -> Dict[str, Any]:
231
+ def _ensure_biblicus_block(
232
+ metadata: Dict[str, Any], *, item_id: str, source_uri: str
233
+ ) -> Dict[str, Any]:
230
234
  """
231
235
  Ensure the biblicus metadata block exists and is populated.
232
236
 
@@ -324,7 +328,6 @@ class Corpus:
324
328
  :param root: Corpus root directory.
325
329
  :type root: Path
326
330
  """
327
-
328
331
  self.root = root
329
332
  self.meta_dir = self.root / CORPUS_DIR_NAME
330
333
  self.raw_dir = self.root / DEFAULT_RAW_DIR
@@ -339,7 +342,6 @@ class Corpus:
339
342
  :return: Corpus uniform resource identifier.
340
343
  :rtype: str
341
344
  """
342
-
343
345
  return self.root.as_uri()
344
346
 
345
347
  def _load_config(self) -> Optional[CorpusConfig]:
@@ -350,7 +352,6 @@ class Corpus:
350
352
  :rtype: CorpusConfig or None
351
353
  :raises ValueError: If the config schema is invalid.
352
354
  """
353
-
354
355
  path = self.meta_dir / "config.json"
355
356
  if not path.is_file():
356
357
  return None
@@ -359,7 +360,9 @@ class Corpus:
359
360
  return CorpusConfig.model_validate(data)
360
361
  except ValidationError as exc:
361
362
  has_hook_error = any(
362
- isinstance(error.get("loc"), tuple) and error.get("loc") and error.get("loc")[0] == "hooks"
363
+ isinstance(error.get("loc"), tuple)
364
+ and error.get("loc")
365
+ and error.get("loc")[0] == "hooks"
363
366
  for error in exc.errors()
364
367
  )
365
368
  if has_hook_error:
@@ -374,7 +377,6 @@ class Corpus:
374
377
  :rtype: HookManager or None
375
378
  :raises ValueError: If hook specifications are invalid.
376
379
  """
377
-
378
380
  if self.config is None or not self.config.hooks:
379
381
  return None
380
382
  return HookManager.from_config(
@@ -394,7 +396,6 @@ class Corpus:
394
396
  :rtype: Corpus
395
397
  :raises FileNotFoundError: If no corpus config is found.
396
398
  """
397
-
398
399
  start = start.resolve()
399
400
  for candidate in [start, *start.parents]:
400
401
  if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
@@ -413,7 +414,6 @@ class Corpus:
413
414
  :return: Opened corpus instance.
414
415
  :rtype: Corpus
415
416
  """
416
-
417
417
  return cls.find(corpus_ref_to_path(ref))
418
418
 
419
419
  @classmethod
@@ -429,7 +429,6 @@ class Corpus:
429
429
  :rtype: Corpus
430
430
  :raises FileExistsError: If the corpus already exists and force is False.
431
431
  """
432
-
433
432
  root = root.resolve()
434
433
  corpus = cls(root)
435
434
 
@@ -459,7 +458,6 @@ class Corpus:
459
458
  :return: Catalog file path.
460
459
  :rtype: Path
461
460
  """
462
-
463
461
  return self.meta_dir / "catalog.json"
464
462
 
465
463
  def _init_catalog(self) -> None:
@@ -469,7 +467,6 @@ class Corpus:
469
467
  :return: None.
470
468
  :rtype: None
471
469
  """
472
-
473
470
  if self.catalog_path.exists():
474
471
  return
475
472
  catalog = CorpusCatalog(
@@ -492,7 +489,6 @@ class Corpus:
492
489
  :raises FileNotFoundError: If the catalog file does not exist.
493
490
  :raises ValueError: If the catalog schema is invalid.
494
491
  """
495
-
496
492
  if not self.catalog_path.is_file():
497
493
  raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
498
494
  catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
@@ -507,7 +503,6 @@ class Corpus:
507
503
  :raises FileNotFoundError: If the catalog file does not exist.
508
504
  :raises ValueError: If the catalog schema is invalid.
509
505
  """
510
-
511
506
  return self._load_catalog()
512
507
 
513
508
  def _write_catalog(self, catalog: CorpusCatalog) -> None:
@@ -519,7 +514,6 @@ class Corpus:
519
514
  :return: None.
520
515
  :rtype: None
521
516
  """
522
-
523
517
  temp_path = self.catalog_path.with_suffix(".json.tmp")
524
518
  temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
525
519
  temp_path.replace(self.catalog_path)
@@ -532,7 +526,6 @@ class Corpus:
532
526
  :return: Path to the runs directory.
533
527
  :rtype: Path
534
528
  """
535
-
536
529
  return self.meta_dir / RUNS_DIR_NAME
537
530
 
538
531
  @property
@@ -543,7 +536,6 @@ class Corpus:
543
536
  :return: Path to the extraction runs directory.
544
537
  :rtype: Path
545
538
  """
546
-
547
539
  return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
548
540
 
549
541
  def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
@@ -557,7 +549,6 @@ class Corpus:
557
549
  :return: Extraction run directory.
558
550
  :rtype: Path
559
551
  """
560
-
561
552
  return self.extraction_runs_dir / extractor_id / run_id
562
553
 
563
554
  def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
@@ -574,12 +565,105 @@ class Corpus:
574
565
  :rtype: str or None
575
566
  :raises OSError: If the file exists but cannot be read.
576
567
  """
577
-
578
- path = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "text" / f"{item_id}.txt"
568
+ path = (
569
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
570
+ / "text"
571
+ / f"{item_id}.txt"
572
+ )
579
573
  if not path.is_file():
580
574
  return None
581
575
  return path.read_text(encoding="utf-8")
582
576
 
577
+ def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
578
+ """
579
+ Load an extraction run manifest from the corpus.
580
+
581
+ :param extractor_id: Extractor plugin identifier.
582
+ :type extractor_id: str
583
+ :param run_id: Extraction run identifier.
584
+ :type run_id: str
585
+ :return: Parsed extraction run manifest.
586
+ :rtype: biblicus.extraction.ExtractionRunManifest
587
+ :raises FileNotFoundError: If the manifest file does not exist.
588
+ :raises ValueError: If the manifest data is invalid.
589
+ """
590
+ from .extraction import ExtractionRunManifest
591
+
592
+ manifest_path = (
593
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
594
+ )
595
+ if not manifest_path.is_file():
596
+ raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
597
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
598
+ return ExtractionRunManifest.model_validate(data)
599
+
600
+ def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
601
+ """
602
+ List extraction runs stored under the corpus.
603
+
604
+ :param extractor_id: Optional extractor identifier filter.
605
+ :type extractor_id: str or None
606
+ :return: Summary list entries for each run.
607
+ :rtype: list[biblicus.models.ExtractionRunListEntry]
608
+ """
609
+ runs_root = self.extraction_runs_dir
610
+ if not runs_root.is_dir():
611
+ return []
612
+
613
+ extractor_dirs: List[Path]
614
+ if extractor_id is None:
615
+ extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
616
+ else:
617
+ extractor_path = runs_root / extractor_id
618
+ extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
619
+
620
+ entries: List[ExtractionRunListEntry] = []
621
+ for extractor_dir in extractor_dirs:
622
+ for run_dir in sorted(extractor_dir.iterdir()):
623
+ if not run_dir.is_dir():
624
+ continue
625
+ manifest_path = run_dir / "manifest.json"
626
+ if not manifest_path.is_file():
627
+ continue
628
+ try:
629
+ manifest = self.load_extraction_run_manifest(
630
+ extractor_id=extractor_dir.name,
631
+ run_id=run_dir.name,
632
+ )
633
+ except (FileNotFoundError, ValueError):
634
+ continue
635
+ entries.append(
636
+ ExtractionRunListEntry(
637
+ extractor_id=extractor_dir.name,
638
+ run_id=run_dir.name,
639
+ recipe_id=manifest.recipe.recipe_id,
640
+ recipe_name=manifest.recipe.name,
641
+ catalog_generated_at=manifest.catalog_generated_at,
642
+ created_at=manifest.created_at,
643
+ stats=dict(manifest.stats),
644
+ )
645
+ )
646
+
647
+ entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
648
+ return entries
649
+
650
+ def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
651
+ """
652
+ Delete an extraction run directory and its derived artifacts.
653
+
654
+ :param extractor_id: Extractor plugin identifier.
655
+ :type extractor_id: str
656
+ :param run_id: Extraction run identifier.
657
+ :type run_id: str
658
+ :return: None.
659
+ :rtype: None
660
+ :raises FileNotFoundError: If the extraction run directory does not exist.
661
+ """
662
+ run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
663
+ if not run_dir.is_dir():
664
+ raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
665
+ shutil.rmtree(run_dir)
666
+
583
667
  def _ensure_runs_dir(self) -> None:
584
668
  """
585
669
  Ensure the retrieval runs directory exists.
@@ -587,7 +671,6 @@ class Corpus:
587
671
  :return: None.
588
672
  :rtype: None
589
673
  """
590
-
591
674
  self.runs_dir.mkdir(parents=True, exist_ok=True)
592
675
 
593
676
  def write_run(self, run: RetrievalRun) -> None:
@@ -599,7 +682,6 @@ class Corpus:
599
682
  :return: None.
600
683
  :rtype: None
601
684
  """
602
-
603
685
  self._ensure_runs_dir()
604
686
  path = self.runs_dir / f"{run.run_id}.json"
605
687
  path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
@@ -618,7 +700,6 @@ class Corpus:
618
700
  :rtype: RetrievalRun
619
701
  :raises FileNotFoundError: If the run manifest does not exist.
620
702
  """
621
-
622
703
  path = self.runs_dir / f"{run_id}.json"
623
704
  if not path.is_file():
624
705
  raise FileNotFoundError(f"Missing run manifest: {path}")
@@ -633,7 +714,6 @@ class Corpus:
633
714
  :return: Latest run identifier or None.
634
715
  :rtype: str or None
635
716
  """
636
-
637
717
  return self._load_catalog().latest_run_id
638
718
 
639
719
  def _upsert_catalog_item(self, item: CatalogItem) -> None:
@@ -645,7 +725,6 @@ class Corpus:
645
725
  :return: None.
646
726
  :rtype: None
647
727
  """
648
-
649
728
  self._init_catalog()
650
729
  catalog = self._load_catalog()
651
730
  catalog.items[item.id] = item
@@ -693,7 +772,6 @@ class Corpus:
693
772
  :rtype: IngestResult
694
773
  :raises ValueError: If markdown is not Unicode Transformation Format 8.
695
774
  """
696
-
697
775
  item_id = str(uuid.uuid4())
698
776
  safe_filename = _sanitize_filename(filename) if filename else ""
699
777
 
@@ -741,7 +819,9 @@ class Corpus:
741
819
  try:
742
820
  markdown_text = data.decode("utf-8")
743
821
  except UnicodeDecodeError as decode_error:
744
- raise ValueError("Markdown must be Unicode Transformation Format 8") from decode_error
822
+ raise ValueError(
823
+ "Markdown must be Unicode Transformation Format 8"
824
+ ) from decode_error
745
825
 
746
826
  parsed_document = parse_front_matter(markdown_text)
747
827
  frontmatter = dict(parsed_document.metadata)
@@ -760,7 +840,9 @@ class Corpus:
760
840
  if isinstance(title_value, str) and title_value.strip():
761
841
  resolved_title = title_value.strip()
762
842
 
763
- frontmatter = _ensure_biblicus_block(frontmatter, item_id=item_id, source_uri=source_uri)
843
+ frontmatter = _ensure_biblicus_block(
844
+ frontmatter, item_id=item_id, source_uri=source_uri
845
+ )
764
846
  rendered_document = render_front_matter(frontmatter, parsed_document.body)
765
847
  data_to_write = rendered_document.encode("utf-8")
766
848
  else:
@@ -807,7 +889,9 @@ class Corpus:
807
889
  sidecar_metadata["media_type"] = media_type
808
890
  sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
809
891
  _write_sidecar(output_path, sidecar_metadata)
810
- frontmatter = _merge_metadata(frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata)
892
+ frontmatter = _merge_metadata(
893
+ frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
894
+ )
811
895
 
812
896
  created_at = utc_now_iso()
813
897
  item_record = CatalogItem(
@@ -858,7 +942,6 @@ class Corpus:
858
942
  :rtype: IngestResult
859
943
  :raises ValueError: If the media_type is text/markdown.
860
944
  """
861
-
862
945
  if media_type == "text/markdown":
863
946
  raise ValueError("Stream ingestion is not supported for Markdown")
864
947
 
@@ -972,7 +1055,6 @@ class Corpus:
972
1055
  :return: Ingestion result summary.
973
1056
  :rtype: IngestResult
974
1057
  """
975
-
976
1058
  data = text.encode("utf-8")
977
1059
  return self.ingest_item(
978
1060
  data,
@@ -1003,7 +1085,6 @@ class Corpus:
1003
1085
  :return: Ingestion result summary.
1004
1086
  :rtype: IngestResult
1005
1087
  """
1006
-
1007
1088
  candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
1008
1089
  if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
1009
1090
  path = source if isinstance(source, Path) else candidate_path
@@ -1061,7 +1142,6 @@ class Corpus:
1061
1142
  :raises FileNotFoundError: If the source_root does not exist.
1062
1143
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1063
1144
  """
1064
-
1065
1145
  source_root = source_root.resolve()
1066
1146
  if not source_root.is_dir():
1067
1147
  raise FileNotFoundError(f"Import source root does not exist: {source_root}")
@@ -1111,9 +1191,10 @@ class Corpus:
1111
1191
  :rtype: None
1112
1192
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1113
1193
  """
1114
-
1115
1194
  item_id = str(uuid.uuid4())
1116
- destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path)
1195
+ destination_relpath = str(
1196
+ Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
1197
+ )
1117
1198
  destination_path = (self.root / destination_relpath).resolve()
1118
1199
  destination_path.parent.mkdir(parents=True, exist_ok=True)
1119
1200
 
@@ -1176,11 +1257,8 @@ class Corpus:
1176
1257
  :return: Catalog items ordered by recency.
1177
1258
  :rtype: list[CatalogItem]
1178
1259
  """
1179
-
1180
1260
  catalog = self._load_catalog()
1181
- ordered_ids = (
1182
- catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
1183
- )
1261
+ ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
1184
1262
  collected_items: List[CatalogItem] = []
1185
1263
  for item_id in ordered_ids:
1186
1264
  item = catalog.items.get(item_id)
@@ -1198,13 +1276,84 @@ class Corpus:
1198
1276
  :rtype: CatalogItem
1199
1277
  :raises KeyError: If the item identifier is unknown.
1200
1278
  """
1201
-
1202
1279
  catalog = self._load_catalog()
1203
1280
  item = catalog.items.get(item_id)
1204
1281
  if item is None:
1205
1282
  raise KeyError(f"Unknown item identifier: {item_id}")
1206
1283
  return item
1207
1284
 
1285
+ def create_crawl_id(self) -> str:
1286
+ """
1287
+ Create a new crawl identifier.
1288
+
1289
+ :return: Crawl identifier.
1290
+ :rtype: str
1291
+ """
1292
+ return str(uuid.uuid4())
1293
+
1294
+ def ingest_crawled_payload(
1295
+ self,
1296
+ *,
1297
+ crawl_id: str,
1298
+ relative_path: str,
1299
+ data: bytes,
1300
+ filename: str,
1301
+ media_type: str,
1302
+ source_uri: str,
1303
+ tags: Sequence[str],
1304
+ ) -> None:
1305
+ """
1306
+ Ingest a crawled payload under a crawl import namespace.
1307
+
1308
+ :param crawl_id: Crawl identifier used to group crawled artifacts.
1309
+ :type crawl_id: str
1310
+ :param relative_path: Relative path within the crawl prefix.
1311
+ :type relative_path: str
1312
+ :param data: Raw payload bytes.
1313
+ :type data: bytes
1314
+ :param filename: Suggested filename from the payload metadata.
1315
+ :type filename: str
1316
+ :param media_type: Internet Assigned Numbers Authority media type.
1317
+ :type media_type: str
1318
+ :param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
1319
+ :type source_uri: str
1320
+ :param tags: Tags to attach to the stored item.
1321
+ :type tags: Sequence[str]
1322
+ :return: None.
1323
+ :rtype: None
1324
+ """
1325
+ _ = filename
1326
+ item_id = str(uuid.uuid4())
1327
+ destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
1328
+ destination_path = (self.root / destination_relpath).resolve()
1329
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1330
+ destination_path.write_bytes(data)
1331
+
1332
+ sha256_digest = _sha256_bytes(data)
1333
+
1334
+ sidecar: Dict[str, Any] = {}
1335
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1336
+ sidecar["media_type"] = media_type
1337
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
1338
+ _write_sidecar(destination_path, sidecar)
1339
+
1340
+ merged_metadata = _merge_metadata({}, sidecar)
1341
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1342
+
1343
+ item_record = CatalogItem(
1344
+ id=item_id,
1345
+ relpath=destination_relpath,
1346
+ sha256=sha256_digest,
1347
+ bytes=len(data),
1348
+ media_type=media_type,
1349
+ title=None,
1350
+ tags=list(resolved_tags),
1351
+ metadata=dict(merged_metadata or {}),
1352
+ created_at=utc_now_iso(),
1353
+ source_uri=source_uri,
1354
+ )
1355
+ self._upsert_catalog_item(item_record)
1356
+
1208
1357
  def reindex(self) -> Dict[str, int]:
1209
1358
  """
1210
1359
  Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
@@ -1216,7 +1365,6 @@ class Corpus:
1216
1365
  :rtype: dict[str, int]
1217
1366
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1218
1367
  """
1219
-
1220
1368
  self._init_catalog()
1221
1369
  existing_catalog = self._load_catalog()
1222
1370
  stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
@@ -1291,7 +1439,9 @@ class Corpus:
1291
1439
 
1292
1440
  previous_item = existing_catalog.items.get(item_id)
1293
1441
  created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
1294
- source_uri = source_uri or (previous_item.source_uri if previous_item is not None else None)
1442
+ source_uri = source_uri or (
1443
+ previous_item.source_uri if previous_item is not None else None
1444
+ )
1295
1445
 
1296
1446
  if previous_item is None:
1297
1447
  stats["inserted"] += 1
@@ -1338,7 +1488,6 @@ class Corpus:
1338
1488
  :return: Corpus name.
1339
1489
  :rtype: str
1340
1490
  """
1341
-
1342
1491
  return self.root.name
1343
1492
 
1344
1493
  def purge(self, *, confirm: str) -> None:
@@ -1351,10 +1500,11 @@ class Corpus:
1351
1500
  :rtype: None
1352
1501
  :raises ValueError: If the confirmation does not match.
1353
1502
  """
1354
-
1355
1503
  expected = self.name
1356
1504
  if confirm != expected:
1357
- raise ValueError(f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus")
1505
+ raise ValueError(
1506
+ f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
1507
+ )
1358
1508
 
1359
1509
  if self.raw_dir.exists():
1360
1510
  shutil.rmtree(self.raw_dir)