biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -11,6 +11,7 @@ import shutil
|
|
|
11
11
|
import uuid
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Any, Dict, List, Optional, Sequence
|
|
14
|
+
from urllib.parse import quote, unquote, urlparse
|
|
14
15
|
|
|
15
16
|
import yaml
|
|
16
17
|
from pydantic import ValidationError
|
|
@@ -19,11 +20,12 @@ from .constants import (
|
|
|
19
20
|
ANALYSIS_RUNS_DIR_NAME,
|
|
20
21
|
CORPUS_DIR_NAME,
|
|
21
22
|
DEFAULT_RAW_DIR,
|
|
22
|
-
|
|
23
|
-
RUNS_DIR_NAME,
|
|
23
|
+
EXTRACTION_SNAPSHOTS_DIR_NAME,
|
|
24
24
|
SCHEMA_VERSION,
|
|
25
25
|
SIDECAR_SUFFIX,
|
|
26
|
+
SNAPSHOTS_DIR_NAME,
|
|
26
27
|
)
|
|
28
|
+
from .errors import IngestCollisionError
|
|
27
29
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
28
30
|
from .hook_manager import HookManager
|
|
29
31
|
from .hooks import HookPoint
|
|
@@ -32,10 +34,10 @@ from .models import (
|
|
|
32
34
|
CatalogItem,
|
|
33
35
|
CorpusCatalog,
|
|
34
36
|
CorpusConfig,
|
|
35
|
-
|
|
36
|
-
|
|
37
|
+
ExtractionSnapshotListEntry,
|
|
38
|
+
ExtractionSnapshotReference,
|
|
37
39
|
IngestResult,
|
|
38
|
-
|
|
40
|
+
RetrievalSnapshot,
|
|
39
41
|
)
|
|
40
42
|
from .sources import load_source
|
|
41
43
|
from .time import utc_now_iso
|
|
@@ -110,7 +112,10 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
|
110
112
|
"""
|
|
111
113
|
media_type_overrides = {
|
|
112
114
|
"image/jpeg": ".jpg",
|
|
115
|
+
"audio/mpeg": ".mp3",
|
|
113
116
|
"audio/ogg": ".ogg",
|
|
117
|
+
"audio/wav": ".wav",
|
|
118
|
+
"audio/x-wav": ".wav",
|
|
114
119
|
}
|
|
115
120
|
if media_type in media_type_overrides:
|
|
116
121
|
return media_type_overrides[media_type]
|
|
@@ -136,7 +141,16 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
136
141
|
return raw_name + ".md"
|
|
137
142
|
|
|
138
143
|
if Path(raw_name).suffix:
|
|
139
|
-
|
|
144
|
+
if "%2F" in raw_name or "%3A" in raw_name:
|
|
145
|
+
decoded = unquote(raw_name)
|
|
146
|
+
parsed = urlparse(decoded)
|
|
147
|
+
decoded_path = parsed.path if parsed.scheme else decoded
|
|
148
|
+
if not Path(decoded_path).suffix:
|
|
149
|
+
pass
|
|
150
|
+
else:
|
|
151
|
+
return raw_name
|
|
152
|
+
else:
|
|
153
|
+
return raw_name
|
|
140
154
|
|
|
141
155
|
ext = _preferred_extension_for_media_type(media_type)
|
|
142
156
|
if not ext:
|
|
@@ -144,6 +158,55 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
144
158
|
return raw_name + ext
|
|
145
159
|
|
|
146
160
|
|
|
161
|
+
def _encode_source_uri_for_filename(source_uri: str) -> str:
|
|
162
|
+
"""
|
|
163
|
+
Percent-encode a source uniform resource identifier for filename use.
|
|
164
|
+
|
|
165
|
+
:param source_uri: Source uniform resource identifier to encode.
|
|
166
|
+
:type source_uri: str
|
|
167
|
+
:return: Percent-encoded uniform resource identifier safe for filenames.
|
|
168
|
+
:rtype: str
|
|
169
|
+
"""
|
|
170
|
+
return quote(source_uri, safe="")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _storage_filename_for_ingest(
|
|
174
|
+
*, filename: Optional[str], media_type: str, source_uri: Optional[str]
|
|
175
|
+
) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Derive a collision-safe filename for corpus storage.
|
|
178
|
+
|
|
179
|
+
If a source uniform resource identifier is provided, the full uniform resource identifier is
|
|
180
|
+
percent-encoded to namespace the stored file, preventing collisions between identical basenames
|
|
181
|
+
from different sources. When no uniform resource identifier is available, fall back to a
|
|
182
|
+
sanitized filename.
|
|
183
|
+
|
|
184
|
+
:param filename: Optional filename hint from the caller.
|
|
185
|
+
:type filename: str or None
|
|
186
|
+
:param media_type: Media type of the payload.
|
|
187
|
+
:type media_type: str
|
|
188
|
+
:param source_uri: Optional source uniform resource identifier for provenance.
|
|
189
|
+
:type source_uri: str or None
|
|
190
|
+
:return: Storage filename with an appropriate extension, or an empty string when no hint exists.
|
|
191
|
+
:rtype: str
|
|
192
|
+
"""
|
|
193
|
+
base_name = ""
|
|
194
|
+
if source_uri:
|
|
195
|
+
base_name = _encode_source_uri_for_filename(source_uri)
|
|
196
|
+
if filename and not source_uri.startswith("file:"):
|
|
197
|
+
sanitized = _sanitize_filename(filename)
|
|
198
|
+
if sanitized:
|
|
199
|
+
base_name = f"{base_name}--{sanitized}"
|
|
200
|
+
if not base_name and filename:
|
|
201
|
+
base_name = _sanitize_filename(filename)
|
|
202
|
+
if not base_name:
|
|
203
|
+
return ""
|
|
204
|
+
if len(base_name) > 180:
|
|
205
|
+
digest = hashlib.sha256(base_name.encode("utf-8")).hexdigest()
|
|
206
|
+
base_name = f"hash-{digest}"
|
|
207
|
+
return _ensure_filename_extension(base_name, media_type=media_type)
|
|
208
|
+
|
|
209
|
+
|
|
147
210
|
def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
148
211
|
"""
|
|
149
212
|
Merge tags from explicit input and front matter values.
|
|
@@ -476,7 +539,7 @@ class Corpus:
|
|
|
476
539
|
generated_at=utc_now_iso(),
|
|
477
540
|
corpus_uri=normalize_corpus_uri(self.root),
|
|
478
541
|
raw_dir=DEFAULT_RAW_DIR,
|
|
479
|
-
|
|
542
|
+
latest_snapshot_id=None,
|
|
480
543
|
items={},
|
|
481
544
|
order=[],
|
|
482
545
|
)
|
|
@@ -520,70 +583,90 @@ class Corpus:
|
|
|
520
583
|
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
521
584
|
temp_path.replace(self.catalog_path)
|
|
522
585
|
|
|
586
|
+
def _find_item_by_source_uri(self, source_uri: str) -> Optional[CatalogItem]:
|
|
587
|
+
"""
|
|
588
|
+
Locate an existing catalog item by source uniform resource identifier.
|
|
589
|
+
|
|
590
|
+
:param source_uri: Source uniform resource identifier to search for.
|
|
591
|
+
:type source_uri: str
|
|
592
|
+
:return: Matching catalog item or None.
|
|
593
|
+
:rtype: CatalogItem or None
|
|
594
|
+
"""
|
|
595
|
+
if not source_uri:
|
|
596
|
+
return None
|
|
597
|
+
self._init_catalog()
|
|
598
|
+
catalog = self._load_catalog()
|
|
599
|
+
for item in catalog.items.values():
|
|
600
|
+
if item.source_uri == source_uri:
|
|
601
|
+
return item
|
|
602
|
+
return None
|
|
603
|
+
|
|
523
604
|
@property
|
|
524
|
-
def
|
|
605
|
+
def snapshots_dir(self) -> Path:
|
|
525
606
|
"""
|
|
526
|
-
Location of retrieval
|
|
607
|
+
Location of retrieval snapshot manifests.
|
|
527
608
|
|
|
528
|
-
:return: Path to the
|
|
609
|
+
:return: Path to the snapshots directory.
|
|
529
610
|
:rtype: Path
|
|
530
611
|
"""
|
|
531
|
-
return self.meta_dir /
|
|
612
|
+
return self.meta_dir / SNAPSHOTS_DIR_NAME
|
|
532
613
|
|
|
533
614
|
@property
|
|
534
|
-
def
|
|
615
|
+
def extraction_snapshots_dir(self) -> Path:
|
|
535
616
|
"""
|
|
536
|
-
Location of extraction
|
|
617
|
+
Location of extraction snapshot artifacts.
|
|
537
618
|
|
|
538
|
-
:return: Path to the extraction
|
|
619
|
+
:return: Path to the extraction snapshots directory.
|
|
539
620
|
:rtype: Path
|
|
540
621
|
"""
|
|
541
|
-
return self.
|
|
622
|
+
return self.snapshots_dir / EXTRACTION_SNAPSHOTS_DIR_NAME
|
|
542
623
|
|
|
543
624
|
@property
|
|
544
625
|
def analysis_runs_dir(self) -> Path:
|
|
545
626
|
"""
|
|
546
|
-
Location of analysis
|
|
627
|
+
Location of analysis snapshot artifacts.
|
|
547
628
|
|
|
548
|
-
:return: Path to the analysis
|
|
629
|
+
:return: Path to the analysis snapshots directory.
|
|
549
630
|
:rtype: Path
|
|
550
631
|
"""
|
|
551
|
-
return self.
|
|
632
|
+
return self.snapshots_dir / ANALYSIS_RUNS_DIR_NAME
|
|
552
633
|
|
|
553
|
-
def
|
|
634
|
+
def extraction_snapshot_dir(self, *, extractor_id: str, snapshot_id: str) -> Path:
|
|
554
635
|
"""
|
|
555
|
-
Resolve an extraction
|
|
636
|
+
Resolve an extraction snapshot directory.
|
|
556
637
|
|
|
557
638
|
:param extractor_id: Extractor plugin identifier.
|
|
558
639
|
:type extractor_id: str
|
|
559
|
-
:param
|
|
560
|
-
:type
|
|
561
|
-
:return: Extraction
|
|
640
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
641
|
+
:type snapshot_id: str
|
|
642
|
+
:return: Extraction snapshot directory.
|
|
562
643
|
:rtype: Path
|
|
563
644
|
"""
|
|
564
|
-
return self.
|
|
645
|
+
return self.extraction_snapshots_dir / extractor_id / snapshot_id
|
|
565
646
|
|
|
566
|
-
def analysis_run_dir(self, *, analysis_id: str,
|
|
647
|
+
def analysis_run_dir(self, *, analysis_id: str, snapshot_id: str) -> Path:
|
|
567
648
|
"""
|
|
568
|
-
Resolve an analysis
|
|
649
|
+
Resolve an analysis snapshot directory.
|
|
569
650
|
|
|
570
651
|
:param analysis_id: Analysis backend identifier.
|
|
571
652
|
:type analysis_id: str
|
|
572
|
-
:param
|
|
573
|
-
:type
|
|
574
|
-
:return: Analysis
|
|
653
|
+
:param snapshot_id: Analysis snapshot identifier.
|
|
654
|
+
:type snapshot_id: str
|
|
655
|
+
:return: Analysis snapshot directory.
|
|
575
656
|
:rtype: Path
|
|
576
657
|
"""
|
|
577
|
-
return self.analysis_runs_dir / analysis_id /
|
|
658
|
+
return self.analysis_runs_dir / analysis_id / snapshot_id
|
|
578
659
|
|
|
579
|
-
def read_extracted_text(
|
|
660
|
+
def read_extracted_text(
|
|
661
|
+
self, *, extractor_id: str, snapshot_id: str, item_id: str
|
|
662
|
+
) -> Optional[str]:
|
|
580
663
|
"""
|
|
581
|
-
Read extracted text for an item from an extraction
|
|
664
|
+
Read extracted text for an item from an extraction snapshot, when present.
|
|
582
665
|
|
|
583
666
|
:param extractor_id: Extractor plugin identifier.
|
|
584
667
|
:type extractor_id: str
|
|
585
|
-
:param
|
|
586
|
-
:type
|
|
668
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
669
|
+
:type snapshot_id: str
|
|
587
670
|
:param item_id: Item identifier.
|
|
588
671
|
:type item_id: str
|
|
589
672
|
:return: Extracted text or None if the artifact does not exist.
|
|
@@ -591,7 +674,7 @@ class Corpus:
|
|
|
591
674
|
:raises OSError: If the file exists but cannot be read.
|
|
592
675
|
"""
|
|
593
676
|
path = (
|
|
594
|
-
self.
|
|
677
|
+
self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
|
|
595
678
|
/ "text"
|
|
596
679
|
/ f"{item_id}.txt"
|
|
597
680
|
)
|
|
@@ -599,72 +682,73 @@ class Corpus:
|
|
|
599
682
|
return None
|
|
600
683
|
return path.read_text(encoding="utf-8")
|
|
601
684
|
|
|
602
|
-
def
|
|
685
|
+
def load_extraction_snapshot_manifest(self, *, extractor_id: str, snapshot_id: str):
|
|
603
686
|
"""
|
|
604
|
-
Load an extraction
|
|
687
|
+
Load an extraction snapshot manifest from the corpus.
|
|
605
688
|
|
|
606
689
|
:param extractor_id: Extractor plugin identifier.
|
|
607
690
|
:type extractor_id: str
|
|
608
|
-
:param
|
|
609
|
-
:type
|
|
610
|
-
:return: Parsed extraction
|
|
611
|
-
:rtype: biblicus.extraction.
|
|
691
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
692
|
+
:type snapshot_id: str
|
|
693
|
+
:return: Parsed extraction snapshot manifest.
|
|
694
|
+
:rtype: biblicus.extraction.ExtractionSnapshotManifest
|
|
612
695
|
:raises FileNotFoundError: If the manifest file does not exist.
|
|
613
696
|
:raises ValueError: If the manifest data is invalid.
|
|
614
697
|
"""
|
|
615
|
-
from .extraction import
|
|
698
|
+
from .extraction import ExtractionSnapshotManifest
|
|
616
699
|
|
|
617
700
|
manifest_path = (
|
|
618
|
-
self.
|
|
701
|
+
self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
|
|
702
|
+
/ "manifest.json"
|
|
619
703
|
)
|
|
620
704
|
if not manifest_path.is_file():
|
|
621
|
-
raise FileNotFoundError(f"Missing extraction
|
|
705
|
+
raise FileNotFoundError(f"Missing extraction snapshot manifest: {manifest_path}")
|
|
622
706
|
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
623
|
-
return
|
|
707
|
+
return ExtractionSnapshotManifest.model_validate(data)
|
|
624
708
|
|
|
625
|
-
def
|
|
709
|
+
def list_extraction_snapshots(
|
|
626
710
|
self, *, extractor_id: Optional[str] = None
|
|
627
|
-
) -> List[
|
|
711
|
+
) -> List[ExtractionSnapshotListEntry]:
|
|
628
712
|
"""
|
|
629
|
-
List extraction
|
|
713
|
+
List extraction snapshots stored under the corpus.
|
|
630
714
|
|
|
631
715
|
:param extractor_id: Optional extractor identifier filter.
|
|
632
716
|
:type extractor_id: str or None
|
|
633
|
-
:return: Summary list entries for each
|
|
634
|
-
:rtype: list[biblicus.models.
|
|
717
|
+
:return: Summary list entries for each snapshot.
|
|
718
|
+
:rtype: list[biblicus.models.ExtractionSnapshotListEntry]
|
|
635
719
|
"""
|
|
636
|
-
|
|
637
|
-
if not
|
|
720
|
+
snapshots_root = self.extraction_snapshots_dir
|
|
721
|
+
if not snapshots_root.is_dir():
|
|
638
722
|
return []
|
|
639
723
|
|
|
640
724
|
extractor_dirs: List[Path]
|
|
641
725
|
if extractor_id is None:
|
|
642
|
-
extractor_dirs = [path for path in sorted(
|
|
726
|
+
extractor_dirs = [path for path in sorted(snapshots_root.iterdir()) if path.is_dir()]
|
|
643
727
|
else:
|
|
644
|
-
extractor_path =
|
|
728
|
+
extractor_path = snapshots_root / extractor_id
|
|
645
729
|
extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
|
|
646
730
|
|
|
647
|
-
entries: List[
|
|
731
|
+
entries: List[ExtractionSnapshotListEntry] = []
|
|
648
732
|
for extractor_dir in extractor_dirs:
|
|
649
|
-
for
|
|
650
|
-
if not
|
|
733
|
+
for snapshot_dir in sorted(extractor_dir.iterdir()):
|
|
734
|
+
if not snapshot_dir.is_dir():
|
|
651
735
|
continue
|
|
652
|
-
manifest_path =
|
|
736
|
+
manifest_path = snapshot_dir / "manifest.json"
|
|
653
737
|
if not manifest_path.is_file():
|
|
654
738
|
continue
|
|
655
739
|
try:
|
|
656
|
-
manifest = self.
|
|
740
|
+
manifest = self.load_extraction_snapshot_manifest(
|
|
657
741
|
extractor_id=extractor_dir.name,
|
|
658
|
-
|
|
742
|
+
snapshot_id=snapshot_dir.name,
|
|
659
743
|
)
|
|
660
744
|
except (FileNotFoundError, ValueError):
|
|
661
745
|
continue
|
|
662
746
|
entries.append(
|
|
663
|
-
|
|
747
|
+
ExtractionSnapshotListEntry(
|
|
664
748
|
extractor_id=extractor_dir.name,
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
749
|
+
snapshot_id=snapshot_dir.name,
|
|
750
|
+
configuration_id=manifest.configuration.configuration_id,
|
|
751
|
+
configuration_name=manifest.configuration.name,
|
|
668
752
|
catalog_generated_at=manifest.catalog_generated_at,
|
|
669
753
|
created_at=manifest.created_at,
|
|
670
754
|
stats=dict(manifest.stats),
|
|
@@ -672,95 +756,100 @@ class Corpus:
|
|
|
672
756
|
)
|
|
673
757
|
|
|
674
758
|
entries.sort(
|
|
675
|
-
key=lambda entry: (entry.created_at, entry.extractor_id, entry.
|
|
759
|
+
key=lambda entry: (entry.created_at, entry.extractor_id, entry.snapshot_id),
|
|
760
|
+
reverse=True,
|
|
676
761
|
)
|
|
677
762
|
return entries
|
|
678
763
|
|
|
679
|
-
def
|
|
764
|
+
def latest_extraction_snapshot_reference(
|
|
680
765
|
self, *, extractor_id: Optional[str] = None
|
|
681
|
-
) -> Optional[
|
|
766
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
682
767
|
"""
|
|
683
|
-
Return the most recent extraction
|
|
768
|
+
Return the most recent extraction snapshot reference.
|
|
684
769
|
|
|
685
770
|
:param extractor_id: Optional extractor identifier filter.
|
|
686
771
|
:type extractor_id: str or None
|
|
687
|
-
:return: Latest extraction
|
|
688
|
-
:rtype: biblicus.models.
|
|
772
|
+
:return: Latest extraction snapshot reference or None when no snapshots exist.
|
|
773
|
+
:rtype: biblicus.models.ExtractionSnapshotReference or None
|
|
689
774
|
"""
|
|
690
|
-
entries = self.
|
|
775
|
+
entries = self.list_extraction_snapshots(extractor_id=extractor_id)
|
|
691
776
|
if not entries:
|
|
692
777
|
return None
|
|
693
778
|
latest = entries[0]
|
|
694
|
-
return
|
|
779
|
+
return ExtractionSnapshotReference(
|
|
780
|
+
extractor_id=latest.extractor_id, snapshot_id=latest.snapshot_id
|
|
781
|
+
)
|
|
695
782
|
|
|
696
|
-
def
|
|
783
|
+
def delete_extraction_snapshot(self, *, extractor_id: str, snapshot_id: str) -> None:
|
|
697
784
|
"""
|
|
698
|
-
Delete an extraction
|
|
785
|
+
Delete an extraction snapshot directory and its derived artifacts.
|
|
699
786
|
|
|
700
787
|
:param extractor_id: Extractor plugin identifier.
|
|
701
788
|
:type extractor_id: str
|
|
702
|
-
:param
|
|
703
|
-
:type
|
|
789
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
790
|
+
:type snapshot_id: str
|
|
704
791
|
:return: None.
|
|
705
792
|
:rtype: None
|
|
706
|
-
:raises FileNotFoundError: If the extraction
|
|
793
|
+
:raises FileNotFoundError: If the extraction snapshot directory does not exist.
|
|
707
794
|
"""
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
795
|
+
snapshot_dir = self.extraction_snapshot_dir(
|
|
796
|
+
extractor_id=extractor_id, snapshot_id=snapshot_id
|
|
797
|
+
)
|
|
798
|
+
if not snapshot_dir.is_dir():
|
|
799
|
+
raise FileNotFoundError(f"Missing extraction snapshot directory: {snapshot_dir}")
|
|
800
|
+
shutil.rmtree(snapshot_dir)
|
|
712
801
|
|
|
713
|
-
def
|
|
802
|
+
def _ensure_snapshots_dir(self) -> None:
|
|
714
803
|
"""
|
|
715
|
-
Ensure the retrieval
|
|
804
|
+
Ensure the retrieval snapshots directory exists.
|
|
716
805
|
|
|
717
806
|
:return: None.
|
|
718
807
|
:rtype: None
|
|
719
808
|
"""
|
|
720
|
-
self.
|
|
809
|
+
self.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
721
810
|
|
|
722
|
-
def
|
|
811
|
+
def write_snapshot(self, snapshot: RetrievalSnapshot) -> None:
|
|
723
812
|
"""
|
|
724
|
-
Persist a retrieval
|
|
813
|
+
Persist a retrieval snapshot manifest and update the catalog pointer.
|
|
725
814
|
|
|
726
|
-
:param
|
|
727
|
-
:type
|
|
815
|
+
:param snapshot: Snapshot manifest to persist.
|
|
816
|
+
:type snapshot: RetrievalSnapshot
|
|
728
817
|
:return: None.
|
|
729
818
|
:rtype: None
|
|
730
819
|
"""
|
|
731
|
-
self.
|
|
732
|
-
path = self.
|
|
733
|
-
path.write_text(
|
|
820
|
+
self._ensure_snapshots_dir()
|
|
821
|
+
path = self.snapshots_dir / f"{snapshot.snapshot_id}.json"
|
|
822
|
+
path.write_text(snapshot.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
734
823
|
catalog = self._load_catalog()
|
|
735
|
-
catalog.
|
|
824
|
+
catalog.latest_snapshot_id = snapshot.snapshot_id
|
|
736
825
|
catalog.generated_at = utc_now_iso()
|
|
737
826
|
self._write_catalog(catalog)
|
|
738
827
|
|
|
739
|
-
def
|
|
828
|
+
def load_snapshot(self, snapshot_id: str) -> RetrievalSnapshot:
|
|
740
829
|
"""
|
|
741
|
-
Load a retrieval
|
|
830
|
+
Load a retrieval snapshot manifest by identifier.
|
|
742
831
|
|
|
743
|
-
:param
|
|
744
|
-
:type
|
|
745
|
-
:return: Parsed
|
|
746
|
-
:rtype:
|
|
747
|
-
:raises FileNotFoundError: If the
|
|
832
|
+
:param snapshot_id: Snapshot identifier.
|
|
833
|
+
:type snapshot_id: str
|
|
834
|
+
:return: Parsed snapshot manifest.
|
|
835
|
+
:rtype: RetrievalSnapshot
|
|
836
|
+
:raises FileNotFoundError: If the snapshot manifest does not exist.
|
|
748
837
|
"""
|
|
749
|
-
path = self.
|
|
838
|
+
path = self.snapshots_dir / f"{snapshot_id}.json"
|
|
750
839
|
if not path.is_file():
|
|
751
|
-
raise FileNotFoundError(f"Missing
|
|
840
|
+
raise FileNotFoundError(f"Missing snapshot manifest: {path}")
|
|
752
841
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
753
|
-
return
|
|
842
|
+
return RetrievalSnapshot.model_validate(data)
|
|
754
843
|
|
|
755
844
|
@property
|
|
756
|
-
def
|
|
845
|
+
def latest_snapshot_id(self) -> Optional[str]:
|
|
757
846
|
"""
|
|
758
|
-
Latest retrieval
|
|
847
|
+
Latest retrieval snapshot identifier recorded in the catalog.
|
|
759
848
|
|
|
760
|
-
:return: Latest
|
|
849
|
+
:return: Latest snapshot identifier or None.
|
|
761
850
|
:rtype: str or None
|
|
762
851
|
"""
|
|
763
|
-
return self._load_catalog().
|
|
852
|
+
return self._load_catalog().latest_snapshot_id
|
|
764
853
|
|
|
765
854
|
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
766
855
|
"""
|
|
@@ -779,7 +868,7 @@ class Corpus:
|
|
|
779
868
|
ordered_ids.insert(0, item.id)
|
|
780
869
|
catalog.order = ordered_ids
|
|
781
870
|
catalog.generated_at = utc_now_iso()
|
|
782
|
-
catalog.
|
|
871
|
+
catalog.latest_snapshot_id = None
|
|
783
872
|
|
|
784
873
|
self._write_catalog(catalog)
|
|
785
874
|
|
|
@@ -817,18 +906,26 @@ class Corpus:
|
|
|
817
906
|
:return: Ingestion result summary.
|
|
818
907
|
:rtype: IngestResult
|
|
819
908
|
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
909
|
+
:raises IngestCollisionError: If a source uniform resource identifier is already ingested.
|
|
820
910
|
"""
|
|
821
|
-
|
|
822
|
-
|
|
911
|
+
existing_item = self._find_item_by_source_uri(source_uri)
|
|
912
|
+
if existing_item is not None:
|
|
913
|
+
raise IngestCollisionError(
|
|
914
|
+
source_uri=source_uri,
|
|
915
|
+
existing_item_id=existing_item.id,
|
|
916
|
+
existing_relpath=existing_item.relpath,
|
|
917
|
+
)
|
|
823
918
|
|
|
824
|
-
|
|
825
|
-
|
|
919
|
+
item_id = str(uuid.uuid4())
|
|
920
|
+
storage_filename = _storage_filename_for_ingest(
|
|
921
|
+
filename=filename, media_type=media_type, source_uri=source_uri
|
|
922
|
+
)
|
|
826
923
|
|
|
827
924
|
if media_type == "text/markdown":
|
|
828
|
-
output_name = f"{item_id}--{
|
|
925
|
+
output_name = f"{item_id}--{storage_filename}" if storage_filename else f"{item_id}.md"
|
|
829
926
|
else:
|
|
830
|
-
if
|
|
831
|
-
output_name = f"{item_id}--{
|
|
927
|
+
if storage_filename:
|
|
928
|
+
output_name = f"{item_id}--{storage_filename}"
|
|
832
929
|
else:
|
|
833
930
|
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
834
931
|
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
@@ -991,13 +1088,21 @@ class Corpus:
|
|
|
991
1088
|
if media_type == "text/markdown":
|
|
992
1089
|
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
993
1090
|
|
|
1091
|
+
existing_item = self._find_item_by_source_uri(source_uri)
|
|
1092
|
+
if existing_item is not None:
|
|
1093
|
+
raise IngestCollisionError(
|
|
1094
|
+
source_uri=source_uri,
|
|
1095
|
+
existing_item_id=existing_item.id,
|
|
1096
|
+
existing_relpath=existing_item.relpath,
|
|
1097
|
+
)
|
|
1098
|
+
|
|
994
1099
|
item_id = str(uuid.uuid4())
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
1100
|
+
storage_filename = _storage_filename_for_ingest(
|
|
1101
|
+
filename=filename, media_type=media_type, source_uri=source_uri
|
|
1102
|
+
)
|
|
998
1103
|
|
|
999
|
-
if
|
|
1000
|
-
output_name = f"{item_id}--{
|
|
1104
|
+
if storage_filename:
|
|
1105
|
+
output_name = f"{item_id}--{storage_filename}"
|
|
1001
1106
|
else:
|
|
1002
1107
|
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
1003
1108
|
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
@@ -1085,7 +1190,7 @@ class Corpus:
|
|
|
1085
1190
|
*,
|
|
1086
1191
|
title: Optional[str] = None,
|
|
1087
1192
|
tags: Sequence[str] = (),
|
|
1088
|
-
source_uri: str =
|
|
1193
|
+
source_uri: Optional[str] = None,
|
|
1089
1194
|
) -> IngestResult:
|
|
1090
1195
|
"""
|
|
1091
1196
|
Ingest a text note as Markdown.
|
|
@@ -1096,11 +1201,15 @@ class Corpus:
|
|
|
1096
1201
|
:type title: str or None
|
|
1097
1202
|
:param tags: Tags to associate with the note.
|
|
1098
1203
|
:type tags: Sequence[str]
|
|
1099
|
-
:param source_uri:
|
|
1100
|
-
:type source_uri: str
|
|
1204
|
+
:param source_uri: Optional source uniform resource identifier for provenance.
|
|
1205
|
+
:type source_uri: str or None
|
|
1101
1206
|
:return: Ingestion result summary.
|
|
1102
1207
|
:rtype: IngestResult
|
|
1103
1208
|
"""
|
|
1209
|
+
if source_uri is None:
|
|
1210
|
+
digest_source = (title or "") + "\n" + text
|
|
1211
|
+
digest = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()
|
|
1212
|
+
source_uri = f"text:{digest}"
|
|
1104
1213
|
data = text.encode("utf-8")
|
|
1105
1214
|
return self.ingest_item(
|
|
1106
1215
|
data,
|
|
@@ -1520,7 +1629,7 @@ class Corpus:
|
|
|
1520
1629
|
generated_at=utc_now_iso(),
|
|
1521
1630
|
corpus_uri=normalize_corpus_uri(self.root),
|
|
1522
1631
|
raw_dir=DEFAULT_RAW_DIR,
|
|
1523
|
-
|
|
1632
|
+
latest_snapshot_id=None,
|
|
1524
1633
|
items=new_items,
|
|
1525
1634
|
order=order,
|
|
1526
1635
|
)
|
|
@@ -1572,7 +1681,7 @@ class Corpus:
|
|
|
1572
1681
|
generated_at=utc_now_iso(),
|
|
1573
1682
|
corpus_uri=normalize_corpus_uri(self.root),
|
|
1574
1683
|
raw_dir=DEFAULT_RAW_DIR,
|
|
1575
|
-
|
|
1684
|
+
latest_snapshot_id=None,
|
|
1576
1685
|
items={},
|
|
1577
1686
|
order=[],
|
|
1578
1687
|
)
|
biblicus/errors.py
CHANGED
|
@@ -5,11 +5,35 @@ Error types for Biblicus.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class
|
|
8
|
+
class ExtractionSnapshotFatalError(RuntimeError):
|
|
9
9
|
"""
|
|
10
|
-
Fatal extraction
|
|
10
|
+
Fatal extraction snapshot error that should abort the entire snapshot.
|
|
11
11
|
|
|
12
12
|
This exception is used for conditions that indicate a configuration or environment problem
|
|
13
13
|
rather than a per-item extraction failure. For example, a selection extractor that depends
|
|
14
|
-
on referenced extraction
|
|
14
|
+
on referenced extraction snapshot manifests treats missing manifests as fatal.
|
|
15
15
|
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IngestCollisionError(RuntimeError):
|
|
19
|
+
"""
|
|
20
|
+
Ingest collision for an already ingested source.
|
|
21
|
+
|
|
22
|
+
:param source_uri: Source uniform resource identifier that caused the collision.
|
|
23
|
+
:type source_uri: str
|
|
24
|
+
:param existing_item_id: Identifier of the existing catalog item.
|
|
25
|
+
:type existing_item_id: str
|
|
26
|
+
:param existing_relpath: Raw storage relpath of the existing item.
|
|
27
|
+
:type existing_relpath: str
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, *, source_uri: str, existing_item_id: str, existing_relpath: str) -> None:
|
|
31
|
+
self.source_uri = source_uri
|
|
32
|
+
self.existing_item_id = existing_item_id
|
|
33
|
+
self.existing_relpath = existing_relpath
|
|
34
|
+
message = (
|
|
35
|
+
"Source already ingested"
|
|
36
|
+
f": source_uri={source_uri} existing_item_id={existing_item_id}"
|
|
37
|
+
f" existing_relpath={existing_relpath}"
|
|
38
|
+
)
|
|
39
|
+
super().__init__(message)
|