biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +177 -53
- biblicus/corpus.py +209 -59
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +280 -79
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +118 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +1 -2
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
- biblicus-0.4.0.dist-info/RECORD +45 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -13,6 +13,7 @@ from pathlib import Path
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Sequence
|
|
14
14
|
|
|
15
15
|
import yaml
|
|
16
|
+
from pydantic import ValidationError
|
|
16
17
|
|
|
17
18
|
from .constants import (
|
|
18
19
|
CORPUS_DIR_NAME,
|
|
@@ -23,15 +24,20 @@ from .constants import (
|
|
|
23
24
|
SIDECAR_SUFFIX,
|
|
24
25
|
)
|
|
25
26
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
26
|
-
from pydantic import ValidationError
|
|
27
|
-
|
|
28
27
|
from .hook_manager import HookManager
|
|
29
28
|
from .hooks import HookPoint
|
|
30
29
|
from .ignore import load_corpus_ignore_spec
|
|
31
|
-
from .models import
|
|
30
|
+
from .models import (
|
|
31
|
+
CatalogItem,
|
|
32
|
+
CorpusCatalog,
|
|
33
|
+
CorpusConfig,
|
|
34
|
+
ExtractionRunListEntry,
|
|
35
|
+
IngestResult,
|
|
36
|
+
RetrievalRun,
|
|
37
|
+
)
|
|
32
38
|
from .sources import load_source
|
|
33
39
|
from .time import utc_now_iso
|
|
34
|
-
from .uris import
|
|
40
|
+
from .uris import corpus_ref_to_path, normalize_corpus_uri
|
|
35
41
|
|
|
36
42
|
|
|
37
43
|
def _sha256_bytes(data: bytes) -> str:
|
|
@@ -43,11 +49,12 @@ def _sha256_bytes(data: bytes) -> str:
|
|
|
43
49
|
:return: Secure Hash Algorithm 256 hex digest.
|
|
44
50
|
:rtype: str
|
|
45
51
|
"""
|
|
46
|
-
|
|
47
52
|
return hashlib.sha256(data).hexdigest()
|
|
48
53
|
|
|
49
54
|
|
|
50
|
-
def _write_stream_and_hash(
|
|
55
|
+
def _write_stream_and_hash(
|
|
56
|
+
stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
|
|
57
|
+
) -> Dict[str, object]:
|
|
51
58
|
"""
|
|
52
59
|
Write a binary stream to disk while computing a digest.
|
|
53
60
|
|
|
@@ -61,7 +68,6 @@ def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int =
|
|
|
61
68
|
:rtype: dict[str, object]
|
|
62
69
|
:raises OSError: If the destination cannot be written.
|
|
63
70
|
"""
|
|
64
|
-
|
|
65
71
|
hasher = hashlib.sha256()
|
|
66
72
|
bytes_written = 0
|
|
67
73
|
with destination_path.open("wb") as destination_handle:
|
|
@@ -84,7 +90,6 @@ def _sanitize_filename(name: str) -> str:
|
|
|
84
90
|
:return: Sanitized filename.
|
|
85
91
|
:rtype: str
|
|
86
92
|
"""
|
|
87
|
-
|
|
88
93
|
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
89
94
|
sanitized_name = "".join(
|
|
90
95
|
(character if character in allowed_characters else "_") for character in name
|
|
@@ -101,9 +106,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
|
101
106
|
:return: Preferred extension or None.
|
|
102
107
|
:rtype: str or None
|
|
103
108
|
"""
|
|
104
|
-
|
|
105
109
|
media_type_overrides = {
|
|
106
110
|
"image/jpeg": ".jpg",
|
|
111
|
+
"audio/ogg": ".ogg",
|
|
107
112
|
}
|
|
108
113
|
if media_type in media_type_overrides:
|
|
109
114
|
return media_type_overrides[media_type]
|
|
@@ -121,7 +126,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
121
126
|
:return: Filename with a compatible extension.
|
|
122
127
|
:rtype: str
|
|
123
128
|
"""
|
|
124
|
-
|
|
125
129
|
raw_name = filename.strip()
|
|
126
130
|
|
|
127
131
|
if media_type == "text/markdown":
|
|
@@ -129,11 +133,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
129
133
|
return raw_name
|
|
130
134
|
return raw_name + ".md"
|
|
131
135
|
|
|
136
|
+
if Path(raw_name).suffix:
|
|
137
|
+
return raw_name
|
|
138
|
+
|
|
132
139
|
ext = _preferred_extension_for_media_type(media_type)
|
|
133
140
|
if not ext:
|
|
134
141
|
return raw_name
|
|
135
|
-
if raw_name.lower().endswith(ext.lower()):
|
|
136
|
-
return raw_name
|
|
137
142
|
return raw_name + ext
|
|
138
143
|
|
|
139
144
|
|
|
@@ -148,7 +153,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
|
148
153
|
:return: Deduplicated tag list preserving order.
|
|
149
154
|
:rtype: list[str]
|
|
150
155
|
"""
|
|
151
|
-
|
|
152
156
|
merged_tags: List[str] = []
|
|
153
157
|
|
|
154
158
|
for explicit_tag in explicit:
|
|
@@ -181,7 +185,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
|
|
|
181
185
|
:return: Sidecar path.
|
|
182
186
|
:rtype: Path
|
|
183
187
|
"""
|
|
184
|
-
|
|
185
188
|
return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
|
|
186
189
|
|
|
187
190
|
|
|
@@ -195,7 +198,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
|
|
|
195
198
|
:rtype: dict[str, Any]
|
|
196
199
|
:raises ValueError: If the sidecar content is not a mapping.
|
|
197
200
|
"""
|
|
198
|
-
|
|
199
201
|
path = _sidecar_path_for(content_path)
|
|
200
202
|
if not path.is_file():
|
|
201
203
|
return {}
|
|
@@ -226,7 +228,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
|
|
|
226
228
|
path.write_text(text + "\n", encoding="utf-8")
|
|
227
229
|
|
|
228
230
|
|
|
229
|
-
def _ensure_biblicus_block(
|
|
231
|
+
def _ensure_biblicus_block(
|
|
232
|
+
metadata: Dict[str, Any], *, item_id: str, source_uri: str
|
|
233
|
+
) -> Dict[str, Any]:
|
|
230
234
|
"""
|
|
231
235
|
Ensure the biblicus metadata block exists and is populated.
|
|
232
236
|
|
|
@@ -324,7 +328,6 @@ class Corpus:
|
|
|
324
328
|
:param root: Corpus root directory.
|
|
325
329
|
:type root: Path
|
|
326
330
|
"""
|
|
327
|
-
|
|
328
331
|
self.root = root
|
|
329
332
|
self.meta_dir = self.root / CORPUS_DIR_NAME
|
|
330
333
|
self.raw_dir = self.root / DEFAULT_RAW_DIR
|
|
@@ -339,7 +342,6 @@ class Corpus:
|
|
|
339
342
|
:return: Corpus uniform resource identifier.
|
|
340
343
|
:rtype: str
|
|
341
344
|
"""
|
|
342
|
-
|
|
343
345
|
return self.root.as_uri()
|
|
344
346
|
|
|
345
347
|
def _load_config(self) -> Optional[CorpusConfig]:
|
|
@@ -350,7 +352,6 @@ class Corpus:
|
|
|
350
352
|
:rtype: CorpusConfig or None
|
|
351
353
|
:raises ValueError: If the config schema is invalid.
|
|
352
354
|
"""
|
|
353
|
-
|
|
354
355
|
path = self.meta_dir / "config.json"
|
|
355
356
|
if not path.is_file():
|
|
356
357
|
return None
|
|
@@ -359,7 +360,9 @@ class Corpus:
|
|
|
359
360
|
return CorpusConfig.model_validate(data)
|
|
360
361
|
except ValidationError as exc:
|
|
361
362
|
has_hook_error = any(
|
|
362
|
-
isinstance(error.get("loc"), tuple)
|
|
363
|
+
isinstance(error.get("loc"), tuple)
|
|
364
|
+
and error.get("loc")
|
|
365
|
+
and error.get("loc")[0] == "hooks"
|
|
363
366
|
for error in exc.errors()
|
|
364
367
|
)
|
|
365
368
|
if has_hook_error:
|
|
@@ -374,7 +377,6 @@ class Corpus:
|
|
|
374
377
|
:rtype: HookManager or None
|
|
375
378
|
:raises ValueError: If hook specifications are invalid.
|
|
376
379
|
"""
|
|
377
|
-
|
|
378
380
|
if self.config is None or not self.config.hooks:
|
|
379
381
|
return None
|
|
380
382
|
return HookManager.from_config(
|
|
@@ -394,7 +396,6 @@ class Corpus:
|
|
|
394
396
|
:rtype: Corpus
|
|
395
397
|
:raises FileNotFoundError: If no corpus config is found.
|
|
396
398
|
"""
|
|
397
|
-
|
|
398
399
|
start = start.resolve()
|
|
399
400
|
for candidate in [start, *start.parents]:
|
|
400
401
|
if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
|
|
@@ -413,7 +414,6 @@ class Corpus:
|
|
|
413
414
|
:return: Opened corpus instance.
|
|
414
415
|
:rtype: Corpus
|
|
415
416
|
"""
|
|
416
|
-
|
|
417
417
|
return cls.find(corpus_ref_to_path(ref))
|
|
418
418
|
|
|
419
419
|
@classmethod
|
|
@@ -429,7 +429,6 @@ class Corpus:
|
|
|
429
429
|
:rtype: Corpus
|
|
430
430
|
:raises FileExistsError: If the corpus already exists and force is False.
|
|
431
431
|
"""
|
|
432
|
-
|
|
433
432
|
root = root.resolve()
|
|
434
433
|
corpus = cls(root)
|
|
435
434
|
|
|
@@ -459,7 +458,6 @@ class Corpus:
|
|
|
459
458
|
:return: Catalog file path.
|
|
460
459
|
:rtype: Path
|
|
461
460
|
"""
|
|
462
|
-
|
|
463
461
|
return self.meta_dir / "catalog.json"
|
|
464
462
|
|
|
465
463
|
def _init_catalog(self) -> None:
|
|
@@ -469,7 +467,6 @@ class Corpus:
|
|
|
469
467
|
:return: None.
|
|
470
468
|
:rtype: None
|
|
471
469
|
"""
|
|
472
|
-
|
|
473
470
|
if self.catalog_path.exists():
|
|
474
471
|
return
|
|
475
472
|
catalog = CorpusCatalog(
|
|
@@ -492,7 +489,6 @@ class Corpus:
|
|
|
492
489
|
:raises FileNotFoundError: If the catalog file does not exist.
|
|
493
490
|
:raises ValueError: If the catalog schema is invalid.
|
|
494
491
|
"""
|
|
495
|
-
|
|
496
492
|
if not self.catalog_path.is_file():
|
|
497
493
|
raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
|
|
498
494
|
catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
|
|
@@ -507,7 +503,6 @@ class Corpus:
|
|
|
507
503
|
:raises FileNotFoundError: If the catalog file does not exist.
|
|
508
504
|
:raises ValueError: If the catalog schema is invalid.
|
|
509
505
|
"""
|
|
510
|
-
|
|
511
506
|
return self._load_catalog()
|
|
512
507
|
|
|
513
508
|
def _write_catalog(self, catalog: CorpusCatalog) -> None:
|
|
@@ -519,7 +514,6 @@ class Corpus:
|
|
|
519
514
|
:return: None.
|
|
520
515
|
:rtype: None
|
|
521
516
|
"""
|
|
522
|
-
|
|
523
517
|
temp_path = self.catalog_path.with_suffix(".json.tmp")
|
|
524
518
|
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
525
519
|
temp_path.replace(self.catalog_path)
|
|
@@ -532,7 +526,6 @@ class Corpus:
|
|
|
532
526
|
:return: Path to the runs directory.
|
|
533
527
|
:rtype: Path
|
|
534
528
|
"""
|
|
535
|
-
|
|
536
529
|
return self.meta_dir / RUNS_DIR_NAME
|
|
537
530
|
|
|
538
531
|
@property
|
|
@@ -543,7 +536,6 @@ class Corpus:
|
|
|
543
536
|
:return: Path to the extraction runs directory.
|
|
544
537
|
:rtype: Path
|
|
545
538
|
"""
|
|
546
|
-
|
|
547
539
|
return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
|
|
548
540
|
|
|
549
541
|
def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
|
|
@@ -557,7 +549,6 @@ class Corpus:
|
|
|
557
549
|
:return: Extraction run directory.
|
|
558
550
|
:rtype: Path
|
|
559
551
|
"""
|
|
560
|
-
|
|
561
552
|
return self.extraction_runs_dir / extractor_id / run_id
|
|
562
553
|
|
|
563
554
|
def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
|
|
@@ -574,12 +565,105 @@ class Corpus:
|
|
|
574
565
|
:rtype: str or None
|
|
575
566
|
:raises OSError: If the file exists but cannot be read.
|
|
576
567
|
"""
|
|
577
|
-
|
|
578
|
-
|
|
568
|
+
path = (
|
|
569
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
570
|
+
/ "text"
|
|
571
|
+
/ f"{item_id}.txt"
|
|
572
|
+
)
|
|
579
573
|
if not path.is_file():
|
|
580
574
|
return None
|
|
581
575
|
return path.read_text(encoding="utf-8")
|
|
582
576
|
|
|
577
|
+
def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
|
|
578
|
+
"""
|
|
579
|
+
Load an extraction run manifest from the corpus.
|
|
580
|
+
|
|
581
|
+
:param extractor_id: Extractor plugin identifier.
|
|
582
|
+
:type extractor_id: str
|
|
583
|
+
:param run_id: Extraction run identifier.
|
|
584
|
+
:type run_id: str
|
|
585
|
+
:return: Parsed extraction run manifest.
|
|
586
|
+
:rtype: biblicus.extraction.ExtractionRunManifest
|
|
587
|
+
:raises FileNotFoundError: If the manifest file does not exist.
|
|
588
|
+
:raises ValueError: If the manifest data is invalid.
|
|
589
|
+
"""
|
|
590
|
+
from .extraction import ExtractionRunManifest
|
|
591
|
+
|
|
592
|
+
manifest_path = (
|
|
593
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
|
|
594
|
+
)
|
|
595
|
+
if not manifest_path.is_file():
|
|
596
|
+
raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
|
|
597
|
+
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
598
|
+
return ExtractionRunManifest.model_validate(data)
|
|
599
|
+
|
|
600
|
+
def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
|
|
601
|
+
"""
|
|
602
|
+
List extraction runs stored under the corpus.
|
|
603
|
+
|
|
604
|
+
:param extractor_id: Optional extractor identifier filter.
|
|
605
|
+
:type extractor_id: str or None
|
|
606
|
+
:return: Summary list entries for each run.
|
|
607
|
+
:rtype: list[biblicus.models.ExtractionRunListEntry]
|
|
608
|
+
"""
|
|
609
|
+
runs_root = self.extraction_runs_dir
|
|
610
|
+
if not runs_root.is_dir():
|
|
611
|
+
return []
|
|
612
|
+
|
|
613
|
+
extractor_dirs: List[Path]
|
|
614
|
+
if extractor_id is None:
|
|
615
|
+
extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
|
|
616
|
+
else:
|
|
617
|
+
extractor_path = runs_root / extractor_id
|
|
618
|
+
extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
|
|
619
|
+
|
|
620
|
+
entries: List[ExtractionRunListEntry] = []
|
|
621
|
+
for extractor_dir in extractor_dirs:
|
|
622
|
+
for run_dir in sorted(extractor_dir.iterdir()):
|
|
623
|
+
if not run_dir.is_dir():
|
|
624
|
+
continue
|
|
625
|
+
manifest_path = run_dir / "manifest.json"
|
|
626
|
+
if not manifest_path.is_file():
|
|
627
|
+
continue
|
|
628
|
+
try:
|
|
629
|
+
manifest = self.load_extraction_run_manifest(
|
|
630
|
+
extractor_id=extractor_dir.name,
|
|
631
|
+
run_id=run_dir.name,
|
|
632
|
+
)
|
|
633
|
+
except (FileNotFoundError, ValueError):
|
|
634
|
+
continue
|
|
635
|
+
entries.append(
|
|
636
|
+
ExtractionRunListEntry(
|
|
637
|
+
extractor_id=extractor_dir.name,
|
|
638
|
+
run_id=run_dir.name,
|
|
639
|
+
recipe_id=manifest.recipe.recipe_id,
|
|
640
|
+
recipe_name=manifest.recipe.name,
|
|
641
|
+
catalog_generated_at=manifest.catalog_generated_at,
|
|
642
|
+
created_at=manifest.created_at,
|
|
643
|
+
stats=dict(manifest.stats),
|
|
644
|
+
)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
|
|
648
|
+
return entries
|
|
649
|
+
|
|
650
|
+
def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
|
|
651
|
+
"""
|
|
652
|
+
Delete an extraction run directory and its derived artifacts.
|
|
653
|
+
|
|
654
|
+
:param extractor_id: Extractor plugin identifier.
|
|
655
|
+
:type extractor_id: str
|
|
656
|
+
:param run_id: Extraction run identifier.
|
|
657
|
+
:type run_id: str
|
|
658
|
+
:return: None.
|
|
659
|
+
:rtype: None
|
|
660
|
+
:raises FileNotFoundError: If the extraction run directory does not exist.
|
|
661
|
+
"""
|
|
662
|
+
run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
663
|
+
if not run_dir.is_dir():
|
|
664
|
+
raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
|
|
665
|
+
shutil.rmtree(run_dir)
|
|
666
|
+
|
|
583
667
|
def _ensure_runs_dir(self) -> None:
|
|
584
668
|
"""
|
|
585
669
|
Ensure the retrieval runs directory exists.
|
|
@@ -587,7 +671,6 @@ class Corpus:
|
|
|
587
671
|
:return: None.
|
|
588
672
|
:rtype: None
|
|
589
673
|
"""
|
|
590
|
-
|
|
591
674
|
self.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
592
675
|
|
|
593
676
|
def write_run(self, run: RetrievalRun) -> None:
|
|
@@ -599,7 +682,6 @@ class Corpus:
|
|
|
599
682
|
:return: None.
|
|
600
683
|
:rtype: None
|
|
601
684
|
"""
|
|
602
|
-
|
|
603
685
|
self._ensure_runs_dir()
|
|
604
686
|
path = self.runs_dir / f"{run.run_id}.json"
|
|
605
687
|
path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
@@ -618,7 +700,6 @@ class Corpus:
|
|
|
618
700
|
:rtype: RetrievalRun
|
|
619
701
|
:raises FileNotFoundError: If the run manifest does not exist.
|
|
620
702
|
"""
|
|
621
|
-
|
|
622
703
|
path = self.runs_dir / f"{run_id}.json"
|
|
623
704
|
if not path.is_file():
|
|
624
705
|
raise FileNotFoundError(f"Missing run manifest: {path}")
|
|
@@ -633,7 +714,6 @@ class Corpus:
|
|
|
633
714
|
:return: Latest run identifier or None.
|
|
634
715
|
:rtype: str or None
|
|
635
716
|
"""
|
|
636
|
-
|
|
637
717
|
return self._load_catalog().latest_run_id
|
|
638
718
|
|
|
639
719
|
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
@@ -645,7 +725,6 @@ class Corpus:
|
|
|
645
725
|
:return: None.
|
|
646
726
|
:rtype: None
|
|
647
727
|
"""
|
|
648
|
-
|
|
649
728
|
self._init_catalog()
|
|
650
729
|
catalog = self._load_catalog()
|
|
651
730
|
catalog.items[item.id] = item
|
|
@@ -693,7 +772,6 @@ class Corpus:
|
|
|
693
772
|
:rtype: IngestResult
|
|
694
773
|
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
695
774
|
"""
|
|
696
|
-
|
|
697
775
|
item_id = str(uuid.uuid4())
|
|
698
776
|
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
699
777
|
|
|
@@ -741,7 +819,9 @@ class Corpus:
|
|
|
741
819
|
try:
|
|
742
820
|
markdown_text = data.decode("utf-8")
|
|
743
821
|
except UnicodeDecodeError as decode_error:
|
|
744
|
-
raise ValueError(
|
|
822
|
+
raise ValueError(
|
|
823
|
+
"Markdown must be Unicode Transformation Format 8"
|
|
824
|
+
) from decode_error
|
|
745
825
|
|
|
746
826
|
parsed_document = parse_front_matter(markdown_text)
|
|
747
827
|
frontmatter = dict(parsed_document.metadata)
|
|
@@ -760,7 +840,9 @@ class Corpus:
|
|
|
760
840
|
if isinstance(title_value, str) and title_value.strip():
|
|
761
841
|
resolved_title = title_value.strip()
|
|
762
842
|
|
|
763
|
-
frontmatter = _ensure_biblicus_block(
|
|
843
|
+
frontmatter = _ensure_biblicus_block(
|
|
844
|
+
frontmatter, item_id=item_id, source_uri=source_uri
|
|
845
|
+
)
|
|
764
846
|
rendered_document = render_front_matter(frontmatter, parsed_document.body)
|
|
765
847
|
data_to_write = rendered_document.encode("utf-8")
|
|
766
848
|
else:
|
|
@@ -807,7 +889,9 @@ class Corpus:
|
|
|
807
889
|
sidecar_metadata["media_type"] = media_type
|
|
808
890
|
sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
|
|
809
891
|
_write_sidecar(output_path, sidecar_metadata)
|
|
810
|
-
frontmatter = _merge_metadata(
|
|
892
|
+
frontmatter = _merge_metadata(
|
|
893
|
+
frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
|
|
894
|
+
)
|
|
811
895
|
|
|
812
896
|
created_at = utc_now_iso()
|
|
813
897
|
item_record = CatalogItem(
|
|
@@ -858,7 +942,6 @@ class Corpus:
|
|
|
858
942
|
:rtype: IngestResult
|
|
859
943
|
:raises ValueError: If the media_type is text/markdown.
|
|
860
944
|
"""
|
|
861
|
-
|
|
862
945
|
if media_type == "text/markdown":
|
|
863
946
|
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
864
947
|
|
|
@@ -972,7 +1055,6 @@ class Corpus:
|
|
|
972
1055
|
:return: Ingestion result summary.
|
|
973
1056
|
:rtype: IngestResult
|
|
974
1057
|
"""
|
|
975
|
-
|
|
976
1058
|
data = text.encode("utf-8")
|
|
977
1059
|
return self.ingest_item(
|
|
978
1060
|
data,
|
|
@@ -1003,7 +1085,6 @@ class Corpus:
|
|
|
1003
1085
|
:return: Ingestion result summary.
|
|
1004
1086
|
:rtype: IngestResult
|
|
1005
1087
|
"""
|
|
1006
|
-
|
|
1007
1088
|
candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
|
|
1008
1089
|
if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
|
|
1009
1090
|
path = source if isinstance(source, Path) else candidate_path
|
|
@@ -1061,7 +1142,6 @@ class Corpus:
|
|
|
1061
1142
|
:raises FileNotFoundError: If the source_root does not exist.
|
|
1062
1143
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1063
1144
|
"""
|
|
1064
|
-
|
|
1065
1145
|
source_root = source_root.resolve()
|
|
1066
1146
|
if not source_root.is_dir():
|
|
1067
1147
|
raise FileNotFoundError(f"Import source root does not exist: {source_root}")
|
|
@@ -1111,9 +1191,10 @@ class Corpus:
|
|
|
1111
1191
|
:rtype: None
|
|
1112
1192
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1113
1193
|
"""
|
|
1114
|
-
|
|
1115
1194
|
item_id = str(uuid.uuid4())
|
|
1116
|
-
destination_relpath = str(
|
|
1195
|
+
destination_relpath = str(
|
|
1196
|
+
Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
|
|
1197
|
+
)
|
|
1117
1198
|
destination_path = (self.root / destination_relpath).resolve()
|
|
1118
1199
|
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1119
1200
|
|
|
@@ -1176,11 +1257,8 @@ class Corpus:
|
|
|
1176
1257
|
:return: Catalog items ordered by recency.
|
|
1177
1258
|
:rtype: list[CatalogItem]
|
|
1178
1259
|
"""
|
|
1179
|
-
|
|
1180
1260
|
catalog = self._load_catalog()
|
|
1181
|
-
ordered_ids = (
|
|
1182
|
-
catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
1183
|
-
)
|
|
1261
|
+
ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
1184
1262
|
collected_items: List[CatalogItem] = []
|
|
1185
1263
|
for item_id in ordered_ids:
|
|
1186
1264
|
item = catalog.items.get(item_id)
|
|
@@ -1198,13 +1276,84 @@ class Corpus:
|
|
|
1198
1276
|
:rtype: CatalogItem
|
|
1199
1277
|
:raises KeyError: If the item identifier is unknown.
|
|
1200
1278
|
"""
|
|
1201
|
-
|
|
1202
1279
|
catalog = self._load_catalog()
|
|
1203
1280
|
item = catalog.items.get(item_id)
|
|
1204
1281
|
if item is None:
|
|
1205
1282
|
raise KeyError(f"Unknown item identifier: {item_id}")
|
|
1206
1283
|
return item
|
|
1207
1284
|
|
|
1285
|
+
def create_crawl_id(self) -> str:
|
|
1286
|
+
"""
|
|
1287
|
+
Create a new crawl identifier.
|
|
1288
|
+
|
|
1289
|
+
:return: Crawl identifier.
|
|
1290
|
+
:rtype: str
|
|
1291
|
+
"""
|
|
1292
|
+
return str(uuid.uuid4())
|
|
1293
|
+
|
|
1294
|
+
def ingest_crawled_payload(
|
|
1295
|
+
self,
|
|
1296
|
+
*,
|
|
1297
|
+
crawl_id: str,
|
|
1298
|
+
relative_path: str,
|
|
1299
|
+
data: bytes,
|
|
1300
|
+
filename: str,
|
|
1301
|
+
media_type: str,
|
|
1302
|
+
source_uri: str,
|
|
1303
|
+
tags: Sequence[str],
|
|
1304
|
+
) -> None:
|
|
1305
|
+
"""
|
|
1306
|
+
Ingest a crawled payload under a crawl import namespace.
|
|
1307
|
+
|
|
1308
|
+
:param crawl_id: Crawl identifier used to group crawled artifacts.
|
|
1309
|
+
:type crawl_id: str
|
|
1310
|
+
:param relative_path: Relative path within the crawl prefix.
|
|
1311
|
+
:type relative_path: str
|
|
1312
|
+
:param data: Raw payload bytes.
|
|
1313
|
+
:type data: bytes
|
|
1314
|
+
:param filename: Suggested filename from the payload metadata.
|
|
1315
|
+
:type filename: str
|
|
1316
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
1317
|
+
:type media_type: str
|
|
1318
|
+
:param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
|
|
1319
|
+
:type source_uri: str
|
|
1320
|
+
:param tags: Tags to attach to the stored item.
|
|
1321
|
+
:type tags: Sequence[str]
|
|
1322
|
+
:return: None.
|
|
1323
|
+
:rtype: None
|
|
1324
|
+
"""
|
|
1325
|
+
_ = filename
|
|
1326
|
+
item_id = str(uuid.uuid4())
|
|
1327
|
+
destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
|
|
1328
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1329
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1330
|
+
destination_path.write_bytes(data)
|
|
1331
|
+
|
|
1332
|
+
sha256_digest = _sha256_bytes(data)
|
|
1333
|
+
|
|
1334
|
+
sidecar: Dict[str, Any] = {}
|
|
1335
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1336
|
+
sidecar["media_type"] = media_type
|
|
1337
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
1338
|
+
_write_sidecar(destination_path, sidecar)
|
|
1339
|
+
|
|
1340
|
+
merged_metadata = _merge_metadata({}, sidecar)
|
|
1341
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1342
|
+
|
|
1343
|
+
item_record = CatalogItem(
|
|
1344
|
+
id=item_id,
|
|
1345
|
+
relpath=destination_relpath,
|
|
1346
|
+
sha256=sha256_digest,
|
|
1347
|
+
bytes=len(data),
|
|
1348
|
+
media_type=media_type,
|
|
1349
|
+
title=None,
|
|
1350
|
+
tags=list(resolved_tags),
|
|
1351
|
+
metadata=dict(merged_metadata or {}),
|
|
1352
|
+
created_at=utc_now_iso(),
|
|
1353
|
+
source_uri=source_uri,
|
|
1354
|
+
)
|
|
1355
|
+
self._upsert_catalog_item(item_record)
|
|
1356
|
+
|
|
1208
1357
|
def reindex(self) -> Dict[str, int]:
|
|
1209
1358
|
"""
|
|
1210
1359
|
Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
|
|
@@ -1216,7 +1365,6 @@ class Corpus:
|
|
|
1216
1365
|
:rtype: dict[str, int]
|
|
1217
1366
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1218
1367
|
"""
|
|
1219
|
-
|
|
1220
1368
|
self._init_catalog()
|
|
1221
1369
|
existing_catalog = self._load_catalog()
|
|
1222
1370
|
stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
|
|
@@ -1291,7 +1439,9 @@ class Corpus:
|
|
|
1291
1439
|
|
|
1292
1440
|
previous_item = existing_catalog.items.get(item_id)
|
|
1293
1441
|
created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
|
|
1294
|
-
source_uri = source_uri or (
|
|
1442
|
+
source_uri = source_uri or (
|
|
1443
|
+
previous_item.source_uri if previous_item is not None else None
|
|
1444
|
+
)
|
|
1295
1445
|
|
|
1296
1446
|
if previous_item is None:
|
|
1297
1447
|
stats["inserted"] += 1
|
|
@@ -1338,7 +1488,6 @@ class Corpus:
|
|
|
1338
1488
|
:return: Corpus name.
|
|
1339
1489
|
:rtype: str
|
|
1340
1490
|
"""
|
|
1341
|
-
|
|
1342
1491
|
return self.root.name
|
|
1343
1492
|
|
|
1344
1493
|
def purge(self, *, confirm: str) -> None:
|
|
@@ -1351,10 +1500,11 @@ class Corpus:
|
|
|
1351
1500
|
:rtype: None
|
|
1352
1501
|
:raises ValueError: If the confirmation does not match.
|
|
1353
1502
|
"""
|
|
1354
|
-
|
|
1355
1503
|
expected = self.name
|
|
1356
1504
|
if confirm != expected:
|
|
1357
|
-
raise ValueError(
|
|
1505
|
+
raise ValueError(
|
|
1506
|
+
f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
|
|
1507
|
+
)
|
|
1358
1508
|
|
|
1359
1509
|
if self.raw_dir.exists():
|
|
1360
1510
|
shutil.rmtree(self.raw_dir)
|