biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +96 -13
- biblicus/backends/sqlite_full_text_search.py +74 -14
- biblicus/cli.py +126 -19
- biblicus/constants.py +2 -0
- biblicus/corpus.py +455 -45
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +529 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/models.py +107 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +85 -5
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- biblicus-0.3.0.dist-info/METADATA +336 -0
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus-0.1.1.dist-info/METADATA +0 -174
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -13,13 +13,24 @@ from pathlib import Path
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Sequence
|
|
14
14
|
|
|
15
15
|
import yaml
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
from pydantic import ValidationError
|
|
17
|
+
|
|
18
|
+
from .constants import (
|
|
19
|
+
CORPUS_DIR_NAME,
|
|
20
|
+
DEFAULT_RAW_DIR,
|
|
21
|
+
EXTRACTION_RUNS_DIR_NAME,
|
|
22
|
+
RUNS_DIR_NAME,
|
|
23
|
+
SCHEMA_VERSION,
|
|
24
|
+
SIDECAR_SUFFIX,
|
|
25
|
+
)
|
|
18
26
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
27
|
+
from .hook_manager import HookManager
|
|
28
|
+
from .hooks import HookPoint
|
|
29
|
+
from .ignore import load_corpus_ignore_spec
|
|
19
30
|
from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
|
|
20
31
|
from .sources import load_source
|
|
21
32
|
from .time import utc_now_iso
|
|
22
|
-
from .uris import
|
|
33
|
+
from .uris import corpus_ref_to_path, normalize_corpus_uri
|
|
23
34
|
|
|
24
35
|
|
|
25
36
|
def _sha256_bytes(data: bytes) -> str:
|
|
@@ -31,10 +42,38 @@ def _sha256_bytes(data: bytes) -> str:
|
|
|
31
42
|
:return: Secure Hash Algorithm 256 hex digest.
|
|
32
43
|
:rtype: str
|
|
33
44
|
"""
|
|
34
|
-
|
|
35
45
|
return hashlib.sha256(data).hexdigest()
|
|
36
46
|
|
|
37
47
|
|
|
48
|
+
def _write_stream_and_hash(
|
|
49
|
+
stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
|
|
50
|
+
) -> Dict[str, object]:
|
|
51
|
+
"""
|
|
52
|
+
Write a binary stream to disk while computing a digest.
|
|
53
|
+
|
|
54
|
+
:param stream: Binary stream to read from.
|
|
55
|
+
:type stream: object
|
|
56
|
+
:param destination_path: Destination path to write to.
|
|
57
|
+
:type destination_path: Path
|
|
58
|
+
:param chunk_size: Chunk size for reads.
|
|
59
|
+
:type chunk_size: int
|
|
60
|
+
:return: Mapping containing sha256 and bytes_written.
|
|
61
|
+
:rtype: dict[str, object]
|
|
62
|
+
:raises OSError: If the destination cannot be written.
|
|
63
|
+
"""
|
|
64
|
+
hasher = hashlib.sha256()
|
|
65
|
+
bytes_written = 0
|
|
66
|
+
with destination_path.open("wb") as destination_handle:
|
|
67
|
+
while True:
|
|
68
|
+
chunk = stream.read(chunk_size)
|
|
69
|
+
if not chunk:
|
|
70
|
+
break
|
|
71
|
+
hasher.update(chunk)
|
|
72
|
+
destination_handle.write(chunk)
|
|
73
|
+
bytes_written += len(chunk)
|
|
74
|
+
return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
|
|
75
|
+
|
|
76
|
+
|
|
38
77
|
def _sanitize_filename(name: str) -> str:
|
|
39
78
|
"""
|
|
40
79
|
Sanitize a filename into a portable, filesystem-friendly form.
|
|
@@ -44,7 +83,6 @@ def _sanitize_filename(name: str) -> str:
|
|
|
44
83
|
:return: Sanitized filename.
|
|
45
84
|
:rtype: str
|
|
46
85
|
"""
|
|
47
|
-
|
|
48
86
|
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
49
87
|
sanitized_name = "".join(
|
|
50
88
|
(character if character in allowed_characters else "_") for character in name
|
|
@@ -61,9 +99,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
|
61
99
|
:return: Preferred extension or None.
|
|
62
100
|
:rtype: str or None
|
|
63
101
|
"""
|
|
64
|
-
|
|
65
102
|
media_type_overrides = {
|
|
66
103
|
"image/jpeg": ".jpg",
|
|
104
|
+
"audio/ogg": ".ogg",
|
|
67
105
|
}
|
|
68
106
|
if media_type in media_type_overrides:
|
|
69
107
|
return media_type_overrides[media_type]
|
|
@@ -81,7 +119,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
81
119
|
:return: Filename with a compatible extension.
|
|
82
120
|
:rtype: str
|
|
83
121
|
"""
|
|
84
|
-
|
|
85
122
|
raw_name = filename.strip()
|
|
86
123
|
|
|
87
124
|
if media_type == "text/markdown":
|
|
@@ -89,11 +126,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
89
126
|
return raw_name
|
|
90
127
|
return raw_name + ".md"
|
|
91
128
|
|
|
129
|
+
if Path(raw_name).suffix:
|
|
130
|
+
return raw_name
|
|
131
|
+
|
|
92
132
|
ext = _preferred_extension_for_media_type(media_type)
|
|
93
133
|
if not ext:
|
|
94
134
|
return raw_name
|
|
95
|
-
if raw_name.lower().endswith(ext.lower()):
|
|
96
|
-
return raw_name
|
|
97
135
|
return raw_name + ext
|
|
98
136
|
|
|
99
137
|
|
|
@@ -108,7 +146,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
|
108
146
|
:return: Deduplicated tag list preserving order.
|
|
109
147
|
:rtype: list[str]
|
|
110
148
|
"""
|
|
111
|
-
|
|
112
149
|
merged_tags: List[str] = []
|
|
113
150
|
|
|
114
151
|
for explicit_tag in explicit:
|
|
@@ -141,7 +178,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
|
|
|
141
178
|
:return: Sidecar path.
|
|
142
179
|
:rtype: Path
|
|
143
180
|
"""
|
|
144
|
-
|
|
145
181
|
return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
|
|
146
182
|
|
|
147
183
|
|
|
@@ -155,7 +191,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
|
|
|
155
191
|
:rtype: dict[str, Any]
|
|
156
192
|
:raises ValueError: If the sidecar content is not a mapping.
|
|
157
193
|
"""
|
|
158
|
-
|
|
159
194
|
path = _sidecar_path_for(content_path)
|
|
160
195
|
if not path.is_file():
|
|
161
196
|
return {}
|
|
@@ -186,7 +221,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
|
|
|
186
221
|
path.write_text(text + "\n", encoding="utf-8")
|
|
187
222
|
|
|
188
223
|
|
|
189
|
-
def _ensure_biblicus_block(
|
|
224
|
+
def _ensure_biblicus_block(
|
|
225
|
+
metadata: Dict[str, Any], *, item_id: str, source_uri: str
|
|
226
|
+
) -> Dict[str, Any]:
|
|
190
227
|
"""
|
|
191
228
|
Ensure the biblicus metadata block exists and is populated.
|
|
192
229
|
|
|
@@ -284,11 +321,11 @@ class Corpus:
|
|
|
284
321
|
:param root: Corpus root directory.
|
|
285
322
|
:type root: Path
|
|
286
323
|
"""
|
|
287
|
-
|
|
288
324
|
self.root = root
|
|
289
325
|
self.meta_dir = self.root / CORPUS_DIR_NAME
|
|
290
326
|
self.raw_dir = self.root / DEFAULT_RAW_DIR
|
|
291
327
|
self.config = self._load_config()
|
|
328
|
+
self._hooks = self._load_hooks()
|
|
292
329
|
|
|
293
330
|
@property
|
|
294
331
|
def uri(self) -> str:
|
|
@@ -298,7 +335,6 @@ class Corpus:
|
|
|
298
335
|
:return: Corpus uniform resource identifier.
|
|
299
336
|
:rtype: str
|
|
300
337
|
"""
|
|
301
|
-
|
|
302
338
|
return self.root.as_uri()
|
|
303
339
|
|
|
304
340
|
def _load_config(self) -> Optional[CorpusConfig]:
|
|
@@ -309,12 +345,38 @@ class Corpus:
|
|
|
309
345
|
:rtype: CorpusConfig or None
|
|
310
346
|
:raises ValueError: If the config schema is invalid.
|
|
311
347
|
"""
|
|
312
|
-
|
|
313
348
|
path = self.meta_dir / "config.json"
|
|
314
349
|
if not path.is_file():
|
|
315
350
|
return None
|
|
316
351
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
317
|
-
|
|
352
|
+
try:
|
|
353
|
+
return CorpusConfig.model_validate(data)
|
|
354
|
+
except ValidationError as exc:
|
|
355
|
+
has_hook_error = any(
|
|
356
|
+
isinstance(error.get("loc"), tuple)
|
|
357
|
+
and error.get("loc")
|
|
358
|
+
and error.get("loc")[0] == "hooks"
|
|
359
|
+
for error in exc.errors()
|
|
360
|
+
)
|
|
361
|
+
if has_hook_error:
|
|
362
|
+
raise ValueError(f"Invalid hook specification: {exc}") from exc
|
|
363
|
+
raise ValueError(f"Invalid corpus config: {exc}") from exc
|
|
364
|
+
|
|
365
|
+
def _load_hooks(self) -> Optional[HookManager]:
|
|
366
|
+
"""
|
|
367
|
+
Load the hook manager from config if hooks are configured.
|
|
368
|
+
|
|
369
|
+
:return: Hook manager or None.
|
|
370
|
+
:rtype: HookManager or None
|
|
371
|
+
:raises ValueError: If hook specifications are invalid.
|
|
372
|
+
"""
|
|
373
|
+
if self.config is None or not self.config.hooks:
|
|
374
|
+
return None
|
|
375
|
+
return HookManager.from_config(
|
|
376
|
+
corpus_root=self.root,
|
|
377
|
+
corpus_uri=self.uri,
|
|
378
|
+
hook_specs=self.config.hooks,
|
|
379
|
+
)
|
|
318
380
|
|
|
319
381
|
@classmethod
|
|
320
382
|
def find(cls, start: Path) -> "Corpus":
|
|
@@ -327,7 +389,6 @@ class Corpus:
|
|
|
327
389
|
:rtype: Corpus
|
|
328
390
|
:raises FileNotFoundError: If no corpus config is found.
|
|
329
391
|
"""
|
|
330
|
-
|
|
331
392
|
start = start.resolve()
|
|
332
393
|
for candidate in [start, *start.parents]:
|
|
333
394
|
if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
|
|
@@ -346,7 +407,6 @@ class Corpus:
|
|
|
346
407
|
:return: Opened corpus instance.
|
|
347
408
|
:rtype: Corpus
|
|
348
409
|
"""
|
|
349
|
-
|
|
350
410
|
return cls.find(corpus_ref_to_path(ref))
|
|
351
411
|
|
|
352
412
|
@classmethod
|
|
@@ -362,7 +422,6 @@ class Corpus:
|
|
|
362
422
|
:rtype: Corpus
|
|
363
423
|
:raises FileExistsError: If the corpus already exists and force is False.
|
|
364
424
|
"""
|
|
365
|
-
|
|
366
425
|
root = root.resolve()
|
|
367
426
|
corpus = cls(root)
|
|
368
427
|
|
|
@@ -392,7 +451,6 @@ class Corpus:
|
|
|
392
451
|
:return: Catalog file path.
|
|
393
452
|
:rtype: Path
|
|
394
453
|
"""
|
|
395
|
-
|
|
396
454
|
return self.meta_dir / "catalog.json"
|
|
397
455
|
|
|
398
456
|
def _init_catalog(self) -> None:
|
|
@@ -402,7 +460,6 @@ class Corpus:
|
|
|
402
460
|
:return: None.
|
|
403
461
|
:rtype: None
|
|
404
462
|
"""
|
|
405
|
-
|
|
406
463
|
if self.catalog_path.exists():
|
|
407
464
|
return
|
|
408
465
|
catalog = CorpusCatalog(
|
|
@@ -425,7 +482,6 @@ class Corpus:
|
|
|
425
482
|
:raises FileNotFoundError: If the catalog file does not exist.
|
|
426
483
|
:raises ValueError: If the catalog schema is invalid.
|
|
427
484
|
"""
|
|
428
|
-
|
|
429
485
|
if not self.catalog_path.is_file():
|
|
430
486
|
raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
|
|
431
487
|
catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
|
|
@@ -440,7 +496,6 @@ class Corpus:
|
|
|
440
496
|
:raises FileNotFoundError: If the catalog file does not exist.
|
|
441
497
|
:raises ValueError: If the catalog schema is invalid.
|
|
442
498
|
"""
|
|
443
|
-
|
|
444
499
|
return self._load_catalog()
|
|
445
500
|
|
|
446
501
|
def _write_catalog(self, catalog: CorpusCatalog) -> None:
|
|
@@ -452,7 +507,6 @@ class Corpus:
|
|
|
452
507
|
:return: None.
|
|
453
508
|
:rtype: None
|
|
454
509
|
"""
|
|
455
|
-
|
|
456
510
|
temp_path = self.catalog_path.with_suffix(".json.tmp")
|
|
457
511
|
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
458
512
|
temp_path.replace(self.catalog_path)
|
|
@@ -465,9 +519,54 @@ class Corpus:
|
|
|
465
519
|
:return: Path to the runs directory.
|
|
466
520
|
:rtype: Path
|
|
467
521
|
"""
|
|
468
|
-
|
|
469
522
|
return self.meta_dir / RUNS_DIR_NAME
|
|
470
523
|
|
|
524
|
+
@property
|
|
525
|
+
def extraction_runs_dir(self) -> Path:
|
|
526
|
+
"""
|
|
527
|
+
Location of extraction run artifacts.
|
|
528
|
+
|
|
529
|
+
:return: Path to the extraction runs directory.
|
|
530
|
+
:rtype: Path
|
|
531
|
+
"""
|
|
532
|
+
return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
|
|
533
|
+
|
|
534
|
+
def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
|
|
535
|
+
"""
|
|
536
|
+
Resolve an extraction run directory.
|
|
537
|
+
|
|
538
|
+
:param extractor_id: Extractor plugin identifier.
|
|
539
|
+
:type extractor_id: str
|
|
540
|
+
:param run_id: Extraction run identifier.
|
|
541
|
+
:type run_id: str
|
|
542
|
+
:return: Extraction run directory.
|
|
543
|
+
:rtype: Path
|
|
544
|
+
"""
|
|
545
|
+
return self.extraction_runs_dir / extractor_id / run_id
|
|
546
|
+
|
|
547
|
+
def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
|
|
548
|
+
"""
|
|
549
|
+
Read extracted text for an item from an extraction run, when present.
|
|
550
|
+
|
|
551
|
+
:param extractor_id: Extractor plugin identifier.
|
|
552
|
+
:type extractor_id: str
|
|
553
|
+
:param run_id: Extraction run identifier.
|
|
554
|
+
:type run_id: str
|
|
555
|
+
:param item_id: Item identifier.
|
|
556
|
+
:type item_id: str
|
|
557
|
+
:return: Extracted text or None if the artifact does not exist.
|
|
558
|
+
:rtype: str or None
|
|
559
|
+
:raises OSError: If the file exists but cannot be read.
|
|
560
|
+
"""
|
|
561
|
+
path = (
|
|
562
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
563
|
+
/ "text"
|
|
564
|
+
/ f"{item_id}.txt"
|
|
565
|
+
)
|
|
566
|
+
if not path.is_file():
|
|
567
|
+
return None
|
|
568
|
+
return path.read_text(encoding="utf-8")
|
|
569
|
+
|
|
471
570
|
def _ensure_runs_dir(self) -> None:
|
|
472
571
|
"""
|
|
473
572
|
Ensure the retrieval runs directory exists.
|
|
@@ -475,7 +574,6 @@ class Corpus:
|
|
|
475
574
|
:return: None.
|
|
476
575
|
:rtype: None
|
|
477
576
|
"""
|
|
478
|
-
|
|
479
577
|
self.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
480
578
|
|
|
481
579
|
def write_run(self, run: RetrievalRun) -> None:
|
|
@@ -487,7 +585,6 @@ class Corpus:
|
|
|
487
585
|
:return: None.
|
|
488
586
|
:rtype: None
|
|
489
587
|
"""
|
|
490
|
-
|
|
491
588
|
self._ensure_runs_dir()
|
|
492
589
|
path = self.runs_dir / f"{run.run_id}.json"
|
|
493
590
|
path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
@@ -506,7 +603,6 @@ class Corpus:
|
|
|
506
603
|
:rtype: RetrievalRun
|
|
507
604
|
:raises FileNotFoundError: If the run manifest does not exist.
|
|
508
605
|
"""
|
|
509
|
-
|
|
510
606
|
path = self.runs_dir / f"{run_id}.json"
|
|
511
607
|
if not path.is_file():
|
|
512
608
|
raise FileNotFoundError(f"Missing run manifest: {path}")
|
|
@@ -521,7 +617,6 @@ class Corpus:
|
|
|
521
617
|
:return: Latest run identifier or None.
|
|
522
618
|
:rtype: str or None
|
|
523
619
|
"""
|
|
524
|
-
|
|
525
620
|
return self._load_catalog().latest_run_id
|
|
526
621
|
|
|
527
622
|
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
@@ -533,7 +628,6 @@ class Corpus:
|
|
|
533
628
|
:return: None.
|
|
534
629
|
:rtype: None
|
|
535
630
|
"""
|
|
536
|
-
|
|
537
631
|
self._init_catalog()
|
|
538
632
|
catalog = self._load_catalog()
|
|
539
633
|
catalog.items[item.id] = item
|
|
@@ -581,7 +675,6 @@ class Corpus:
|
|
|
581
675
|
:rtype: IngestResult
|
|
582
676
|
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
583
677
|
"""
|
|
584
|
-
|
|
585
678
|
item_id = str(uuid.uuid4())
|
|
586
679
|
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
587
680
|
|
|
@@ -608,13 +701,30 @@ class Corpus:
|
|
|
608
701
|
if resolved_tags and "tags" not in metadata_input:
|
|
609
702
|
metadata_input["tags"] = list(resolved_tags)
|
|
610
703
|
|
|
704
|
+
if self._hooks is not None:
|
|
705
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
706
|
+
hook_point=HookPoint.before_ingest,
|
|
707
|
+
filename=filename,
|
|
708
|
+
media_type=media_type,
|
|
709
|
+
title=resolved_title,
|
|
710
|
+
tags=list(resolved_tags),
|
|
711
|
+
metadata=dict(metadata_input),
|
|
712
|
+
source_uri=source_uri,
|
|
713
|
+
)
|
|
714
|
+
if mutation.add_tags:
|
|
715
|
+
for tag in mutation.add_tags:
|
|
716
|
+
if tag not in resolved_tags:
|
|
717
|
+
resolved_tags.append(tag)
|
|
718
|
+
|
|
611
719
|
frontmatter: Dict[str, Any] = {}
|
|
612
720
|
|
|
613
721
|
if media_type == "text/markdown":
|
|
614
722
|
try:
|
|
615
723
|
markdown_text = data.decode("utf-8")
|
|
616
724
|
except UnicodeDecodeError as decode_error:
|
|
617
|
-
raise ValueError(
|
|
725
|
+
raise ValueError(
|
|
726
|
+
"Markdown must be Unicode Transformation Format 8"
|
|
727
|
+
) from decode_error
|
|
618
728
|
|
|
619
729
|
parsed_document = parse_front_matter(markdown_text)
|
|
620
730
|
frontmatter = dict(parsed_document.metadata)
|
|
@@ -633,7 +743,9 @@ class Corpus:
|
|
|
633
743
|
if isinstance(title_value, str) and title_value.strip():
|
|
634
744
|
resolved_title = title_value.strip()
|
|
635
745
|
|
|
636
|
-
frontmatter = _ensure_biblicus_block(
|
|
746
|
+
frontmatter = _ensure_biblicus_block(
|
|
747
|
+
frontmatter, item_id=item_id, source_uri=source_uri
|
|
748
|
+
)
|
|
637
749
|
rendered_document = render_front_matter(frontmatter, parsed_document.body)
|
|
638
750
|
data_to_write = rendered_document.encode("utf-8")
|
|
639
751
|
else:
|
|
@@ -656,6 +768,34 @@ class Corpus:
|
|
|
656
768
|
_write_sidecar(output_path, sidecar)
|
|
657
769
|
frontmatter = sidecar
|
|
658
770
|
|
|
771
|
+
if self._hooks is not None:
|
|
772
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
773
|
+
hook_point=HookPoint.after_ingest,
|
|
774
|
+
filename=filename,
|
|
775
|
+
media_type=media_type,
|
|
776
|
+
title=resolved_title,
|
|
777
|
+
tags=list(resolved_tags),
|
|
778
|
+
metadata=dict(metadata_input),
|
|
779
|
+
source_uri=source_uri,
|
|
780
|
+
item_id=item_id,
|
|
781
|
+
relpath=relpath,
|
|
782
|
+
)
|
|
783
|
+
if mutation.add_tags:
|
|
784
|
+
updated_tags = list(resolved_tags)
|
|
785
|
+
for tag in mutation.add_tags:
|
|
786
|
+
if tag not in updated_tags:
|
|
787
|
+
updated_tags.append(tag)
|
|
788
|
+
resolved_tags = updated_tags
|
|
789
|
+
sidecar_metadata = _load_sidecar(output_path)
|
|
790
|
+
sidecar_metadata["tags"] = resolved_tags
|
|
791
|
+
if media_type != "text/markdown":
|
|
792
|
+
sidecar_metadata["media_type"] = media_type
|
|
793
|
+
sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
|
|
794
|
+
_write_sidecar(output_path, sidecar_metadata)
|
|
795
|
+
frontmatter = _merge_metadata(
|
|
796
|
+
frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
|
|
797
|
+
)
|
|
798
|
+
|
|
659
799
|
created_at = utc_now_iso()
|
|
660
800
|
item_record = CatalogItem(
|
|
661
801
|
id=item_id,
|
|
@@ -673,6 +813,129 @@ class Corpus:
|
|
|
673
813
|
|
|
674
814
|
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
675
815
|
|
|
816
|
+
def ingest_item_stream(
|
|
817
|
+
self,
|
|
818
|
+
stream,
|
|
819
|
+
*,
|
|
820
|
+
filename: Optional[str] = None,
|
|
821
|
+
media_type: str = "application/octet-stream",
|
|
822
|
+
tags: Sequence[str] = (),
|
|
823
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
824
|
+
source_uri: str = "unknown",
|
|
825
|
+
) -> IngestResult:
|
|
826
|
+
"""
|
|
827
|
+
Ingest a binary item from a readable stream.
|
|
828
|
+
|
|
829
|
+
This method is intended for large non-markdown items. It writes bytes to disk incrementally
|
|
830
|
+
while computing a checksum.
|
|
831
|
+
|
|
832
|
+
:param stream: Readable binary stream.
|
|
833
|
+
:type stream: object
|
|
834
|
+
:param filename: Optional filename for the stored item.
|
|
835
|
+
:type filename: str or None
|
|
836
|
+
:param media_type: Internet Assigned Numbers Authority media type for the item.
|
|
837
|
+
:type media_type: str
|
|
838
|
+
:param tags: Tags to associate with the item.
|
|
839
|
+
:type tags: Sequence[str]
|
|
840
|
+
:param metadata: Optional metadata mapping.
|
|
841
|
+
:type metadata: dict[str, Any] or None
|
|
842
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
843
|
+
:type source_uri: str
|
|
844
|
+
:return: Ingestion result summary.
|
|
845
|
+
:rtype: IngestResult
|
|
846
|
+
:raises ValueError: If the media_type is text/markdown.
|
|
847
|
+
"""
|
|
848
|
+
if media_type == "text/markdown":
|
|
849
|
+
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
850
|
+
|
|
851
|
+
item_id = str(uuid.uuid4())
|
|
852
|
+
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
853
|
+
if safe_filename:
|
|
854
|
+
safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
|
|
855
|
+
|
|
856
|
+
if safe_filename:
|
|
857
|
+
output_name = f"{item_id}--{safe_filename}"
|
|
858
|
+
else:
|
|
859
|
+
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
860
|
+
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
861
|
+
|
|
862
|
+
relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
|
|
863
|
+
output_path = self.root / relpath
|
|
864
|
+
|
|
865
|
+
resolved_tags = list(tags)
|
|
866
|
+
metadata_input: Dict[str, Any] = dict(metadata or {})
|
|
867
|
+
if resolved_tags and "tags" not in metadata_input:
|
|
868
|
+
metadata_input["tags"] = list(resolved_tags)
|
|
869
|
+
|
|
870
|
+
if self._hooks is not None:
|
|
871
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
872
|
+
hook_point=HookPoint.before_ingest,
|
|
873
|
+
filename=filename,
|
|
874
|
+
media_type=media_type,
|
|
875
|
+
title=None,
|
|
876
|
+
tags=list(resolved_tags),
|
|
877
|
+
metadata=dict(metadata_input),
|
|
878
|
+
source_uri=source_uri,
|
|
879
|
+
)
|
|
880
|
+
if mutation.add_tags:
|
|
881
|
+
for tag in mutation.add_tags:
|
|
882
|
+
if tag not in resolved_tags:
|
|
883
|
+
resolved_tags.append(tag)
|
|
884
|
+
|
|
885
|
+
write_result = _write_stream_and_hash(stream, output_path)
|
|
886
|
+
sha256_digest = str(write_result["sha256"])
|
|
887
|
+
bytes_written = int(write_result["bytes_written"])
|
|
888
|
+
|
|
889
|
+
sidecar: Dict[str, Any] = {}
|
|
890
|
+
sidecar["media_type"] = media_type
|
|
891
|
+
if resolved_tags:
|
|
892
|
+
sidecar["tags"] = resolved_tags
|
|
893
|
+
if metadata_input:
|
|
894
|
+
for metadata_key, metadata_value in metadata_input.items():
|
|
895
|
+
if metadata_key in {"tags", "biblicus"}:
|
|
896
|
+
continue
|
|
897
|
+
sidecar[metadata_key] = metadata_value
|
|
898
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
899
|
+
_write_sidecar(output_path, sidecar)
|
|
900
|
+
|
|
901
|
+
if self._hooks is not None:
|
|
902
|
+
mutation = self._hooks.run_ingest_hooks(
|
|
903
|
+
hook_point=HookPoint.after_ingest,
|
|
904
|
+
filename=filename,
|
|
905
|
+
media_type=media_type,
|
|
906
|
+
title=None,
|
|
907
|
+
tags=list(resolved_tags),
|
|
908
|
+
metadata=dict(metadata_input),
|
|
909
|
+
source_uri=source_uri,
|
|
910
|
+
item_id=item_id,
|
|
911
|
+
relpath=relpath,
|
|
912
|
+
)
|
|
913
|
+
if mutation.add_tags:
|
|
914
|
+
updated_tags = list(resolved_tags)
|
|
915
|
+
for tag in mutation.add_tags:
|
|
916
|
+
if tag not in updated_tags:
|
|
917
|
+
updated_tags.append(tag)
|
|
918
|
+
resolved_tags = updated_tags
|
|
919
|
+
sidecar["tags"] = resolved_tags
|
|
920
|
+
_write_sidecar(output_path, sidecar)
|
|
921
|
+
|
|
922
|
+
created_at = utc_now_iso()
|
|
923
|
+
item_record = CatalogItem(
|
|
924
|
+
id=item_id,
|
|
925
|
+
relpath=relpath,
|
|
926
|
+
sha256=sha256_digest,
|
|
927
|
+
bytes=bytes_written,
|
|
928
|
+
media_type=media_type,
|
|
929
|
+
title=None,
|
|
930
|
+
tags=list(resolved_tags),
|
|
931
|
+
metadata=dict(sidecar or {}),
|
|
932
|
+
created_at=created_at,
|
|
933
|
+
source_uri=source_uri,
|
|
934
|
+
)
|
|
935
|
+
self._upsert_catalog_item(item_record)
|
|
936
|
+
|
|
937
|
+
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
938
|
+
|
|
676
939
|
def ingest_note(
|
|
677
940
|
self,
|
|
678
941
|
text: str,
|
|
@@ -695,7 +958,6 @@ class Corpus:
|
|
|
695
958
|
:return: Ingestion result summary.
|
|
696
959
|
:rtype: IngestResult
|
|
697
960
|
"""
|
|
698
|
-
|
|
699
961
|
data = text.encode("utf-8")
|
|
700
962
|
return self.ingest_item(
|
|
701
963
|
data,
|
|
@@ -726,6 +988,35 @@ class Corpus:
|
|
|
726
988
|
:return: Ingestion result summary.
|
|
727
989
|
:rtype: IngestResult
|
|
728
990
|
"""
|
|
991
|
+
candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
|
|
992
|
+
if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
|
|
993
|
+
path = source if isinstance(source, Path) else candidate_path
|
|
994
|
+
assert isinstance(path, Path)
|
|
995
|
+
path = path.resolve()
|
|
996
|
+
filename = path.name
|
|
997
|
+
media_type, _ = mimetypes.guess_type(filename)
|
|
998
|
+
media_type = media_type or "application/octet-stream"
|
|
999
|
+
if path.suffix.lower() in {".md", ".markdown"}:
|
|
1000
|
+
media_type = "text/markdown"
|
|
1001
|
+
if media_type == "text/markdown":
|
|
1002
|
+
return self.ingest_item(
|
|
1003
|
+
path.read_bytes(),
|
|
1004
|
+
filename=filename,
|
|
1005
|
+
media_type=media_type,
|
|
1006
|
+
title=None,
|
|
1007
|
+
tags=tags,
|
|
1008
|
+
metadata=None,
|
|
1009
|
+
source_uri=source_uri or path.as_uri(),
|
|
1010
|
+
)
|
|
1011
|
+
with path.open("rb") as handle:
|
|
1012
|
+
return self.ingest_item_stream(
|
|
1013
|
+
handle,
|
|
1014
|
+
filename=filename,
|
|
1015
|
+
media_type=media_type,
|
|
1016
|
+
tags=tags,
|
|
1017
|
+
metadata=None,
|
|
1018
|
+
source_uri=source_uri or path.as_uri(),
|
|
1019
|
+
)
|
|
729
1020
|
|
|
730
1021
|
payload = load_source(source, source_uri=source_uri)
|
|
731
1022
|
return self.ingest_item(
|
|
@@ -738,6 +1029,128 @@ class Corpus:
|
|
|
738
1029
|
source_uri=payload.source_uri,
|
|
739
1030
|
)
|
|
740
1031
|
|
|
1032
|
+
def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
|
|
1033
|
+
"""
|
|
1034
|
+
Import a folder tree into the corpus, preserving relative paths and provenance.
|
|
1035
|
+
|
|
1036
|
+
Imported content is stored under the raw directory in a dedicated import namespace so that
|
|
1037
|
+
operators can inspect and back up imported content as a structured tree.
|
|
1038
|
+
|
|
1039
|
+
:param source_root: Root directory of the folder tree to import.
|
|
1040
|
+
:type source_root: Path
|
|
1041
|
+
:param tags: Tags to associate with imported items.
|
|
1042
|
+
:type tags: Sequence[str]
|
|
1043
|
+
:return: Import statistics.
|
|
1044
|
+
:rtype: dict[str, int]
|
|
1045
|
+
:raises FileNotFoundError: If the source_root does not exist.
|
|
1046
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1047
|
+
"""
|
|
1048
|
+
source_root = source_root.resolve()
|
|
1049
|
+
if not source_root.is_dir():
|
|
1050
|
+
raise FileNotFoundError(f"Import source root does not exist: {source_root}")
|
|
1051
|
+
|
|
1052
|
+
ignore_spec = load_corpus_ignore_spec(self.root)
|
|
1053
|
+
import_id = str(uuid.uuid4())
|
|
1054
|
+
stats = {"scanned": 0, "ignored": 0, "imported": 0}
|
|
1055
|
+
|
|
1056
|
+
for source_path in sorted(source_root.rglob("*")):
|
|
1057
|
+
if not source_path.is_file():
|
|
1058
|
+
continue
|
|
1059
|
+
relative_source_path = source_path.relative_to(source_root).as_posix()
|
|
1060
|
+
stats["scanned"] += 1
|
|
1061
|
+
if ignore_spec.matches(relative_source_path):
|
|
1062
|
+
stats["ignored"] += 1
|
|
1063
|
+
continue
|
|
1064
|
+
self._import_file(
|
|
1065
|
+
source_path=source_path,
|
|
1066
|
+
import_id=import_id,
|
|
1067
|
+
relative_source_path=relative_source_path,
|
|
1068
|
+
tags=tags,
|
|
1069
|
+
)
|
|
1070
|
+
stats["imported"] += 1
|
|
1071
|
+
|
|
1072
|
+
return stats
|
|
1073
|
+
|
|
1074
|
+
def _import_file(
|
|
1075
|
+
self,
|
|
1076
|
+
*,
|
|
1077
|
+
source_path: Path,
|
|
1078
|
+
import_id: str,
|
|
1079
|
+
relative_source_path: str,
|
|
1080
|
+
tags: Sequence[str],
|
|
1081
|
+
) -> None:
|
|
1082
|
+
"""
|
|
1083
|
+
Import a single file into the corpus under an import namespace.
|
|
1084
|
+
|
|
1085
|
+
:param source_path: Source file path to import.
|
|
1086
|
+
:type source_path: Path
|
|
1087
|
+
:param import_id: Import identifier.
|
|
1088
|
+
:type import_id: str
|
|
1089
|
+
:param relative_source_path: Relative path within the imported tree.
|
|
1090
|
+
:type relative_source_path: str
|
|
1091
|
+
:param tags: Tags to apply.
|
|
1092
|
+
:type tags: Sequence[str]
|
|
1093
|
+
:return: None.
|
|
1094
|
+
:rtype: None
|
|
1095
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1096
|
+
"""
|
|
1097
|
+
item_id = str(uuid.uuid4())
|
|
1098
|
+
destination_relpath = str(
|
|
1099
|
+
Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
|
|
1100
|
+
)
|
|
1101
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1102
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1103
|
+
|
|
1104
|
+
raw_bytes = source_path.read_bytes()
|
|
1105
|
+
sha256_digest = _sha256_bytes(raw_bytes)
|
|
1106
|
+
|
|
1107
|
+
media_type, _ = mimetypes.guess_type(source_path.name)
|
|
1108
|
+
media_type = media_type or "application/octet-stream"
|
|
1109
|
+
if source_path.suffix.lower() in {".md", ".markdown"}:
|
|
1110
|
+
media_type = "text/markdown"
|
|
1111
|
+
|
|
1112
|
+
title: Optional[str] = None
|
|
1113
|
+
frontmatter_metadata: Dict[str, Any] = {}
|
|
1114
|
+
if media_type == "text/markdown":
|
|
1115
|
+
try:
|
|
1116
|
+
text = raw_bytes.decode("utf-8")
|
|
1117
|
+
except UnicodeDecodeError as decode_error:
|
|
1118
|
+
raise ValueError(
|
|
1119
|
+
f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
|
|
1120
|
+
) from decode_error
|
|
1121
|
+
parsed_document = parse_front_matter(text)
|
|
1122
|
+
frontmatter_metadata = dict(parsed_document.metadata)
|
|
1123
|
+
title_value = frontmatter_metadata.get("title")
|
|
1124
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
1125
|
+
title = title_value.strip()
|
|
1126
|
+
|
|
1127
|
+
destination_path.write_bytes(raw_bytes)
|
|
1128
|
+
|
|
1129
|
+
sidecar: Dict[str, Any] = {}
|
|
1130
|
+
if tags:
|
|
1131
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1132
|
+
if media_type != "text/markdown":
|
|
1133
|
+
sidecar["media_type"] = media_type
|
|
1134
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
|
|
1135
|
+
_write_sidecar(destination_path, sidecar)
|
|
1136
|
+
|
|
1137
|
+
merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
|
|
1138
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1139
|
+
|
|
1140
|
+
item_record = CatalogItem(
|
|
1141
|
+
id=item_id,
|
|
1142
|
+
relpath=destination_relpath,
|
|
1143
|
+
sha256=sha256_digest,
|
|
1144
|
+
bytes=len(raw_bytes),
|
|
1145
|
+
media_type=media_type,
|
|
1146
|
+
title=title,
|
|
1147
|
+
tags=list(resolved_tags),
|
|
1148
|
+
metadata=dict(merged_metadata or {}),
|
|
1149
|
+
created_at=utc_now_iso(),
|
|
1150
|
+
source_uri=source_path.as_uri(),
|
|
1151
|
+
)
|
|
1152
|
+
self._upsert_catalog_item(item_record)
|
|
1153
|
+
|
|
741
1154
|
def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
|
|
742
1155
|
"""
|
|
743
1156
|
List items from the catalog.
|
|
@@ -747,11 +1160,8 @@ class Corpus:
|
|
|
747
1160
|
:return: Catalog items ordered by recency.
|
|
748
1161
|
:rtype: list[CatalogItem]
|
|
749
1162
|
"""
|
|
750
|
-
|
|
751
1163
|
catalog = self._load_catalog()
|
|
752
|
-
ordered_ids = (
|
|
753
|
-
catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
754
|
-
)
|
|
1164
|
+
ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
755
1165
|
collected_items: List[CatalogItem] = []
|
|
756
1166
|
for item_id in ordered_ids:
|
|
757
1167
|
item = catalog.items.get(item_id)
|
|
@@ -769,7 +1179,6 @@ class Corpus:
|
|
|
769
1179
|
:rtype: CatalogItem
|
|
770
1180
|
:raises KeyError: If the item identifier is unknown.
|
|
771
1181
|
"""
|
|
772
|
-
|
|
773
1182
|
catalog = self._load_catalog()
|
|
774
1183
|
item = catalog.items.get(item_id)
|
|
775
1184
|
if item is None:
|
|
@@ -787,7 +1196,6 @@ class Corpus:
|
|
|
787
1196
|
:rtype: dict[str, int]
|
|
788
1197
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
789
1198
|
"""
|
|
790
|
-
|
|
791
1199
|
self._init_catalog()
|
|
792
1200
|
existing_catalog = self._load_catalog()
|
|
793
1201
|
stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
|
|
@@ -862,7 +1270,9 @@ class Corpus:
|
|
|
862
1270
|
|
|
863
1271
|
previous_item = existing_catalog.items.get(item_id)
|
|
864
1272
|
created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
|
|
865
|
-
source_uri = source_uri or (
|
|
1273
|
+
source_uri = source_uri or (
|
|
1274
|
+
previous_item.source_uri if previous_item is not None else None
|
|
1275
|
+
)
|
|
866
1276
|
|
|
867
1277
|
if previous_item is None:
|
|
868
1278
|
stats["inserted"] += 1
|
|
@@ -909,7 +1319,6 @@ class Corpus:
|
|
|
909
1319
|
:return: Corpus name.
|
|
910
1320
|
:rtype: str
|
|
911
1321
|
"""
|
|
912
|
-
|
|
913
1322
|
return self.root.name
|
|
914
1323
|
|
|
915
1324
|
def purge(self, *, confirm: str) -> None:
|
|
@@ -922,10 +1331,11 @@ class Corpus:
|
|
|
922
1331
|
:rtype: None
|
|
923
1332
|
:raises ValueError: If the confirmation does not match.
|
|
924
1333
|
"""
|
|
925
|
-
|
|
926
1334
|
expected = self.name
|
|
927
1335
|
if confirm != expected:
|
|
928
|
-
raise ValueError(
|
|
1336
|
+
raise ValueError(
|
|
1337
|
+
f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
|
|
1338
|
+
)
|
|
929
1339
|
|
|
930
1340
|
if self.raw_dir.exists():
|
|
931
1341
|
shutil.rmtree(self.raw_dir)
|