biblicus 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -25,4 +25,4 @@ __all__ = [
25
25
  "RetrievalRun",
26
26
  ]
27
27
 
28
- __version__ = "0.3.0"
28
+ __version__ = "0.5.0"
biblicus/cli.py CHANGED
@@ -13,11 +13,19 @@ from typing import Dict, List, Optional
13
13
  from pydantic import ValidationError
14
14
 
15
15
  from .backends import get_backend
16
+ from .context import (
17
+ ContextPackPolicy,
18
+ TokenBudget,
19
+ build_context_pack,
20
+ fit_context_pack_to_token_budget,
21
+ )
16
22
  from .corpus import Corpus
23
+ from .crawl import CrawlRequest, crawl_into_corpus
17
24
  from .errors import ExtractionRunFatalError
18
25
  from .evaluation import evaluate_run, load_dataset
26
+ from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
19
27
  from .extraction import build_extraction_run
20
- from .models import QueryBudget
28
+ from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
21
29
  from .uris import corpus_ref_to_path
22
30
 
23
31
 
@@ -327,7 +335,7 @@ def cmd_build(arguments: argparse.Namespace) -> int:
327
335
  return 0
328
336
 
329
337
 
330
- def cmd_extract(arguments: argparse.Namespace) -> int:
338
+ def cmd_extract_build(arguments: argparse.Namespace) -> int:
331
339
  """
332
340
  Build a text extraction run for the corpus using a pipeline of extractors.
333
341
 
@@ -359,6 +367,69 @@ def cmd_extract(arguments: argparse.Namespace) -> int:
359
367
  return 0
360
368
 
361
369
 
370
+ def cmd_extract_list(arguments: argparse.Namespace) -> int:
371
+ """
372
+ List extraction runs stored under the corpus.
373
+
374
+ :param arguments: Parsed command-line interface arguments.
375
+ :type arguments: argparse.Namespace
376
+ :return: Exit code.
377
+ :rtype: int
378
+ """
379
+ corpus = (
380
+ Corpus.open(arguments.corpus)
381
+ if getattr(arguments, "corpus", None)
382
+ else Corpus.find(Path.cwd())
383
+ )
384
+ runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
385
+ print(json.dumps([entry.model_dump() for entry in runs], indent=2))
386
+ return 0
387
+
388
+
389
+ def cmd_extract_show(arguments: argparse.Namespace) -> int:
390
+ """
391
+ Show an extraction run manifest.
392
+
393
+ :param arguments: Parsed command-line interface arguments.
394
+ :type arguments: argparse.Namespace
395
+ :return: Exit code.
396
+ :rtype: int
397
+ """
398
+ corpus = (
399
+ Corpus.open(arguments.corpus)
400
+ if getattr(arguments, "corpus", None)
401
+ else Corpus.find(Path.cwd())
402
+ )
403
+ reference = parse_extraction_run_reference(arguments.run)
404
+ manifest = corpus.load_extraction_run_manifest(
405
+ extractor_id=reference.extractor_id, run_id=reference.run_id
406
+ )
407
+ print(manifest.model_dump_json(indent=2))
408
+ return 0
409
+
410
+
411
+ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
412
+ """
413
+ Delete an extraction run directory and its derived artifacts.
414
+
415
+ :param arguments: Parsed command-line interface arguments.
416
+ :type arguments: argparse.Namespace
417
+ :return: Exit code.
418
+ :rtype: int
419
+ """
420
+ corpus = (
421
+ Corpus.open(arguments.corpus)
422
+ if getattr(arguments, "corpus", None)
423
+ else Corpus.find(Path.cwd())
424
+ )
425
+ if arguments.confirm != arguments.run:
426
+ raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
427
+ reference = parse_extraction_run_reference(arguments.run)
428
+ corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
429
+ print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
430
+ return 0
431
+
432
+
362
433
  def cmd_query(arguments: argparse.Namespace) -> int:
363
434
  """
364
435
  Execute a retrieval query.
@@ -385,10 +456,62 @@ def cmd_query(arguments: argparse.Namespace) -> int:
385
456
  query_text = arguments.query if arguments.query is not None else sys.stdin.read()
386
457
  budget = _budget_from_args(arguments)
387
458
  result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
459
+ processed_evidence = result.evidence
460
+ if getattr(arguments, "reranker_id", None):
461
+ processed_evidence = apply_evidence_reranker(
462
+ reranker_id=arguments.reranker_id,
463
+ query_text=result.query_text,
464
+ evidence=processed_evidence,
465
+ )
466
+ if getattr(arguments, "minimum_score", None) is not None:
467
+ processed_evidence = apply_evidence_filter(
468
+ filter_id="filter-minimum-score",
469
+ query_text=result.query_text,
470
+ evidence=processed_evidence,
471
+ config={"minimum_score": float(arguments.minimum_score)},
472
+ )
473
+ if processed_evidence is not result.evidence:
474
+ result = result.model_copy(update={"evidence": processed_evidence})
388
475
  print(result.model_dump_json(indent=2))
389
476
  return 0
390
477
 
391
478
 
479
+ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
480
+ """
481
+ Build a context pack from a retrieval result.
482
+
483
+ The retrieval result is read from standard input as JavaScript Object Notation.
484
+
485
+ :param arguments: Parsed command-line interface arguments.
486
+ :type arguments: argparse.Namespace
487
+ :return: Exit code.
488
+ :rtype: int
489
+ """
490
+ input_text = sys.stdin.read()
491
+ if not input_text.strip():
492
+ raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
493
+ retrieval_result = RetrievalResult.model_validate_json(input_text)
494
+ join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
495
+ policy = ContextPackPolicy(join_with=join_with)
496
+ context_pack = build_context_pack(retrieval_result, policy=policy)
497
+ if arguments.max_tokens is not None:
498
+ context_pack = fit_context_pack_to_token_budget(
499
+ context_pack,
500
+ policy=policy,
501
+ token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
502
+ )
503
+ print(
504
+ json.dumps(
505
+ {
506
+ "policy": policy.model_dump(),
507
+ "context_pack": context_pack.model_dump(),
508
+ },
509
+ indent=2,
510
+ )
511
+ )
512
+ return 0
513
+
514
+
392
515
  def cmd_eval(arguments: argparse.Namespace) -> int:
393
516
  """
394
517
  Evaluate a retrieval run against a dataset.
@@ -414,6 +537,32 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
414
537
  return 0
415
538
 
416
539
 
540
+ def cmd_crawl(arguments: argparse.Namespace) -> int:
541
+ """
542
+ Crawl a website prefix into a corpus.
543
+
544
+ :param arguments: Parsed command-line interface arguments.
545
+ :type arguments: argparse.Namespace
546
+ :return: Exit code.
547
+ :rtype: int
548
+ """
549
+ corpus = (
550
+ Corpus.open(arguments.corpus)
551
+ if getattr(arguments, "corpus", None)
552
+ else Corpus.find(Path.cwd())
553
+ )
554
+ tags = _parse_tags(arguments.tags, arguments.tag)
555
+ request = CrawlRequest(
556
+ root_url=arguments.root_url,
557
+ allowed_prefix=arguments.allowed_prefix,
558
+ max_items=arguments.max_items,
559
+ tags=tags,
560
+ )
561
+ result = crawl_into_corpus(corpus=corpus, request=request)
562
+ print(result.model_dump_json(indent=2))
563
+ return 0
564
+
565
+
417
566
  def build_parser() -> argparse.ArgumentParser:
418
567
  """
419
568
  Build the command-line interface argument parser.
@@ -511,16 +660,53 @@ def build_parser() -> argparse.ArgumentParser:
511
660
  )
512
661
  p_build.set_defaults(func=cmd_build)
513
662
 
514
- p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
515
- _add_common_corpus_arg(p_extract)
516
- p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
517
- p_extract.add_argument(
663
+ p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
664
+ extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
665
+
666
+ p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
667
+ _add_common_corpus_arg(p_extract_build)
668
+ p_extract_build.add_argument(
669
+ "--recipe-name", default="default", help="Human-readable recipe name."
670
+ )
671
+ p_extract_build.add_argument(
518
672
  "--step",
519
673
  action="append",
520
674
  default=None,
521
675
  help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
522
676
  )
523
- p_extract.set_defaults(func=cmd_extract)
677
+ p_extract_build.set_defaults(func=cmd_extract_build)
678
+
679
+ p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
680
+ _add_common_corpus_arg(p_extract_list)
681
+ p_extract_list.add_argument(
682
+ "--extractor-id",
683
+ default=None,
684
+ help="Optional extractor identifier filter (for example: pipeline).",
685
+ )
686
+ p_extract_list.set_defaults(func=cmd_extract_list)
687
+
688
+ p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
689
+ _add_common_corpus_arg(p_extract_show)
690
+ p_extract_show.add_argument(
691
+ "--run",
692
+ required=True,
693
+ help="Extraction run reference in the form extractor_id:run_id.",
694
+ )
695
+ p_extract_show.set_defaults(func=cmd_extract_show)
696
+
697
+ p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
698
+ _add_common_corpus_arg(p_extract_delete)
699
+ p_extract_delete.add_argument(
700
+ "--run",
701
+ required=True,
702
+ help="Extraction run reference in the form extractor_id:run_id.",
703
+ )
704
+ p_extract_delete.add_argument(
705
+ "--confirm",
706
+ required=True,
707
+ help="Type the exact extractor_id:run_id to confirm deletion.",
708
+ )
709
+ p_extract_delete.set_defaults(func=cmd_extract_delete)
524
710
 
525
711
  p_query = sub.add_parser("query", help="Run a retrieval query.")
526
712
  _add_common_corpus_arg(p_query)
@@ -530,8 +716,38 @@ def build_parser() -> argparse.ArgumentParser:
530
716
  p_query.add_argument("--max-total-items", type=int, default=5)
531
717
  p_query.add_argument("--max-total-characters", type=int, default=2000)
532
718
  p_query.add_argument("--max-items-per-source", type=int, default=5)
719
+ p_query.add_argument(
720
+ "--reranker-id",
721
+ default=None,
722
+ help="Optional reranker identifier to apply after retrieval (for example: rerank-longest-text).",
723
+ )
724
+ p_query.add_argument(
725
+ "--minimum-score",
726
+ type=float,
727
+ default=None,
728
+ help="Optional minimum score threshold to filter evidence after retrieval.",
729
+ )
533
730
  p_query.set_defaults(func=cmd_query)
534
731
 
732
+ p_context_pack = sub.add_parser("context-pack", help="Build context pack text from evidence.")
733
+ context_pack_sub = p_context_pack.add_subparsers(dest="context_pack_command", required=True)
734
+
735
+ p_context_pack_build = context_pack_sub.add_parser(
736
+ "build", help="Build a context pack from a retrieval result JavaScript Object Notation."
737
+ )
738
+ p_context_pack_build.add_argument(
739
+ "--join-with",
740
+ default="\\n\\n",
741
+ help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
742
+ )
743
+ p_context_pack_build.add_argument(
744
+ "--max-tokens",
745
+ default=None,
746
+ type=int,
747
+ help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
748
+ )
749
+ p_context_pack_build.set_defaults(func=cmd_context_pack_build)
750
+
535
751
  p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
536
752
  _add_common_corpus_arg(p_eval)
537
753
  p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
@@ -545,6 +761,19 @@ def build_parser() -> argparse.ArgumentParser:
545
761
  p_eval.add_argument("--max-items-per-source", type=int, default=5)
546
762
  p_eval.set_defaults(func=cmd_eval)
547
763
 
764
+ p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
765
+ _add_common_corpus_arg(p_crawl)
766
+ p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
767
+ p_crawl.add_argument(
768
+ "--allowed-prefix",
769
+ required=True,
770
+ help="Uniform resource locator prefix that limits which links are eligible for crawl.",
771
+ )
772
+ p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
773
+ p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
774
+ p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
775
+ p_crawl.set_defaults(func=cmd_crawl)
776
+
548
777
  return parser
549
778
 
550
779
 
biblicus/context.py ADDED
@@ -0,0 +1,183 @@
1
+ """
2
+ Context pack building for Biblicus.
3
+
4
+ A context pack is the text that your application sends to a large language model.
5
+ Biblicus produces a context pack from structured retrieval results so that evidence remains a
6
+ stable contract while context formatting remains an explicit policy surface.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import List, Optional
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field
14
+
15
+ from .models import RetrievalResult
16
+
17
+
18
+ class ContextPackPolicy(BaseModel):
19
+ """
20
+ Policy that controls how evidence becomes context pack text.
21
+
22
+ :ivar join_with: Separator inserted between evidence text blocks.
23
+ :vartype join_with: str
24
+ """
25
+
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+ join_with: str = Field(default="\n\n")
29
+
30
+
31
+ class ContextPack(BaseModel):
32
+ """
33
+ Context pack derived from retrieval evidence.
34
+
35
+ :ivar text: Context pack text suitable for inclusion in a model call.
36
+ :vartype text: str
37
+ :ivar evidence_count: Number of evidence blocks included in the context pack.
38
+ :vartype evidence_count: int
39
+ :ivar blocks: Structured blocks that produced the context pack.
40
+ :vartype blocks: list[ContextPackBlock]
41
+ """
42
+
43
+ model_config = ConfigDict(extra="forbid")
44
+
45
+ text: str
46
+ evidence_count: int = Field(ge=0)
47
+ blocks: List["ContextPackBlock"] = Field(default_factory=list)
48
+
49
+
50
+ class ContextPackBlock(BaseModel):
51
+ """
52
+ A single context pack block derived from one evidence item.
53
+
54
+ :ivar evidence_item_id: Item identifier that produced this block.
55
+ :vartype evidence_item_id: str
56
+ :ivar text: Text included in this block.
57
+ :vartype text: str
58
+ """
59
+
60
+ model_config = ConfigDict(extra="forbid")
61
+
62
+ evidence_item_id: str = Field(min_length=1)
63
+ text: str = Field(min_length=1)
64
+
65
+
66
+ class TokenCounter(BaseModel):
67
+ """
68
+ Token counter configuration for token budget fitting.
69
+
70
+ This is a lightweight model wrapper so token fitting remains explicit and testable even when
71
+ the underlying tokenizer is provided by an optional dependency.
72
+
73
+ :ivar tokenizer_id: Tokenizer identifier (for example, naive-whitespace).
74
+ :vartype tokenizer_id: str
75
+ """
76
+
77
+ model_config = ConfigDict(extra="forbid")
78
+
79
+ tokenizer_id: str = Field(default="naive-whitespace", min_length=1)
80
+
81
+
82
+ class TokenBudget(BaseModel):
83
+ """
84
+ Token budget for a context pack.
85
+
86
+ :ivar max_tokens: Maximum tokens permitted for the final context pack text.
87
+ :vartype max_tokens: int
88
+ """
89
+
90
+ model_config = ConfigDict(extra="forbid")
91
+
92
+ max_tokens: int = Field(ge=1)
93
+
94
+
95
+ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
96
+ """
97
+ Build a context pack from a retrieval result using an explicit policy.
98
+
99
+ :param result: Retrieval result containing ranked evidence.
100
+ :type result: RetrievalResult
101
+ :param policy: Policy controlling how evidence text is joined.
102
+ :type policy: ContextPackPolicy
103
+ :return: Context pack containing concatenated evidence text.
104
+ :rtype: ContextPack
105
+ """
106
+ selected_blocks: List[ContextPackBlock] = []
107
+ for evidence in result.evidence:
108
+ if not isinstance(evidence.text, str):
109
+ continue
110
+ trimmed_text = evidence.text.strip()
111
+ if not trimmed_text:
112
+ continue
113
+ selected_blocks.append(
114
+ ContextPackBlock(evidence_item_id=evidence.item_id, text=trimmed_text)
115
+ )
116
+
117
+ return ContextPack(
118
+ text=policy.join_with.join([block.text for block in selected_blocks]),
119
+ evidence_count=len(selected_blocks),
120
+ blocks=selected_blocks,
121
+ )
122
+
123
+
124
+ def count_tokens(text: str, *, tokenizer_id: str) -> int:
125
+ """
126
+ Count tokens in a text using a tokenizer identifier.
127
+
128
+ The default tokenizer is naive-whitespace, which counts whitespace-separated tokens.
129
+
130
+ :param text: Text payload to count.
131
+ :type text: str
132
+ :param tokenizer_id: Tokenizer identifier.
133
+ :type tokenizer_id: str
134
+ :return: Token count.
135
+ :rtype: int
136
+ :raises KeyError: If the tokenizer identifier is unknown.
137
+ """
138
+ tokenizers = {
139
+ "naive-whitespace": lambda value: len([token for token in value.split() if token]),
140
+ }
141
+ tokenizer = tokenizers[tokenizer_id]
142
+ return int(tokenizer(text))
143
+
144
+
145
+ def fit_context_pack_to_token_budget(
146
+ context_pack: ContextPack,
147
+ *,
148
+ policy: ContextPackPolicy,
149
+ token_budget: TokenBudget,
150
+ token_counter: Optional[TokenCounter] = None,
151
+ ) -> ContextPack:
152
+ """
153
+ Fit a context pack to a token budget by dropping trailing blocks.
154
+
155
+ This function is deterministic. It never rewrites block text. It only removes blocks from the
156
+ end of the block list until the token budget is met.
157
+
158
+ :param context_pack: Context pack to fit.
159
+ :type context_pack: ContextPack
160
+ :param policy: Policy controlling how blocks are joined into text.
161
+ :type policy: ContextPackPolicy
162
+ :param token_budget: Token budget to enforce.
163
+ :type token_budget: TokenBudget
164
+ :param token_counter: Optional token counter configuration.
165
+ :type token_counter: TokenCounter or None
166
+ :return: Fitted context pack.
167
+ :rtype: ContextPack
168
+ """
169
+ token_counter = token_counter or TokenCounter()
170
+ remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
171
+
172
+ while remaining_blocks:
173
+ candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
174
+ candidate_tokens = count_tokens(candidate_text, tokenizer_id=token_counter.tokenizer_id)
175
+ if candidate_tokens <= token_budget.max_tokens:
176
+ return ContextPack(
177
+ text=candidate_text,
178
+ evidence_count=len(remaining_blocks),
179
+ blocks=remaining_blocks,
180
+ )
181
+ remaining_blocks = remaining_blocks[:-1]
182
+
183
+ return ContextPack(text="", evidence_count=0, blocks=[])
biblicus/corpus.py CHANGED
@@ -27,7 +27,14 @@ from .frontmatter import parse_front_matter, render_front_matter
27
27
  from .hook_manager import HookManager
28
28
  from .hooks import HookPoint
29
29
  from .ignore import load_corpus_ignore_spec
30
- from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
30
+ from .models import (
31
+ CatalogItem,
32
+ CorpusCatalog,
33
+ CorpusConfig,
34
+ ExtractionRunListEntry,
35
+ IngestResult,
36
+ RetrievalRun,
37
+ )
31
38
  from .sources import load_source
32
39
  from .time import utc_now_iso
33
40
  from .uris import corpus_ref_to_path, normalize_corpus_uri
@@ -567,6 +574,96 @@ class Corpus:
567
574
  return None
568
575
  return path.read_text(encoding="utf-8")
569
576
 
577
+ def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
578
+ """
579
+ Load an extraction run manifest from the corpus.
580
+
581
+ :param extractor_id: Extractor plugin identifier.
582
+ :type extractor_id: str
583
+ :param run_id: Extraction run identifier.
584
+ :type run_id: str
585
+ :return: Parsed extraction run manifest.
586
+ :rtype: biblicus.extraction.ExtractionRunManifest
587
+ :raises FileNotFoundError: If the manifest file does not exist.
588
+ :raises ValueError: If the manifest data is invalid.
589
+ """
590
+ from .extraction import ExtractionRunManifest
591
+
592
+ manifest_path = (
593
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
594
+ )
595
+ if not manifest_path.is_file():
596
+ raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
597
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
598
+ return ExtractionRunManifest.model_validate(data)
599
+
600
+ def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
601
+ """
602
+ List extraction runs stored under the corpus.
603
+
604
+ :param extractor_id: Optional extractor identifier filter.
605
+ :type extractor_id: str or None
606
+ :return: Summary list entries for each run.
607
+ :rtype: list[biblicus.models.ExtractionRunListEntry]
608
+ """
609
+ runs_root = self.extraction_runs_dir
610
+ if not runs_root.is_dir():
611
+ return []
612
+
613
+ extractor_dirs: List[Path]
614
+ if extractor_id is None:
615
+ extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
616
+ else:
617
+ extractor_path = runs_root / extractor_id
618
+ extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
619
+
620
+ entries: List[ExtractionRunListEntry] = []
621
+ for extractor_dir in extractor_dirs:
622
+ for run_dir in sorted(extractor_dir.iterdir()):
623
+ if not run_dir.is_dir():
624
+ continue
625
+ manifest_path = run_dir / "manifest.json"
626
+ if not manifest_path.is_file():
627
+ continue
628
+ try:
629
+ manifest = self.load_extraction_run_manifest(
630
+ extractor_id=extractor_dir.name,
631
+ run_id=run_dir.name,
632
+ )
633
+ except (FileNotFoundError, ValueError):
634
+ continue
635
+ entries.append(
636
+ ExtractionRunListEntry(
637
+ extractor_id=extractor_dir.name,
638
+ run_id=run_dir.name,
639
+ recipe_id=manifest.recipe.recipe_id,
640
+ recipe_name=manifest.recipe.name,
641
+ catalog_generated_at=manifest.catalog_generated_at,
642
+ created_at=manifest.created_at,
643
+ stats=dict(manifest.stats),
644
+ )
645
+ )
646
+
647
+ entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
648
+ return entries
649
+
650
+ def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
651
+ """
652
+ Delete an extraction run directory and its derived artifacts.
653
+
654
+ :param extractor_id: Extractor plugin identifier.
655
+ :type extractor_id: str
656
+ :param run_id: Extraction run identifier.
657
+ :type run_id: str
658
+ :return: None.
659
+ :rtype: None
660
+ :raises FileNotFoundError: If the extraction run directory does not exist.
661
+ """
662
+ run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
663
+ if not run_dir.is_dir():
664
+ raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
665
+ shutil.rmtree(run_dir)
666
+
570
667
  def _ensure_runs_dir(self) -> None:
571
668
  """
572
669
  Ensure the retrieval runs directory exists.
@@ -1185,6 +1282,78 @@ class Corpus:
1185
1282
  raise KeyError(f"Unknown item identifier: {item_id}")
1186
1283
  return item
1187
1284
 
1285
+ def create_crawl_id(self) -> str:
1286
+ """
1287
+ Create a new crawl identifier.
1288
+
1289
+ :return: Crawl identifier.
1290
+ :rtype: str
1291
+ """
1292
+ return str(uuid.uuid4())
1293
+
1294
+ def ingest_crawled_payload(
1295
+ self,
1296
+ *,
1297
+ crawl_id: str,
1298
+ relative_path: str,
1299
+ data: bytes,
1300
+ filename: str,
1301
+ media_type: str,
1302
+ source_uri: str,
1303
+ tags: Sequence[str],
1304
+ ) -> None:
1305
+ """
1306
+ Ingest a crawled payload under a crawl import namespace.
1307
+
1308
+ :param crawl_id: Crawl identifier used to group crawled artifacts.
1309
+ :type crawl_id: str
1310
+ :param relative_path: Relative path within the crawl prefix.
1311
+ :type relative_path: str
1312
+ :param data: Raw payload bytes.
1313
+ :type data: bytes
1314
+ :param filename: Suggested filename from the payload metadata.
1315
+ :type filename: str
1316
+ :param media_type: Internet Assigned Numbers Authority media type.
1317
+ :type media_type: str
1318
+ :param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
1319
+ :type source_uri: str
1320
+ :param tags: Tags to attach to the stored item.
1321
+ :type tags: Sequence[str]
1322
+ :return: None.
1323
+ :rtype: None
1324
+ """
1325
+ _ = filename
1326
+ item_id = str(uuid.uuid4())
1327
+ destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
1328
+ destination_path = (self.root / destination_relpath).resolve()
1329
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1330
+ destination_path.write_bytes(data)
1331
+
1332
+ sha256_digest = _sha256_bytes(data)
1333
+
1334
+ sidecar: Dict[str, Any] = {}
1335
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1336
+ sidecar["media_type"] = media_type
1337
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
1338
+ _write_sidecar(destination_path, sidecar)
1339
+
1340
+ merged_metadata = _merge_metadata({}, sidecar)
1341
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1342
+
1343
+ item_record = CatalogItem(
1344
+ id=item_id,
1345
+ relpath=destination_relpath,
1346
+ sha256=sha256_digest,
1347
+ bytes=len(data),
1348
+ media_type=media_type,
1349
+ title=None,
1350
+ tags=list(resolved_tags),
1351
+ metadata=dict(merged_metadata or {}),
1352
+ created_at=utc_now_iso(),
1353
+ source_uri=source_uri,
1354
+ )
1355
+ self._upsert_catalog_item(item_record)
1356
+
1188
1357
  def reindex(self) -> Dict[str, int]:
1189
1358
  """
1190
1359
  Rebuild/refresh the corpus catalog from the current on-disk corpus contents.