biblicus 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/cli.py +236 -7
- biblicus/context.py +183 -0
- biblicus/corpus.py +170 -1
- biblicus/crawl.py +186 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +4 -2
- biblicus/models.py +31 -0
- biblicus/time.py +1 -1
- {biblicus-0.3.0.dist-info → biblicus-0.5.0.dist-info}/METADATA +273 -112
- {biblicus-0.3.0.dist-info → biblicus-0.5.0.dist-info}/RECORD +15 -12
- {biblicus-0.3.0.dist-info → biblicus-0.5.0.dist-info}/WHEEL +0 -0
- {biblicus-0.3.0.dist-info → biblicus-0.5.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.3.0.dist-info → biblicus-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.3.0.dist-info → biblicus-0.5.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/cli.py
CHANGED
|
@@ -13,11 +13,19 @@ from typing import Dict, List, Optional
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
|
+
from .context import (
|
|
17
|
+
ContextPackPolicy,
|
|
18
|
+
TokenBudget,
|
|
19
|
+
build_context_pack,
|
|
20
|
+
fit_context_pack_to_token_budget,
|
|
21
|
+
)
|
|
16
22
|
from .corpus import Corpus
|
|
23
|
+
from .crawl import CrawlRequest, crawl_into_corpus
|
|
17
24
|
from .errors import ExtractionRunFatalError
|
|
18
25
|
from .evaluation import evaluate_run, load_dataset
|
|
26
|
+
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
19
27
|
from .extraction import build_extraction_run
|
|
20
|
-
from .models import QueryBudget
|
|
28
|
+
from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
|
|
21
29
|
from .uris import corpus_ref_to_path
|
|
22
30
|
|
|
23
31
|
|
|
@@ -327,7 +335,7 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
327
335
|
return 0
|
|
328
336
|
|
|
329
337
|
|
|
330
|
-
def
|
|
338
|
+
def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
331
339
|
"""
|
|
332
340
|
Build a text extraction run for the corpus using a pipeline of extractors.
|
|
333
341
|
|
|
@@ -359,6 +367,69 @@ def cmd_extract(arguments: argparse.Namespace) -> int:
|
|
|
359
367
|
return 0
|
|
360
368
|
|
|
361
369
|
|
|
370
|
+
def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
371
|
+
"""
|
|
372
|
+
List extraction runs stored under the corpus.
|
|
373
|
+
|
|
374
|
+
:param arguments: Parsed command-line interface arguments.
|
|
375
|
+
:type arguments: argparse.Namespace
|
|
376
|
+
:return: Exit code.
|
|
377
|
+
:rtype: int
|
|
378
|
+
"""
|
|
379
|
+
corpus = (
|
|
380
|
+
Corpus.open(arguments.corpus)
|
|
381
|
+
if getattr(arguments, "corpus", None)
|
|
382
|
+
else Corpus.find(Path.cwd())
|
|
383
|
+
)
|
|
384
|
+
runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
|
|
385
|
+
print(json.dumps([entry.model_dump() for entry in runs], indent=2))
|
|
386
|
+
return 0
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
390
|
+
"""
|
|
391
|
+
Show an extraction run manifest.
|
|
392
|
+
|
|
393
|
+
:param arguments: Parsed command-line interface arguments.
|
|
394
|
+
:type arguments: argparse.Namespace
|
|
395
|
+
:return: Exit code.
|
|
396
|
+
:rtype: int
|
|
397
|
+
"""
|
|
398
|
+
corpus = (
|
|
399
|
+
Corpus.open(arguments.corpus)
|
|
400
|
+
if getattr(arguments, "corpus", None)
|
|
401
|
+
else Corpus.find(Path.cwd())
|
|
402
|
+
)
|
|
403
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
404
|
+
manifest = corpus.load_extraction_run_manifest(
|
|
405
|
+
extractor_id=reference.extractor_id, run_id=reference.run_id
|
|
406
|
+
)
|
|
407
|
+
print(manifest.model_dump_json(indent=2))
|
|
408
|
+
return 0
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
412
|
+
"""
|
|
413
|
+
Delete an extraction run directory and its derived artifacts.
|
|
414
|
+
|
|
415
|
+
:param arguments: Parsed command-line interface arguments.
|
|
416
|
+
:type arguments: argparse.Namespace
|
|
417
|
+
:return: Exit code.
|
|
418
|
+
:rtype: int
|
|
419
|
+
"""
|
|
420
|
+
corpus = (
|
|
421
|
+
Corpus.open(arguments.corpus)
|
|
422
|
+
if getattr(arguments, "corpus", None)
|
|
423
|
+
else Corpus.find(Path.cwd())
|
|
424
|
+
)
|
|
425
|
+
if arguments.confirm != arguments.run:
|
|
426
|
+
raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
|
|
427
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
428
|
+
corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
|
|
429
|
+
print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
|
|
430
|
+
return 0
|
|
431
|
+
|
|
432
|
+
|
|
362
433
|
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
363
434
|
"""
|
|
364
435
|
Execute a retrieval query.
|
|
@@ -385,10 +456,62 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
385
456
|
query_text = arguments.query if arguments.query is not None else sys.stdin.read()
|
|
386
457
|
budget = _budget_from_args(arguments)
|
|
387
458
|
result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
|
|
459
|
+
processed_evidence = result.evidence
|
|
460
|
+
if getattr(arguments, "reranker_id", None):
|
|
461
|
+
processed_evidence = apply_evidence_reranker(
|
|
462
|
+
reranker_id=arguments.reranker_id,
|
|
463
|
+
query_text=result.query_text,
|
|
464
|
+
evidence=processed_evidence,
|
|
465
|
+
)
|
|
466
|
+
if getattr(arguments, "minimum_score", None) is not None:
|
|
467
|
+
processed_evidence = apply_evidence_filter(
|
|
468
|
+
filter_id="filter-minimum-score",
|
|
469
|
+
query_text=result.query_text,
|
|
470
|
+
evidence=processed_evidence,
|
|
471
|
+
config={"minimum_score": float(arguments.minimum_score)},
|
|
472
|
+
)
|
|
473
|
+
if processed_evidence is not result.evidence:
|
|
474
|
+
result = result.model_copy(update={"evidence": processed_evidence})
|
|
388
475
|
print(result.model_dump_json(indent=2))
|
|
389
476
|
return 0
|
|
390
477
|
|
|
391
478
|
|
|
479
|
+
def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
480
|
+
"""
|
|
481
|
+
Build a context pack from a retrieval result.
|
|
482
|
+
|
|
483
|
+
The retrieval result is read from standard input as JavaScript Object Notation.
|
|
484
|
+
|
|
485
|
+
:param arguments: Parsed command-line interface arguments.
|
|
486
|
+
:type arguments: argparse.Namespace
|
|
487
|
+
:return: Exit code.
|
|
488
|
+
:rtype: int
|
|
489
|
+
"""
|
|
490
|
+
input_text = sys.stdin.read()
|
|
491
|
+
if not input_text.strip():
|
|
492
|
+
raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
|
|
493
|
+
retrieval_result = RetrievalResult.model_validate_json(input_text)
|
|
494
|
+
join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
|
|
495
|
+
policy = ContextPackPolicy(join_with=join_with)
|
|
496
|
+
context_pack = build_context_pack(retrieval_result, policy=policy)
|
|
497
|
+
if arguments.max_tokens is not None:
|
|
498
|
+
context_pack = fit_context_pack_to_token_budget(
|
|
499
|
+
context_pack,
|
|
500
|
+
policy=policy,
|
|
501
|
+
token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
|
|
502
|
+
)
|
|
503
|
+
print(
|
|
504
|
+
json.dumps(
|
|
505
|
+
{
|
|
506
|
+
"policy": policy.model_dump(),
|
|
507
|
+
"context_pack": context_pack.model_dump(),
|
|
508
|
+
},
|
|
509
|
+
indent=2,
|
|
510
|
+
)
|
|
511
|
+
)
|
|
512
|
+
return 0
|
|
513
|
+
|
|
514
|
+
|
|
392
515
|
def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
393
516
|
"""
|
|
394
517
|
Evaluate a retrieval run against a dataset.
|
|
@@ -414,6 +537,32 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
414
537
|
return 0
|
|
415
538
|
|
|
416
539
|
|
|
540
|
+
def cmd_crawl(arguments: argparse.Namespace) -> int:
|
|
541
|
+
"""
|
|
542
|
+
Crawl a website prefix into a corpus.
|
|
543
|
+
|
|
544
|
+
:param arguments: Parsed command-line interface arguments.
|
|
545
|
+
:type arguments: argparse.Namespace
|
|
546
|
+
:return: Exit code.
|
|
547
|
+
:rtype: int
|
|
548
|
+
"""
|
|
549
|
+
corpus = (
|
|
550
|
+
Corpus.open(arguments.corpus)
|
|
551
|
+
if getattr(arguments, "corpus", None)
|
|
552
|
+
else Corpus.find(Path.cwd())
|
|
553
|
+
)
|
|
554
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
555
|
+
request = CrawlRequest(
|
|
556
|
+
root_url=arguments.root_url,
|
|
557
|
+
allowed_prefix=arguments.allowed_prefix,
|
|
558
|
+
max_items=arguments.max_items,
|
|
559
|
+
tags=tags,
|
|
560
|
+
)
|
|
561
|
+
result = crawl_into_corpus(corpus=corpus, request=request)
|
|
562
|
+
print(result.model_dump_json(indent=2))
|
|
563
|
+
return 0
|
|
564
|
+
|
|
565
|
+
|
|
417
566
|
def build_parser() -> argparse.ArgumentParser:
|
|
418
567
|
"""
|
|
419
568
|
Build the command-line interface argument parser.
|
|
@@ -511,16 +660,53 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
511
660
|
)
|
|
512
661
|
p_build.set_defaults(func=cmd_build)
|
|
513
662
|
|
|
514
|
-
p_extract = sub.add_parser("extract", help="
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
663
|
+
p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
|
|
664
|
+
extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
|
|
665
|
+
|
|
666
|
+
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
|
|
667
|
+
_add_common_corpus_arg(p_extract_build)
|
|
668
|
+
p_extract_build.add_argument(
|
|
669
|
+
"--recipe-name", default="default", help="Human-readable recipe name."
|
|
670
|
+
)
|
|
671
|
+
p_extract_build.add_argument(
|
|
518
672
|
"--step",
|
|
519
673
|
action="append",
|
|
520
674
|
default=None,
|
|
521
675
|
help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
522
676
|
)
|
|
523
|
-
|
|
677
|
+
p_extract_build.set_defaults(func=cmd_extract_build)
|
|
678
|
+
|
|
679
|
+
p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
|
|
680
|
+
_add_common_corpus_arg(p_extract_list)
|
|
681
|
+
p_extract_list.add_argument(
|
|
682
|
+
"--extractor-id",
|
|
683
|
+
default=None,
|
|
684
|
+
help="Optional extractor identifier filter (for example: pipeline).",
|
|
685
|
+
)
|
|
686
|
+
p_extract_list.set_defaults(func=cmd_extract_list)
|
|
687
|
+
|
|
688
|
+
p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
|
|
689
|
+
_add_common_corpus_arg(p_extract_show)
|
|
690
|
+
p_extract_show.add_argument(
|
|
691
|
+
"--run",
|
|
692
|
+
required=True,
|
|
693
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
694
|
+
)
|
|
695
|
+
p_extract_show.set_defaults(func=cmd_extract_show)
|
|
696
|
+
|
|
697
|
+
p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
|
|
698
|
+
_add_common_corpus_arg(p_extract_delete)
|
|
699
|
+
p_extract_delete.add_argument(
|
|
700
|
+
"--run",
|
|
701
|
+
required=True,
|
|
702
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
703
|
+
)
|
|
704
|
+
p_extract_delete.add_argument(
|
|
705
|
+
"--confirm",
|
|
706
|
+
required=True,
|
|
707
|
+
help="Type the exact extractor_id:run_id to confirm deletion.",
|
|
708
|
+
)
|
|
709
|
+
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
524
710
|
|
|
525
711
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
526
712
|
_add_common_corpus_arg(p_query)
|
|
@@ -530,8 +716,38 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
530
716
|
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
531
717
|
p_query.add_argument("--max-total-characters", type=int, default=2000)
|
|
532
718
|
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
719
|
+
p_query.add_argument(
|
|
720
|
+
"--reranker-id",
|
|
721
|
+
default=None,
|
|
722
|
+
help="Optional reranker identifier to apply after retrieval (for example: rerank-longest-text).",
|
|
723
|
+
)
|
|
724
|
+
p_query.add_argument(
|
|
725
|
+
"--minimum-score",
|
|
726
|
+
type=float,
|
|
727
|
+
default=None,
|
|
728
|
+
help="Optional minimum score threshold to filter evidence after retrieval.",
|
|
729
|
+
)
|
|
533
730
|
p_query.set_defaults(func=cmd_query)
|
|
534
731
|
|
|
732
|
+
p_context_pack = sub.add_parser("context-pack", help="Build context pack text from evidence.")
|
|
733
|
+
context_pack_sub = p_context_pack.add_subparsers(dest="context_pack_command", required=True)
|
|
734
|
+
|
|
735
|
+
p_context_pack_build = context_pack_sub.add_parser(
|
|
736
|
+
"build", help="Build a context pack from a retrieval result JavaScript Object Notation."
|
|
737
|
+
)
|
|
738
|
+
p_context_pack_build.add_argument(
|
|
739
|
+
"--join-with",
|
|
740
|
+
default="\\n\\n",
|
|
741
|
+
help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
|
|
742
|
+
)
|
|
743
|
+
p_context_pack_build.add_argument(
|
|
744
|
+
"--max-tokens",
|
|
745
|
+
default=None,
|
|
746
|
+
type=int,
|
|
747
|
+
help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
|
|
748
|
+
)
|
|
749
|
+
p_context_pack_build.set_defaults(func=cmd_context_pack_build)
|
|
750
|
+
|
|
535
751
|
p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
|
|
536
752
|
_add_common_corpus_arg(p_eval)
|
|
537
753
|
p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
@@ -545,6 +761,19 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
545
761
|
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
546
762
|
p_eval.set_defaults(func=cmd_eval)
|
|
547
763
|
|
|
764
|
+
p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
|
|
765
|
+
_add_common_corpus_arg(p_crawl)
|
|
766
|
+
p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
|
|
767
|
+
p_crawl.add_argument(
|
|
768
|
+
"--allowed-prefix",
|
|
769
|
+
required=True,
|
|
770
|
+
help="Uniform resource locator prefix that limits which links are eligible for crawl.",
|
|
771
|
+
)
|
|
772
|
+
p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
|
|
773
|
+
p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
|
|
774
|
+
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
775
|
+
p_crawl.set_defaults(func=cmd_crawl)
|
|
776
|
+
|
|
548
777
|
return parser
|
|
549
778
|
|
|
550
779
|
|
biblicus/context.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Context pack building for Biblicus.
|
|
3
|
+
|
|
4
|
+
A context pack is the text that your application sends to a large language model.
|
|
5
|
+
Biblicus produces a context pack from structured retrieval results so that evidence remains a
|
|
6
|
+
stable contract while context formatting remains an explicit policy surface.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from .models import RetrievalResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ContextPackPolicy(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Policy that controls how evidence becomes context pack text.
|
|
21
|
+
|
|
22
|
+
:ivar join_with: Separator inserted between evidence text blocks.
|
|
23
|
+
:vartype join_with: str
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
join_with: str = Field(default="\n\n")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ContextPack(BaseModel):
|
|
32
|
+
"""
|
|
33
|
+
Context pack derived from retrieval evidence.
|
|
34
|
+
|
|
35
|
+
:ivar text: Context pack text suitable for inclusion in a model call.
|
|
36
|
+
:vartype text: str
|
|
37
|
+
:ivar evidence_count: Number of evidence blocks included in the context pack.
|
|
38
|
+
:vartype evidence_count: int
|
|
39
|
+
:ivar blocks: Structured blocks that produced the context pack.
|
|
40
|
+
:vartype blocks: list[ContextPackBlock]
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="forbid")
|
|
44
|
+
|
|
45
|
+
text: str
|
|
46
|
+
evidence_count: int = Field(ge=0)
|
|
47
|
+
blocks: List["ContextPackBlock"] = Field(default_factory=list)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ContextPackBlock(BaseModel):
|
|
51
|
+
"""
|
|
52
|
+
A single context pack block derived from one evidence item.
|
|
53
|
+
|
|
54
|
+
:ivar evidence_item_id: Item identifier that produced this block.
|
|
55
|
+
:vartype evidence_item_id: str
|
|
56
|
+
:ivar text: Text included in this block.
|
|
57
|
+
:vartype text: str
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
model_config = ConfigDict(extra="forbid")
|
|
61
|
+
|
|
62
|
+
evidence_item_id: str = Field(min_length=1)
|
|
63
|
+
text: str = Field(min_length=1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TokenCounter(BaseModel):
|
|
67
|
+
"""
|
|
68
|
+
Token counter configuration for token budget fitting.
|
|
69
|
+
|
|
70
|
+
This is a lightweight model wrapper so token fitting remains explicit and testable even when
|
|
71
|
+
the underlying tokenizer is provided by an optional dependency.
|
|
72
|
+
|
|
73
|
+
:ivar tokenizer_id: Tokenizer identifier (for example, naive-whitespace).
|
|
74
|
+
:vartype tokenizer_id: str
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
model_config = ConfigDict(extra="forbid")
|
|
78
|
+
|
|
79
|
+
tokenizer_id: str = Field(default="naive-whitespace", min_length=1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class TokenBudget(BaseModel):
|
|
83
|
+
"""
|
|
84
|
+
Token budget for a context pack.
|
|
85
|
+
|
|
86
|
+
:ivar max_tokens: Maximum tokens permitted for the final context pack text.
|
|
87
|
+
:vartype max_tokens: int
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(extra="forbid")
|
|
91
|
+
|
|
92
|
+
max_tokens: int = Field(ge=1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
|
|
96
|
+
"""
|
|
97
|
+
Build a context pack from a retrieval result using an explicit policy.
|
|
98
|
+
|
|
99
|
+
:param result: Retrieval result containing ranked evidence.
|
|
100
|
+
:type result: RetrievalResult
|
|
101
|
+
:param policy: Policy controlling how evidence text is joined.
|
|
102
|
+
:type policy: ContextPackPolicy
|
|
103
|
+
:return: Context pack containing concatenated evidence text.
|
|
104
|
+
:rtype: ContextPack
|
|
105
|
+
"""
|
|
106
|
+
selected_blocks: List[ContextPackBlock] = []
|
|
107
|
+
for evidence in result.evidence:
|
|
108
|
+
if not isinstance(evidence.text, str):
|
|
109
|
+
continue
|
|
110
|
+
trimmed_text = evidence.text.strip()
|
|
111
|
+
if not trimmed_text:
|
|
112
|
+
continue
|
|
113
|
+
selected_blocks.append(
|
|
114
|
+
ContextPackBlock(evidence_item_id=evidence.item_id, text=trimmed_text)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return ContextPack(
|
|
118
|
+
text=policy.join_with.join([block.text for block in selected_blocks]),
|
|
119
|
+
evidence_count=len(selected_blocks),
|
|
120
|
+
blocks=selected_blocks,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def count_tokens(text: str, *, tokenizer_id: str) -> int:
|
|
125
|
+
"""
|
|
126
|
+
Count tokens in a text using a tokenizer identifier.
|
|
127
|
+
|
|
128
|
+
The default tokenizer is naive-whitespace, which counts whitespace-separated tokens.
|
|
129
|
+
|
|
130
|
+
:param text: Text payload to count.
|
|
131
|
+
:type text: str
|
|
132
|
+
:param tokenizer_id: Tokenizer identifier.
|
|
133
|
+
:type tokenizer_id: str
|
|
134
|
+
:return: Token count.
|
|
135
|
+
:rtype: int
|
|
136
|
+
:raises KeyError: If the tokenizer identifier is unknown.
|
|
137
|
+
"""
|
|
138
|
+
tokenizers = {
|
|
139
|
+
"naive-whitespace": lambda value: len([token for token in value.split() if token]),
|
|
140
|
+
}
|
|
141
|
+
tokenizer = tokenizers[tokenizer_id]
|
|
142
|
+
return int(tokenizer(text))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def fit_context_pack_to_token_budget(
|
|
146
|
+
context_pack: ContextPack,
|
|
147
|
+
*,
|
|
148
|
+
policy: ContextPackPolicy,
|
|
149
|
+
token_budget: TokenBudget,
|
|
150
|
+
token_counter: Optional[TokenCounter] = None,
|
|
151
|
+
) -> ContextPack:
|
|
152
|
+
"""
|
|
153
|
+
Fit a context pack to a token budget by dropping trailing blocks.
|
|
154
|
+
|
|
155
|
+
This function is deterministic. It never rewrites block text. It only removes blocks from the
|
|
156
|
+
end of the block list until the token budget is met.
|
|
157
|
+
|
|
158
|
+
:param context_pack: Context pack to fit.
|
|
159
|
+
:type context_pack: ContextPack
|
|
160
|
+
:param policy: Policy controlling how blocks are joined into text.
|
|
161
|
+
:type policy: ContextPackPolicy
|
|
162
|
+
:param token_budget: Token budget to enforce.
|
|
163
|
+
:type token_budget: TokenBudget
|
|
164
|
+
:param token_counter: Optional token counter configuration.
|
|
165
|
+
:type token_counter: TokenCounter or None
|
|
166
|
+
:return: Fitted context pack.
|
|
167
|
+
:rtype: ContextPack
|
|
168
|
+
"""
|
|
169
|
+
token_counter = token_counter or TokenCounter()
|
|
170
|
+
remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
|
|
171
|
+
|
|
172
|
+
while remaining_blocks:
|
|
173
|
+
candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
|
|
174
|
+
candidate_tokens = count_tokens(candidate_text, tokenizer_id=token_counter.tokenizer_id)
|
|
175
|
+
if candidate_tokens <= token_budget.max_tokens:
|
|
176
|
+
return ContextPack(
|
|
177
|
+
text=candidate_text,
|
|
178
|
+
evidence_count=len(remaining_blocks),
|
|
179
|
+
blocks=remaining_blocks,
|
|
180
|
+
)
|
|
181
|
+
remaining_blocks = remaining_blocks[:-1]
|
|
182
|
+
|
|
183
|
+
return ContextPack(text="", evidence_count=0, blocks=[])
|
biblicus/corpus.py
CHANGED
|
@@ -27,7 +27,14 @@ from .frontmatter import parse_front_matter, render_front_matter
|
|
|
27
27
|
from .hook_manager import HookManager
|
|
28
28
|
from .hooks import HookPoint
|
|
29
29
|
from .ignore import load_corpus_ignore_spec
|
|
30
|
-
from .models import
|
|
30
|
+
from .models import (
|
|
31
|
+
CatalogItem,
|
|
32
|
+
CorpusCatalog,
|
|
33
|
+
CorpusConfig,
|
|
34
|
+
ExtractionRunListEntry,
|
|
35
|
+
IngestResult,
|
|
36
|
+
RetrievalRun,
|
|
37
|
+
)
|
|
31
38
|
from .sources import load_source
|
|
32
39
|
from .time import utc_now_iso
|
|
33
40
|
from .uris import corpus_ref_to_path, normalize_corpus_uri
|
|
@@ -567,6 +574,96 @@ class Corpus:
|
|
|
567
574
|
return None
|
|
568
575
|
return path.read_text(encoding="utf-8")
|
|
569
576
|
|
|
577
|
+
def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
|
|
578
|
+
"""
|
|
579
|
+
Load an extraction run manifest from the corpus.
|
|
580
|
+
|
|
581
|
+
:param extractor_id: Extractor plugin identifier.
|
|
582
|
+
:type extractor_id: str
|
|
583
|
+
:param run_id: Extraction run identifier.
|
|
584
|
+
:type run_id: str
|
|
585
|
+
:return: Parsed extraction run manifest.
|
|
586
|
+
:rtype: biblicus.extraction.ExtractionRunManifest
|
|
587
|
+
:raises FileNotFoundError: If the manifest file does not exist.
|
|
588
|
+
:raises ValueError: If the manifest data is invalid.
|
|
589
|
+
"""
|
|
590
|
+
from .extraction import ExtractionRunManifest
|
|
591
|
+
|
|
592
|
+
manifest_path = (
|
|
593
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
|
|
594
|
+
)
|
|
595
|
+
if not manifest_path.is_file():
|
|
596
|
+
raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
|
|
597
|
+
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
598
|
+
return ExtractionRunManifest.model_validate(data)
|
|
599
|
+
|
|
600
|
+
def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
|
|
601
|
+
"""
|
|
602
|
+
List extraction runs stored under the corpus.
|
|
603
|
+
|
|
604
|
+
:param extractor_id: Optional extractor identifier filter.
|
|
605
|
+
:type extractor_id: str or None
|
|
606
|
+
:return: Summary list entries for each run.
|
|
607
|
+
:rtype: list[biblicus.models.ExtractionRunListEntry]
|
|
608
|
+
"""
|
|
609
|
+
runs_root = self.extraction_runs_dir
|
|
610
|
+
if not runs_root.is_dir():
|
|
611
|
+
return []
|
|
612
|
+
|
|
613
|
+
extractor_dirs: List[Path]
|
|
614
|
+
if extractor_id is None:
|
|
615
|
+
extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
|
|
616
|
+
else:
|
|
617
|
+
extractor_path = runs_root / extractor_id
|
|
618
|
+
extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
|
|
619
|
+
|
|
620
|
+
entries: List[ExtractionRunListEntry] = []
|
|
621
|
+
for extractor_dir in extractor_dirs:
|
|
622
|
+
for run_dir in sorted(extractor_dir.iterdir()):
|
|
623
|
+
if not run_dir.is_dir():
|
|
624
|
+
continue
|
|
625
|
+
manifest_path = run_dir / "manifest.json"
|
|
626
|
+
if not manifest_path.is_file():
|
|
627
|
+
continue
|
|
628
|
+
try:
|
|
629
|
+
manifest = self.load_extraction_run_manifest(
|
|
630
|
+
extractor_id=extractor_dir.name,
|
|
631
|
+
run_id=run_dir.name,
|
|
632
|
+
)
|
|
633
|
+
except (FileNotFoundError, ValueError):
|
|
634
|
+
continue
|
|
635
|
+
entries.append(
|
|
636
|
+
ExtractionRunListEntry(
|
|
637
|
+
extractor_id=extractor_dir.name,
|
|
638
|
+
run_id=run_dir.name,
|
|
639
|
+
recipe_id=manifest.recipe.recipe_id,
|
|
640
|
+
recipe_name=manifest.recipe.name,
|
|
641
|
+
catalog_generated_at=manifest.catalog_generated_at,
|
|
642
|
+
created_at=manifest.created_at,
|
|
643
|
+
stats=dict(manifest.stats),
|
|
644
|
+
)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
|
|
648
|
+
return entries
|
|
649
|
+
|
|
650
|
+
def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
|
|
651
|
+
"""
|
|
652
|
+
Delete an extraction run directory and its derived artifacts.
|
|
653
|
+
|
|
654
|
+
:param extractor_id: Extractor plugin identifier.
|
|
655
|
+
:type extractor_id: str
|
|
656
|
+
:param run_id: Extraction run identifier.
|
|
657
|
+
:type run_id: str
|
|
658
|
+
:return: None.
|
|
659
|
+
:rtype: None
|
|
660
|
+
:raises FileNotFoundError: If the extraction run directory does not exist.
|
|
661
|
+
"""
|
|
662
|
+
run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
663
|
+
if not run_dir.is_dir():
|
|
664
|
+
raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
|
|
665
|
+
shutil.rmtree(run_dir)
|
|
666
|
+
|
|
570
667
|
def _ensure_runs_dir(self) -> None:
|
|
571
668
|
"""
|
|
572
669
|
Ensure the retrieval runs directory exists.
|
|
@@ -1185,6 +1282,78 @@ class Corpus:
|
|
|
1185
1282
|
raise KeyError(f"Unknown item identifier: {item_id}")
|
|
1186
1283
|
return item
|
|
1187
1284
|
|
|
1285
|
+
def create_crawl_id(self) -> str:
|
|
1286
|
+
"""
|
|
1287
|
+
Create a new crawl identifier.
|
|
1288
|
+
|
|
1289
|
+
:return: Crawl identifier.
|
|
1290
|
+
:rtype: str
|
|
1291
|
+
"""
|
|
1292
|
+
return str(uuid.uuid4())
|
|
1293
|
+
|
|
1294
|
+
def ingest_crawled_payload(
|
|
1295
|
+
self,
|
|
1296
|
+
*,
|
|
1297
|
+
crawl_id: str,
|
|
1298
|
+
relative_path: str,
|
|
1299
|
+
data: bytes,
|
|
1300
|
+
filename: str,
|
|
1301
|
+
media_type: str,
|
|
1302
|
+
source_uri: str,
|
|
1303
|
+
tags: Sequence[str],
|
|
1304
|
+
) -> None:
|
|
1305
|
+
"""
|
|
1306
|
+
Ingest a crawled payload under a crawl import namespace.
|
|
1307
|
+
|
|
1308
|
+
:param crawl_id: Crawl identifier used to group crawled artifacts.
|
|
1309
|
+
:type crawl_id: str
|
|
1310
|
+
:param relative_path: Relative path within the crawl prefix.
|
|
1311
|
+
:type relative_path: str
|
|
1312
|
+
:param data: Raw payload bytes.
|
|
1313
|
+
:type data: bytes
|
|
1314
|
+
:param filename: Suggested filename from the payload metadata.
|
|
1315
|
+
:type filename: str
|
|
1316
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
1317
|
+
:type media_type: str
|
|
1318
|
+
:param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
|
|
1319
|
+
:type source_uri: str
|
|
1320
|
+
:param tags: Tags to attach to the stored item.
|
|
1321
|
+
:type tags: Sequence[str]
|
|
1322
|
+
:return: None.
|
|
1323
|
+
:rtype: None
|
|
1324
|
+
"""
|
|
1325
|
+
_ = filename
|
|
1326
|
+
item_id = str(uuid.uuid4())
|
|
1327
|
+
destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
|
|
1328
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1329
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1330
|
+
destination_path.write_bytes(data)
|
|
1331
|
+
|
|
1332
|
+
sha256_digest = _sha256_bytes(data)
|
|
1333
|
+
|
|
1334
|
+
sidecar: Dict[str, Any] = {}
|
|
1335
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1336
|
+
sidecar["media_type"] = media_type
|
|
1337
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
1338
|
+
_write_sidecar(destination_path, sidecar)
|
|
1339
|
+
|
|
1340
|
+
merged_metadata = _merge_metadata({}, sidecar)
|
|
1341
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1342
|
+
|
|
1343
|
+
item_record = CatalogItem(
|
|
1344
|
+
id=item_id,
|
|
1345
|
+
relpath=destination_relpath,
|
|
1346
|
+
sha256=sha256_digest,
|
|
1347
|
+
bytes=len(data),
|
|
1348
|
+
media_type=media_type,
|
|
1349
|
+
title=None,
|
|
1350
|
+
tags=list(resolved_tags),
|
|
1351
|
+
metadata=dict(merged_metadata or {}),
|
|
1352
|
+
created_at=utc_now_iso(),
|
|
1353
|
+
source_uri=source_uri,
|
|
1354
|
+
)
|
|
1355
|
+
self._upsert_catalog_item(item_record)
|
|
1356
|
+
|
|
1188
1357
|
def reindex(self) -> Dict[str, int]:
|
|
1189
1358
|
"""
|
|
1190
1359
|
Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
|