biblicus 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/cli.py +99 -1
- biblicus/constants.py +1 -0
- biblicus/context.py +138 -4
- biblicus/extraction_evaluation.py +312 -0
- {biblicus-0.11.0.dist-info → biblicus-0.13.0.dist-info}/METADATA +6 -2
- {biblicus-0.11.0.dist-info → biblicus-0.13.0.dist-info}/RECORD +11 -10
- {biblicus-0.11.0.dist-info → biblicus-0.13.0.dist-info}/WHEEL +0 -0
- {biblicus-0.11.0.dist-info → biblicus-0.13.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.11.0.dist-info → biblicus-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.11.0.dist-info → biblicus-0.13.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/cli.py
CHANGED
|
@@ -15,9 +15,11 @@ from pydantic import ValidationError
|
|
|
15
15
|
from .analysis import get_analysis_backend
|
|
16
16
|
from .backends import get_backend
|
|
17
17
|
from .context import (
|
|
18
|
+
CharacterBudget,
|
|
18
19
|
ContextPackPolicy,
|
|
19
20
|
TokenBudget,
|
|
20
21
|
build_context_pack,
|
|
22
|
+
fit_context_pack_to_character_budget,
|
|
21
23
|
fit_context_pack_to_token_budget,
|
|
22
24
|
)
|
|
23
25
|
from .corpus import Corpus
|
|
@@ -26,6 +28,11 @@ from .errors import ExtractionRunFatalError
|
|
|
26
28
|
from .evaluation import evaluate_run, load_dataset
|
|
27
29
|
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
28
30
|
from .extraction import build_extraction_run
|
|
31
|
+
from .extraction_evaluation import (
|
|
32
|
+
evaluate_extraction_run,
|
|
33
|
+
load_extraction_dataset,
|
|
34
|
+
write_extraction_evaluation_result,
|
|
35
|
+
)
|
|
29
36
|
from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
|
|
30
37
|
from .uris import corpus_ref_to_path
|
|
31
38
|
|
|
@@ -504,6 +511,54 @@ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
|
504
511
|
return 0
|
|
505
512
|
|
|
506
513
|
|
|
514
|
+
def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
515
|
+
"""
|
|
516
|
+
Evaluate an extraction run against a dataset.
|
|
517
|
+
|
|
518
|
+
:param arguments: Parsed command-line interface arguments.
|
|
519
|
+
:type arguments: argparse.Namespace
|
|
520
|
+
:return: Exit code.
|
|
521
|
+
:rtype: int
|
|
522
|
+
"""
|
|
523
|
+
corpus = (
|
|
524
|
+
Corpus.open(arguments.corpus)
|
|
525
|
+
if getattr(arguments, "corpus", None)
|
|
526
|
+
else Corpus.find(Path.cwd())
|
|
527
|
+
)
|
|
528
|
+
if arguments.run:
|
|
529
|
+
run_ref = parse_extraction_run_reference(arguments.run)
|
|
530
|
+
else:
|
|
531
|
+
run_ref = corpus.latest_extraction_run_reference()
|
|
532
|
+
if run_ref is None:
|
|
533
|
+
raise ValueError("Extraction evaluation requires an extraction run")
|
|
534
|
+
print(
|
|
535
|
+
"Warning: using latest extraction run; pass --run for reproducibility.",
|
|
536
|
+
file=sys.stderr,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
dataset_path = Path(arguments.dataset)
|
|
540
|
+
if not dataset_path.is_file():
|
|
541
|
+
raise FileNotFoundError(f"Dataset file not found: {dataset_path}")
|
|
542
|
+
try:
|
|
543
|
+
dataset = load_extraction_dataset(dataset_path)
|
|
544
|
+
except ValidationError as exc:
|
|
545
|
+
raise ValueError(f"Invalid extraction dataset: {exc}") from exc
|
|
546
|
+
|
|
547
|
+
run = corpus.load_extraction_run_manifest(
|
|
548
|
+
extractor_id=run_ref.extractor_id,
|
|
549
|
+
run_id=run_ref.run_id,
|
|
550
|
+
)
|
|
551
|
+
result = evaluate_extraction_run(
|
|
552
|
+
corpus=corpus,
|
|
553
|
+
run=run,
|
|
554
|
+
extractor_id=run_ref.extractor_id,
|
|
555
|
+
dataset=dataset,
|
|
556
|
+
)
|
|
557
|
+
write_extraction_evaluation_result(corpus=corpus, run_id=run.run_id, result=result)
|
|
558
|
+
print(result.model_dump_json(indent=2))
|
|
559
|
+
return 0
|
|
560
|
+
|
|
561
|
+
|
|
507
562
|
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
508
563
|
"""
|
|
509
564
|
Execute a retrieval query.
|
|
@@ -568,7 +623,11 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
|
568
623
|
)
|
|
569
624
|
retrieval_result = RetrievalResult.model_validate_json(input_text)
|
|
570
625
|
join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
|
|
571
|
-
policy = ContextPackPolicy(
|
|
626
|
+
policy = ContextPackPolicy(
|
|
627
|
+
join_with=join_with,
|
|
628
|
+
ordering=arguments.ordering,
|
|
629
|
+
include_metadata=arguments.include_metadata,
|
|
630
|
+
)
|
|
572
631
|
context_pack = build_context_pack(retrieval_result, policy=policy)
|
|
573
632
|
if arguments.max_tokens is not None:
|
|
574
633
|
context_pack = fit_context_pack_to_token_budget(
|
|
@@ -576,6 +635,12 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
|
576
635
|
policy=policy,
|
|
577
636
|
token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
|
|
578
637
|
)
|
|
638
|
+
if arguments.max_characters is not None:
|
|
639
|
+
context_pack = fit_context_pack_to_character_budget(
|
|
640
|
+
context_pack,
|
|
641
|
+
policy=policy,
|
|
642
|
+
character_budget=CharacterBudget(max_characters=int(arguments.max_characters)),
|
|
643
|
+
)
|
|
579
644
|
print(
|
|
580
645
|
json.dumps(
|
|
581
646
|
{
|
|
@@ -889,6 +954,22 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
889
954
|
)
|
|
890
955
|
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
891
956
|
|
|
957
|
+
p_extract_evaluate = extract_sub.add_parser(
|
|
958
|
+
"evaluate", help="Evaluate an extraction run against a dataset."
|
|
959
|
+
)
|
|
960
|
+
_add_common_corpus_arg(p_extract_evaluate)
|
|
961
|
+
p_extract_evaluate.add_argument(
|
|
962
|
+
"--run",
|
|
963
|
+
default=None,
|
|
964
|
+
help="Extraction run reference in the form extractor_id:run_id (defaults to latest run).",
|
|
965
|
+
)
|
|
966
|
+
p_extract_evaluate.add_argument(
|
|
967
|
+
"--dataset",
|
|
968
|
+
required=True,
|
|
969
|
+
help="Path to the extraction evaluation dataset JSON file.",
|
|
970
|
+
)
|
|
971
|
+
p_extract_evaluate.set_defaults(func=cmd_extract_evaluate)
|
|
972
|
+
|
|
892
973
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
893
974
|
_add_common_corpus_arg(p_query)
|
|
894
975
|
p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
@@ -921,12 +1002,29 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
921
1002
|
default="\\n\\n",
|
|
922
1003
|
help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
|
|
923
1004
|
)
|
|
1005
|
+
p_context_pack_build.add_argument(
|
|
1006
|
+
"--ordering",
|
|
1007
|
+
choices=["rank", "score", "source"],
|
|
1008
|
+
default="rank",
|
|
1009
|
+
help="Evidence ordering policy (rank, score, source).",
|
|
1010
|
+
)
|
|
1011
|
+
p_context_pack_build.add_argument(
|
|
1012
|
+
"--include-metadata",
|
|
1013
|
+
action="store_true",
|
|
1014
|
+
help="Include evidence metadata in each context pack block.",
|
|
1015
|
+
)
|
|
924
1016
|
p_context_pack_build.add_argument(
|
|
925
1017
|
"--max-tokens",
|
|
926
1018
|
default=None,
|
|
927
1019
|
type=int,
|
|
928
1020
|
help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
|
|
929
1021
|
)
|
|
1022
|
+
p_context_pack_build.add_argument(
|
|
1023
|
+
"--max-characters",
|
|
1024
|
+
default=None,
|
|
1025
|
+
type=int,
|
|
1026
|
+
help="Optional character budget for the final context pack.",
|
|
1027
|
+
)
|
|
930
1028
|
p_context_pack_build.set_defaults(func=cmd_context_pack_build)
|
|
931
1029
|
|
|
932
1030
|
p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
|
biblicus/constants.py
CHANGED
biblicus/context.py
CHANGED
|
@@ -8,11 +8,11 @@ stable contract while context formatting remains an explicit policy surface.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
-
from typing import List, Optional
|
|
11
|
+
from typing import Dict, List, Literal, Optional
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, ConfigDict, Field
|
|
14
14
|
|
|
15
|
-
from .models import RetrievalResult
|
|
15
|
+
from .models import Evidence, RetrievalResult
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class ContextPackPolicy(BaseModel):
|
|
@@ -21,11 +21,17 @@ class ContextPackPolicy(BaseModel):
|
|
|
21
21
|
|
|
22
22
|
:ivar join_with: Separator inserted between evidence text blocks.
|
|
23
23
|
:vartype join_with: str
|
|
24
|
+
:ivar ordering: Evidence ordering policy (rank, score, or source).
|
|
25
|
+
:vartype ordering: str
|
|
26
|
+
:ivar include_metadata: Whether to include evidence metadata lines in each block.
|
|
27
|
+
:vartype include_metadata: bool
|
|
24
28
|
"""
|
|
25
29
|
|
|
26
30
|
model_config = ConfigDict(extra="forbid")
|
|
27
31
|
|
|
28
32
|
join_with: str = Field(default="\n\n")
|
|
33
|
+
ordering: Literal["rank", "score", "source"] = Field(default="rank")
|
|
34
|
+
include_metadata: bool = Field(default=False)
|
|
29
35
|
|
|
30
36
|
|
|
31
37
|
class ContextPack(BaseModel):
|
|
@@ -55,12 +61,15 @@ class ContextPackBlock(BaseModel):
|
|
|
55
61
|
:vartype evidence_item_id: str
|
|
56
62
|
:ivar text: Text included in this block.
|
|
57
63
|
:vartype text: str
|
|
64
|
+
:ivar metadata: Optional metadata included with the block.
|
|
65
|
+
:vartype metadata: dict[str, object] or None
|
|
58
66
|
"""
|
|
59
67
|
|
|
60
68
|
model_config = ConfigDict(extra="forbid")
|
|
61
69
|
|
|
62
70
|
evidence_item_id: str = Field(min_length=1)
|
|
63
71
|
text: str = Field(min_length=1)
|
|
72
|
+
metadata: Optional[Dict[str, object]] = None
|
|
64
73
|
|
|
65
74
|
|
|
66
75
|
class TokenCounter(BaseModel):
|
|
@@ -92,6 +101,19 @@ class TokenBudget(BaseModel):
|
|
|
92
101
|
max_tokens: int = Field(ge=1)
|
|
93
102
|
|
|
94
103
|
|
|
104
|
+
class CharacterBudget(BaseModel):
|
|
105
|
+
"""
|
|
106
|
+
Character budget for a context pack.
|
|
107
|
+
|
|
108
|
+
:ivar max_characters: Maximum characters permitted for the final context pack text.
|
|
109
|
+
:vartype max_characters: int
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
model_config = ConfigDict(extra="forbid")
|
|
113
|
+
|
|
114
|
+
max_characters: int = Field(ge=1)
|
|
115
|
+
|
|
116
|
+
|
|
95
117
|
def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) -> ContextPack:
|
|
96
118
|
"""
|
|
97
119
|
Build a context pack from a retrieval result using an explicit policy.
|
|
@@ -104,14 +126,20 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
|
|
|
104
126
|
:rtype: ContextPack
|
|
105
127
|
"""
|
|
106
128
|
selected_blocks: List[ContextPackBlock] = []
|
|
107
|
-
for evidence in result.evidence:
|
|
129
|
+
for evidence in _order_evidence(result.evidence, policy=policy):
|
|
108
130
|
if not isinstance(evidence.text, str):
|
|
109
131
|
continue
|
|
110
132
|
trimmed_text = evidence.text.strip()
|
|
111
133
|
if not trimmed_text:
|
|
112
134
|
continue
|
|
135
|
+
metadata = _metadata_for_evidence(evidence) if policy.include_metadata else None
|
|
136
|
+
block_text = _format_block_text(trimmed_text, metadata=metadata)
|
|
113
137
|
selected_blocks.append(
|
|
114
|
-
ContextPackBlock(
|
|
138
|
+
ContextPackBlock(
|
|
139
|
+
evidence_item_id=evidence.item_id,
|
|
140
|
+
text=block_text,
|
|
141
|
+
metadata=metadata,
|
|
142
|
+
)
|
|
115
143
|
)
|
|
116
144
|
|
|
117
145
|
return ContextPack(
|
|
@@ -181,3 +209,109 @@ def fit_context_pack_to_token_budget(
|
|
|
181
209
|
remaining_blocks = remaining_blocks[:-1]
|
|
182
210
|
|
|
183
211
|
return ContextPack(text="", evidence_count=0, blocks=[])
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def fit_context_pack_to_character_budget(
|
|
215
|
+
context_pack: ContextPack,
|
|
216
|
+
*,
|
|
217
|
+
policy: ContextPackPolicy,
|
|
218
|
+
character_budget: CharacterBudget,
|
|
219
|
+
) -> ContextPack:
|
|
220
|
+
"""
|
|
221
|
+
Fit a context pack to a character budget by dropping trailing blocks.
|
|
222
|
+
|
|
223
|
+
:param context_pack: Context pack to fit.
|
|
224
|
+
:type context_pack: ContextPack
|
|
225
|
+
:param policy: Policy controlling how blocks are joined into text.
|
|
226
|
+
:type policy: ContextPackPolicy
|
|
227
|
+
:param character_budget: Character budget to enforce.
|
|
228
|
+
:type character_budget: CharacterBudget
|
|
229
|
+
:return: Fitted context pack.
|
|
230
|
+
:rtype: ContextPack
|
|
231
|
+
"""
|
|
232
|
+
remaining_blocks: List[ContextPackBlock] = list(context_pack.blocks)
|
|
233
|
+
max_characters = character_budget.max_characters
|
|
234
|
+
|
|
235
|
+
while remaining_blocks:
|
|
236
|
+
candidate_text = policy.join_with.join([block.text for block in remaining_blocks])
|
|
237
|
+
if len(candidate_text) <= max_characters:
|
|
238
|
+
return ContextPack(
|
|
239
|
+
text=candidate_text,
|
|
240
|
+
evidence_count=len(remaining_blocks),
|
|
241
|
+
blocks=remaining_blocks,
|
|
242
|
+
)
|
|
243
|
+
remaining_blocks = remaining_blocks[:-1]
|
|
244
|
+
|
|
245
|
+
return ContextPack(text="", evidence_count=0, blocks=[])
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _order_evidence(
|
|
249
|
+
evidence: List[Evidence],
|
|
250
|
+
*,
|
|
251
|
+
policy: ContextPackPolicy,
|
|
252
|
+
) -> List[Evidence]:
|
|
253
|
+
"""
|
|
254
|
+
Order evidence items according to the context pack policy.
|
|
255
|
+
|
|
256
|
+
:param evidence: Evidence list to order.
|
|
257
|
+
:type evidence: list[Evidence]
|
|
258
|
+
:param policy: Context pack policy.
|
|
259
|
+
:type policy: ContextPackPolicy
|
|
260
|
+
:return: Ordered evidence list.
|
|
261
|
+
:rtype: list[Evidence]
|
|
262
|
+
"""
|
|
263
|
+
if policy.ordering == "rank":
|
|
264
|
+
return sorted(evidence, key=lambda item: (item.rank, item.item_id))
|
|
265
|
+
if policy.ordering == "score":
|
|
266
|
+
return sorted(evidence, key=lambda item: (-item.score, item.item_id))
|
|
267
|
+
if policy.ordering == "source":
|
|
268
|
+
return sorted(
|
|
269
|
+
evidence,
|
|
270
|
+
key=lambda item: (
|
|
271
|
+
item.source_uri or item.item_id,
|
|
272
|
+
-item.score,
|
|
273
|
+
item.item_id,
|
|
274
|
+
),
|
|
275
|
+
)
|
|
276
|
+
raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
|
|
280
|
+
"""
|
|
281
|
+
Build metadata for a context pack block.
|
|
282
|
+
|
|
283
|
+
:param evidence: Evidence item to describe.
|
|
284
|
+
:type evidence: Evidence
|
|
285
|
+
:return: Metadata mapping.
|
|
286
|
+
:rtype: dict[str, object]
|
|
287
|
+
"""
|
|
288
|
+
return {
|
|
289
|
+
"item_id": evidence.item_id,
|
|
290
|
+
"source_uri": evidence.source_uri or "none",
|
|
291
|
+
"score": evidence.score,
|
|
292
|
+
"stage": evidence.stage,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
|
|
297
|
+
"""
|
|
298
|
+
Format a context pack block text with optional metadata.
|
|
299
|
+
|
|
300
|
+
:param text: Evidence text.
|
|
301
|
+
:type text: str
|
|
302
|
+
:param metadata: Optional metadata mapping.
|
|
303
|
+
:type metadata: dict[str, object] or None
|
|
304
|
+
:return: Formatted block text.
|
|
305
|
+
:rtype: str
|
|
306
|
+
"""
|
|
307
|
+
if not metadata:
|
|
308
|
+
return text
|
|
309
|
+
metadata_lines = "\n".join(
|
|
310
|
+
[
|
|
311
|
+
f"item_id: {metadata['item_id']}",
|
|
312
|
+
f"source_uri: {metadata['source_uri']}",
|
|
313
|
+
f"score: {metadata['score']}",
|
|
314
|
+
f"stage: {metadata['stage']}",
|
|
315
|
+
]
|
|
316
|
+
)
|
|
317
|
+
return f"{metadata_lines}\n{text}"
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extraction evaluation utilities for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from difflib import SequenceMatcher
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
13
|
+
|
|
14
|
+
from .constants import EXTRACTION_DATASET_SCHEMA_VERSION
|
|
15
|
+
from .corpus import Corpus
|
|
16
|
+
from .extraction import ExtractionRunManifest
|
|
17
|
+
from .models import CatalogItem
|
|
18
|
+
from .time import utc_now_iso
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ExtractionEvaluationItem(BaseModel):
|
|
22
|
+
"""
|
|
23
|
+
Dataset item for extraction evaluation.
|
|
24
|
+
|
|
25
|
+
:ivar item_id: Optional item identifier.
|
|
26
|
+
:vartype item_id: str or None
|
|
27
|
+
:ivar source_uri: Optional source uniform resource identifier.
|
|
28
|
+
:vartype source_uri: str or None
|
|
29
|
+
:ivar expected_text: Expected extracted text.
|
|
30
|
+
:vartype expected_text: str
|
|
31
|
+
:ivar kind: Label kind (gold or synthetic).
|
|
32
|
+
:vartype kind: str
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="forbid")
|
|
36
|
+
|
|
37
|
+
item_id: Optional[str] = None
|
|
38
|
+
source_uri: Optional[str] = None
|
|
39
|
+
expected_text: str
|
|
40
|
+
kind: str = Field(default="gold")
|
|
41
|
+
|
|
42
|
+
@model_validator(mode="after")
|
|
43
|
+
def _require_locator(self) -> "ExtractionEvaluationItem":
|
|
44
|
+
if not self.item_id and not self.source_uri:
|
|
45
|
+
raise ValueError("Evaluation items must include item_id or source_uri")
|
|
46
|
+
return self
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ExtractionEvaluationDataset(BaseModel):
|
|
50
|
+
"""
|
|
51
|
+
Dataset for extraction evaluation.
|
|
52
|
+
|
|
53
|
+
:ivar schema_version: Dataset schema version.
|
|
54
|
+
:vartype schema_version: int
|
|
55
|
+
:ivar name: Dataset name.
|
|
56
|
+
:vartype name: str
|
|
57
|
+
:ivar description: Optional description.
|
|
58
|
+
:vartype description: str or None
|
|
59
|
+
:ivar items: Labeled evaluation items.
|
|
60
|
+
:vartype items: list[ExtractionEvaluationItem]
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
model_config = ConfigDict(extra="forbid")
|
|
64
|
+
|
|
65
|
+
schema_version: int = Field(ge=1)
|
|
66
|
+
name: str
|
|
67
|
+
description: Optional[str] = None
|
|
68
|
+
items: List[ExtractionEvaluationItem] = Field(default_factory=list)
|
|
69
|
+
|
|
70
|
+
@model_validator(mode="after")
|
|
71
|
+
def _enforce_schema_version(self) -> "ExtractionEvaluationDataset":
|
|
72
|
+
if self.schema_version != EXTRACTION_DATASET_SCHEMA_VERSION:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Unsupported extraction dataset schema version: {self.schema_version}"
|
|
75
|
+
)
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ExtractionEvaluationItemReport(BaseModel):
|
|
80
|
+
"""
|
|
81
|
+
Per-item report for extraction evaluation.
|
|
82
|
+
|
|
83
|
+
:ivar item_id: Item identifier.
|
|
84
|
+
:vartype item_id: str
|
|
85
|
+
:ivar source_uri: Source uniform resource identifier.
|
|
86
|
+
:vartype source_uri: str
|
|
87
|
+
:ivar expected_text: Expected text from the dataset.
|
|
88
|
+
:vartype expected_text: str
|
|
89
|
+
:ivar extracted_text: Extracted text when available.
|
|
90
|
+
:vartype extracted_text: str or None
|
|
91
|
+
:ivar coverage_status: Coverage status (present, empty, missing).
|
|
92
|
+
:vartype coverage_status: str
|
|
93
|
+
:ivar extraction_status: Extraction status from the run (extracted, skipped, errored, missing).
|
|
94
|
+
:vartype extraction_status: str
|
|
95
|
+
:ivar similarity_score: Similarity score between expected and extracted text.
|
|
96
|
+
:vartype similarity_score: float
|
|
97
|
+
:ivar kind: Label kind from the dataset.
|
|
98
|
+
:vartype kind: str
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
model_config = ConfigDict(extra="forbid")
|
|
102
|
+
|
|
103
|
+
item_id: str
|
|
104
|
+
source_uri: str
|
|
105
|
+
expected_text: str
|
|
106
|
+
extracted_text: Optional[str] = None
|
|
107
|
+
coverage_status: str
|
|
108
|
+
extraction_status: str
|
|
109
|
+
similarity_score: float
|
|
110
|
+
kind: str
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ExtractionEvaluationResult(BaseModel):
|
|
114
|
+
"""
|
|
115
|
+
Result bundle for an extraction evaluation.
|
|
116
|
+
|
|
117
|
+
:ivar dataset: Dataset metadata.
|
|
118
|
+
:vartype dataset: dict[str, object]
|
|
119
|
+
:ivar extractor_id: Extractor identifier.
|
|
120
|
+
:vartype extractor_id: str
|
|
121
|
+
:ivar run_id: Extraction run identifier.
|
|
122
|
+
:vartype run_id: str
|
|
123
|
+
:ivar recipe_id: Extraction recipe identifier.
|
|
124
|
+
:vartype recipe_id: str
|
|
125
|
+
:ivar recipe_name: Extraction recipe name.
|
|
126
|
+
:vartype recipe_name: str
|
|
127
|
+
:ivar evaluated_at: International Organization for Standardization 8601 timestamp.
|
|
128
|
+
:vartype evaluated_at: str
|
|
129
|
+
:ivar metrics: Evaluation metrics for coverage and accuracy.
|
|
130
|
+
:vartype metrics: dict[str, float]
|
|
131
|
+
:ivar items: Per-item evaluation reports.
|
|
132
|
+
:vartype items: list[ExtractionEvaluationItemReport]
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
model_config = ConfigDict(extra="forbid")
|
|
136
|
+
|
|
137
|
+
dataset: Dict[str, object]
|
|
138
|
+
extractor_id: str
|
|
139
|
+
run_id: str
|
|
140
|
+
recipe_id: str
|
|
141
|
+
recipe_name: str
|
|
142
|
+
evaluated_at: str
|
|
143
|
+
metrics: Dict[str, float]
|
|
144
|
+
items: List[ExtractionEvaluationItemReport]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def load_extraction_dataset(path: Path) -> ExtractionEvaluationDataset:
|
|
148
|
+
"""
|
|
149
|
+
Load an extraction evaluation dataset from JavaScript Object Notation.
|
|
150
|
+
|
|
151
|
+
:param path: Path to the dataset file.
|
|
152
|
+
:type path: Path
|
|
153
|
+
:return: Parsed extraction evaluation dataset.
|
|
154
|
+
:rtype: ExtractionEvaluationDataset
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
158
|
+
except json.JSONDecodeError as exc:
|
|
159
|
+
raise ValueError("Invalid extraction dataset") from exc
|
|
160
|
+
return ExtractionEvaluationDataset.model_validate(data)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def evaluate_extraction_run(
|
|
164
|
+
*,
|
|
165
|
+
corpus: Corpus,
|
|
166
|
+
run: ExtractionRunManifest,
|
|
167
|
+
extractor_id: str,
|
|
168
|
+
dataset: ExtractionEvaluationDataset,
|
|
169
|
+
) -> ExtractionEvaluationResult:
|
|
170
|
+
"""
|
|
171
|
+
Evaluate an extraction run against a dataset.
|
|
172
|
+
|
|
173
|
+
:param corpus: Corpus associated with the run.
|
|
174
|
+
:type corpus: Corpus
|
|
175
|
+
:param run: Extraction run manifest.
|
|
176
|
+
:type run: ExtractionRunManifest
|
|
177
|
+
:param extractor_id: Extractor identifier for the run.
|
|
178
|
+
:type extractor_id: str
|
|
179
|
+
:param dataset: Extraction evaluation dataset.
|
|
180
|
+
:type dataset: ExtractionEvaluationDataset
|
|
181
|
+
:return: Extraction evaluation result bundle.
|
|
182
|
+
:rtype: ExtractionEvaluationResult
|
|
183
|
+
"""
|
|
184
|
+
catalog = corpus.load_catalog()
|
|
185
|
+
item_index = {item.item_id: item for item in run.items}
|
|
186
|
+
coverage_present = 0
|
|
187
|
+
coverage_empty = 0
|
|
188
|
+
coverage_missing = 0
|
|
189
|
+
processable = 0
|
|
190
|
+
similarity_scores: List[float] = []
|
|
191
|
+
item_reports: List[ExtractionEvaluationItemReport] = []
|
|
192
|
+
|
|
193
|
+
for dataset_item in dataset.items:
|
|
194
|
+
item_id = _resolve_item_id(dataset_item, catalog_items=catalog.items)
|
|
195
|
+
catalog_item = catalog.items.get(item_id)
|
|
196
|
+
if catalog_item is None:
|
|
197
|
+
raise ValueError(f"Unknown item identifier: {item_id}")
|
|
198
|
+
extraction_item = item_index.get(item_id)
|
|
199
|
+
extraction_status = extraction_item.status if extraction_item else "missing"
|
|
200
|
+
if extraction_status != "errored" and extraction_status != "missing":
|
|
201
|
+
processable += 1
|
|
202
|
+
|
|
203
|
+
extracted_text = corpus.read_extracted_text(
|
|
204
|
+
extractor_id=extractor_id, run_id=run.run_id, item_id=item_id
|
|
205
|
+
)
|
|
206
|
+
coverage_status = _coverage_status(extracted_text)
|
|
207
|
+
if coverage_status == "present":
|
|
208
|
+
coverage_present += 1
|
|
209
|
+
elif coverage_status == "empty":
|
|
210
|
+
coverage_empty += 1
|
|
211
|
+
else:
|
|
212
|
+
coverage_missing += 1
|
|
213
|
+
|
|
214
|
+
similarity_score = _similarity_score(
|
|
215
|
+
expected_text=dataset_item.expected_text, extracted_text=extracted_text
|
|
216
|
+
)
|
|
217
|
+
similarity_scores.append(similarity_score)
|
|
218
|
+
item_reports.append(
|
|
219
|
+
ExtractionEvaluationItemReport(
|
|
220
|
+
item_id=item_id,
|
|
221
|
+
source_uri=catalog_item.source_uri,
|
|
222
|
+
expected_text=dataset_item.expected_text,
|
|
223
|
+
extracted_text=extracted_text,
|
|
224
|
+
coverage_status=coverage_status,
|
|
225
|
+
extraction_status=extraction_status,
|
|
226
|
+
similarity_score=similarity_score,
|
|
227
|
+
kind=dataset_item.kind,
|
|
228
|
+
)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
total_items = max(len(dataset.items), 1)
|
|
232
|
+
average_similarity = sum(similarity_scores) / total_items if similarity_scores else 0.0
|
|
233
|
+
metrics = {
|
|
234
|
+
"coverage_present": float(coverage_present),
|
|
235
|
+
"coverage_empty": float(coverage_empty),
|
|
236
|
+
"coverage_missing": float(coverage_missing),
|
|
237
|
+
"processable_fraction": processable / total_items,
|
|
238
|
+
"average_similarity": average_similarity,
|
|
239
|
+
}
|
|
240
|
+
dataset_meta = {
|
|
241
|
+
"name": dataset.name,
|
|
242
|
+
"description": dataset.description,
|
|
243
|
+
"items": len(dataset.items),
|
|
244
|
+
}
|
|
245
|
+
return ExtractionEvaluationResult(
|
|
246
|
+
dataset=dataset_meta,
|
|
247
|
+
extractor_id=extractor_id,
|
|
248
|
+
run_id=run.run_id,
|
|
249
|
+
recipe_id=run.recipe.recipe_id,
|
|
250
|
+
recipe_name=run.recipe.name,
|
|
251
|
+
evaluated_at=utc_now_iso(),
|
|
252
|
+
metrics=metrics,
|
|
253
|
+
items=item_reports,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def write_extraction_evaluation_result(
|
|
258
|
+
*, corpus: Corpus, run_id: str, result: ExtractionEvaluationResult
|
|
259
|
+
) -> Path:
|
|
260
|
+
"""
|
|
261
|
+
Persist extraction evaluation output under the corpus.
|
|
262
|
+
|
|
263
|
+
:param corpus: Corpus associated with the evaluation.
|
|
264
|
+
:type corpus: Corpus
|
|
265
|
+
:param run_id: Extraction run identifier.
|
|
266
|
+
:type run_id: str
|
|
267
|
+
:param result: Evaluation result to write.
|
|
268
|
+
:type result: ExtractionEvaluationResult
|
|
269
|
+
:return: Output path.
|
|
270
|
+
:rtype: Path
|
|
271
|
+
"""
|
|
272
|
+
output_dir = corpus.runs_dir / "evaluation" / "extraction" / run_id
|
|
273
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
output_path = output_dir / "output.json"
|
|
275
|
+
output_path.write_text(result.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
276
|
+
return output_path
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _resolve_item_id(
|
|
280
|
+
dataset_item: ExtractionEvaluationItem, *, catalog_items: Dict[str, CatalogItem]
|
|
281
|
+
) -> str:
|
|
282
|
+
if dataset_item.item_id:
|
|
283
|
+
return dataset_item.item_id
|
|
284
|
+
source_uri = dataset_item.source_uri
|
|
285
|
+
if not source_uri:
|
|
286
|
+
raise ValueError("Evaluation item is missing item_id and source_uri")
|
|
287
|
+
for item_id, catalog_item in catalog_items.items():
|
|
288
|
+
if getattr(catalog_item, "source_uri", None) == source_uri:
|
|
289
|
+
return item_id
|
|
290
|
+
raise ValueError(f"Unknown source uniform resource identifier: {source_uri}")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _coverage_status(extracted_text: Optional[str]) -> str:
|
|
294
|
+
if extracted_text is None:
|
|
295
|
+
return "missing"
|
|
296
|
+
if extracted_text.strip():
|
|
297
|
+
return "present"
|
|
298
|
+
return "empty"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _normalize_text(text: str) -> str:
|
|
302
|
+
return " ".join(text.lower().split())
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _similarity_score(*, expected_text: str, extracted_text: Optional[str]) -> float:
|
|
306
|
+
if extracted_text is None:
|
|
307
|
+
return 0.0
|
|
308
|
+
expected = _normalize_text(expected_text)
|
|
309
|
+
actual = _normalize_text(extracted_text)
|
|
310
|
+
if not expected and not actual:
|
|
311
|
+
return 1.0
|
|
312
|
+
return SequenceMatcher(None, expected, actual).ratio()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.13.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -486,10 +486,11 @@ corpus/
|
|
|
486
486
|
|
|
487
487
|
## Retrieval backends
|
|
488
488
|
|
|
489
|
-
|
|
489
|
+
Three backends are included.
|
|
490
490
|
|
|
491
491
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
492
492
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
493
|
+
- `vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
|
|
493
494
|
|
|
494
495
|
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
495
496
|
|
|
@@ -535,6 +536,9 @@ These extractors are built in. Optional ones require extra dependencies. See [te
|
|
|
535
536
|
|
|
536
537
|
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
537
538
|
|
|
539
|
+
For extraction evaluation workflows, dataset formats, and report interpretation, see
|
|
540
|
+
`docs/EXTRACTION_EVALUATION.md`.
|
|
541
|
+
|
|
538
542
|
## Topic modeling analysis
|
|
539
543
|
|
|
540
544
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=pD55sYei6AGGLcN1AWnpUY6-zPIPq1WxOp-sexOOlT0,496
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
4
|
-
biblicus/constants.py,sha256
|
|
5
|
-
biblicus/context.py,sha256=
|
|
3
|
+
biblicus/cli.py,sha256=cMoirLFPhTwftNuqaadajCcRUEz_FBaLkupjVxpAxO8,38403
|
|
4
|
+
biblicus/constants.py,sha256=gAlEVJhxdFj-eWWJrlYbP7H1X3c5gwhrIBq9NQ1Vq_E,371
|
|
5
|
+
biblicus/context.py,sha256=U7qkOwMdqNgYnqaC9hgQY0kv0R-6qcjV6bhXQl2WUkE,10215
|
|
6
6
|
biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
|
|
7
7
|
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
8
8
|
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
9
9
|
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
10
10
|
biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
|
|
11
11
|
biblicus/extraction.py,sha256=qvrsq6zSz2Kg-cap-18HPHC9pQlqEGo7pyID2uKCyBo,19760
|
|
12
|
+
biblicus/extraction_evaluation.py,sha256=cBC2B1nQCtXmOcVWUhHyO2NJRX8QSDuqhVjEc8PXrOA,10400
|
|
12
13
|
biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
|
|
13
14
|
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
14
15
|
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
@@ -57,9 +58,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
|
|
|
57
58
|
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
58
59
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
59
60
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
60
|
-
biblicus-0.
|
|
61
|
-
biblicus-0.
|
|
62
|
-
biblicus-0.
|
|
63
|
-
biblicus-0.
|
|
64
|
-
biblicus-0.
|
|
65
|
-
biblicus-0.
|
|
61
|
+
biblicus-0.13.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
62
|
+
biblicus-0.13.0.dist-info/METADATA,sha256=Ae0gttdvOggyE1vQVab4IOSmbx-JklxzvBZJ_3UyxIA,27979
|
|
63
|
+
biblicus-0.13.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
64
|
+
biblicus-0.13.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
65
|
+
biblicus-0.13.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
66
|
+
biblicus-0.13.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|