biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +33 -49
  11. biblicus/corpus.py +39 -58
  12. biblicus/errors.py +15 -0
  13. biblicus/evaluation.py +4 -8
  14. biblicus/extraction.py +276 -77
  15. biblicus/extractors/__init__.py +14 -3
  16. biblicus/extractors/base.py +12 -5
  17. biblicus/extractors/metadata_text.py +13 -5
  18. biblicus/extractors/openai_stt.py +180 -0
  19. biblicus/extractors/pass_through_text.py +16 -6
  20. biblicus/extractors/pdf_text.py +100 -0
  21. biblicus/extractors/pipeline.py +105 -0
  22. biblicus/extractors/rapidocr_text.py +129 -0
  23. biblicus/extractors/select_longest_text.py +105 -0
  24. biblicus/extractors/select_text.py +100 -0
  25. biblicus/extractors/unstructured_text.py +100 -0
  26. biblicus/frontmatter.py +0 -3
  27. biblicus/hook_logging.py +0 -5
  28. biblicus/hook_manager.py +3 -5
  29. biblicus/hooks.py +3 -7
  30. biblicus/ignore.py +0 -3
  31. biblicus/models.py +87 -0
  32. biblicus/retrieval.py +0 -4
  33. biblicus/sources.py +44 -9
  34. biblicus/time.py +0 -1
  35. biblicus/uris.py +3 -4
  36. biblicus/user_config.py +138 -0
  37. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
  38. biblicus-0.3.0.dist-info/RECORD +44 -0
  39. biblicus/extractors/cascade.py +0 -101
  40. biblicus-0.2.0.dist-info/RECORD +0 -32
  41. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  42. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py CHANGED
@@ -13,6 +13,7 @@ from pathlib import Path
13
13
  from typing import Any, Dict, List, Optional, Sequence
14
14
 
15
15
  import yaml
16
+ from pydantic import ValidationError
16
17
 
17
18
  from .constants import (
18
19
  CORPUS_DIR_NAME,
@@ -23,15 +24,13 @@ from .constants import (
23
24
  SIDECAR_SUFFIX,
24
25
  )
25
26
  from .frontmatter import parse_front_matter, render_front_matter
26
- from pydantic import ValidationError
27
-
28
27
  from .hook_manager import HookManager
29
28
  from .hooks import HookPoint
30
29
  from .ignore import load_corpus_ignore_spec
31
30
  from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
32
31
  from .sources import load_source
33
32
  from .time import utc_now_iso
34
- from .uris import normalize_corpus_uri, corpus_ref_to_path
33
+ from .uris import corpus_ref_to_path, normalize_corpus_uri
35
34
 
36
35
 
37
36
  def _sha256_bytes(data: bytes) -> str:
@@ -43,11 +42,12 @@ def _sha256_bytes(data: bytes) -> str:
43
42
  :return: Secure Hash Algorithm 256 hex digest.
44
43
  :rtype: str
45
44
  """
46
-
47
45
  return hashlib.sha256(data).hexdigest()
48
46
 
49
47
 
50
- def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int = 1024 * 1024) -> Dict[str, object]:
48
+ def _write_stream_and_hash(
49
+ stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
50
+ ) -> Dict[str, object]:
51
51
  """
52
52
  Write a binary stream to disk while computing a digest.
53
53
 
@@ -61,7 +61,6 @@ def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int =
61
61
  :rtype: dict[str, object]
62
62
  :raises OSError: If the destination cannot be written.
63
63
  """
64
-
65
64
  hasher = hashlib.sha256()
66
65
  bytes_written = 0
67
66
  with destination_path.open("wb") as destination_handle:
@@ -84,7 +83,6 @@ def _sanitize_filename(name: str) -> str:
84
83
  :return: Sanitized filename.
85
84
  :rtype: str
86
85
  """
87
-
88
86
  allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
89
87
  sanitized_name = "".join(
90
88
  (character if character in allowed_characters else "_") for character in name
@@ -101,9 +99,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
101
99
  :return: Preferred extension or None.
102
100
  :rtype: str or None
103
101
  """
104
-
105
102
  media_type_overrides = {
106
103
  "image/jpeg": ".jpg",
104
+ "audio/ogg": ".ogg",
107
105
  }
108
106
  if media_type in media_type_overrides:
109
107
  return media_type_overrides[media_type]
@@ -121,7 +119,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
121
119
  :return: Filename with a compatible extension.
122
120
  :rtype: str
123
121
  """
124
-
125
122
  raw_name = filename.strip()
126
123
 
127
124
  if media_type == "text/markdown":
@@ -129,11 +126,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
129
126
  return raw_name
130
127
  return raw_name + ".md"
131
128
 
129
+ if Path(raw_name).suffix:
130
+ return raw_name
131
+
132
132
  ext = _preferred_extension_for_media_type(media_type)
133
133
  if not ext:
134
134
  return raw_name
135
- if raw_name.lower().endswith(ext.lower()):
136
- return raw_name
137
135
  return raw_name + ext
138
136
 
139
137
 
@@ -148,7 +146,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
148
146
  :return: Deduplicated tag list preserving order.
149
147
  :rtype: list[str]
150
148
  """
151
-
152
149
  merged_tags: List[str] = []
153
150
 
154
151
  for explicit_tag in explicit:
@@ -181,7 +178,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
181
178
  :return: Sidecar path.
182
179
  :rtype: Path
183
180
  """
184
-
185
181
  return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
186
182
 
187
183
 
@@ -195,7 +191,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
195
191
  :rtype: dict[str, Any]
196
192
  :raises ValueError: If the sidecar content is not a mapping.
197
193
  """
198
-
199
194
  path = _sidecar_path_for(content_path)
200
195
  if not path.is_file():
201
196
  return {}
@@ -226,7 +221,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
226
221
  path.write_text(text + "\n", encoding="utf-8")
227
222
 
228
223
 
229
- def _ensure_biblicus_block(metadata: Dict[str, Any], *, item_id: str, source_uri: str) -> Dict[str, Any]:
224
+ def _ensure_biblicus_block(
225
+ metadata: Dict[str, Any], *, item_id: str, source_uri: str
226
+ ) -> Dict[str, Any]:
230
227
  """
231
228
  Ensure the biblicus metadata block exists and is populated.
232
229
 
@@ -324,7 +321,6 @@ class Corpus:
324
321
  :param root: Corpus root directory.
325
322
  :type root: Path
326
323
  """
327
-
328
324
  self.root = root
329
325
  self.meta_dir = self.root / CORPUS_DIR_NAME
330
326
  self.raw_dir = self.root / DEFAULT_RAW_DIR
@@ -339,7 +335,6 @@ class Corpus:
339
335
  :return: Corpus uniform resource identifier.
340
336
  :rtype: str
341
337
  """
342
-
343
338
  return self.root.as_uri()
344
339
 
345
340
  def _load_config(self) -> Optional[CorpusConfig]:
@@ -350,7 +345,6 @@ class Corpus:
350
345
  :rtype: CorpusConfig or None
351
346
  :raises ValueError: If the config schema is invalid.
352
347
  """
353
-
354
348
  path = self.meta_dir / "config.json"
355
349
  if not path.is_file():
356
350
  return None
@@ -359,7 +353,9 @@ class Corpus:
359
353
  return CorpusConfig.model_validate(data)
360
354
  except ValidationError as exc:
361
355
  has_hook_error = any(
362
- isinstance(error.get("loc"), tuple) and error.get("loc") and error.get("loc")[0] == "hooks"
356
+ isinstance(error.get("loc"), tuple)
357
+ and error.get("loc")
358
+ and error.get("loc")[0] == "hooks"
363
359
  for error in exc.errors()
364
360
  )
365
361
  if has_hook_error:
@@ -374,7 +370,6 @@ class Corpus:
374
370
  :rtype: HookManager or None
375
371
  :raises ValueError: If hook specifications are invalid.
376
372
  """
377
-
378
373
  if self.config is None or not self.config.hooks:
379
374
  return None
380
375
  return HookManager.from_config(
@@ -394,7 +389,6 @@ class Corpus:
394
389
  :rtype: Corpus
395
390
  :raises FileNotFoundError: If no corpus config is found.
396
391
  """
397
-
398
392
  start = start.resolve()
399
393
  for candidate in [start, *start.parents]:
400
394
  if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
@@ -413,7 +407,6 @@ class Corpus:
413
407
  :return: Opened corpus instance.
414
408
  :rtype: Corpus
415
409
  """
416
-
417
410
  return cls.find(corpus_ref_to_path(ref))
418
411
 
419
412
  @classmethod
@@ -429,7 +422,6 @@ class Corpus:
429
422
  :rtype: Corpus
430
423
  :raises FileExistsError: If the corpus already exists and force is False.
431
424
  """
432
-
433
425
  root = root.resolve()
434
426
  corpus = cls(root)
435
427
 
@@ -459,7 +451,6 @@ class Corpus:
459
451
  :return: Catalog file path.
460
452
  :rtype: Path
461
453
  """
462
-
463
454
  return self.meta_dir / "catalog.json"
464
455
 
465
456
  def _init_catalog(self) -> None:
@@ -469,7 +460,6 @@ class Corpus:
469
460
  :return: None.
470
461
  :rtype: None
471
462
  """
472
-
473
463
  if self.catalog_path.exists():
474
464
  return
475
465
  catalog = CorpusCatalog(
@@ -492,7 +482,6 @@ class Corpus:
492
482
  :raises FileNotFoundError: If the catalog file does not exist.
493
483
  :raises ValueError: If the catalog schema is invalid.
494
484
  """
495
-
496
485
  if not self.catalog_path.is_file():
497
486
  raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
498
487
  catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
@@ -507,7 +496,6 @@ class Corpus:
507
496
  :raises FileNotFoundError: If the catalog file does not exist.
508
497
  :raises ValueError: If the catalog schema is invalid.
509
498
  """
510
-
511
499
  return self._load_catalog()
512
500
 
513
501
  def _write_catalog(self, catalog: CorpusCatalog) -> None:
@@ -519,7 +507,6 @@ class Corpus:
519
507
  :return: None.
520
508
  :rtype: None
521
509
  """
522
-
523
510
  temp_path = self.catalog_path.with_suffix(".json.tmp")
524
511
  temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
525
512
  temp_path.replace(self.catalog_path)
@@ -532,7 +519,6 @@ class Corpus:
532
519
  :return: Path to the runs directory.
533
520
  :rtype: Path
534
521
  """
535
-
536
522
  return self.meta_dir / RUNS_DIR_NAME
537
523
 
538
524
  @property
@@ -543,7 +529,6 @@ class Corpus:
543
529
  :return: Path to the extraction runs directory.
544
530
  :rtype: Path
545
531
  """
546
-
547
532
  return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
548
533
 
549
534
  def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
@@ -557,7 +542,6 @@ class Corpus:
557
542
  :return: Extraction run directory.
558
543
  :rtype: Path
559
544
  """
560
-
561
545
  return self.extraction_runs_dir / extractor_id / run_id
562
546
 
563
547
  def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
@@ -574,8 +558,11 @@ class Corpus:
574
558
  :rtype: str or None
575
559
  :raises OSError: If the file exists but cannot be read.
576
560
  """
577
-
578
- path = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "text" / f"{item_id}.txt"
561
+ path = (
562
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
563
+ / "text"
564
+ / f"{item_id}.txt"
565
+ )
579
566
  if not path.is_file():
580
567
  return None
581
568
  return path.read_text(encoding="utf-8")
@@ -587,7 +574,6 @@ class Corpus:
587
574
  :return: None.
588
575
  :rtype: None
589
576
  """
590
-
591
577
  self.runs_dir.mkdir(parents=True, exist_ok=True)
592
578
 
593
579
  def write_run(self, run: RetrievalRun) -> None:
@@ -599,7 +585,6 @@ class Corpus:
599
585
  :return: None.
600
586
  :rtype: None
601
587
  """
602
-
603
588
  self._ensure_runs_dir()
604
589
  path = self.runs_dir / f"{run.run_id}.json"
605
590
  path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
@@ -618,7 +603,6 @@ class Corpus:
618
603
  :rtype: RetrievalRun
619
604
  :raises FileNotFoundError: If the run manifest does not exist.
620
605
  """
621
-
622
606
  path = self.runs_dir / f"{run_id}.json"
623
607
  if not path.is_file():
624
608
  raise FileNotFoundError(f"Missing run manifest: {path}")
@@ -633,7 +617,6 @@ class Corpus:
633
617
  :return: Latest run identifier or None.
634
618
  :rtype: str or None
635
619
  """
636
-
637
620
  return self._load_catalog().latest_run_id
638
621
 
639
622
  def _upsert_catalog_item(self, item: CatalogItem) -> None:
@@ -645,7 +628,6 @@ class Corpus:
645
628
  :return: None.
646
629
  :rtype: None
647
630
  """
648
-
649
631
  self._init_catalog()
650
632
  catalog = self._load_catalog()
651
633
  catalog.items[item.id] = item
@@ -693,7 +675,6 @@ class Corpus:
693
675
  :rtype: IngestResult
694
676
  :raises ValueError: If markdown is not Unicode Transformation Format 8.
695
677
  """
696
-
697
678
  item_id = str(uuid.uuid4())
698
679
  safe_filename = _sanitize_filename(filename) if filename else ""
699
680
 
@@ -741,7 +722,9 @@ class Corpus:
741
722
  try:
742
723
  markdown_text = data.decode("utf-8")
743
724
  except UnicodeDecodeError as decode_error:
744
- raise ValueError("Markdown must be Unicode Transformation Format 8") from decode_error
725
+ raise ValueError(
726
+ "Markdown must be Unicode Transformation Format 8"
727
+ ) from decode_error
745
728
 
746
729
  parsed_document = parse_front_matter(markdown_text)
747
730
  frontmatter = dict(parsed_document.metadata)
@@ -760,7 +743,9 @@ class Corpus:
760
743
  if isinstance(title_value, str) and title_value.strip():
761
744
  resolved_title = title_value.strip()
762
745
 
763
- frontmatter = _ensure_biblicus_block(frontmatter, item_id=item_id, source_uri=source_uri)
746
+ frontmatter = _ensure_biblicus_block(
747
+ frontmatter, item_id=item_id, source_uri=source_uri
748
+ )
764
749
  rendered_document = render_front_matter(frontmatter, parsed_document.body)
765
750
  data_to_write = rendered_document.encode("utf-8")
766
751
  else:
@@ -807,7 +792,9 @@ class Corpus:
807
792
  sidecar_metadata["media_type"] = media_type
808
793
  sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
809
794
  _write_sidecar(output_path, sidecar_metadata)
810
- frontmatter = _merge_metadata(frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata)
795
+ frontmatter = _merge_metadata(
796
+ frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
797
+ )
811
798
 
812
799
  created_at = utc_now_iso()
813
800
  item_record = CatalogItem(
@@ -858,7 +845,6 @@ class Corpus:
858
845
  :rtype: IngestResult
859
846
  :raises ValueError: If the media_type is text/markdown.
860
847
  """
861
-
862
848
  if media_type == "text/markdown":
863
849
  raise ValueError("Stream ingestion is not supported for Markdown")
864
850
 
@@ -972,7 +958,6 @@ class Corpus:
972
958
  :return: Ingestion result summary.
973
959
  :rtype: IngestResult
974
960
  """
975
-
976
961
  data = text.encode("utf-8")
977
962
  return self.ingest_item(
978
963
  data,
@@ -1003,7 +988,6 @@ class Corpus:
1003
988
  :return: Ingestion result summary.
1004
989
  :rtype: IngestResult
1005
990
  """
1006
-
1007
991
  candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
1008
992
  if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
1009
993
  path = source if isinstance(source, Path) else candidate_path
@@ -1061,7 +1045,6 @@ class Corpus:
1061
1045
  :raises FileNotFoundError: If the source_root does not exist.
1062
1046
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1063
1047
  """
1064
-
1065
1048
  source_root = source_root.resolve()
1066
1049
  if not source_root.is_dir():
1067
1050
  raise FileNotFoundError(f"Import source root does not exist: {source_root}")
@@ -1111,9 +1094,10 @@ class Corpus:
1111
1094
  :rtype: None
1112
1095
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1113
1096
  """
1114
-
1115
1097
  item_id = str(uuid.uuid4())
1116
- destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path)
1098
+ destination_relpath = str(
1099
+ Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
1100
+ )
1117
1101
  destination_path = (self.root / destination_relpath).resolve()
1118
1102
  destination_path.parent.mkdir(parents=True, exist_ok=True)
1119
1103
 
@@ -1176,11 +1160,8 @@ class Corpus:
1176
1160
  :return: Catalog items ordered by recency.
1177
1161
  :rtype: list[CatalogItem]
1178
1162
  """
1179
-
1180
1163
  catalog = self._load_catalog()
1181
- ordered_ids = (
1182
- catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
1183
- )
1164
+ ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
1184
1165
  collected_items: List[CatalogItem] = []
1185
1166
  for item_id in ordered_ids:
1186
1167
  item = catalog.items.get(item_id)
@@ -1198,7 +1179,6 @@ class Corpus:
1198
1179
  :rtype: CatalogItem
1199
1180
  :raises KeyError: If the item identifier is unknown.
1200
1181
  """
1201
-
1202
1182
  catalog = self._load_catalog()
1203
1183
  item = catalog.items.get(item_id)
1204
1184
  if item is None:
@@ -1216,7 +1196,6 @@ class Corpus:
1216
1196
  :rtype: dict[str, int]
1217
1197
  :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
1218
1198
  """
1219
-
1220
1199
  self._init_catalog()
1221
1200
  existing_catalog = self._load_catalog()
1222
1201
  stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
@@ -1291,7 +1270,9 @@ class Corpus:
1291
1270
 
1292
1271
  previous_item = existing_catalog.items.get(item_id)
1293
1272
  created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
1294
- source_uri = source_uri or (previous_item.source_uri if previous_item is not None else None)
1273
+ source_uri = source_uri or (
1274
+ previous_item.source_uri if previous_item is not None else None
1275
+ )
1295
1276
 
1296
1277
  if previous_item is None:
1297
1278
  stats["inserted"] += 1
@@ -1338,7 +1319,6 @@ class Corpus:
1338
1319
  :return: Corpus name.
1339
1320
  :rtype: str
1340
1321
  """
1341
-
1342
1322
  return self.root.name
1343
1323
 
1344
1324
  def purge(self, *, confirm: str) -> None:
@@ -1351,10 +1331,11 @@ class Corpus:
1351
1331
  :rtype: None
1352
1332
  :raises ValueError: If the confirmation does not match.
1353
1333
  """
1354
-
1355
1334
  expected = self.name
1356
1335
  if confirm != expected:
1357
- raise ValueError(f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus")
1336
+ raise ValueError(
1337
+ f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
1338
+ )
1358
1339
 
1359
1340
  if self.raw_dir.exists():
1360
1341
  shutil.rmtree(self.raw_dir)
biblicus/errors.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Error types for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+
8
+ class ExtractionRunFatalError(RuntimeError):
9
+ """
10
+ Fatal extraction run error that should abort the entire run.
11
+
12
+ This exception is used for conditions that indicate a configuration or environment problem
13
+ rather than a per-item extraction failure. For example, a selection extractor that depends
14
+ on referenced extraction run manifests treats missing manifests as fatal.
15
+ """
biblicus/evaluation.py CHANGED
@@ -11,8 +11,8 @@ from typing import Dict, List, Optional
11
11
 
12
12
  from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
- from .constants import DATASET_SCHEMA_VERSION
15
14
  from .backends import get_backend
15
+ from .constants import DATASET_SCHEMA_VERSION
16
16
  from .corpus import Corpus
17
17
  from .models import QueryBudget, RetrievalResult, RetrievalRun
18
18
  from .time import utc_now_iso
@@ -45,7 +45,9 @@ class EvaluationQuery(BaseModel):
45
45
  @model_validator(mode="after")
46
46
  def _require_expectation(self) -> "EvaluationQuery":
47
47
  if not self.expected_item_id and not self.expected_source_uri:
48
- raise ValueError("Evaluation queries must include expected_item_id or expected_source_uri")
48
+ raise ValueError(
49
+ "Evaluation queries must include expected_item_id or expected_source_uri"
50
+ )
49
51
  return self
50
52
 
51
53
 
@@ -114,7 +116,6 @@ def load_dataset(path: Path) -> EvaluationDataset:
114
116
  :return: Parsed evaluation dataset.
115
117
  :rtype: EvaluationDataset
116
118
  """
117
-
118
119
  data = json.loads(path.read_text(encoding="utf-8"))
119
120
  return EvaluationDataset.model_validate(data)
120
121
 
@@ -140,7 +141,6 @@ def evaluate_run(
140
141
  :return: Evaluation result bundle.
141
142
  :rtype: EvaluationResult
142
143
  """
143
-
144
144
  backend = get_backend(run.recipe.backend_id)
145
145
  latency_seconds: List[float] = []
146
146
  hit_count = 0
@@ -200,7 +200,6 @@ def _expected_rank(result: RetrievalResult, query: EvaluationQuery) -> Optional[
200
200
  :return: Rank of the first matching evidence item, or None.
201
201
  :rtype: int or None
202
202
  """
203
-
204
203
  for evidence in result.evidence:
205
204
  if query.expected_item_id and evidence.item_id == query.expected_item_id:
206
205
  return evidence.rank
@@ -218,7 +217,6 @@ def _average_latency_milliseconds(latencies: List[float]) -> float:
218
217
  :return: Average latency in milliseconds.
219
218
  :rtype: float
220
219
  """
221
-
222
220
  if not latencies:
223
221
  return 0.0
224
222
  return sum(latencies) / len(latencies) * 1000.0
@@ -233,7 +231,6 @@ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
233
231
  :return: Percentile 95 latency in milliseconds.
234
232
  :rtype: float
235
233
  """
236
-
237
234
  if not latencies:
238
235
  return 0.0
239
236
  sorted_latencies = sorted(latencies)
@@ -252,7 +249,6 @@ def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
252
249
  :return: Total artifact bytes.
253
250
  :rtype: int
254
251
  """
255
-
256
252
  total_bytes = 0
257
253
  for artifact_relpath in run.artifact_paths:
258
254
  artifact_path = corpus.root / artifact_relpath